From 4b239f458c229de044d6905c2b0f9fe16ed9e01e Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Fri, 17 Dec 2010 16:58:28 -0800
Subject: x86-64, mm: Put early page table high

While dubug kdump, found current kernel will have problem with crashkernel=512M.

It turns out that initial mapping is to 512M, and later initial mapping to 4G
(acutally is 2040M in my platform), will put page table near 512M.
then initial mapping to 128g will be near 2g.

before this patch:
[    0.000000] initial memory mapped : 0 - 20000000
[    0.000000] init_memory_mapping: [0x00000000000000-0x0000007f74ffff]
[    0.000000]  0000000000 - 007f600000 page 2M
[    0.000000]  007f600000 - 007f750000 page 4k
[    0.000000] kernel direct mapping tables up to 7f750000 @ [0x1fffc000-0x1fffffff]
[    0.000000]     memblock_x86_reserve_range: [0x1fffc000-0x1fffdfff]          PGTABLE
[    0.000000] init_memory_mapping: [0x00000100000000-0x0000207fffffff]
[    0.000000]  0100000000 - 2080000000 page 2M
[    0.000000] kernel direct mapping tables up to 2080000000 @ [0x7bc01000-0x7bc83fff]
[    0.000000]     memblock_x86_reserve_range: [0x7bc01000-0x7bc7efff]          PGTABLE
[    0.000000] RAMDISK: 7bc84000 - 7f745000
[    0.000000] crashkernel reservation failed - No suitable area found.

after patch:
[    0.000000] initial memory mapped : 0 - 20000000
[    0.000000] init_memory_mapping: [0x00000000000000-0x0000007f74ffff]
[    0.000000]  0000000000 - 007f600000 page 2M
[    0.000000]  007f600000 - 007f750000 page 4k
[    0.000000] kernel direct mapping tables up to 7f750000 @ [0x7f74c000-0x7f74ffff]
[    0.000000]     memblock_x86_reserve_range: [0x7f74c000-0x7f74dfff]          PGTABLE
[    0.000000] init_memory_mapping: [0x00000100000000-0x0000207fffffff]
[    0.000000]  0100000000 - 2080000000 page 2M
[    0.000000] kernel direct mapping tables up to 2080000000 @ [0x207ff7d000-0x207fffffff]
[    0.000000]     memblock_x86_reserve_range: [0x207ff7d000-0x207fffafff]          PGTABLE
[    0.000000] RAMDISK: 7bc84000 - 7f745000
[    0.000000]     memblock_x86_reserve_range: [0x17000000-0x36ffffff]     CRASH KERNEL
[    0.000000] Reserving 512MB of memory at 368MB for crashkernel (System RAM: 133120MB)

It means with the patch, page table for [0, 2g) will need 2g, instead of under 512M,
page table for [4g, 128g) will be near 128g, instead of under 2g.

That would good, if we have lots of memory above 4g, like 1024g, or 2048g or 16T, will not put
related page table under 2g. that would be have chance to fill the under 2g if 1G or 2M page is
not used.

the code change will use add map_low_page() and update unmap_low_page() for 64bit, and use them
to get access the corresponding high memory for page table setting.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <4D0C0734.7060900@kernel.org>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>

diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index c0e28a1..5863950 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -33,7 +33,7 @@ int direct_gbpages
 static void __init find_early_table_space(unsigned long end, int use_pse,
 					  int use_gbpages)
 {
-	unsigned long puds, pmds, ptes, tables, start;
+	unsigned long puds, pmds, ptes, tables, start = 0, good_end = end;
 	phys_addr_t base;
 
 	puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
@@ -73,12 +73,9 @@ static void __init find_early_table_space(unsigned long end, int use_pse,
 	 * need roughly 0.5KB per GB.
 	 */
 #ifdef CONFIG_X86_32
-	start = 0x7000;
-#else
-	start = 0x8000;
+	good_end = max_pfn_mapped << PAGE_SHIFT;
 #endif
-	base = memblock_find_in_range(start, max_pfn_mapped<<PAGE_SHIFT,
-					tables, PAGE_SIZE);
+	base = memblock_find_in_range(start, good_end, tables, PAGE_SIZE);
 	if (base == MEMBLOCK_ERROR)
 		panic("Cannot find space for the kernel page tables");
 
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 71a5929..024847d 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -333,12 +333,28 @@ static __ref void *alloc_low_page(unsigned long *phys)
 	return adr;
 }
 
+static __ref void *map_low_page(void *virt)
+{
+	void *adr;
+	unsigned long phys, left;
+
+	if (after_bootmem)
+		return virt;
+
+	phys = __pa(virt);
+	left = phys & (PAGE_SIZE - 1);
+	adr = early_memremap(phys & PAGE_MASK, PAGE_SIZE);
+	adr = (void *)(((unsigned long)adr) | left);
+
+	return adr;
+}
+
 static __ref void unmap_low_page(void *adr)
 {
 	if (after_bootmem)
 		return;
 
-	early_iounmap(adr, PAGE_SIZE);
+	early_iounmap((void *)((unsigned long)adr & PAGE_MASK), PAGE_SIZE);
 }
 
 static unsigned long __meminit
@@ -386,15 +402,6 @@ phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end,
 }
 
 static unsigned long __meminit
-phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end,
-		pgprot_t prot)
-{
-	pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd);
-
-	return phys_pte_init(pte, address, end, prot);
-}
-
-static unsigned long __meminit
 phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
 	      unsigned long page_size_mask, pgprot_t prot)
 {
@@ -420,8 +427,10 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
 		if (pmd_val(*pmd)) {
 			if (!pmd_large(*pmd)) {
 				spin_lock(&init_mm.page_table_lock);
-				last_map_addr = phys_pte_update(pmd, address,
+				pte = map_low_page((pte_t *)pmd_page_vaddr(*pmd));
+				last_map_addr = phys_pte_init(pte, address,
 								end, prot);
+				unmap_low_page(pte);
 				spin_unlock(&init_mm.page_table_lock);
 				continue;
 			}
@@ -468,18 +477,6 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
 }
 
 static unsigned long __meminit
-phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end,
-		unsigned long page_size_mask, pgprot_t prot)
-{
-	pmd_t *pmd = pmd_offset(pud, 0);
-	unsigned long last_map_addr;
-
-	last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask, prot);
-	__flush_tlb_all();
-	return last_map_addr;
-}
-
-static unsigned long __meminit
 phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
 			 unsigned long page_size_mask)
 {
@@ -504,8 +501,11 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
 
 		if (pud_val(*pud)) {
 			if (!pud_large(*pud)) {
-				last_map_addr = phys_pmd_update(pud, addr, end,
+				pmd = map_low_page(pmd_offset(pud, 0));
+				last_map_addr = phys_pmd_init(pmd, addr, end,
 							 page_size_mask, prot);
+				unmap_low_page(pmd);
+				__flush_tlb_all();
 				continue;
 			}
 			/*
@@ -553,17 +553,6 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
 	return last_map_addr;
 }
 
-static unsigned long __meminit
-phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end,
-		 unsigned long page_size_mask)
-{
-	pud_t *pud;
-
-	pud = (pud_t *)pgd_page_vaddr(*pgd);
-
-	return phys_pud_init(pud, addr, end, page_size_mask);
-}
-
 unsigned long __meminit
 kernel_physical_mapping_init(unsigned long start,
 			     unsigned long end,
@@ -587,8 +576,10 @@ kernel_physical_mapping_init(unsigned long start,
 			next = end;
 
 		if (pgd_val(*pgd)) {
-			last_map_addr = phys_pud_update(pgd, __pa(start),
+			pud = map_low_page((pud_t *)pgd_page_vaddr(*pgd));
+			last_map_addr = phys_pud_init(pud, __pa(start),
 						 __pa(end), page_size_mask);
+			unmap_low_page(pud);
 			continue;
 		}
 
-- 
cgit v0.10.2


From 32e3f2b00c529477d26895c5428ed95bba537443 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Fri, 17 Dec 2010 16:58:40 -0800
Subject: x86-64, gart: Fix allocation with memblock

When trying to change alloc_bootmem with memblock to go with real top-down
Found one old system:
[    0.000000] Node 0: aperture @ ac000000 size 64 MB
[    0.000000] Aperture pointing to e820 RAM. Ignoring.
[    0.000000] Your BIOS doesn't leave a aperture memory hole
[    0.000000] Please enable the IOMMU option in the BIOS setup
[    0.000000] This costs you 64 MB of RAM
[    0.000000]     memblock_x86_reserve_range: [0x2020000000-0x2023ffffff]       aperture64
[    0.000000] Cannot allocate aperture memory hole (ffff882020000000,65536K)
[    0.000000]        memblock_x86_free_range: [0x2020000000-0x2023ffffff]
[    0.000000] Kernel panic - not syncing: Not enough memory for aperture
[    0.000000] Pid: 0, comm: swapper Not tainted 2.6.37-rc5-tip-yh-06229-gb792dc2-dirty #331
[    0.000000] Call Trace:
[    0.000000]  [<ffffffff81cf50fe>] ? panic+0x91/0x1a3
[    0.000000]  [<ffffffff827c66b2>] ? gart_iommu_hole_init+0x3d7/0x4a3
[    0.000000]  [<ffffffff81d026a9>] ? _etext+0x0/0x3
[    0.000000]  [<ffffffff827ba940>] ? pci_iommu_alloc+0x47/0x71
[    0.000000]  [<ffffffff827c820b>] ? mem_init+0x19/0xec
[    0.000000]  [<ffffffff827b3c40>] ? start_kernel+0x20a/0x3e8
[    0.000000]  [<ffffffff827b32cc>] ? x86_64_start_reservations+0x9c/0xa0
[    0.000000]  [<ffffffff827b33e4>] ? x86_64_start_kernel+0x114/0x11b

it means __alloc_bootmem_nopanic() get too high for that aperture.

Use memblock_find_in_range() with limit directly.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <4D0C0740.90104@kernel.org>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>

diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index dcd7c83..85f66b4 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -13,7 +13,7 @@
 #include <linux/kernel.h>
 #include <linux/types.h>
 #include <linux/init.h>
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
 #include <linux/mmzone.h>
 #include <linux/pci_ids.h>
 #include <linux/pci.h>
@@ -69,7 +69,7 @@ static void __init insert_aperture_resource(u32 aper_base, u32 aper_size)
 static u32 __init allocate_aperture(void)
 {
 	u32 aper_size;
-	void *p;
+	unsigned long addr;
 
 	/* aper_size should <= 1G */
 	if (fallback_aper_order > 5)
@@ -95,27 +95,26 @@ static u32 __init allocate_aperture(void)
 	 * so don't use 512M below as gart iommu, leave the space for kernel
 	 * code for safe
 	 */
-	p = __alloc_bootmem_nopanic(aper_size, aper_size, 512ULL<<20);
+	addr = memblock_find_in_range(0, 1ULL<<32, aper_size, 512ULL<<20);
+	if (addr == MEMBLOCK_ERROR || addr + aper_size > 0xffffffff) {
+		printk(KERN_ERR
+			"Cannot allocate aperture memory hole (%lx,%uK)\n",
+				addr, aper_size>>10);
+		return 0;
+	}
+	memblock_x86_reserve_range(addr, addr + aper_size, "aperture64");
 	/*
 	 * Kmemleak should not scan this block as it may not be mapped via the
 	 * kernel direct mapping.
 	 */
-	kmemleak_ignore(p);
-	if (!p || __pa(p)+aper_size > 0xffffffff) {
-		printk(KERN_ERR
-			"Cannot allocate aperture memory hole (%p,%uK)\n",
-				p, aper_size>>10);
-		if (p)
-			free_bootmem(__pa(p), aper_size);
-		return 0;
-	}
+	kmemleak_ignore(phys_to_virt(addr));
 	printk(KERN_INFO "Mapping aperture over %d KB of RAM @ %lx\n",
-			aper_size >> 10, __pa(p));
-	insert_aperture_resource((u32)__pa(p), aper_size);
-	register_nosave_region((u32)__pa(p) >> PAGE_SHIFT,
-				(u32)__pa(p+aper_size) >> PAGE_SHIFT);
+			aper_size >> 10, addr);
+	insert_aperture_resource((u32)addr, aper_size);
+	register_nosave_region(addr >> PAGE_SHIFT,
+			       (addr+aper_size) >> PAGE_SHIFT);
 
-	return (u32)__pa(p);
+	return (u32)addr;
 }
 
 
-- 
cgit v0.10.2


From 1a4a678b12c84db9ae5dce424e0e97f0559bb57c Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Fri, 17 Dec 2010 16:59:07 -0800
Subject: memblock: Make find_memory_core_early() find from top-down

That is used for find ram in node or bootmem type.

We should make it top-down so it will be consistent to memblock_find,
and to avoid allocating potentially valuable low memory before we
actually need it.

Suggested-by: Jeremy Fitzhardinge <jeremy@goop.org>
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <4D0C075B.3040501@kernel.org>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 07a6544..19413bf 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3555,6 +3555,34 @@ static int __meminit next_active_region_index_in_nid(int index, int nid)
 	return -1;
 }
 
+/*
+ * Basic iterator support. Return the last range of PFNs for a node
+ * Note: nid == MAX_NUMNODES returns last region regardless of node
+ */
+static int __meminit last_active_region_index_in_nid(int nid)
+{
+	int i;
+
+	for (i = nr_nodemap_entries - 1; i >= 0; i--)
+		if (nid == MAX_NUMNODES || early_node_map[i].nid == nid)
+			return i;
+
+	return -1;
+}
+
+/*
+ * Basic iterator support. Return the previous active range of PFNs for a node
+ * Note: nid == MAX_NUMNODES returns next region regardless of node
+ */
+static int __meminit previous_active_region_index_in_nid(int index, int nid)
+{
+	for (index = index - 1; index >= 0; index--)
+		if (nid == MAX_NUMNODES || early_node_map[index].nid == nid)
+			return index;
+
+	return -1;
+}
+
 #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
 /*
  * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
@@ -3606,6 +3634,10 @@ bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
 	for (i = first_active_region_index_in_nid(nid); i != -1; \
 				i = next_active_region_index_in_nid(i, nid))
 
+#define for_each_active_range_index_in_nid_reverse(i, nid) \
+	for (i = last_active_region_index_in_nid(nid); i != -1; \
+				i = previous_active_region_index_in_nid(i, nid))
+
 /**
  * free_bootmem_with_active_regions - Call free_bootmem_node for each active range
  * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
@@ -3644,7 +3676,7 @@ u64 __init find_memory_core_early(int nid, u64 size, u64 align,
 	int i;
 
 	/* Need to go over early_node_map to find out good range for node */
-	for_each_active_range_index_in_nid(i, nid) {
+	for_each_active_range_index_in_nid_reverse(i, nid) {
 		u64 addr;
 		u64 ei_start, ei_last;
 		u64 final_start, final_end;
-- 
cgit v0.10.2


From 45635ab5e41bcde94a82f9a05d660ef77fe38c1b Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Mon, 27 Dec 2010 16:47:54 -0800
Subject: x86: Change get_max_mapped() to inline

Move it into head file. to prepare use it in other files.

[ hpa: added missing <linux/types.h> and changed type to phys_addr_t. ]

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <4D1933BA.8000508@kernel.org>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>

diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index 1df6621..93626e6 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -2,6 +2,7 @@
 #define _ASM_X86_PAGE_DEFS_H
 
 #include <linux/const.h>
+#include <linux/types.h>
 
 /* PAGE_SHIFT determines the page size */
 #define PAGE_SHIFT	12
@@ -45,6 +46,11 @@ extern int devmem_is_allowed(unsigned long pagenr);
 extern unsigned long max_low_pfn_mapped;
 extern unsigned long max_pfn_mapped;
 
+static inline phys_addr_t get_max_mapped(void)
+{
+	return (phys_addr_t)max_pfn_mapped << PAGE_SHIFT;
+}
+
 extern unsigned long init_memory_mapping(unsigned long start,
 					 unsigned long end);
 
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index df172c1..3def8c9 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -669,15 +669,6 @@ static int __init parse_reservelow(char *p)
 
 early_param("reservelow", parse_reservelow);
 
-static u64 __init get_max_mapped(void)
-{
-	u64 end = max_pfn_mapped;
-
-	end <<= PAGE_SHIFT;
-
-	return end;
-}
-
 /*
  * Determine if we were loaded by an EFI loader.  If so, then we have also been
  * passed the efi memmap, systab, etc., so we should use these data structures
-- 
cgit v0.10.2


From dbef7b56d2fc5115f26f72a0b080283bbf972cab Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Mon, 27 Dec 2010 16:48:08 -0800
Subject: x86-64, numa: Allocate memnodemap under max_pfn_mapped

We need to access it right way, so make sure that it is mapped already.

Prepare to put page table on local node, and nodemap is used before that.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <4D1933C8.7060105@kernel.org>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>

diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 7762a51..02d36ff 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -87,7 +87,7 @@ static int __init allocate_cachealigned_memnodemap(void)
 
 	addr = 0x8000;
 	nodemap_size = roundup(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES);
-	nodemap_addr = memblock_find_in_range(addr, max_pfn<<PAGE_SHIFT,
+	nodemap_addr = memblock_find_in_range(addr, get_max_mapped(),
 				      nodemap_size, L1_CACHE_BYTES);
 	if (nodemap_addr == MEMBLOCK_ERROR) {
 		printk(KERN_ERR
-- 
cgit v0.10.2


From 1411e0ec3123ae4c4ead6bfc9fe3ee5a3ae5c327 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Mon, 27 Dec 2010 16:48:17 -0800
Subject: x86-64, numa: Put pgtable to local node memory

Introduce init_memory_mapping_high(), and use it with 64bit.

It will go with every memory segment above 4g to create page table to the
memory range itself.

before this patch all page tables was on one node.

with this patch, one RED-PEN is killed

debug out for 8 sockets system after patch
[    0.000000] initial memory mapped : 0 - 20000000
[    0.000000] init_memory_mapping: [0x00000000000000-0x0000007f74ffff]
[    0.000000]  0000000000 - 007f600000 page 2M
[    0.000000]  007f600000 - 007f750000 page 4k
[    0.000000] kernel direct mapping tables up to 7f750000 @ [0x7f74c000-0x7f74ffff]
[    0.000000] RAMDISK: 7bc84000 - 7f745000
....
[    0.000000] Adding active range (0, 0x10, 0x95) 0 entries of 3200 used
[    0.000000] Adding active range (0, 0x100, 0x7f750) 1 entries of 3200 used
[    0.000000] Adding active range (0, 0x100000, 0x1080000) 2 entries of 3200 used
[    0.000000] Adding active range (1, 0x1080000, 0x2080000) 3 entries of 3200 used
[    0.000000] Adding active range (2, 0x2080000, 0x3080000) 4 entries of 3200 used
[    0.000000] Adding active range (3, 0x3080000, 0x4080000) 5 entries of 3200 used
[    0.000000] Adding active range (4, 0x4080000, 0x5080000) 6 entries of 3200 used
[    0.000000] Adding active range (5, 0x5080000, 0x6080000) 7 entries of 3200 used
[    0.000000] Adding active range (6, 0x6080000, 0x7080000) 8 entries of 3200 used
[    0.000000] Adding active range (7, 0x7080000, 0x8080000) 9 entries of 3200 used
[    0.000000] init_memory_mapping: [0x00000100000000-0x0000107fffffff]
[    0.000000]  0100000000 - 1080000000 page 2M
[    0.000000] kernel direct mapping tables up to 1080000000 @ [0x107ffbd000-0x107fffffff]
[    0.000000]     memblock_x86_reserve_range: [0x107ffc2000-0x107fffffff]          PGTABLE
[    0.000000] init_memory_mapping: [0x00001080000000-0x0000207fffffff]
[    0.000000]  1080000000 - 2080000000 page 2M
[    0.000000] kernel direct mapping tables up to 2080000000 @ [0x207ff7d000-0x207fffffff]
[    0.000000]     memblock_x86_reserve_range: [0x207ffc0000-0x207fffffff]          PGTABLE
[    0.000000] init_memory_mapping: [0x00002080000000-0x0000307fffffff]
[    0.000000]  2080000000 - 3080000000 page 2M
[    0.000000] kernel direct mapping tables up to 3080000000 @ [0x307ff3d000-0x307fffffff]
[    0.000000]     memblock_x86_reserve_range: [0x307ffc0000-0x307fffffff]          PGTABLE
[    0.000000] init_memory_mapping: [0x00003080000000-0x0000407fffffff]
[    0.000000]  3080000000 - 4080000000 page 2M
[    0.000000] kernel direct mapping tables up to 4080000000 @ [0x407fefd000-0x407fffffff]
[    0.000000]     memblock_x86_reserve_range: [0x407ffc0000-0x407fffffff]          PGTABLE
[    0.000000] init_memory_mapping: [0x00004080000000-0x0000507fffffff]
[    0.000000]  4080000000 - 5080000000 page 2M
[    0.000000] kernel direct mapping tables up to 5080000000 @ [0x507febd000-0x507fffffff]
[    0.000000]     memblock_x86_reserve_range: [0x507ffc0000-0x507fffffff]          PGTABLE
[    0.000000] init_memory_mapping: [0x00005080000000-0x0000607fffffff]
[    0.000000]  5080000000 - 6080000000 page 2M
[    0.000000] kernel direct mapping tables up to 6080000000 @ [0x607fe7d000-0x607fffffff]
[    0.000000]     memblock_x86_reserve_range: [0x607ffc0000-0x607fffffff]          PGTABLE
[    0.000000] init_memory_mapping: [0x00006080000000-0x0000707fffffff]
[    0.000000]  6080000000 - 7080000000 page 2M
[    0.000000] kernel direct mapping tables up to 7080000000 @ [0x707fe3d000-0x707fffffff]
[    0.000000]     memblock_x86_reserve_range: [0x707ffc0000-0x707fffffff]          PGTABLE
[    0.000000] init_memory_mapping: [0x00007080000000-0x0000807fffffff]
[    0.000000]  7080000000 - 8080000000 page 2M
[    0.000000] kernel direct mapping tables up to 8080000000 @ [0x807fdfc000-0x807fffffff]
[    0.000000]     memblock_x86_reserve_range: [0x807ffbf000-0x807fffffff]          PGTABLE
[    0.000000] Initmem setup node 0 [0000000000000000-000000107fffffff]
[    0.000000]   NODE_DATA [0x0000107ffbd000-0x0000107ffc1fff]
[    0.000000] Initmem setup node 1 [0000001080000000-000000207fffffff]
[    0.000000]   NODE_DATA [0x0000207ffbb000-0x0000207ffbffff]
[    0.000000] Initmem setup node 2 [0000002080000000-000000307fffffff]
[    0.000000]   NODE_DATA [0x0000307ffbb000-0x0000307ffbffff]
[    0.000000] Initmem setup node 3 [0000003080000000-000000407fffffff]
[    0.000000]   NODE_DATA [0x0000407ffbb000-0x0000407ffbffff]
[    0.000000] Initmem setup node 4 [0000004080000000-000000507fffffff]
[    0.000000]   NODE_DATA [0x0000507ffbb000-0x0000507ffbffff]
[    0.000000] Initmem setup node 5 [0000005080000000-000000607fffffff]
[    0.000000]   NODE_DATA [0x0000607ffbb000-0x0000607ffbffff]
[    0.000000] Initmem setup node 6 [0000006080000000-000000707fffffff]
[    0.000000]   NODE_DATA [0x0000707ffbb000-0x0000707ffbffff]
[    0.000000] Initmem setup node 7 [0000007080000000-000000807fffffff]
[    0.000000]   NODE_DATA [0x0000807ffba000-0x0000807ffbefff]

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <4D1933D1.9020609@kernel.org>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>

diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index 93626e6..731d211 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -54,6 +54,8 @@ static inline phys_addr_t get_max_mapped(void)
 extern unsigned long init_memory_mapping(unsigned long start,
 					 unsigned long end);
 
+void init_memory_mapping_high(void);
+
 extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn,
 				int acpi, int k8);
 extern void free_initmem(void);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 3def8c9..fc0fe74 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -931,14 +931,6 @@ void __init setup_arch(char **cmdline_p)
 	max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT);
 	max_pfn_mapped = max_low_pfn_mapped;
 
-#ifdef CONFIG_X86_64
-	if (max_pfn > max_low_pfn) {
-		max_pfn_mapped = init_memory_mapping(1UL<<32,
-						     max_pfn<<PAGE_SHIFT);
-		/* can we preseve max_low_pfn ?*/
-		max_low_pfn = max_pfn;
-	}
-#endif
 	memblock.current_limit = get_max_mapped();
 
 	/*
diff --git a/arch/x86/mm/amdtopology_64.c b/arch/x86/mm/amdtopology_64.c
index 51fae9c..ae6ad69 100644
--- a/arch/x86/mm/amdtopology_64.c
+++ b/arch/x86/mm/amdtopology_64.c
@@ -221,12 +221,14 @@ int __init amd_scan_nodes(void)
 		apicid_base = boot_cpu_physical_apicid;
 	}
 
-	for_each_node_mask(i, node_possible_map) {
-		int j;
-
+	for_each_node_mask(i, node_possible_map)
 		memblock_x86_register_active_regions(i,
 				nodes[i].start >> PAGE_SHIFT,
 				nodes[i].end >> PAGE_SHIFT);
+	init_memory_mapping_high();
+	for_each_node_mask(i, node_possible_map) {
+		int j;
+
 		for (j = apicid_base; j < cores + apicid_base; j++)
 			apicid_to_node[(i << bits) + j] = i;
 		setup_node_bootmem(i, nodes[i].start, nodes[i].end);
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 5863950..fa6fe75 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -65,16 +65,10 @@ static void __init find_early_table_space(unsigned long end, int use_pse,
 #ifdef CONFIG_X86_32
 	/* for fixmap */
 	tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE);
-#endif
 
-	/*
-	 * RED-PEN putting page tables only on node 0 could
-	 * cause a hotspot and fill up ZONE_DMA. The page tables
-	 * need roughly 0.5KB per GB.
-	 */
-#ifdef CONFIG_X86_32
 	good_end = max_pfn_mapped << PAGE_SHIFT;
 #endif
+
 	base = memblock_find_in_range(start, good_end, tables, PAGE_SIZE);
 	if (base == MEMBLOCK_ERROR)
 		panic("Cannot find space for the kernel page tables");
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 024847d..194f273 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -607,9 +607,63 @@ void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
 				int acpi, int k8)
 {
 	memblock_x86_register_active_regions(0, start_pfn, end_pfn);
+	init_memory_mapping_high();
 }
 #endif
 
+struct mapping_work_data {
+	unsigned long start;
+	unsigned long end;
+	unsigned long pfn_mapped;
+};
+
+static int __init_refok
+mapping_work_fn(unsigned long start_pfn, unsigned long end_pfn, void *datax)
+{
+	struct mapping_work_data *data = datax;
+	unsigned long pfn_mapped;
+	unsigned long final_start, final_end;
+
+	final_start = max_t(unsigned long, start_pfn<<PAGE_SHIFT, data->start);
+	final_end = min_t(unsigned long, end_pfn<<PAGE_SHIFT, data->end);
+
+	if (final_end <= final_start)
+		return 0;
+
+	pfn_mapped = init_memory_mapping(final_start, final_end);
+
+	if (pfn_mapped > data->pfn_mapped)
+		data->pfn_mapped = pfn_mapped;
+
+	return 0;
+}
+
+static unsigned long __init_refok
+init_memory_mapping_active_regions(unsigned long start, unsigned long end)
+{
+	struct mapping_work_data data;
+
+	data.start = start;
+	data.end = end;
+	data.pfn_mapped = 0;
+
+	work_with_active_regions(MAX_NUMNODES, mapping_work_fn, &data);
+
+	return data.pfn_mapped;
+}
+
+void __init_refok init_memory_mapping_high(void)
+{
+	if (max_pfn > max_low_pfn) {
+		max_pfn_mapped = init_memory_mapping_active_regions(1UL<<32,
+							 max_pfn<<PAGE_SHIFT);
+		/* can we preserve max_low_pfn ? */
+		max_low_pfn = max_pfn;
+
+		memblock.current_limit = get_max_mapped();
+	}
+}
+
 void __init paging_init(void)
 {
 	unsigned long max_zone_pfns[MAX_NR_ZONES];
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 02d36ff..7cc26ae 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -590,11 +590,12 @@ static int __init numa_emulation(unsigned long start_pfn,
 	 * the e820 memory map.
 	 */
 	remove_all_active_ranges();
-	for_each_node_mask(i, node_possible_map) {
+	for_each_node_mask(i, node_possible_map)
 		memblock_x86_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
 						nodes[i].end >> PAGE_SHIFT);
+	init_memory_mapping_high();
+	for_each_node_mask(i, node_possible_map)
 		setup_node_bootmem(i, nodes[i].start, nodes[i].end);
-	}
 	acpi_fake_nodes(nodes, num_nodes);
 	numa_init_array();
 	return 0;
@@ -645,6 +646,7 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn,
 	for (i = 0; i < nr_cpu_ids; i++)
 		numa_set_node(i, 0);
 	memblock_x86_register_active_regions(0, start_pfn, last_pfn);
+	init_memory_mapping_high();
 	setup_node_bootmem(0, start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT);
 }
 
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index a35cb9d..0b961c8 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -433,6 +433,8 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
 		return -1;
 	}
 
+	init_memory_mapping_high();
+
 	/* Account for nodes with cpus and no memory */
 	nodes_or(node_possible_map, nodes_parsed, cpu_nodes_parsed);
 
-- 
cgit v0.10.2


From f005fe12b90c5b9fe180a09209a893e09affa8aa Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Mon, 27 Dec 2010 16:48:32 -0800
Subject: x86-64: Move out cleanup higmap [_brk_end, _end) out of
 init_memory_mapping()

It is not related to init_memory_mapping(),  and init_memory_mapping() is
getting more bigger.

So make it as seperated function and call it from reserve_brk() and that is
point when _brk_end is concluded.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <4D1933E0.7090305@kernel.org>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index fc0fe74..635185b 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -293,10 +293,32 @@ static void __init init_gbpages(void)
 	else
 		direct_gbpages = 0;
 }
+
+static void __init cleanup_highmap_brk_end(void)
+{
+	pud_t *pud;
+	pmd_t *pmd;
+
+	mmu_cr4_features = read_cr4();
+
+	/*
+	 * _brk_end cannot change anymore, but it and _end may be
+	 * located on different 2M pages. cleanup_highmap(), however,
+	 * can only consider _end when it runs, so destroy any
+	 * mappings beyond _brk_end here.
+	 */
+	pud = pud_offset(pgd_offset_k(_brk_end), _brk_end);
+	pmd = pmd_offset(pud, _brk_end - 1);
+	while (++pmd <= pmd_offset(pud, (unsigned long)_end - 1))
+		pmd_clear(pmd);
+}
 #else
 static inline void init_gbpages(void)
 {
 }
+static inline void cleanup_highmap_brk_end(void)
+{
+}
 #endif
 
 static void __init reserve_brk(void)
@@ -307,6 +329,8 @@ static void __init reserve_brk(void)
 	/* Mark brk area as locked down and no longer taking any
 	   new allocations */
 	_brk_start = 0;
+
+	cleanup_highmap_brk_end();
 }
 
 #ifdef CONFIG_BLK_DEV_INITRD
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index fa6fe75..35ee75d 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -270,25 +270,6 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
 	load_cr3(swapper_pg_dir);
 #endif
 
-#ifdef CONFIG_X86_64
-	if (!after_bootmem && !start) {
-		pud_t *pud;
-		pmd_t *pmd;
-
-		mmu_cr4_features = read_cr4();
-
-		/*
-		 * _brk_end cannot change anymore, but it and _end may be
-		 * located on different 2M pages. cleanup_highmap(), however,
-		 * can only consider _end when it runs, so destroy any
-		 * mappings beyond _brk_end here.
-		 */
-		pud = pud_offset(pgd_offset_k(_brk_end), _brk_end);
-		pmd = pmd_offset(pud, _brk_end - 1);
-		while (++pmd <= pmd_offset(pud, (unsigned long)_end - 1))
-			pmd_clear(pmd);
-	}
-#endif
 	__flush_tlb_all();
 
 	if (!after_bootmem && e820_table_end > e820_table_start)
-- 
cgit v0.10.2


From c9e358dfc4a8cb2227172ef77908c2e0ee17bcb9 Mon Sep 17 00:00:00 2001
From: Grant Likely <grant.likely@secretlab.ca>
Date: Fri, 21 Jan 2011 09:24:48 -0700
Subject: driver-core: remove conditionals around devicetree pointers

Having conditional around the of_match_table and the of_node pointers
turns out to make driver code use ugly #ifdef blocks.  Drop the
conditionals and remove the #ifdef blocks from the affected drivers.

Also tidy up minor whitespace issues within the same hunks.

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
Acked-by: Greg Kroah-Hartman <gregkh@suse.de>

diff --git a/drivers/i2c/busses/i2c-ocores.c b/drivers/i2c/busses/i2c-ocores.c
index ef3bcb1..c2ef5ce 100644
--- a/drivers/i2c/busses/i2c-ocores.c
+++ b/drivers/i2c/busses/i2c-ocores.c
@@ -330,9 +330,7 @@ static int __devinit ocores_i2c_probe(struct platform_device *pdev)
 	i2c->adap = ocores_adapter;
 	i2c_set_adapdata(&i2c->adap, i2c);
 	i2c->adap.dev.parent = &pdev->dev;
-#ifdef CONFIG_OF
 	i2c->adap.dev.of_node = pdev->dev.of_node;
-#endif
 
 	/* add i2c adapter to i2c tree */
 	ret = i2c_add_adapter(&i2c->adap);
@@ -390,15 +388,11 @@ static int ocores_i2c_resume(struct platform_device *pdev)
 #define ocores_i2c_resume	NULL
 #endif
 
-#ifdef CONFIG_OF
 static struct of_device_id ocores_i2c_match[] = {
-        {
-                .compatible = "opencores,i2c-ocores",
-        },
-        {},
+	{ .compatible = "opencores,i2c-ocores", },
+	{},
 };
 MODULE_DEVICE_TABLE(of, ocores_i2c_match);
-#endif
 
 /* work with hotplug and coldplug */
 MODULE_ALIAS("platform:ocores-i2c");
@@ -411,9 +405,7 @@ static struct platform_driver ocores_i2c_driver = {
 	.driver  = {
 		.owner = THIS_MODULE,
 		.name = "ocores-i2c",
-#ifdef CONFIG_OF
-                .of_match_table = ocores_i2c_match,
-#endif
+		.of_match_table = ocores_i2c_match,
 	},
 };
 
diff --git a/drivers/i2c/i2c-core.c b/drivers/i2c/i2c-core.c
index f0bd5bc..045ba6e 100644
--- a/drivers/i2c/i2c-core.c
+++ b/drivers/i2c/i2c-core.c
@@ -537,9 +537,7 @@ i2c_new_device(struct i2c_adapter *adap, struct i2c_board_info const *info)
 	client->dev.parent = &client->adapter->dev;
 	client->dev.bus = &i2c_bus_type;
 	client->dev.type = &i2c_client_type;
-#ifdef CONFIG_OF
 	client->dev.of_node = info->of_node;
-#endif
 
 	dev_set_name(&client->dev, "%d-%04x", i2c_adapter_id(adap),
 		     client->addr);
diff --git a/drivers/mmc/host/mmc_spi.c b/drivers/mmc/host/mmc_spi.c
index fd877f6..2f7fc0c 100644
--- a/drivers/mmc/host/mmc_spi.c
+++ b/drivers/mmc/host/mmc_spi.c
@@ -1516,21 +1516,17 @@ static int __devexit mmc_spi_remove(struct spi_device *spi)
 	return 0;
 }
 
-#if defined(CONFIG_OF)
 static struct of_device_id mmc_spi_of_match_table[] __devinitdata = {
 	{ .compatible = "mmc-spi-slot", },
 	{},
 };
-#endif
 
 static struct spi_driver mmc_spi_driver = {
 	.driver = {
 		.name =		"mmc_spi",
 		.bus =		&spi_bus_type,
 		.owner =	THIS_MODULE,
-#if defined(CONFIG_OF)
 		.of_match_table = mmc_spi_of_match_table,
-#endif
 	},
 	.probe =	mmc_spi_probe,
 	.remove =	__devexit_p(mmc_spi_remove),
diff --git a/drivers/net/ethoc.c b/drivers/net/ethoc.c
index b79d7e1..db0290f 100644
--- a/drivers/net/ethoc.c
+++ b/drivers/net/ethoc.c
@@ -1163,15 +1163,11 @@ static int ethoc_resume(struct platform_device *pdev)
 # define ethoc_resume  NULL
 #endif
 
-#ifdef CONFIG_OF
 static struct of_device_id ethoc_match[] = {
-	{
-		.compatible = "opencores,ethoc",
-	},
+	{ .compatible = "opencores,ethoc", },
 	{},
 };
 MODULE_DEVICE_TABLE(of, ethoc_match);
-#endif
 
 static struct platform_driver ethoc_driver = {
 	.probe   = ethoc_probe,
@@ -1181,9 +1177,7 @@ static struct platform_driver ethoc_driver = {
 	.driver  = {
 		.name = "ethoc",
 		.owner = THIS_MODULE,
-#ifdef CONFIG_OF
 		.of_match_table = ethoc_match,
-#endif
 	},
 };
 
diff --git a/drivers/spi/pxa2xx_spi.c b/drivers/spi/pxa2xx_spi.c
index 9592883..a429b01 100644
--- a/drivers/spi/pxa2xx_spi.c
+++ b/drivers/spi/pxa2xx_spi.c
@@ -1557,9 +1557,7 @@ static int __devinit pxa2xx_spi_probe(struct platform_device *pdev)
 	drv_data->ssp = ssp;
 
 	master->dev.parent = &pdev->dev;
-#ifdef CONFIG_OF
 	master->dev.of_node = pdev->dev.of_node;
-#endif
 	/* the spi->mode bits understood by this driver: */
 	master->mode_bits = SPI_CPOL | SPI_CPHA | SPI_CS_HIGH;
 
diff --git a/drivers/spi/pxa2xx_spi_pci.c b/drivers/spi/pxa2xx_spi_pci.c
index 351d8a375..b6589bb 100644
--- a/drivers/spi/pxa2xx_spi_pci.c
+++ b/drivers/spi/pxa2xx_spi_pci.c
@@ -98,9 +98,7 @@ static int __devinit ce4100_spi_probe(struct pci_dev *dev,
 	pdev->dev.parent = &dev->dev;
 	pdev->dev.platform_data = &spi_info->spi_pdata;
 
-#ifdef CONFIG_OF
 	pdev->dev.of_node = dev->dev.of_node;
-#endif
 	pdev->dev.release = plat_dev_release;
 
 	spi_pdata->num_chipselect = dev->devfn;
diff --git a/drivers/spi/xilinx_spi.c b/drivers/spi/xilinx_spi.c
index 7adaef6..4d2c75d 100644
--- a/drivers/spi/xilinx_spi.c
+++ b/drivers/spi/xilinx_spi.c
@@ -351,14 +351,12 @@ static irqreturn_t xilinx_spi_irq(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
-#ifdef CONFIG_OF
 static const struct of_device_id xilinx_spi_of_match[] = {
 	{ .compatible = "xlnx,xps-spi-2.00.a", },
 	{ .compatible = "xlnx,xps-spi-2.00.b", },
 	{}
 };
 MODULE_DEVICE_TABLE(of, xilinx_spi_of_match);
-#endif
 
 struct spi_master *xilinx_spi_init(struct device *dev, struct resource *mem,
 	u32 irq, s16 bus_num, int num_cs, int little_endian, int bits_per_word)
@@ -394,9 +392,7 @@ struct spi_master *xilinx_spi_init(struct device *dev, struct resource *mem,
 
 	master->bus_num = bus_num;
 	master->num_chipselect = num_cs;
-#ifdef CONFIG_OF
 	master->dev.of_node = dev->of_node;
-#endif
 
 	xspi->mem = *mem;
 	xspi->irq = irq;
@@ -539,9 +535,7 @@ static struct platform_driver xilinx_spi_driver = {
 	.driver = {
 		.name = XILINX_SPI_NAME,
 		.owner = THIS_MODULE,
-#ifdef CONFIG_OF
 		.of_match_table = xilinx_spi_of_match,
-#endif
 	},
 };
 
diff --git a/include/linux/device.h b/include/linux/device.h
index 1bf5cf0..ca5d252 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -128,9 +128,7 @@ struct device_driver {
 
 	bool suppress_bind_attrs;	/* disables bind/unbind via sysfs */
 
-#if defined(CONFIG_OF)
 	const struct of_device_id	*of_match_table;
-#endif
 
 	int (*probe) (struct device *dev);
 	int (*remove) (struct device *dev);
@@ -441,9 +439,8 @@ struct device {
 					     override */
 	/* arch specific additions */
 	struct dev_archdata	archdata;
-#ifdef CONFIG_OF
-	struct device_node	*of_node;
-#endif
+
+	struct device_node	*of_node; /* associated device tree node */
 
 	dev_t			devt;	/* dev_t, creates the sysfs "dev" */
 
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 903576d..06a8d9c 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -258,9 +258,7 @@ struct i2c_board_info {
 	unsigned short	addr;
 	void		*platform_data;
 	struct dev_archdata	*archdata;
-#ifdef CONFIG_OF
 	struct device_node *of_node;
-#endif
 	int		irq;
 };
 
diff --git a/include/linux/of.h b/include/linux/of.h
index cad7cf0..d9dd664 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -23,8 +23,6 @@
 
 #include <asm/byteorder.h>
 
-#ifdef CONFIG_OF
-
 typedef u32 phandle;
 typedef u32 ihandle;
 
@@ -65,6 +63,8 @@ struct device_node {
 #endif
 };
 
+#ifdef CONFIG_OF
+
 /* Pointer for first entry in chain of all nodes. */
 extern struct device_node *allnodes;
 extern struct device_node *of_chosen;
-- 
cgit v0.10.2


From cd7eab44e9946c28d595abe3e9a43e945bc49141 Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Wed, 19 Jan 2011 21:01:44 +0000
Subject: genirq: Add IRQ affinity notifiers

When initiating I/O on a multiqueue and multi-IRQ device, we may want
to select a queue for which the response will be handled on the same
or a nearby CPU.  This requires a reverse-map of IRQ affinity.  Add a
notification mechanism to support this.

This is based closely on work by Thomas Gleixner <tglx@linutronix.de>.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
Cc: linux-net-drivers@solarflare.com
Cc: Tom Herbert <therbert@google.com>
Cc: David Miller <davem@davemloft.net>
LKML-Reference: <1295470904.11126.84.camel@bwh-desktop>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 55e0d42..63c5ad7 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -14,6 +14,8 @@
 #include <linux/smp.h>
 #include <linux/percpu.h>
 #include <linux/hrtimer.h>
+#include <linux/kref.h>
+#include <linux/workqueue.h>
 
 #include <asm/atomic.h>
 #include <asm/ptrace.h>
@@ -240,6 +242,35 @@ extern int irq_can_set_affinity(unsigned int irq);
 extern int irq_select_affinity(unsigned int irq);
 
 extern int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m);
+
+/**
+ * struct irq_affinity_notify - context for notification of IRQ affinity changes
+ * @irq:		Interrupt to which notification applies
+ * @kref:		Reference count, for internal use
+ * @work:		Work item, for internal use
+ * @notify:		Function to be called on change.  This will be
+ *			called in process context.
+ * @release:		Function to be called on release.  This will be
+ *			called in process context.  Once registered, the
+ *			structure must only be freed when this function is
+ *			called or later.
+ */
+struct irq_affinity_notify {
+	unsigned int irq;
+	struct kref kref;
+	struct work_struct work;
+	void (*notify)(struct irq_affinity_notify *, const cpumask_t *mask);
+	void (*release)(struct kref *ref);
+};
+
+extern int
+irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify);
+
+static inline void irq_run_affinity_notifiers(void)
+{
+	flush_scheduled_work();
+}
+
 #else /* CONFIG_SMP */
 
 static inline int irq_set_affinity(unsigned int irq, const struct cpumask *m)
@@ -255,7 +286,7 @@ static inline int irq_can_set_affinity(unsigned int irq)
 static inline int irq_select_affinity(unsigned int irq)  { return 0; }
 
 static inline int irq_set_affinity_hint(unsigned int irq,
-                                        const struct cpumask *m)
+					const struct cpumask *m)
 {
 	return -EINVAL;
 }
diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
index c1a95b7..bfef56d 100644
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -8,6 +8,7 @@
  * For now it's included from <linux/irq.h>
  */
 
+struct irq_affinity_notify;
 struct proc_dir_entry;
 struct timer_rand_state;
 /**
@@ -24,6 +25,7 @@ struct timer_rand_state;
  * @last_unhandled:	aging timer for unhandled count
  * @irqs_unhandled:	stats field for spurious unhandled interrupts
  * @lock:		locking for SMP
+ * @affinity_notify:	context for notification of affinity changes
  * @pending_mask:	pending rebalanced interrupts
  * @threads_active:	number of irqaction threads currently running
  * @wait_for_threads:	wait queue for sync_irq to wait for threaded handlers
@@ -70,6 +72,7 @@ struct irq_desc {
 	raw_spinlock_t		lock;
 #ifdef CONFIG_SMP
 	const struct cpumask	*affinity_hint;
+	struct irq_affinity_notify *affinity_notify;
 #ifdef CONFIG_GENERIC_PENDING_IRQ
 	cpumask_var_t		pending_mask;
 #endif
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 0caa59f..0587c5c 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -134,6 +134,10 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
 		irq_set_thread_affinity(desc);
 	}
 #endif
+	if (desc->affinity_notify) {
+		kref_get(&desc->affinity_notify->kref);
+		schedule_work(&desc->affinity_notify->work);
+	}
 	desc->status |= IRQ_AFFINITY_SET;
 	raw_spin_unlock_irqrestore(&desc->lock, flags);
 	return 0;
@@ -155,6 +159,79 @@ int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
 }
 EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
 
+static void irq_affinity_notify(struct work_struct *work)
+{
+	struct irq_affinity_notify *notify =
+		container_of(work, struct irq_affinity_notify, work);
+	struct irq_desc *desc = irq_to_desc(notify->irq);
+	cpumask_var_t cpumask;
+	unsigned long flags;
+
+	if (!desc)
+		goto out;
+
+	if (!alloc_cpumask_var(&cpumask, GFP_KERNEL))
+		goto out;
+
+	raw_spin_lock_irqsave(&desc->lock, flags);
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+	if (desc->status & IRQ_MOVE_PENDING)
+		cpumask_copy(cpumask, desc->pending_mask);
+	else
+#endif
+		cpumask_copy(cpumask, desc->affinity);
+	raw_spin_unlock_irqrestore(&desc->lock, flags);
+
+	notify->notify(notify, cpumask);
+
+	free_cpumask_var(cpumask);
+out:
+	kref_put(&notify->kref, notify->release);
+}
+
+/**
+ *	irq_set_affinity_notifier - control notification of IRQ affinity changes
+ *	@irq:		Interrupt for which to enable/disable notification
+ *	@notify:	Context for notification, or %NULL to disable
+ *			notification.  Function pointers must be initialised;
+ *			the other fields will be initialised by this function.
+ *
+ *	Must be called in process context.  Notification may only be enabled
+ *	after the IRQ is allocated and must be disabled before the IRQ is
+ *	freed using free_irq().
+ */
+int
+irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+	struct irq_affinity_notify *old_notify;
+	unsigned long flags;
+
+	/* The release function is promised process context */
+	might_sleep();
+
+	if (!desc)
+		return -EINVAL;
+
+	/* Complete initialisation of *notify */
+	if (notify) {
+		notify->irq = irq;
+		kref_init(&notify->kref);
+		INIT_WORK(&notify->work, irq_affinity_notify);
+	}
+
+	raw_spin_lock_irqsave(&desc->lock, flags);
+	old_notify = desc->affinity_notify;
+	desc->affinity_notify = notify;
+	raw_spin_unlock_irqrestore(&desc->lock, flags);
+
+	if (old_notify)
+		kref_put(&old_notify->kref, old_notify->release);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(irq_set_affinity_notifier);
+
 #ifndef CONFIG_AUTO_IRQ_AFFINITY
 /*
  * Generic version of the affinity autoselector.
@@ -1004,6 +1081,11 @@ void free_irq(unsigned int irq, void *dev_id)
 	if (!desc)
 		return;
 
+#ifdef CONFIG_SMP
+	if (WARN_ON(desc->affinity_notify))
+		desc->affinity_notify = NULL;
+#endif
+
 	chip_bus_lock(desc);
 	kfree(__free_irq(irq, dev_id));
 	chip_bus_sync_unlock(desc);
-- 
cgit v0.10.2


From 361c99a661a78ed22264649440e87fe4fe8da1f2 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 11 Jan 2011 20:56:53 -0200
Subject: perf evsel: Introduce perf_evlist

Killing two more perf wide global variables: nr_counters and evsel_list
as a list_head.

There are more operations that will need more fields in perf_evlist,
like the pollfd for polling all the fds in a list of evsel instances.

Use option->value to pass the evsel_list to parse_{events,filters}.

LKML-Reference: <new-submission>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index 7141c42..f20bc6f 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -402,6 +402,7 @@ LIB_H += util/debug.h
 LIB_H += util/debugfs.h
 LIB_H += util/event.h
 LIB_H += util/evsel.h
+LIB_H += util/evlist.h
 LIB_H += util/exec_cmd.h
 LIB_H += util/types.h
 LIB_H += util/levenshtein.h
@@ -440,6 +441,7 @@ LIB_OBJS += $(OUTPUT)util/ctype.o
 LIB_OBJS += $(OUTPUT)util/debugfs.o
 LIB_OBJS += $(OUTPUT)util/environment.o
 LIB_OBJS += $(OUTPUT)util/event.o
+LIB_OBJS += $(OUTPUT)util/evlist.o
 LIB_OBJS += $(OUTPUT)util/evsel.o
 LIB_OBJS += $(OUTPUT)util/exec_cmd.o
 LIB_OBJS += $(OUTPUT)util/help.o
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index b2f729f..252ace8 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -18,6 +18,7 @@
 
 #include "util/header.h"
 #include "util/event.h"
+#include "util/evlist.h"
 #include "util/evsel.h"
 #include "util/debug.h"
 #include "util/session.h"
@@ -66,6 +67,7 @@ static bool			sample_address			=  false;
 static bool			sample_time			=  false;
 static bool			no_buildid			=  false;
 static bool			no_buildid_cache		=  false;
+static struct perf_evlist	*evsel_list;
 
 static long			samples				=      0;
 static u64			bytes_written			=      0;
@@ -229,7 +231,8 @@ static struct perf_header_attr *get_header_attr(struct perf_event_attr *a, int n
 	return h_attr;
 }
 
-static void create_counter(struct perf_evsel *evsel, int cpu)
+static void create_counter(struct perf_evlist *evlist,
+			   struct perf_evsel *evsel, int cpu)
 {
 	char *filter = evsel->filter;
 	struct perf_event_attr *attr = &evsel->attr;
@@ -263,7 +266,7 @@ static void create_counter(struct perf_evsel *evsel, int cpu)
 
 	attr->sample_type	|= PERF_SAMPLE_IP | PERF_SAMPLE_TID;
 
-	if (nr_counters > 1)
+	if (evlist->nr_entries > 1)
 		attr->sample_type |= PERF_SAMPLE_ID;
 
 	/*
@@ -410,7 +413,7 @@ try_again:
 
 		if (evsel->idx || thread_index) {
 			struct perf_evsel *first;
-			first = list_entry(evsel_list.next, struct perf_evsel, node);
+			first = list_entry(evlist->entries.next, struct perf_evsel, node);
 			ret = ioctl(FD(evsel, nr_cpu, thread_index),
 				    PERF_EVENT_IOC_SET_OUTPUT,
 				    FD(first, nr_cpu, 0));
@@ -449,14 +452,14 @@ try_again:
 		sample_type = attr->sample_type;
 }
 
-static void open_counters(int cpu)
+static void open_counters(struct perf_evlist *evlist, int cpu)
 {
 	struct perf_evsel *pos;
 
 	group_fd = -1;
 
-	list_for_each_entry(pos, &evsel_list, node)
-		create_counter(pos, cpu);
+	list_for_each_entry(pos, &evlist->entries, node)
+		create_counter(evlist, pos, cpu);
 
 	nr_cpu++;
 }
@@ -481,9 +484,9 @@ static void atexit_header(void)
 
 		if (!no_buildid)
 			process_buildids();
-		perf_header__write(&session->header, output, true);
+		perf_header__write(&session->header, evsel_list, output, true);
 		perf_session__delete(session);
-		perf_evsel_list__delete();
+		perf_evlist__delete(evsel_list);
 		symbol__exit();
 	}
 }
@@ -611,7 +614,7 @@ static int __cmd_record(int argc, const char **argv)
 			goto out_delete_session;
 	}
 
-	if (have_tracepoints(&evsel_list))
+	if (have_tracepoints(&evsel_list->entries))
 		perf_header__set_feat(&session->header, HEADER_TRACE_INFO);
 
 	/*
@@ -674,10 +677,10 @@ static int __cmd_record(int argc, const char **argv)
 	}
 
 	if (!system_wide && no_inherit && !cpu_list) {
-		open_counters(-1);
+		open_counters(evsel_list, -1);
 	} else {
 		for (i = 0; i < cpus->nr; i++)
-			open_counters(cpus->map[i]);
+			open_counters(evsel_list, cpus->map[i]);
 	}
 
 	perf_session__set_sample_type(session, sample_type);
@@ -687,7 +690,8 @@ static int __cmd_record(int argc, const char **argv)
 		if (err < 0)
 			return err;
 	} else if (file_new) {
-		err = perf_header__write(&session->header, output, false);
+		err = perf_header__write(&session->header, evsel_list,
+					 output, false);
 		if (err < 0)
 			return err;
 	}
@@ -712,7 +716,7 @@ static int __cmd_record(int argc, const char **argv)
 			return err;
 		}
 
-		if (have_tracepoints(&evsel_list)) {
+		if (have_tracepoints(&evsel_list->entries)) {
 			/*
 			 * FIXME err <= 0 here actually means that
 			 * there were no tracepoints so its not really
@@ -721,7 +725,7 @@ static int __cmd_record(int argc, const char **argv)
 			 * return this more properly and also
 			 * propagate errors that now are calling die()
 			 */
-			err = event__synthesize_tracing_data(output, &evsel_list,
+			err = event__synthesize_tracing_data(output, evsel_list,
 							     process_synthesized_event,
 							     session);
 			if (err <= 0) {
@@ -797,7 +801,7 @@ static int __cmd_record(int argc, const char **argv)
 			for (i = 0; i < nr_cpu; i++) {
 				struct perf_evsel *pos;
 
-				list_for_each_entry(pos, &evsel_list, node) {
+				list_for_each_entry(pos, &evsel_list->entries, node) {
 					for (thread = 0;
 						thread < threads->nr;
 						thread++)
@@ -838,10 +842,10 @@ static const char * const record_usage[] = {
 static bool force, append_file;
 
 const struct option record_options[] = {
-	OPT_CALLBACK('e', "event", NULL, "event",
+	OPT_CALLBACK('e', "event", &evsel_list, "event",
 		     "event selector. use 'perf list' to list available events",
 		     parse_events),
-	OPT_CALLBACK(0, "filter", NULL, "filter",
+	OPT_CALLBACK(0, "filter", &evsel_list, "filter",
 		     "event filter", parse_filter),
 	OPT_INTEGER('p', "pid", &target_pid,
 		    "record events on existing process id"),
@@ -892,6 +896,10 @@ int cmd_record(int argc, const char **argv, const char *prefix __used)
 	int err = -ENOMEM;
 	struct perf_evsel *pos;
 
+	evsel_list = perf_evlist__new();
+	if (evsel_list == NULL)
+		return -ENOMEM;
+
 	argc = parse_options(argc, argv, record_options, record_usage,
 			    PARSE_OPT_STOP_AT_NON_OPTION);
 	if (!argc && target_pid == -1 && target_tid == -1 &&
@@ -913,7 +921,8 @@ int cmd_record(int argc, const char **argv, const char *prefix __used)
 	if (no_buildid_cache || no_buildid)
 		disable_buildid_cache();
 
-	if (list_empty(&evsel_list) && perf_evsel_list__create_default() < 0) {
+	if (evsel_list->nr_entries == 0 &&
+	    perf_evlist__add_default(evsel_list) < 0) {
 		pr_err("Not enough memory for event selector list\n");
 		goto out_symbol_exit;
 	}
@@ -933,7 +942,7 @@ int cmd_record(int argc, const char **argv, const char *prefix __used)
 		return -1;
 	}
 
-	list_for_each_entry(pos, &evsel_list, node) {
+	list_for_each_entry(pos, &evsel_list->entries, node) {
 		if (perf_evsel__alloc_fd(pos, cpus->nr, threads->nr) < 0)
 			goto out_free_fd;
 		if (perf_header__push_event(pos->attr.config, event_name(pos)))
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index a482a19..da90902 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -43,6 +43,7 @@
 #include "util/parse-options.h"
 #include "util/parse-events.h"
 #include "util/event.h"
+#include "util/evlist.h"
 #include "util/evsel.h"
 #include "util/debug.h"
 #include "util/header.h"
@@ -71,6 +72,8 @@ static struct perf_event_attr default_attrs[] = {
 
 };
 
+struct perf_evlist		*evsel_list;
+
 static bool			system_wide			=  false;
 static struct cpu_map		*cpus;
 static int			run_idx				=  0;
@@ -309,7 +312,7 @@ static int run_perf_stat(int argc __used, const char **argv)
 		close(child_ready_pipe[0]);
 	}
 
-	list_for_each_entry(counter, &evsel_list, node) {
+	list_for_each_entry(counter, &evsel_list->entries, node) {
 		if (create_perf_stat_counter(counter) < 0) {
 			if (errno == -EPERM || errno == -EACCES) {
 				error("You may not have permission to collect %sstats.\n"
@@ -347,12 +350,12 @@ static int run_perf_stat(int argc __used, const char **argv)
 	update_stats(&walltime_nsecs_stats, t1 - t0);
 
 	if (no_aggr) {
-		list_for_each_entry(counter, &evsel_list, node) {
+		list_for_each_entry(counter, &evsel_list->entries, node) {
 			read_counter(counter);
 			perf_evsel__close_fd(counter, cpus->nr, 1);
 		}
 	} else {
-		list_for_each_entry(counter, &evsel_list, node) {
+		list_for_each_entry(counter, &evsel_list->entries, node) {
 			read_counter_aggr(counter);
 			perf_evsel__close_fd(counter, cpus->nr, threads->nr);
 		}
@@ -555,10 +558,10 @@ static void print_stat(int argc, const char **argv)
 	}
 
 	if (no_aggr) {
-		list_for_each_entry(counter, &evsel_list, node)
+		list_for_each_entry(counter, &evsel_list->entries, node)
 			print_counter(counter);
 	} else {
-		list_for_each_entry(counter, &evsel_list, node)
+		list_for_each_entry(counter, &evsel_list->entries, node)
 			print_counter_aggr(counter);
 	}
 
@@ -610,7 +613,7 @@ static int stat__set_big_num(const struct option *opt __used,
 }
 
 static const struct option options[] = {
-	OPT_CALLBACK('e', "event", NULL, "event",
+	OPT_CALLBACK('e', "event", &evsel_list, "event",
 		     "event selector. use 'perf list' to list available events",
 		     parse_events),
 	OPT_BOOLEAN('i', "no-inherit", &no_inherit,
@@ -648,6 +651,10 @@ int cmd_stat(int argc, const char **argv, const char *prefix __used)
 
 	setlocale(LC_ALL, "");
 
+	evsel_list = perf_evlist__new();
+	if (evsel_list == NULL)
+		return -ENOMEM;
+
 	argc = parse_options(argc, argv, options, stat_usage,
 		PARSE_OPT_STOP_AT_NON_OPTION);
 
@@ -679,17 +686,14 @@ int cmd_stat(int argc, const char **argv, const char *prefix __used)
 		usage_with_options(stat_usage, options);
 
 	/* Set attrs and nr_counters if no event is selected and !null_run */
-	if (!null_run && !nr_counters) {
+	if (!null_run && !evsel_list->nr_entries) {
 		size_t c;
 
-		nr_counters = ARRAY_SIZE(default_attrs);
-
 		for (c = 0; c < ARRAY_SIZE(default_attrs); ++c) {
-			pos = perf_evsel__new(&default_attrs[c],
-					      nr_counters);
+			pos = perf_evsel__new(&default_attrs[c], c);
 			if (pos == NULL)
 				goto out;
-			list_add(&pos->node, &evsel_list);
+			perf_evlist__add(evsel_list, pos);
 		}
 	}
 
@@ -713,7 +717,7 @@ int cmd_stat(int argc, const char **argv, const char *prefix __used)
 		return -1;
 	}
 
-	list_for_each_entry(pos, &evsel_list, node) {
+	list_for_each_entry(pos, &evsel_list->entries, node) {
 		if (perf_evsel__alloc_stat_priv(pos) < 0 ||
 		    perf_evsel__alloc_counts(pos, cpus->nr) < 0 ||
 		    perf_evsel__alloc_fd(pos, cpus->nr, threads->nr) < 0)
@@ -741,9 +745,9 @@ int cmd_stat(int argc, const char **argv, const char *prefix __used)
 	if (status != -1)
 		print_stat(argc, argv);
 out_free_fd:
-	list_for_each_entry(pos, &evsel_list, node)
+	list_for_each_entry(pos, &evsel_list->entries, node)
 		perf_evsel__free_stat_priv(pos);
-	perf_evsel_list__delete();
+	perf_evlist__delete(evsel_list);
 out:
 	thread_map__delete(threads);
 	threads = NULL;
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index b6998e0..216b62e 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -21,6 +21,7 @@
 #include "perf.h"
 
 #include "util/color.h"
+#include "util/evlist.h"
 #include "util/evsel.h"
 #include "util/session.h"
 #include "util/symbol.h"
@@ -60,6 +61,8 @@
 
 #define FD(e, x, y) (*(int *)xyarray__entry(e->fd, x, y))
 
+struct perf_evlist		*evsel_list;
+
 static bool			system_wide			=  false;
 
 static int			default_interval		=      0;
@@ -267,7 +270,7 @@ static void __zero_source_counters(struct sym_entry *syme)
 
 	line = syme->src->lines;
 	while (line) {
-		for (i = 0; i < nr_counters; i++)
+		for (i = 0; i < evsel_list->nr_entries; i++)
 			line->count[i] = 0;
 		line = line->next;
 	}
@@ -414,7 +417,7 @@ static double sym_weight(const struct sym_entry *sym)
 	if (!display_weighted)
 		return weight;
 
-	for (counter = 1; counter < nr_counters-1; counter++)
+	for (counter = 1; counter < evsel_list->nr_entries - 1; counter++)
 		weight *= sym->count[counter];
 
 	weight /= (sym->count[counter] + 1);
@@ -501,7 +504,7 @@ static void print_sym_table(void)
 			rb_insert_active_sym(&tmp, syme);
 			sum_ksamples += syme->snap_count;
 
-			for (j = 0; j < nr_counters; j++)
+			for (j = 0; j < evsel_list->nr_entries; j++)
 				syme->count[j] = zero ? 0 : syme->count[j] * 7 / 8;
 		} else
 			list_remove_active_sym(syme);
@@ -535,9 +538,9 @@ static void print_sym_table(void)
 			esamples_percent);
 	}
 
-	if (nr_counters == 1 || !display_weighted) {
+	if (evsel_list->nr_entries == 1 || !display_weighted) {
 		struct perf_evsel *first;
-		first = list_entry(evsel_list.next, struct perf_evsel, node);
+		first = list_entry(evsel_list->entries.next, struct perf_evsel, node);
 		printf("%" PRIu64, (uint64_t)first->attr.sample_period);
 		if (freq)
 			printf("Hz ");
@@ -547,7 +550,7 @@ static void print_sym_table(void)
 
 	if (!display_weighted)
 		printf("%s", event_name(sym_evsel));
-	else list_for_each_entry(counter, &evsel_list, node) {
+	else list_for_each_entry(counter, &evsel_list->entries, node) {
 		if (counter->idx)
 			printf("/");
 
@@ -606,7 +609,7 @@ static void print_sym_table(void)
 			sym_width = winsize.ws_col - dso_width - 29;
 	}
 	putchar('\n');
-	if (nr_counters == 1)
+	if (evsel_list->nr_entries == 1)
 		printf("             samples  pcnt");
 	else
 		printf("   weight    samples  pcnt");
@@ -615,7 +618,7 @@ static void print_sym_table(void)
 		printf("         RIP       ");
 	printf(" %-*.*s DSO\n", sym_width, sym_width, "function");
 	printf("   %s    _______ _____",
-	       nr_counters == 1 ? "      " : "______");
+	       evsel_list->nr_entries == 1 ? "      " : "______");
 	if (verbose)
 		printf(" ________________");
 	printf(" %-*.*s", sym_width, sym_width, graph_line);
@@ -634,7 +637,7 @@ static void print_sym_table(void)
 		pcnt = 100.0 - (100.0 * ((sum_ksamples - syme->snap_count) /
 					 sum_ksamples));
 
-		if (nr_counters == 1 || !display_weighted)
+		if (evsel_list->nr_entries == 1 || !display_weighted)
 			printf("%20.2f ", syme->weight);
 		else
 			printf("%9.1f %10ld ", syme->weight, syme->snap_count);
@@ -744,7 +747,7 @@ static void print_mapped_keys(void)
 	fprintf(stdout, "\t[d]     display refresh delay.             \t(%d)\n", delay_secs);
 	fprintf(stdout, "\t[e]     display entries (lines).           \t(%d)\n", print_entries);
 
-	if (nr_counters > 1)
+	if (evsel_list->nr_entries > 1)
 		fprintf(stdout, "\t[E]     active event counter.              \t(%s)\n", event_name(sym_evsel));
 
 	fprintf(stdout, "\t[f]     profile display filter (count).    \t(%d)\n", count_filter);
@@ -753,7 +756,7 @@ static void print_mapped_keys(void)
 	fprintf(stdout, "\t[s]     annotate symbol.                   \t(%s)\n", name?: "NULL");
 	fprintf(stdout, "\t[S]     stop annotation.\n");
 
-	if (nr_counters > 1)
+	if (evsel_list->nr_entries > 1)
 		fprintf(stdout, "\t[w]     toggle display weighted/count[E]r. \t(%d)\n", display_weighted ? 1 : 0);
 
 	fprintf(stdout,
@@ -783,7 +786,7 @@ static int key_mapped(int c)
 			return 1;
 		case 'E':
 		case 'w':
-			return nr_counters > 1 ? 1 : 0;
+			return evsel_list->nr_entries > 1 ? 1 : 0;
 		default:
 			break;
 	}
@@ -831,22 +834,22 @@ static void handle_keypress(struct perf_session *session, int c)
 				signal(SIGWINCH, SIG_DFL);
 			break;
 		case 'E':
-			if (nr_counters > 1) {
+			if (evsel_list->nr_entries > 1) {
 				fprintf(stderr, "\nAvailable events:");
 
-				list_for_each_entry(sym_evsel, &evsel_list, node)
+				list_for_each_entry(sym_evsel, &evsel_list->entries, node)
 					fprintf(stderr, "\n\t%d %s", sym_evsel->idx, event_name(sym_evsel));
 
 				prompt_integer(&sym_counter, "Enter details event counter");
 
-				if (sym_counter >= nr_counters) {
-					sym_evsel = list_entry(evsel_list.next, struct perf_evsel, node);
+				if (sym_counter >= evsel_list->nr_entries) {
+					sym_evsel = list_entry(evsel_list->entries.next, struct perf_evsel, node);
 					sym_counter = 0;
 					fprintf(stderr, "Sorry, no such event, using %s.\n", event_name(sym_evsel));
 					sleep(1);
 					break;
 				}
-				list_for_each_entry(sym_evsel, &evsel_list, node)
+				list_for_each_entry(sym_evsel, &evsel_list->entries, node)
 					if (sym_evsel->idx == sym_counter)
 						break;
 			} else sym_counter = 0;
@@ -1198,7 +1201,7 @@ static void perf_session__mmap_read(struct perf_session *self)
 	int i, thread_index;
 
 	for (i = 0; i < cpus->nr; i++) {
-		list_for_each_entry(counter, &evsel_list, node) {
+		list_for_each_entry(counter, &evsel_list->entries, node) {
 			for (thread_index = 0;
 				thread_index < threads->nr;
 				thread_index++) {
@@ -1312,7 +1315,7 @@ static int __cmd_top(void)
 
 	for (i = 0; i < cpus->nr; i++) {
 		group_fd = -1;
-		list_for_each_entry(counter, &evsel_list, node)
+		list_for_each_entry(counter, &evsel_list->entries, node)
 			start_counter(i, counter);
 	}
 
@@ -1354,7 +1357,7 @@ static const char * const top_usage[] = {
 };
 
 static const struct option options[] = {
-	OPT_CALLBACK('e', "event", NULL, "event",
+	OPT_CALLBACK('e', "event", &evsel_list, "event",
 		     "event selector. use 'perf list' to list available events",
 		     parse_events),
 	OPT_INTEGER('c', "count", &default_interval,
@@ -1404,6 +1407,10 @@ int cmd_top(int argc, const char **argv, const char *prefix __used)
 	struct perf_evsel *pos;
 	int status = -ENOMEM;
 
+	evsel_list = perf_evlist__new();
+	if (evsel_list == NULL)
+		return -ENOMEM;
+
 	page_size = sysconf(_SC_PAGE_SIZE);
 
 	argc = parse_options(argc, argv, options, top_usage, 0);
@@ -1431,7 +1438,8 @@ int cmd_top(int argc, const char **argv, const char *prefix __used)
 		cpu_list = NULL;
 	}
 
-	if (!nr_counters && perf_evsel_list__create_default() < 0) {
+	if (!evsel_list->nr_entries &&
+	    perf_evlist__add_default(evsel_list) < 0) {
 		pr_err("Not enough memory for event selector list\n");
 		return -ENOMEM;
 	}
@@ -1459,7 +1467,7 @@ int cmd_top(int argc, const char **argv, const char *prefix __used)
 	if (cpus == NULL)
 		usage_with_options(top_usage, options);
 
-	list_for_each_entry(pos, &evsel_list, node) {
+	list_for_each_entry(pos, &evsel_list->entries, node) {
 		if (perf_evsel__alloc_mmap_per_thread(pos, cpus->nr, threads->nr) < 0 ||
 		    perf_evsel__alloc_fd(pos, cpus->nr, threads->nr) < 0)
 			goto out_free_fd;
@@ -1472,10 +1480,10 @@ int cmd_top(int argc, const char **argv, const char *prefix __used)
 		pos->attr.sample_period = default_interval;
 	}
 
-	sym_evsel = list_entry(evsel_list.next, struct perf_evsel, node);
+	sym_evsel = list_entry(evsel_list->entries.next, struct perf_evsel, node);
 
 	symbol_conf.priv_size = (sizeof(struct sym_entry) +
-				 (nr_counters + 1) * sizeof(unsigned long));
+				 (evsel_list->nr_entries + 1) * sizeof(unsigned long));
 
 	symbol_conf.try_vmlinux_path = (symbol_conf.vmlinux_name == NULL);
 	if (symbol__init() < 0)
@@ -1489,9 +1497,9 @@ int cmd_top(int argc, const char **argv, const char *prefix __used)
 
 	status = __cmd_top();
 out_free_fd:
-	list_for_each_entry(pos, &evsel_list, node)
+	list_for_each_entry(pos, &evsel_list->entries, node)
 		perf_evsel__free_mmap(pos);
-	perf_evsel_list__delete();
+	perf_evlist__delete(evsel_list);
 
 	return status;
 }
diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
new file mode 100644
index 0000000..7b4faec
--- /dev/null
+++ b/tools/perf/util/evlist.c
@@ -0,0 +1,53 @@
+#include "evlist.h"
+#include "evsel.h"
+#include "util.h"
+
+struct perf_evlist *perf_evlist__new(void)
+{
+	struct perf_evlist *evlist = zalloc(sizeof(*evlist));
+
+	if (evlist != NULL) {
+		INIT_LIST_HEAD(&evlist->entries);
+	}
+
+	return evlist;
+}
+
+static void perf_evlist__purge(struct perf_evlist *evlist)
+{
+	struct perf_evsel *pos, *n;
+
+	list_for_each_entry_safe(pos, n, &evlist->entries, node) {
+		list_del_init(&pos->node);
+		perf_evsel__delete(pos);
+	}
+
+	evlist->nr_entries = 0;
+}
+
+void perf_evlist__delete(struct perf_evlist *evlist)
+{
+	perf_evlist__purge(evlist);
+	free(evlist);
+}
+
+void perf_evlist__add(struct perf_evlist *evlist, struct perf_evsel *entry)
+{
+	list_add_tail(&entry->node, &evlist->entries);
+	++evlist->nr_entries;
+}
+
+int perf_evlist__add_default(struct perf_evlist *evlist)
+{
+	struct perf_event_attr attr = {
+		.type = PERF_TYPE_HARDWARE,
+		.config = PERF_COUNT_HW_CPU_CYCLES,
+	};
+	struct perf_evsel *evsel = perf_evsel__new(&attr, 0);
+
+	if (evsel == NULL)
+		return -ENOMEM;
+
+	perf_evlist__add(evlist, evsel);
+	return 0;
+}
diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h
new file mode 100644
index 0000000..48db91a
--- /dev/null
+++ b/tools/perf/util/evlist.h
@@ -0,0 +1,19 @@
+#ifndef __PERF_EVLIST_H
+#define __PERF_EVLIST_H 1
+
+#include <linux/list.h>
+
+struct perf_evlist {
+	struct list_head entries;
+	int		 nr_entries;
+};
+
+struct perf_evsel;
+
+struct perf_evlist *perf_evlist__new(void);
+void perf_evlist__delete(struct perf_evlist *evlist);
+
+void perf_evlist__add(struct perf_evlist *evlist, struct perf_evsel *entry);
+int perf_evlist__add_default(struct perf_evlist *evlist);
+
+#endif /* __PERF_EVLIST_H */
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index f6a929e..f0138d4 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -8,6 +8,7 @@
 #include <linux/list.h>
 #include <linux/kernel.h>
 
+#include "evlist.h"
 #include "util.h"
 #include "header.h"
 #include "../perf.h"
@@ -428,7 +429,8 @@ static bool perf_session__read_build_ids(struct perf_session *self, bool with_hi
 	return ret;
 }
 
-static int perf_header__adds_write(struct perf_header *self, int fd)
+static int perf_header__adds_write(struct perf_header *self,
+				   struct perf_evlist *evlist, int fd)
 {
 	int nr_sections;
 	struct perf_session *session;
@@ -463,7 +465,7 @@ static int perf_header__adds_write(struct perf_header *self, int fd)
 
 		/* Write trace info */
 		trace_sec->offset = lseek(fd, 0, SEEK_CUR);
-		read_tracing_data(fd, &evsel_list);
+		read_tracing_data(fd, &evlist->entries);
 		trace_sec->size = lseek(fd, 0, SEEK_CUR) - trace_sec->offset;
 	}
 
@@ -513,7 +515,8 @@ int perf_header__write_pipe(int fd)
 	return 0;
 }
 
-int perf_header__write(struct perf_header *self, int fd, bool at_exit)
+int perf_header__write(struct perf_header *self, struct perf_evlist *evlist,
+		       int fd, bool at_exit)
 {
 	struct perf_file_header f_header;
 	struct perf_file_attr   f_attr;
@@ -566,7 +569,7 @@ int perf_header__write(struct perf_header *self, int fd, bool at_exit)
 	self->data_offset = lseek(fd, 0, SEEK_CUR);
 
 	if (at_exit) {
-		err = perf_header__adds_write(self, fd);
+		err = perf_header__adds_write(self, evlist, fd);
 		if (err < 0)
 			return err;
 	}
@@ -1133,7 +1136,7 @@ int event__process_event_type(event_t *self,
 	return 0;
 }
 
-int event__synthesize_tracing_data(int fd, struct list_head *pattrs,
+int event__synthesize_tracing_data(int fd, struct perf_evlist *evlist,
 				   event__handler_t process,
 				   struct perf_session *session __unused)
 {
@@ -1144,7 +1147,7 @@ int event__synthesize_tracing_data(int fd, struct list_head *pattrs,
 	memset(&ev, 0, sizeof(ev));
 
 	ev.tracing_data.header.type = PERF_RECORD_HEADER_TRACING_DATA;
-	size = read_tracing_data_size(fd, pattrs);
+	size = read_tracing_data_size(fd, &evlist->entries);
 	if (size <= 0)
 		return size;
 	aligned_size = ALIGN(size, sizeof(u64));
@@ -1154,7 +1157,7 @@ int event__synthesize_tracing_data(int fd, struct list_head *pattrs,
 
 	process(&ev, NULL, session);
 
-	err = read_tracing_data(fd, pattrs);
+	err = read_tracing_data(fd, &evlist->entries);
 	write_padded(fd, NULL, 0, padding);
 
 	return aligned_size;
diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h
index 33f16be..65afd7f 100644
--- a/tools/perf/util/header.h
+++ b/tools/perf/util/header.h
@@ -65,8 +65,11 @@ struct perf_header {
 int perf_header__init(struct perf_header *self);
 void perf_header__exit(struct perf_header *self);
 
+struct perf_evlist;
+
 int perf_header__read(struct perf_session *session, int fd);
-int perf_header__write(struct perf_header *self, int fd, bool at_exit);
+int perf_header__write(struct perf_header *self, struct perf_evlist *evlist,
+		       int fd, bool at_exit);
 int perf_header__write_pipe(int fd);
 
 int perf_header__add_attr(struct perf_header *self,
@@ -113,7 +116,7 @@ int event__synthesize_event_types(event__handler_t process,
 int event__process_event_type(event_t *self,
 			      struct perf_session *session);
 
-int event__synthesize_tracing_data(int fd, struct list_head *pattrs,
+int event__synthesize_tracing_data(int fd, struct perf_evlist *evlist,
 				   event__handler_t process,
 				   struct perf_session *session);
 int event__process_tracing_data(event_t *self,
diff --git a/tools/perf/util/include/linux/list.h b/tools/perf/util/include/linux/list.h
index f5ca26e..356c7e4 100644
--- a/tools/perf/util/include/linux/list.h
+++ b/tools/perf/util/include/linux/list.h
@@ -1,3 +1,4 @@
+#include <linux/kernel.h>
 #include "../../../../include/linux/list.h"
 
 #ifndef PERF_LIST_H
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 135f69b..d3086ce 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -1,6 +1,7 @@
 #include "../../../include/linux/hw_breakpoint.h"
 #include "util.h"
 #include "../perf.h"
+#include "evlist.h"
 #include "evsel.h"
 #include "parse-options.h"
 #include "parse-events.h"
@@ -11,10 +12,6 @@
 #include "header.h"
 #include "debugfs.h"
 
-int				nr_counters;
-
-LIST_HEAD(evsel_list);
-
 struct event_symbol {
 	u8		type;
 	u64		config;
@@ -778,8 +775,9 @@ modifier:
 	return ret;
 }
 
-int parse_events(const struct option *opt __used, const char *str, int unset __used)
+int parse_events(const struct option *opt, const char *str, int unset __used)
 {
+	struct perf_evlist *evlist = *(struct perf_evlist **)opt->value;
 	struct perf_event_attr attr;
 	enum event_result ret;
 
@@ -794,12 +792,10 @@ int parse_events(const struct option *opt __used, const char *str, int unset __u
 
 		if (ret != EVT_HANDLED_ALL) {
 			struct perf_evsel *evsel;
-			evsel = perf_evsel__new(&attr,
-						nr_counters);
+			evsel = perf_evsel__new(&attr, evlist->nr_entries);
 			if (evsel == NULL)
 				return -1;
-			list_add_tail(&evsel->node, &evsel_list);
-			++nr_counters;
+			perf_evlist__add(evlist, evsel);
 		}
 
 		if (*str == 0)
@@ -813,13 +809,14 @@ int parse_events(const struct option *opt __used, const char *str, int unset __u
 	return 0;
 }
 
-int parse_filter(const struct option *opt __used, const char *str,
+int parse_filter(const struct option *opt, const char *str,
 		 int unset __used)
 {
+	struct perf_evlist *evlist = *(struct perf_evlist **)opt->value;
 	struct perf_evsel *last = NULL;
 
-	if (!list_empty(&evsel_list))
-		last = list_entry(evsel_list.prev, struct perf_evsel, node);
+	if (evlist->nr_entries > 0)
+		last = list_entry(evlist->entries.prev, struct perf_evsel, node);
 
 	if (last == NULL || last->attr.type != PERF_TYPE_TRACEPOINT) {
 		fprintf(stderr,
@@ -981,33 +978,3 @@ void print_events(void)
 
 	exit(129);
 }
-
-int perf_evsel_list__create_default(void)
-{
-	struct perf_evsel *evsel;
-	struct perf_event_attr attr;
-
-	memset(&attr, 0, sizeof(attr));
-	attr.type = PERF_TYPE_HARDWARE;
-	attr.config = PERF_COUNT_HW_CPU_CYCLES;
-
-	evsel = perf_evsel__new(&attr, 0);
-
-	if (evsel == NULL)
-		return -ENOMEM;
-
-	list_add(&evsel->node, &evsel_list);
-	++nr_counters;
-	return 0;
-}
-
-void perf_evsel_list__delete(void)
-{
-	struct perf_evsel *pos, *n;
-
-	list_for_each_entry_safe(pos, n, &evsel_list, node) {
-		list_del_init(&pos->node);
-		perf_evsel__delete(pos);
-	}
-	nr_counters = 0;
-}
diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h
index 458e3ec..cf7e94a 100644
--- a/tools/perf/util/parse-events.h
+++ b/tools/perf/util/parse-events.h
@@ -9,11 +9,6 @@
 struct list_head;
 struct perf_evsel;
 
-extern struct list_head evsel_list;
-
-int perf_evsel_list__create_default(void);
-void perf_evsel_list__delete(void);
-
 struct option;
 
 struct tracepoint_path {
@@ -25,8 +20,6 @@ struct tracepoint_path {
 extern struct tracepoint_path *tracepoint_id_to_path(u64 config);
 extern bool have_tracepoints(struct list_head *evlist);
 
-extern int			nr_counters;
-
 const char *event_name(struct perf_evsel *event);
 extern const char *__event_name(int type, u64 config);
 
-- 
cgit v0.10.2


From 5c581041cf97aa7980b442de81ddea8273d6dcde Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 11 Jan 2011 22:30:02 -0200
Subject: perf evlist: Adopt the pollfd array

Allocating just the space needed for nr_cpus * nr_threads * nr_evsels,
not the MAX_NR_CPUS and counters.

LKML-Reference: <new-submission>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 252ace8..1614d89 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -72,9 +72,6 @@ static struct perf_evlist	*evsel_list;
 static long			samples				=      0;
 static u64			bytes_written			=      0;
 
-static struct pollfd		*event_array;
-
-static int			nr_poll				=      0;
 static int			nr_cpu				=      0;
 
 static int			file_new			=      1;
@@ -432,9 +429,9 @@ try_again:
 				exit(-1);
 			}
 
-			event_array[nr_poll].fd = FD(evsel, nr_cpu, thread_index);
-			event_array[nr_poll].events = POLLIN;
-			nr_poll++;
+			evlist->pollfd[evlist->nr_fds].fd = FD(evsel, nr_cpu, thread_index);
+			evlist->pollfd[evlist->nr_fds].events = POLLIN;
+			evlist->nr_fds++;
 		}
 
 		if (filter != NULL) {
@@ -793,7 +790,7 @@ static int __cmd_record(int argc, const char **argv)
 		if (hits == samples) {
 			if (done)
 				break;
-			err = poll(event_array, nr_poll, -1);
+			err = poll(evsel_list->pollfd, evsel_list->nr_fds, -1);
 			waking++;
 		}
 
@@ -948,9 +945,8 @@ int cmd_record(int argc, const char **argv, const char *prefix __used)
 		if (perf_header__push_event(pos->attr.config, event_name(pos)))
 			goto out_free_fd;
 	}
-	event_array = malloc((sizeof(struct pollfd) * MAX_NR_CPUS *
-			      MAX_COUNTERS * threads->nr));
-	if (!event_array)
+
+	if (perf_evlist__alloc_pollfd(evsel_list, cpus->nr, threads->nr) < 0)
 		goto out_free_fd;
 
 	if (user_interval != ULLONG_MAX)
@@ -968,13 +964,11 @@ int cmd_record(int argc, const char **argv, const char *prefix __used)
 	} else {
 		fprintf(stderr, "frequency and count are zero, aborting\n");
 		err = -EINVAL;
-		goto out_free_event_array;
+		goto out_free_fd;
 	}
 
 	err = __cmd_record(argc, argv);
 
-out_free_event_array:
-	free(event_array);
 out_free_fd:
 	thread_map__delete(threads);
 	threads = NULL;
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 216b62e..1bc4652 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -1193,8 +1193,6 @@ static void perf_session__mmap_read_counter(struct perf_session *self,
 	md->prev = old;
 }
 
-static struct pollfd *event_array;
-
 static void perf_session__mmap_read(struct perf_session *self)
 {
 	struct perf_evsel *counter;
@@ -1212,10 +1210,10 @@ static void perf_session__mmap_read(struct perf_session *self)
 	}
 }
 
-int nr_poll;
 int group_fd;
 
-static void start_counter(int i, struct perf_evsel *evsel)
+static void start_counter(int i, struct perf_evlist *evlist,
+			  struct perf_evsel *evsel)
 {
 	struct xyarray *mmap_array = evsel->priv;
 	struct mmap_data *mm;
@@ -1281,9 +1279,9 @@ try_again:
 		if (group && group_fd == -1)
 			group_fd = FD(evsel, i, thread_index);
 
-		event_array[nr_poll].fd = FD(evsel, i, thread_index);
-		event_array[nr_poll].events = POLLIN;
-		nr_poll++;
+		evlist->pollfd[evlist->nr_fds].fd = FD(evsel, i, thread_index);
+		evlist->pollfd[evlist->nr_fds].events = POLLIN;
+		evlist->nr_fds++;
 
 		mm = xyarray__entry(mmap_array, i, thread_index);
 		mm->prev = 0;
@@ -1316,11 +1314,11 @@ static int __cmd_top(void)
 	for (i = 0; i < cpus->nr; i++) {
 		group_fd = -1;
 		list_for_each_entry(counter, &evsel_list->entries, node)
-			start_counter(i, counter);
+			start_counter(i, evsel_list, counter);
 	}
 
 	/* Wait for a minimal set of events before starting the snapshot */
-	poll(&event_array[0], nr_poll, 100);
+	poll(evsel_list->pollfd, evsel_list->nr_fds, 100);
 
 	perf_session__mmap_read(session);
 
@@ -1345,7 +1343,7 @@ static int __cmd_top(void)
 		perf_session__mmap_read(session);
 
 		if (hits == samples)
-			ret = poll(event_array, nr_poll, 100);
+			ret = poll(evsel_list->pollfd, evsel_list->nr_fds, 100);
 	}
 
 	return 0;
@@ -1426,11 +1424,6 @@ int cmd_top(int argc, const char **argv, const char *prefix __used)
 		usage_with_options(top_usage, options);
 	}
 
-	event_array = malloc((sizeof(struct pollfd) *
-			      MAX_NR_CPUS * MAX_COUNTERS * threads->nr));
-	if (!event_array)
-		return -ENOMEM;
-
 	/* CPU and PID are mutually exclusive */
 	if (target_tid > 0 && cpu_list) {
 		printf("WARNING: PID switch overriding CPU\n");
@@ -1480,6 +1473,9 @@ int cmd_top(int argc, const char **argv, const char *prefix __used)
 		pos->attr.sample_period = default_interval;
 	}
 
+	if (perf_evlist__alloc_pollfd(evsel_list, cpus->nr, threads->nr) < 0)
+		goto out_free_fd;
+
 	sym_evsel = list_entry(evsel_list->entries.next, struct perf_evsel, node);
 
 	symbol_conf.priv_size = (sizeof(struct sym_entry) +
diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index 7b4faec..2abf949 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -1,3 +1,4 @@
+#include <poll.h>
 #include "evlist.h"
 #include "evsel.h"
 #include "util.h"
@@ -28,6 +29,7 @@ static void perf_evlist__purge(struct perf_evlist *evlist)
 void perf_evlist__delete(struct perf_evlist *evlist)
 {
 	perf_evlist__purge(evlist);
+	free(evlist->pollfd);
 	free(evlist);
 }
 
@@ -51,3 +53,10 @@ int perf_evlist__add_default(struct perf_evlist *evlist)
 	perf_evlist__add(evlist, evsel);
 	return 0;
 }
+
+int perf_evlist__alloc_pollfd(struct perf_evlist *evlist, int ncpus, int nthreads)
+{
+	int nfds = ncpus * nthreads * evlist->nr_entries;
+	evlist->pollfd = malloc(sizeof(struct pollfd) * nfds);
+	return evlist->pollfd != NULL ? 0 : -ENOMEM;
+}
diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h
index 48db91a..a7d7e12 100644
--- a/tools/perf/util/evlist.h
+++ b/tools/perf/util/evlist.h
@@ -3,9 +3,13 @@
 
 #include <linux/list.h>
 
+struct pollfd;
+
 struct perf_evlist {
 	struct list_head entries;
 	int		 nr_entries;
+	int		 nr_fds;
+	struct pollfd	 *pollfd;
 };
 
 struct perf_evsel;
@@ -16,4 +20,6 @@ void perf_evlist__delete(struct perf_evlist *evlist);
 void perf_evlist__add(struct perf_evlist *evlist, struct perf_evsel *entry);
 int perf_evlist__add_default(struct perf_evlist *evlist);
 
+int perf_evlist__alloc_pollfd(struct perf_evlist *evlist, int ncpus, int nthreads);
+
 #endif /* __PERF_EVLIST_H */
-- 
cgit v0.10.2


From f08199d314458610d4ca52f8e86e0a4ec7a7bc54 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 11 Jan 2011 23:42:19 -0200
Subject: perf evsel: Support event groups

The perf_evsel__open now have an extra boolean argument specifying if
event grouping is desired.

The first file descriptor created on a CPU becomes the group leader.

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index da90902..b5fe522 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -169,7 +169,7 @@ static int create_perf_stat_counter(struct perf_evsel *evsel)
 				    PERF_FORMAT_TOTAL_TIME_RUNNING;
 
 	if (system_wide)
-		return perf_evsel__open_per_cpu(evsel, cpus);
+		return perf_evsel__open_per_cpu(evsel, cpus, false);
 
 	attr->inherit = !no_inherit;
 	if (target_pid == -1 && target_tid == -1) {
@@ -177,7 +177,7 @@ static int create_perf_stat_counter(struct perf_evsel *evsel)
 		attr->enable_on_exec = 1;
 	}
 
-	return perf_evsel__open_per_thread(evsel, threads);
+	return perf_evsel__open_per_thread(evsel, threads, false);
 }
 
 /*
diff --git a/tools/perf/builtin-test.c b/tools/perf/builtin-test.c
index 5dcdba6..4282d67 100644
--- a/tools/perf/builtin-test.c
+++ b/tools/perf/builtin-test.c
@@ -289,7 +289,7 @@ static int test__open_syscall_event(void)
 		goto out_thread_map_delete;
 	}
 
-	if (perf_evsel__open_per_thread(evsel, threads) < 0) {
+	if (perf_evsel__open_per_thread(evsel, threads, false) < 0) {
 		pr_debug("failed to open counter: %s, "
 			 "tweak /proc/sys/kernel/perf_event_paranoid?\n",
 			 strerror(errno));
@@ -364,7 +364,7 @@ static int test__open_syscall_event_on_all_cpus(void)
 		goto out_thread_map_delete;
 	}
 
-	if (perf_evsel__open(evsel, cpus, threads) < 0) {
+	if (perf_evsel__open(evsel, cpus, threads, false) < 0) {
 		pr_debug("failed to open counter: %s, "
 			 "tweak /proc/sys/kernel/perf_event_paranoid?\n",
 			 strerror(errno));
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index f5cfed6..da473ec 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -128,7 +128,7 @@ int __perf_evsel__read(struct perf_evsel *evsel,
 }
 
 static int __perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus,
-			      struct thread_map *threads)
+			      struct thread_map *threads, bool group)
 {
 	int cpu, thread;
 
@@ -137,12 +137,18 @@ static int __perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus,
 		return -1;
 
 	for (cpu = 0; cpu < cpus->nr; cpu++) {
+		int group_fd = -1;
+
 		for (thread = 0; thread < threads->nr; thread++) {
 			FD(evsel, cpu, thread) = sys_perf_event_open(&evsel->attr,
 								     threads->map[thread],
-								     cpus->map[cpu], -1, 0);
+								     cpus->map[cpu],
+								     group_fd, 0);
 			if (FD(evsel, cpu, thread) < 0)
 				goto out_close;
+
+			if (group && group_fd == -1)
+				group_fd = FD(evsel, cpu, thread);
 		}
 	}
 
@@ -175,10 +181,9 @@ static struct {
 	.threads = { -1, },
 };
 
-int perf_evsel__open(struct perf_evsel *evsel,
-		     struct cpu_map *cpus, struct thread_map *threads)
+int perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus,
+		     struct thread_map *threads, bool group)
 {
-
 	if (cpus == NULL) {
 		/* Work around old compiler warnings about strict aliasing */
 		cpus = &empty_cpu_map.map;
@@ -187,15 +192,17 @@ int perf_evsel__open(struct perf_evsel *evsel,
 	if (threads == NULL)
 		threads = &empty_thread_map.map;
 
-	return __perf_evsel__open(evsel, cpus, threads);
+	return __perf_evsel__open(evsel, cpus, threads, group);
 }
 
-int perf_evsel__open_per_cpu(struct perf_evsel *evsel, struct cpu_map *cpus)
+int perf_evsel__open_per_cpu(struct perf_evsel *evsel,
+			     struct cpu_map *cpus, bool group)
 {
-	return __perf_evsel__open(evsel, cpus, &empty_thread_map.map);
+	return __perf_evsel__open(evsel, cpus, &empty_thread_map.map, group);
 }
 
-int perf_evsel__open_per_thread(struct perf_evsel *evsel, struct thread_map *threads)
+int perf_evsel__open_per_thread(struct perf_evsel *evsel,
+				struct thread_map *threads, bool group)
 {
-	return __perf_evsel__open(evsel, &empty_cpu_map.map, threads);
+	return __perf_evsel__open(evsel, &empty_cpu_map.map, threads, group);
 }
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index b2d755f..0962b50 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -45,10 +45,12 @@ int perf_evsel__alloc_counts(struct perf_evsel *evsel, int ncpus);
 void perf_evsel__free_fd(struct perf_evsel *evsel);
 void perf_evsel__close_fd(struct perf_evsel *evsel, int ncpus, int nthreads);
 
-int perf_evsel__open_per_cpu(struct perf_evsel *evsel, struct cpu_map *cpus);
-int perf_evsel__open_per_thread(struct perf_evsel *evsel, struct thread_map *threads);
-int perf_evsel__open(struct perf_evsel *evsel, 
-		     struct cpu_map *cpus, struct thread_map *threads);
+int perf_evsel__open_per_cpu(struct perf_evsel *evsel,
+			     struct cpu_map *cpus, bool group);
+int perf_evsel__open_per_thread(struct perf_evsel *evsel,
+				struct thread_map *threads, bool group);
+int perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus,
+		     struct thread_map *threads, bool group);
 
 #define perf_evsel__match(evsel, t, c)		\
 	(evsel->attr.type == PERF_TYPE_##t &&	\
-- 
cgit v0.10.2


From 9d04f1781772e11bd58806391555fc23ebb54377 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Wed, 12 Jan 2011 00:08:18 -0200
Subject: perf evsel: Allow specifying if the inherit bit should be set

As this is a per-cpu attribute, we can't set it up in advance and use it
for all the calls.

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index b5fe522..e2a2d02 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -169,7 +169,7 @@ static int create_perf_stat_counter(struct perf_evsel *evsel)
 				    PERF_FORMAT_TOTAL_TIME_RUNNING;
 
 	if (system_wide)
-		return perf_evsel__open_per_cpu(evsel, cpus, false);
+		return perf_evsel__open_per_cpu(evsel, cpus, false, false);
 
 	attr->inherit = !no_inherit;
 	if (target_pid == -1 && target_tid == -1) {
@@ -177,7 +177,7 @@ static int create_perf_stat_counter(struct perf_evsel *evsel)
 		attr->enable_on_exec = 1;
 	}
 
-	return perf_evsel__open_per_thread(evsel, threads, false);
+	return perf_evsel__open_per_thread(evsel, threads, false, false);
 }
 
 /*
diff --git a/tools/perf/builtin-test.c b/tools/perf/builtin-test.c
index 4282d67..7287158 100644
--- a/tools/perf/builtin-test.c
+++ b/tools/perf/builtin-test.c
@@ -289,7 +289,7 @@ static int test__open_syscall_event(void)
 		goto out_thread_map_delete;
 	}
 
-	if (perf_evsel__open_per_thread(evsel, threads, false) < 0) {
+	if (perf_evsel__open_per_thread(evsel, threads, false, false) < 0) {
 		pr_debug("failed to open counter: %s, "
 			 "tweak /proc/sys/kernel/perf_event_paranoid?\n",
 			 strerror(errno));
@@ -364,7 +364,7 @@ static int test__open_syscall_event_on_all_cpus(void)
 		goto out_thread_map_delete;
 	}
 
-	if (perf_evsel__open(evsel, cpus, threads, false) < 0) {
+	if (perf_evsel__open(evsel, cpus, threads, false, false) < 0) {
 		pr_debug("failed to open counter: %s, "
 			 "tweak /proc/sys/kernel/perf_event_paranoid?\n",
 			 strerror(errno));
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index da473ec..82a0053 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -128,7 +128,7 @@ int __perf_evsel__read(struct perf_evsel *evsel,
 }
 
 static int __perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus,
-			      struct thread_map *threads, bool group)
+			      struct thread_map *threads, bool group, bool inherit)
 {
 	int cpu, thread;
 
@@ -139,6 +139,8 @@ static int __perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus,
 	for (cpu = 0; cpu < cpus->nr; cpu++) {
 		int group_fd = -1;
 
+		evsel->attr.inherit = (cpus->map[cpu] < 0) && inherit;
+
 		for (thread = 0; thread < threads->nr; thread++) {
 			FD(evsel, cpu, thread) = sys_perf_event_open(&evsel->attr,
 								     threads->map[thread],
@@ -182,7 +184,7 @@ static struct {
 };
 
 int perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus,
-		     struct thread_map *threads, bool group)
+		     struct thread_map *threads, bool group, bool inherit)
 {
 	if (cpus == NULL) {
 		/* Work around old compiler warnings about strict aliasing */
@@ -192,17 +194,17 @@ int perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus,
 	if (threads == NULL)
 		threads = &empty_thread_map.map;
 
-	return __perf_evsel__open(evsel, cpus, threads, group);
+	return __perf_evsel__open(evsel, cpus, threads, group, inherit);
 }
 
 int perf_evsel__open_per_cpu(struct perf_evsel *evsel,
-			     struct cpu_map *cpus, bool group)
+			     struct cpu_map *cpus, bool group, bool inherit)
 {
-	return __perf_evsel__open(evsel, cpus, &empty_thread_map.map, group);
+	return __perf_evsel__open(evsel, cpus, &empty_thread_map.map, group, inherit);
 }
 
 int perf_evsel__open_per_thread(struct perf_evsel *evsel,
-				struct thread_map *threads, bool group)
+				struct thread_map *threads, bool group, bool inherit)
 {
-	return __perf_evsel__open(evsel, &empty_cpu_map.map, threads, group);
+	return __perf_evsel__open(evsel, &empty_cpu_map.map, threads, group, inherit);
 }
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index 0962b50..1594696 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -46,11 +46,11 @@ void perf_evsel__free_fd(struct perf_evsel *evsel);
 void perf_evsel__close_fd(struct perf_evsel *evsel, int ncpus, int nthreads);
 
 int perf_evsel__open_per_cpu(struct perf_evsel *evsel,
-			     struct cpu_map *cpus, bool group);
+			     struct cpu_map *cpus, bool group, bool inherit);
 int perf_evsel__open_per_thread(struct perf_evsel *evsel,
-				struct thread_map *threads, bool group);
+				struct thread_map *threads, bool group, bool inherit);
 int perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus,
-		     struct thread_map *threads, bool group);
+		     struct thread_map *threads, bool group, bool inherit);
 
 #define perf_evsel__match(evsel, t, c)		\
 	(evsel->attr.type == PERF_TYPE_##t &&	\
-- 
cgit v0.10.2


From 72cb7013e08dec29631e0447f9496b7bacd3e14b Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Wed, 12 Jan 2011 10:52:47 -0200
Subject: perf top: Use perf_evsel__open

Now that it handles group_fd and inherit we can use it, sharing it with
stat.

Next step: 'perf record' should use, then move the mmap_array out of
->priv and into perf_evsel, with top and record sharing this, and at the
same time, write a 'perf test' stress test.

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 1bc4652..15d89be 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -1210,39 +1210,50 @@ static void perf_session__mmap_read(struct perf_session *self)
 	}
 }
 
-int group_fd;
-
 static void start_counter(int i, struct perf_evlist *evlist,
 			  struct perf_evsel *evsel)
 {
 	struct xyarray *mmap_array = evsel->priv;
 	struct mmap_data *mm;
-	struct perf_event_attr *attr;
-	int cpu = -1;
 	int thread_index;
 
-	if (target_tid == -1)
-		cpu = cpus->map[i];
-
-	attr = &evsel->attr;
+	for (thread_index = 0; thread_index < threads->nr; thread_index++) {
+		assert(FD(evsel, i, thread_index) >= 0);
+		fcntl(FD(evsel, i, thread_index), F_SETFL, O_NONBLOCK);
 
-	attr->sample_type	= PERF_SAMPLE_IP | PERF_SAMPLE_TID;
+		evlist->pollfd[evlist->nr_fds].fd = FD(evsel, i, thread_index);
+		evlist->pollfd[evlist->nr_fds].events = POLLIN;
+		evlist->nr_fds++;
 
-	if (freq) {
-		attr->sample_type	|= PERF_SAMPLE_PERIOD;
-		attr->freq		= 1;
-		attr->sample_freq	= freq;
+		mm = xyarray__entry(mmap_array, i, thread_index);
+		mm->prev = 0;
+		mm->mask = mmap_pages*page_size - 1;
+		mm->base = mmap(NULL, (mmap_pages+1)*page_size,
+				PROT_READ, MAP_SHARED, FD(evsel, i, thread_index), 0);
+		if (mm->base == MAP_FAILED)
+			die("failed to mmap with %d (%s)\n", errno, strerror(errno));
 	}
+}
+
+static void start_counters(struct perf_evlist *evlist)
+{
+	struct perf_evsel *counter;
+	int i;
 
-	attr->inherit		= (cpu < 0) && inherit;
-	attr->mmap		= 1;
+	list_for_each_entry(counter, &evlist->entries, node) {
+		struct perf_event_attr *attr = &counter->attr;
 
-	for (thread_index = 0; thread_index < threads->nr; thread_index++) {
-try_again:
-		FD(evsel, i, thread_index) = sys_perf_event_open(attr,
-				threads->map[thread_index], cpu, group_fd, 0);
+		attr->sample_type = PERF_SAMPLE_IP | PERF_SAMPLE_TID;
+
+		if (freq) {
+			attr->sample_type |= PERF_SAMPLE_PERIOD;
+			attr->freq	  = 1;
+			attr->sample_freq = freq;
+		}
 
-		if (FD(evsel, i, thread_index) < 0) {
+		attr->mmap = 1;
+try_again:
+		if (perf_evsel__open(counter, cpus, threads, group, inherit) < 0) {
 			int err = errno;
 
 			if (err == EPERM || err == EACCES)
@@ -1254,8 +1265,8 @@ try_again:
 			 * based cpu-clock-tick sw counter, which
 			 * is always available even if no PMU support:
 			 */
-			if (attr->type == PERF_TYPE_HARDWARE
-					&& attr->config == PERF_COUNT_HW_CPU_CYCLES) {
+			if (attr->type == PERF_TYPE_HARDWARE &&
+			    attr->config == PERF_COUNT_HW_CPU_CYCLES) {
 
 				if (verbose)
 					warning(" ... trying to fall back to cpu-clock-ticks\n");
@@ -1265,39 +1276,24 @@ try_again:
 				goto try_again;
 			}
 			printf("\n");
-			error("sys_perf_event_open() syscall returned with %d (%s).  /bin/dmesg may provide additional information.\n",
-					FD(evsel, i, thread_index), strerror(err));
+			error("sys_perf_event_open() syscall returned with %d "
+			      "(%s).  /bin/dmesg may provide additional information.\n",
+			      err, strerror(err));
 			die("No CONFIG_PERF_EVENTS=y kernel support configured?\n");
 			exit(-1);
 		}
-		assert(FD(evsel, i, thread_index) >= 0);
-		fcntl(FD(evsel, i, thread_index), F_SETFL, O_NONBLOCK);
-
-		/*
-		 * First counter acts as the group leader:
-		 */
-		if (group && group_fd == -1)
-			group_fd = FD(evsel, i, thread_index);
-
-		evlist->pollfd[evlist->nr_fds].fd = FD(evsel, i, thread_index);
-		evlist->pollfd[evlist->nr_fds].events = POLLIN;
-		evlist->nr_fds++;
+	}
 
-		mm = xyarray__entry(mmap_array, i, thread_index);
-		mm->prev = 0;
-		mm->mask = mmap_pages*page_size - 1;
-		mm->base = mmap(NULL, (mmap_pages+1)*page_size,
-				PROT_READ, MAP_SHARED, FD(evsel, i, thread_index), 0);
-		if (mm->base == MAP_FAILED)
-			die("failed to mmap with %d (%s)\n", errno, strerror(errno));
+	for (i = 0; i < cpus->nr; i++) {
+		list_for_each_entry(counter, &evlist->entries, node)
+			start_counter(i, evsel_list, counter);
 	}
 }
 
 static int __cmd_top(void)
 {
 	pthread_t thread;
-	struct perf_evsel *counter;
-	int i, ret;
+	int ret;
 	/*
 	 * FIXME: perf_session__new should allow passing a O_MMAP, so that all this
 	 * mmap reading, etc is encapsulated in it. Use O_WRONLY for now.
@@ -1311,11 +1307,7 @@ static int __cmd_top(void)
 	else
 		event__synthesize_threads(event__process, session);
 
-	for (i = 0; i < cpus->nr; i++) {
-		group_fd = -1;
-		list_for_each_entry(counter, &evsel_list->entries, node)
-			start_counter(i, evsel_list, counter);
-	}
+	start_counters(evsel_list);
 
 	/* Wait for a minimal set of events before starting the snapshot */
 	poll(evsel_list->pollfd, evsel_list->nr_fds, 100);
-- 
cgit v0.10.2


From dd7927f4f8ee75b032ff15aeef4bda49719a443a Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Wed, 12 Jan 2011 14:28:51 -0200
Subject: perf record: Use perf_evsel__open

Now its time to factor out the mmap handling bits into the perf_evsel
class.

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 1614d89..ec43f2e 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -72,8 +72,6 @@ static struct perf_evlist	*evsel_list;
 static long			samples				=      0;
 static u64			bytes_written			=      0;
 
-static int			nr_cpu				=      0;
-
 static int			file_new			=      1;
 static off_t			post_processing_offset;
 
@@ -208,8 +206,6 @@ static void sig_atexit(void)
 	kill(getpid(), signr);
 }
 
-static int group_fd;
-
 static struct perf_header_attr *get_header_attr(struct perf_event_attr *a, int nr)
 {
 	struct perf_header_attr *h_attr;
@@ -234,7 +230,6 @@ static void create_counter(struct perf_evlist *evlist,
 	char *filter = evsel->filter;
 	struct perf_event_attr *attr = &evsel->attr;
 	struct perf_header_attr *h_attr;
-	int track = !evsel->idx; /* only the first counter needs these */
 	int thread_index;
 	int ret;
 	struct {
@@ -243,19 +238,77 @@ static void create_counter(struct perf_evlist *evlist,
 		u64 time_running;
 		u64 id;
 	} read_data;
-	/*
- 	 * Check if parse_single_tracepoint_event has already asked for
- 	 * PERF_SAMPLE_TIME.
- 	 *
-	 * XXX this is kludgy but short term fix for problems introduced by
-	 * eac23d1c that broke 'perf script' by having different sample_types
-	 * when using multiple tracepoint events when we use a perf binary
-	 * that tries to use sample_id_all on an older kernel.
- 	 *
- 	 * We need to move counter creation to perf_session, support
- 	 * different sample_types, etc.
- 	 */
-	bool time_needed = attr->sample_type & PERF_SAMPLE_TIME;
+
+	for (thread_index = 0; thread_index < threads->nr; thread_index++) {
+		h_attr = get_header_attr(attr, evsel->idx);
+		if (h_attr == NULL)
+			die("nomem\n");
+
+		if (!file_new) {
+			if (memcmp(&h_attr->attr, attr, sizeof(*attr))) {
+				fprintf(stderr, "incompatible append\n");
+				exit(-1);
+			}
+		}
+
+		if (read(FD(evsel, cpu, thread_index), &read_data, sizeof(read_data)) == -1) {
+			perror("Unable to read perf file descriptor");
+			exit(-1);
+		}
+
+		if (perf_header_attr__add_id(h_attr, read_data.id) < 0) {
+			pr_warning("Not enough memory to add id\n");
+			exit(-1);
+		}
+
+		assert(FD(evsel, cpu, thread_index) >= 0);
+		fcntl(FD(evsel, cpu, thread_index), F_SETFL, O_NONBLOCK);
+
+		if (evsel->idx || thread_index) {
+			struct perf_evsel *first;
+			first = list_entry(evlist->entries.next, struct perf_evsel, node);
+			ret = ioctl(FD(evsel, cpu, thread_index),
+				    PERF_EVENT_IOC_SET_OUTPUT,
+				    FD(first, cpu, 0));
+			if (ret) {
+				error("failed to set output: %d (%s)\n", errno,
+						strerror(errno));
+				exit(-1);
+			}
+		} else {
+			mmap_array[cpu].prev = 0;
+			mmap_array[cpu].mask = mmap_pages*page_size - 1;
+			mmap_array[cpu].base = mmap(NULL, (mmap_pages+1)*page_size,
+				PROT_READ | PROT_WRITE, MAP_SHARED, FD(evsel, cpu, thread_index), 0);
+			if (mmap_array[cpu].base == MAP_FAILED) {
+				error("failed to mmap with %d (%s)\n", errno, strerror(errno));
+				exit(-1);
+			}
+
+			evlist->pollfd[evlist->nr_fds].fd = FD(evsel, cpu, thread_index);
+			evlist->pollfd[evlist->nr_fds].events = POLLIN;
+			evlist->nr_fds++;
+		}
+
+		if (filter != NULL) {
+			ret = ioctl(FD(evsel, cpu, thread_index),
+				    PERF_EVENT_IOC_SET_FILTER, filter);
+			if (ret) {
+				error("failed to set filter with %d (%s)\n", errno,
+						strerror(errno));
+				exit(-1);
+			}
+		}
+	}
+
+	if (!sample_type)
+		sample_type = attr->sample_type;
+}
+
+static void config_attr(struct perf_evsel *evsel, struct perf_evlist *evlist)
+{
+	struct perf_event_attr *attr = &evsel->attr;
+	int track = !evsel->idx; /* only the first counter needs these */
 
 	attr->read_format	= PERF_FORMAT_TOTAL_TIME_ENABLED |
 				  PERF_FORMAT_TOTAL_TIME_RUNNING |
@@ -315,19 +368,39 @@ static void create_counter(struct perf_evlist *evlist,
 
 	attr->mmap		= track;
 	attr->comm		= track;
-	attr->inherit		= !no_inherit;
+
 	if (target_pid == -1 && target_tid == -1 && !system_wide) {
 		attr->disabled = 1;
 		attr->enable_on_exec = 1;
 	}
-retry_sample_id:
-	attr->sample_id_all = sample_id_all_avail ? 1 : 0;
+}
 
-	for (thread_index = 0; thread_index < threads->nr; thread_index++) {
-try_again:
-		FD(evsel, nr_cpu, thread_index) = sys_perf_event_open(attr, threads->map[thread_index], cpu, group_fd, 0);
+static void open_counters(struct perf_evlist *evlist)
+{
+	struct perf_evsel *pos;
+	int cpu;
+
+	list_for_each_entry(pos, &evlist->entries, node) {
+		struct perf_event_attr *attr = &pos->attr;
+		/*
+		 * Check if parse_single_tracepoint_event has already asked for
+		 * PERF_SAMPLE_TIME.
+		 *
+		 * XXX this is kludgy but short term fix for problems introduced by
+		 * eac23d1c that broke 'perf script' by having different sample_types
+		 * when using multiple tracepoint events when we use a perf binary
+		 * that tries to use sample_id_all on an older kernel.
+		 *
+		 * We need to move counter creation to perf_session, support
+		 * different sample_types, etc.
+		 */
+		bool time_needed = attr->sample_type & PERF_SAMPLE_TIME;
 
-		if (FD(evsel, nr_cpu, thread_index) < 0) {
+		config_attr(pos, evlist);
+retry_sample_id:
+		attr->sample_id_all = sample_id_all_avail ? 1 : 0;
+try_again:
+		if (perf_evsel__open(pos, cpus, threads, group, !no_inherit) < 0) {
 			int err = errno;
 
 			if (err == EPERM || err == EACCES)
@@ -364,7 +437,7 @@ try_again:
 			}
 			printf("\n");
 			error("sys_perf_event_open() syscall returned with %d (%s).  /bin/dmesg may provide additional information.\n",
-			      FD(evsel, nr_cpu, thread_index), strerror(err));
+			      err, strerror(err));
 
 #if defined(__i386__) || defined(__x86_64__)
 			if (attr->type == PERF_TYPE_HARDWARE && err == EOPNOTSUPP)
@@ -375,90 +448,13 @@ try_again:
 #endif
 
 			die("No CONFIG_PERF_EVENTS=y kernel support configured?\n");
-			exit(-1);
-		}
-
-		h_attr = get_header_attr(attr, evsel->idx);
-		if (h_attr == NULL)
-			die("nomem\n");
-
-		if (!file_new) {
-			if (memcmp(&h_attr->attr, attr, sizeof(*attr))) {
-				fprintf(stderr, "incompatible append\n");
-				exit(-1);
-			}
-		}
-
-		if (read(FD(evsel, nr_cpu, thread_index), &read_data, sizeof(read_data)) == -1) {
-			perror("Unable to read perf file descriptor");
-			exit(-1);
-		}
-
-		if (perf_header_attr__add_id(h_attr, read_data.id) < 0) {
-			pr_warning("Not enough memory to add id\n");
-			exit(-1);
-		}
-
-		assert(FD(evsel, nr_cpu, thread_index) >= 0);
-		fcntl(FD(evsel, nr_cpu, thread_index), F_SETFL, O_NONBLOCK);
-
-		/*
-		 * First counter acts as the group leader:
-		 */
-		if (group && group_fd == -1)
-			group_fd = FD(evsel, nr_cpu, thread_index);
-
-		if (evsel->idx || thread_index) {
-			struct perf_evsel *first;
-			first = list_entry(evlist->entries.next, struct perf_evsel, node);
-			ret = ioctl(FD(evsel, nr_cpu, thread_index),
-				    PERF_EVENT_IOC_SET_OUTPUT,
-				    FD(first, nr_cpu, 0));
-			if (ret) {
-				error("failed to set output: %d (%s)\n", errno,
-						strerror(errno));
-				exit(-1);
-			}
-		} else {
-			mmap_array[nr_cpu].prev = 0;
-			mmap_array[nr_cpu].mask = mmap_pages*page_size - 1;
-			mmap_array[nr_cpu].base = mmap(NULL, (mmap_pages+1)*page_size,
-				PROT_READ | PROT_WRITE, MAP_SHARED, FD(evsel, nr_cpu, thread_index), 0);
-			if (mmap_array[nr_cpu].base == MAP_FAILED) {
-				error("failed to mmap with %d (%s)\n", errno, strerror(errno));
-				exit(-1);
-			}
-
-			evlist->pollfd[evlist->nr_fds].fd = FD(evsel, nr_cpu, thread_index);
-			evlist->pollfd[evlist->nr_fds].events = POLLIN;
-			evlist->nr_fds++;
-		}
-
-		if (filter != NULL) {
-			ret = ioctl(FD(evsel, nr_cpu, thread_index),
-				    PERF_EVENT_IOC_SET_FILTER, filter);
-			if (ret) {
-				error("failed to set filter with %d (%s)\n", errno,
-						strerror(errno));
-				exit(-1);
-			}
 		}
 	}
 
-	if (!sample_type)
-		sample_type = attr->sample_type;
-}
-
-static void open_counters(struct perf_evlist *evlist, int cpu)
-{
-	struct perf_evsel *pos;
-
-	group_fd = -1;
-
-	list_for_each_entry(pos, &evlist->entries, node)
-		create_counter(evlist, pos, cpu);
-
-	nr_cpu++;
+	for (cpu = 0; cpu < cpus->nr; ++cpu) {
+		list_for_each_entry(pos, &evlist->entries, node)
+			create_counter(evlist, pos, cpu);
+	}
 }
 
 static int process_buildids(void)
@@ -533,7 +529,7 @@ static void mmap_read_all(void)
 {
 	int i;
 
-	for (i = 0; i < nr_cpu; i++) {
+	for (i = 0; i < cpus->nr; i++) {
 		if (mmap_array[i].base)
 			mmap_read(&mmap_array[i]);
 	}
@@ -673,12 +669,7 @@ static int __cmd_record(int argc, const char **argv)
 		close(child_ready_pipe[0]);
 	}
 
-	if (!system_wide && no_inherit && !cpu_list) {
-		open_counters(evsel_list, -1);
-	} else {
-		for (i = 0; i < cpus->nr; i++)
-			open_counters(evsel_list, cpus->map[i]);
-	}
+	open_counters(evsel_list);
 
 	perf_session__set_sample_type(session, sample_type);
 
@@ -795,7 +786,7 @@ static int __cmd_record(int argc, const char **argv)
 		}
 
 		if (done) {
-			for (i = 0; i < nr_cpu; i++) {
+			for (i = 0; i < cpus->nr; i++) {
 				struct perf_evsel *pos;
 
 				list_for_each_entry(pos, &evsel_list->entries, node) {
@@ -933,11 +924,13 @@ int cmd_record(int argc, const char **argv, const char *prefix __used)
 		usage_with_options(record_usage, record_options);
 	}
 
-	cpus = cpu_map__new(cpu_list);
-	if (cpus == NULL) {
-		perror("failed to parse CPUs map");
-		return -1;
-	}
+	if (target_tid != -1)
+		cpus = cpu_map__dummy_new();
+	else
+		cpus = cpu_map__new(cpu_list);
+
+	if (cpus == NULL)
+		usage_with_options(record_usage, record_options);
 
 	list_for_each_entry(pos, &evsel_list->entries, node) {
 		if (perf_evsel__alloc_fd(pos, cpus->nr, threads->nr) < 0)
-- 
cgit v0.10.2


From 70082dd92c4b288bd723a77897e2b555f0e63113 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Wed, 12 Jan 2011 17:03:24 -0200
Subject: perf evsel: Introduce mmap support

Out of the code in 'perf top'. Record is next in line.

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 15d89be..7d723ad 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -1095,43 +1095,12 @@ static void event__process_sample(const event_t *self,
 	}
 }
 
-struct mmap_data {
-	void			*base;
-	int			mask;
-	unsigned int		prev;
-};
-
-static int perf_evsel__alloc_mmap_per_thread(struct perf_evsel *evsel,
-					     int ncpus, int nthreads)
-{
-	evsel->priv = xyarray__new(ncpus, nthreads, sizeof(struct mmap_data));
-	return evsel->priv != NULL ? 0 : -ENOMEM;
-}
-
-static void perf_evsel__free_mmap(struct perf_evsel *evsel)
-{
-	xyarray__delete(evsel->priv);
-	evsel->priv = NULL;
-}
-
-static unsigned int mmap_read_head(struct mmap_data *md)
-{
-	struct perf_event_mmap_page *pc = md->base;
-	int head;
-
-	head = pc->data_head;
-	rmb();
-
-	return head;
-}
-
 static void perf_session__mmap_read_counter(struct perf_session *self,
 					    struct perf_evsel *evsel,
 					    int cpu, int thread_idx)
 {
-	struct xyarray *mmap_array = evsel->priv;
-	struct mmap_data *md = xyarray__entry(mmap_array, cpu, thread_idx);
-	unsigned int head = mmap_read_head(md);
+	struct perf_mmap *md = xyarray__entry(evsel->mmap, cpu, thread_idx);
+	unsigned int head = perf_mmap__read_head(md);
 	unsigned int old = md->prev;
 	unsigned char *data = md->base + page_size;
 	struct sample_data sample;
@@ -1210,35 +1179,9 @@ static void perf_session__mmap_read(struct perf_session *self)
 	}
 }
 
-static void start_counter(int i, struct perf_evlist *evlist,
-			  struct perf_evsel *evsel)
-{
-	struct xyarray *mmap_array = evsel->priv;
-	struct mmap_data *mm;
-	int thread_index;
-
-	for (thread_index = 0; thread_index < threads->nr; thread_index++) {
-		assert(FD(evsel, i, thread_index) >= 0);
-		fcntl(FD(evsel, i, thread_index), F_SETFL, O_NONBLOCK);
-
-		evlist->pollfd[evlist->nr_fds].fd = FD(evsel, i, thread_index);
-		evlist->pollfd[evlist->nr_fds].events = POLLIN;
-		evlist->nr_fds++;
-
-		mm = xyarray__entry(mmap_array, i, thread_index);
-		mm->prev = 0;
-		mm->mask = mmap_pages*page_size - 1;
-		mm->base = mmap(NULL, (mmap_pages+1)*page_size,
-				PROT_READ, MAP_SHARED, FD(evsel, i, thread_index), 0);
-		if (mm->base == MAP_FAILED)
-			die("failed to mmap with %d (%s)\n", errno, strerror(errno));
-	}
-}
-
 static void start_counters(struct perf_evlist *evlist)
 {
 	struct perf_evsel *counter;
-	int i;
 
 	list_for_each_entry(counter, &evlist->entries, node) {
 		struct perf_event_attr *attr = &counter->attr;
@@ -1282,11 +1225,9 @@ try_again:
 			die("No CONFIG_PERF_EVENTS=y kernel support configured?\n");
 			exit(-1);
 		}
-	}
 
-	for (i = 0; i < cpus->nr; i++) {
-		list_for_each_entry(counter, &evlist->entries, node)
-			start_counter(i, evsel_list, counter);
+		if (perf_evsel__mmap(counter, cpus, threads, mmap_pages, evlist) < 0)
+			die("failed to mmap with %d (%s)\n", errno, strerror(errno));
 	}
 }
 
@@ -1453,7 +1394,7 @@ int cmd_top(int argc, const char **argv, const char *prefix __used)
 		usage_with_options(top_usage, options);
 
 	list_for_each_entry(pos, &evsel_list->entries, node) {
-		if (perf_evsel__alloc_mmap_per_thread(pos, cpus->nr, threads->nr) < 0 ||
+		if (perf_evsel__alloc_mmap(pos, cpus->nr, threads->nr) < 0 ||
 		    perf_evsel__alloc_fd(pos, cpus->nr, threads->nr) < 0)
 			goto out_free_fd;
 		/*
@@ -1485,8 +1426,6 @@ int cmd_top(int argc, const char **argv, const char *prefix __used)
 
 	status = __cmd_top();
 out_free_fd:
-	list_for_each_entry(pos, &evsel_list->entries, node)
-		perf_evsel__free_mmap(pos);
 	perf_evlist__delete(evsel_list);
 
 	return status;
diff --git a/tools/perf/perf.h b/tools/perf/perf.h
index 95aaf56..5fb5e1f 100644
--- a/tools/perf/perf.h
+++ b/tools/perf/perf.h
@@ -94,6 +94,20 @@ void get_term_dimensions(struct winsize *ws);
 #include "util/types.h"
 #include <stdbool.h>
 
+struct perf_mmap {
+	void			*base;
+	int			mask;
+	unsigned int		prev;
+};
+
+static inline unsigned int perf_mmap__read_head(struct perf_mmap *mm)
+{
+	struct perf_event_mmap_page *pc = mm->base;
+	int head = pc->data_head;
+	rmb();
+	return head;
+}
+
 /*
  * prctl(PR_TASK_PERF_EVENTS_DISABLE) will (cheaply) disable all
  * counters in the current task.
diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index 2abf949..6d41292 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -60,3 +60,11 @@ int perf_evlist__alloc_pollfd(struct perf_evlist *evlist, int ncpus, int nthread
 	evlist->pollfd = malloc(sizeof(struct pollfd) * nfds);
 	return evlist->pollfd != NULL ? 0 : -ENOMEM;
 }
+
+void perf_evlist__add_pollfd(struct perf_evlist *evlist, int fd)
+{
+	fcntl(fd, F_SETFL, O_NONBLOCK);
+	evlist->pollfd[evlist->nr_fds].fd = fd;
+	evlist->pollfd[evlist->nr_fds].events = POLLIN;
+	evlist->nr_fds++;
+}
diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h
index a7d7e12..16bbfcb 100644
--- a/tools/perf/util/evlist.h
+++ b/tools/perf/util/evlist.h
@@ -21,5 +21,6 @@ void perf_evlist__add(struct perf_evlist *evlist, struct perf_evsel *entry);
 int perf_evlist__add_default(struct perf_evlist *evlist);
 
 int perf_evlist__alloc_pollfd(struct perf_evlist *evlist, int ncpus, int nthreads);
+void perf_evlist__add_pollfd(struct perf_evlist *evlist, int fd);
 
 #endif /* __PERF_EVLIST_H */
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 82a0053..f500695 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -1,9 +1,13 @@
 #include "evsel.h"
+#include "evlist.h"
 #include "../perf.h"
 #include "util.h"
 #include "cpumap.h"
 #include "thread.h"
 
+#include <unistd.h>
+#include <sys/mman.h>
+
 #define FD(e, x, y) (*(int *)xyarray__entry(e->fd, x, y))
 
 struct perf_evsel *perf_evsel__new(struct perf_event_attr *attr, int idx)
@@ -49,10 +53,32 @@ void perf_evsel__close_fd(struct perf_evsel *evsel, int ncpus, int nthreads)
 		}
 }
 
+void perf_evsel__munmap(struct perf_evsel *evsel, int ncpus, int nthreads)
+{
+	struct perf_mmap *mm;
+	int cpu, thread;
+
+	for (cpu = 0; cpu < ncpus; cpu++)
+		for (thread = 0; thread < nthreads; ++thread) {
+			mm = xyarray__entry(evsel->mmap, cpu, thread);
+			if (mm->base != NULL) {
+				munmap(mm->base, evsel->mmap_len);
+				mm->base = NULL;
+			}
+		}
+}
+
+int perf_evsel__alloc_mmap(struct perf_evsel *evsel, int ncpus, int nthreads)
+{
+	evsel->mmap = xyarray__new(ncpus, nthreads, sizeof(struct perf_mmap));
+	return evsel->mmap != NULL ? 0 : -ENOMEM;
+}
+
 void perf_evsel__delete(struct perf_evsel *evsel)
 {
 	assert(list_empty(&evsel->node));
 	xyarray__delete(evsel->fd);
+	xyarray__delete(evsel->mmap);
 	free(evsel);
 }
 
@@ -208,3 +234,48 @@ int perf_evsel__open_per_thread(struct perf_evsel *evsel,
 {
 	return __perf_evsel__open(evsel, &empty_cpu_map.map, threads, group, inherit);
 }
+
+int perf_evsel__mmap(struct perf_evsel *evsel, struct cpu_map *cpus,
+		     struct thread_map *threads, int pages,
+		     struct perf_evlist *evlist)
+{
+	unsigned int page_size = sysconf(_SC_PAGE_SIZE);
+	int mask = pages * page_size - 1, cpu;
+	struct perf_mmap *mm;
+	int thread;
+
+	if (evsel->mmap == NULL &&
+	    perf_evsel__alloc_mmap(evsel, cpus->nr, threads->nr) < 0)
+		return -ENOMEM;
+
+	evsel->mmap_len = (pages + 1) * page_size;
+
+	for (cpu = 0; cpu < cpus->nr; cpu++) {
+		for (thread = 0; thread < threads->nr; thread++) {
+			mm = xyarray__entry(evsel->mmap, cpu, thread);
+			mm->prev = 0;
+			mm->mask = mask;
+			mm->base = mmap(NULL, evsel->mmap_len, PROT_READ,
+					MAP_SHARED, FD(evsel, cpu, thread), 0);
+			if (mm->base == MAP_FAILED)
+				goto out_unmap;
+
+			if (evlist != NULL)
+				 perf_evlist__add_pollfd(evlist, FD(evsel, cpu, thread));
+		}
+	}
+
+	return 0;
+
+out_unmap:
+	do {
+		while (--thread >= 0) {
+			mm = xyarray__entry(evsel->mmap, cpu, thread);
+			munmap(mm->base, evsel->mmap_len);
+			mm->base = NULL;
+		}
+		thread = threads->nr;
+	} while (--cpu >= 0);
+
+	return -1;
+}
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index 1594696..c8fbef2 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -29,19 +29,23 @@ struct perf_evsel {
 	struct perf_event_attr	attr;
 	char			*filter;
 	struct xyarray		*fd;
+	struct xyarray		*mmap;
 	struct perf_counts	*counts;
+	size_t			mmap_len;
 	int			idx;
 	void			*priv;
 };
 
 struct cpu_map;
 struct thread_map;
+struct perf_evlist;
 
 struct perf_evsel *perf_evsel__new(struct perf_event_attr *attr, int idx);
 void perf_evsel__delete(struct perf_evsel *evsel);
 
 int perf_evsel__alloc_fd(struct perf_evsel *evsel, int ncpus, int nthreads);
 int perf_evsel__alloc_counts(struct perf_evsel *evsel, int ncpus);
+int perf_evsel__alloc_mmap(struct perf_evsel *evsel, int ncpus, int nthreads);
 void perf_evsel__free_fd(struct perf_evsel *evsel);
 void perf_evsel__close_fd(struct perf_evsel *evsel, int ncpus, int nthreads);
 
@@ -51,6 +55,10 @@ int perf_evsel__open_per_thread(struct perf_evsel *evsel,
 				struct thread_map *threads, bool group, bool inherit);
 int perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus,
 		     struct thread_map *threads, bool group, bool inherit);
+int perf_evsel__mmap(struct perf_evsel *evsel, struct cpu_map *cpus,
+		     struct thread_map *threads, int pages,
+		     struct perf_evlist *evlist);
+void perf_evsel__munmap(struct perf_evsel *evsel, int ncpus, int nthreads);
 
 #define perf_evsel__match(evsel, t, c)		\
 	(evsel->attr.type == PERF_TYPE_##t &&	\
-- 
cgit v0.10.2


From 744bd8aa3c8b43447f689a27872fa95e700b8a4f Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Wed, 12 Jan 2011 17:07:28 -0200
Subject: perf record: Use struct perf_mmap and helpers

Paving the way to using perf_evsel->mmap, do this to reduce the patch
noise in the next ones.

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index ec43f2e..d89e2f1 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -78,26 +78,9 @@ static off_t			post_processing_offset;
 static struct perf_session	*session;
 static const char		*cpu_list;
 
-struct mmap_data {
-	void			*base;
-	unsigned int		mask;
-	unsigned int		prev;
-};
-
-static struct mmap_data		mmap_array[MAX_NR_CPUS];
-
-static unsigned long mmap_read_head(struct mmap_data *md)
-{
-	struct perf_event_mmap_page *pc = md->base;
-	long head;
-
-	head = pc->data_head;
-	rmb();
-
-	return head;
-}
+static struct perf_mmap		mmap_array[MAX_NR_CPUS];
 
-static void mmap_write_tail(struct mmap_data *md, unsigned long tail)
+static void mmap_write_tail(struct perf_mmap *md, unsigned long tail)
 {
 	struct perf_event_mmap_page *pc = md->base;
 
@@ -136,9 +119,9 @@ static int process_synthesized_event(event_t *event,
 	return 0;
 }
 
-static void mmap_read(struct mmap_data *md)
+static void mmap_read(struct perf_mmap *md)
 {
-	unsigned int head = mmap_read_head(md);
+	unsigned int head = perf_mmap__read_head(md);
 	unsigned int old = md->prev;
 	unsigned char *data = md->base + page_size;
 	unsigned long size;
-- 
cgit v0.10.2


From 115d2d8963a426670ac3ce983fc4c4e001703943 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Wed, 12 Jan 2011 17:11:53 -0200
Subject: perf record: Move perf_mmap__write_tail to perf.h

Close to perf_mmap__read_head() and the perf_mmap struct definition.
This is useful for any recorder, and we will need it in 'perf test'.

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index d89e2f1..109f3b2 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -80,17 +80,6 @@ static const char		*cpu_list;
 
 static struct perf_mmap		mmap_array[MAX_NR_CPUS];
 
-static void mmap_write_tail(struct perf_mmap *md, unsigned long tail)
-{
-	struct perf_event_mmap_page *pc = md->base;
-
-	/*
-	 * ensure all reads are done before we write the tail out.
-	 */
-	/* mb(); */
-	pc->data_tail = tail;
-}
-
 static void advance_output(size_t size)
 {
 	bytes_written += size;
@@ -165,7 +154,7 @@ static void mmap_read(struct perf_mmap *md)
 	write_output(buf, size);
 
 	md->prev = old;
-	mmap_write_tail(md, old);
+	perf_mmap__write_tail(md, old);
 }
 
 static volatile int done = 0;
diff --git a/tools/perf/perf.h b/tools/perf/perf.h
index 5fb5e1f..a5fc660 100644
--- a/tools/perf/perf.h
+++ b/tools/perf/perf.h
@@ -108,6 +108,18 @@ static inline unsigned int perf_mmap__read_head(struct perf_mmap *mm)
 	return head;
 }
 
+static inline void perf_mmap__write_tail(struct perf_mmap *md,
+					 unsigned long tail)
+{
+	struct perf_event_mmap_page *pc = md->base;
+
+	/*
+	 * ensure all reads are done before we write the tail out.
+	 */
+	/* mb(); */
+	pc->data_tail = tail;
+}
+
 /*
  * prctl(PR_TASK_PERF_EVENTS_DISABLE) will (cheaply) disable all
  * counters in the current task.
-- 
cgit v0.10.2


From 70db7533caef02350ec8d6852e589491bca3a951 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Wed, 12 Jan 2011 22:39:13 -0200
Subject: perf evlist: Move the mmap array from perf_evsel

Adopting the new model used in 'perf record', where we don't have a map
per thread per cpu, instead we have an mmap per cpu, established on the
first fd for that cpu and ask the kernel using the
PERF_EVENT_IOC_SET_OUTPUT ioctl to send events for the other fds on that
cpu for the one with the mmap.

The methods moved from perf_evsel to perf_evlist, but for easing review
they were modified in place, in evsel.c, the next patch will move the
migrated methods to evlist.c.

With this 'perf top' now uses the same mmap model used by 'perf record'
and the next patches will make 'perf record' use these new routines,
establishing a common codebase for both tools.

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 7d723ad..df85c1f 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -78,7 +78,7 @@ static struct cpu_map		*cpus;
 static int			realtime_prio			=      0;
 static bool			group				=  false;
 static unsigned int		page_size;
-static unsigned int		mmap_pages			=     16;
+static unsigned int		mmap_pages			=    128;
 static int			freq				=   1000; /* 1 KHz */
 
 static int			delay_secs			=      2;
@@ -991,8 +991,7 @@ static int symbol_filter(struct map *map, struct symbol *sym)
 
 static void event__process_sample(const event_t *self,
 				  struct sample_data *sample,
-				  struct perf_session *session,
-				  struct perf_evsel *evsel)
+				  struct perf_session *session)
 {
 	u64 ip = self->ip.ip;
 	struct sym_entry *syme;
@@ -1085,8 +1084,12 @@ static void event__process_sample(const event_t *self,
 
 	syme = symbol__priv(al.sym);
 	if (!syme->skip) {
-		syme->count[evsel->idx]++;
+		struct perf_evsel *evsel;
+
 		syme->origin = origin;
+		evsel = perf_evlist__id2evsel(evsel_list, sample->id);
+		assert(evsel != NULL);
+		syme->count[evsel->idx]++;
 		record_precise_ip(syme, evsel->idx, ip);
 		pthread_mutex_lock(&active_symbols_lock);
 		if (list_empty(&syme->node) || !syme->node.next)
@@ -1095,11 +1098,9 @@ static void event__process_sample(const event_t *self,
 	}
 }
 
-static void perf_session__mmap_read_counter(struct perf_session *self,
-					    struct perf_evsel *evsel,
-					    int cpu, int thread_idx)
+static void perf_session__mmap_read_cpu(struct perf_session *self, int cpu)
 {
-	struct perf_mmap *md = xyarray__entry(evsel->mmap, cpu, thread_idx);
+	struct perf_mmap *md = &evsel_list->mmap[cpu];
 	unsigned int head = perf_mmap__read_head(md);
 	unsigned int old = md->prev;
 	unsigned char *data = md->base + page_size;
@@ -1153,7 +1154,7 @@ static void perf_session__mmap_read_counter(struct perf_session *self,
 
 		event__parse_sample(event, self, &sample);
 		if (event->header.type == PERF_RECORD_SAMPLE)
-			event__process_sample(event, &sample, self, evsel);
+			event__process_sample(event, &sample, self);
 		else
 			event__process(event, &sample, self);
 		old += size;
@@ -1164,19 +1165,10 @@ static void perf_session__mmap_read_counter(struct perf_session *self,
 
 static void perf_session__mmap_read(struct perf_session *self)
 {
-	struct perf_evsel *counter;
-	int i, thread_index;
-
-	for (i = 0; i < cpus->nr; i++) {
-		list_for_each_entry(counter, &evsel_list->entries, node) {
-			for (thread_index = 0;
-				thread_index < threads->nr;
-				thread_index++) {
-				perf_session__mmap_read_counter(self,
-					counter, i, thread_index);
-			}
-		}
-	}
+	int i;
+
+	for (i = 0; i < cpus->nr; i++)
+		perf_session__mmap_read_cpu(self, i);
 }
 
 static void start_counters(struct perf_evlist *evlist)
@@ -1194,6 +1186,11 @@ static void start_counters(struct perf_evlist *evlist)
 			attr->sample_freq = freq;
 		}
 
+		if (evlist->nr_entries > 1) {
+			attr->sample_type |= PERF_SAMPLE_ID;
+			attr->read_format |= PERF_FORMAT_ID;
+		}
+
 		attr->mmap = 1;
 try_again:
 		if (perf_evsel__open(counter, cpus, threads, group, inherit) < 0) {
@@ -1225,15 +1222,16 @@ try_again:
 			die("No CONFIG_PERF_EVENTS=y kernel support configured?\n");
 			exit(-1);
 		}
-
-		if (perf_evsel__mmap(counter, cpus, threads, mmap_pages, evlist) < 0)
-			die("failed to mmap with %d (%s)\n", errno, strerror(errno));
 	}
+
+	if (perf_evlist__mmap(evlist, cpus, threads, mmap_pages, true) < 0)
+		die("failed to mmap with %d (%s)\n", errno, strerror(errno));
 }
 
 static int __cmd_top(void)
 {
 	pthread_t thread;
+	struct perf_evsel *first;
 	int ret;
 	/*
 	 * FIXME: perf_session__new should allow passing a O_MMAP, so that all this
@@ -1249,6 +1247,8 @@ static int __cmd_top(void)
 		event__synthesize_threads(event__process, session);
 
 	start_counters(evsel_list);
+	first = list_entry(evsel_list->entries.next, struct perf_evsel, node);
+	perf_session__set_sample_type(session, first->attr.sample_type);
 
 	/* Wait for a minimal set of events before starting the snapshot */
 	poll(evsel_list->pollfd, evsel_list->nr_fds, 100);
@@ -1394,8 +1394,7 @@ int cmd_top(int argc, const char **argv, const char *prefix __used)
 		usage_with_options(top_usage, options);
 
 	list_for_each_entry(pos, &evsel_list->entries, node) {
-		if (perf_evsel__alloc_mmap(pos, cpus->nr, threads->nr) < 0 ||
-		    perf_evsel__alloc_fd(pos, cpus->nr, threads->nr) < 0)
+		if (perf_evsel__alloc_fd(pos, cpus->nr, threads->nr) < 0)
 			goto out_free_fd;
 		/*
 		 * Fill in the ones not specifically initialized via -c:
@@ -1406,7 +1405,8 @@ int cmd_top(int argc, const char **argv, const char *prefix __used)
 		pos->attr.sample_period = default_interval;
 	}
 
-	if (perf_evlist__alloc_pollfd(evsel_list, cpus->nr, threads->nr) < 0)
+	if (perf_evlist__alloc_pollfd(evsel_list, cpus->nr, threads->nr) < 0 ||
+	    perf_evlist__alloc_mmap(evsel_list, cpus->nr) < 0)
 		goto out_free_fd;
 
 	sym_evsel = list_entry(evsel_list->entries.next, struct perf_evsel, node);
diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index 6d41292..deb82a4 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -3,11 +3,18 @@
 #include "evsel.h"
 #include "util.h"
 
+#include <linux/bitops.h>
+#include <linux/hash.h>
+
 struct perf_evlist *perf_evlist__new(void)
 {
 	struct perf_evlist *evlist = zalloc(sizeof(*evlist));
 
 	if (evlist != NULL) {
+		int i;
+
+		for (i = 0; i < PERF_EVLIST__HLIST_SIZE; ++i)
+			INIT_HLIST_HEAD(&evlist->heads[i]);
 		INIT_LIST_HEAD(&evlist->entries);
 	}
 
@@ -29,6 +36,7 @@ static void perf_evlist__purge(struct perf_evlist *evlist)
 void perf_evlist__delete(struct perf_evlist *evlist)
 {
 	perf_evlist__purge(evlist);
+	free(evlist->mmap);
 	free(evlist->pollfd);
 	free(evlist);
 }
@@ -68,3 +76,22 @@ void perf_evlist__add_pollfd(struct perf_evlist *evlist, int fd)
 	evlist->pollfd[evlist->nr_fds].events = POLLIN;
 	evlist->nr_fds++;
 }
+
+struct perf_evsel *perf_evlist__id2evsel(struct perf_evlist *evlist, u64 id)
+{
+	struct hlist_head *head;
+	struct hlist_node *pos;
+	struct perf_sample_id *sid;
+	int hash;
+
+	if (evlist->nr_entries == 1)
+		return list_entry(evlist->entries.next, struct perf_evsel, node);
+
+	hash = hash_64(id, PERF_EVLIST__HLIST_BITS);
+	head = &evlist->heads[hash];
+
+	hlist_for_each_entry(sid, pos, head, node)
+		if (sid->id == id)
+			return sid->evsel;
+	return NULL;
+}
diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h
index 16bbfcb..dbfcc79 100644
--- a/tools/perf/util/evlist.h
+++ b/tools/perf/util/evlist.h
@@ -2,13 +2,20 @@
 #define __PERF_EVLIST_H 1
 
 #include <linux/list.h>
+#include "../perf.h"
 
 struct pollfd;
 
+#define PERF_EVLIST__HLIST_BITS 8
+#define PERF_EVLIST__HLIST_SIZE (1 << PERF_EVLIST__HLIST_BITS)
+
 struct perf_evlist {
 	struct list_head entries;
+	struct hlist_head heads[PERF_EVLIST__HLIST_SIZE];
 	int		 nr_entries;
 	int		 nr_fds;
+	int		 mmap_len;
+	struct perf_mmap *mmap;
 	struct pollfd	 *pollfd;
 };
 
@@ -23,4 +30,6 @@ int perf_evlist__add_default(struct perf_evlist *evlist);
 int perf_evlist__alloc_pollfd(struct perf_evlist *evlist, int ncpus, int nthreads);
 void perf_evlist__add_pollfd(struct perf_evlist *evlist, int fd);
 
+struct perf_evsel *perf_evlist__id2evsel(struct perf_evlist *evlist, u64 id);
+
 #endif /* __PERF_EVLIST_H */
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index f500695..ee49035 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -8,7 +8,11 @@
 #include <unistd.h>
 #include <sys/mman.h>
 
+#include <linux/bitops.h>
+#include <linux/hash.h>
+
 #define FD(e, x, y) (*(int *)xyarray__entry(e->fd, x, y))
+#define SID(e, x, y) xyarray__entry(e->id, x, y)
 
 struct perf_evsel *perf_evsel__new(struct perf_event_attr *attr, int idx)
 {
@@ -29,6 +33,12 @@ int perf_evsel__alloc_fd(struct perf_evsel *evsel, int ncpus, int nthreads)
 	return evsel->fd != NULL ? 0 : -ENOMEM;
 }
 
+int perf_evsel__alloc_id(struct perf_evsel *evsel, int ncpus, int nthreads)
+{
+	evsel->id = xyarray__new(ncpus, nthreads, sizeof(struct perf_sample_id));
+	return evsel->id != NULL ? 0 : -ENOMEM;
+}
+
 int perf_evsel__alloc_counts(struct perf_evsel *evsel, int ncpus)
 {
 	evsel->counts = zalloc((sizeof(*evsel->counts) +
@@ -42,6 +52,12 @@ void perf_evsel__free_fd(struct perf_evsel *evsel)
 	evsel->fd = NULL;
 }
 
+void perf_evsel__free_id(struct perf_evsel *evsel)
+{
+	xyarray__delete(evsel->id);
+	evsel->id = NULL;
+}
+
 void perf_evsel__close_fd(struct perf_evsel *evsel, int ncpus, int nthreads)
 {
 	int cpu, thread;
@@ -53,32 +69,29 @@ void perf_evsel__close_fd(struct perf_evsel *evsel, int ncpus, int nthreads)
 		}
 }
 
-void perf_evsel__munmap(struct perf_evsel *evsel, int ncpus, int nthreads)
+void perf_evlist__munmap(struct perf_evlist *evlist, int ncpus)
 {
-	struct perf_mmap *mm;
-	int cpu, thread;
+	int cpu;
 
-	for (cpu = 0; cpu < ncpus; cpu++)
-		for (thread = 0; thread < nthreads; ++thread) {
-			mm = xyarray__entry(evsel->mmap, cpu, thread);
-			if (mm->base != NULL) {
-				munmap(mm->base, evsel->mmap_len);
-				mm->base = NULL;
-			}
+	for (cpu = 0; cpu < ncpus; cpu++) {
+		if (evlist->mmap[cpu].base != NULL) {
+			munmap(evlist->mmap[cpu].base, evlist->mmap_len);
+			evlist->mmap[cpu].base = NULL;
 		}
+	}
 }
 
-int perf_evsel__alloc_mmap(struct perf_evsel *evsel, int ncpus, int nthreads)
+int perf_evlist__alloc_mmap(struct perf_evlist *evlist, int ncpus)
 {
-	evsel->mmap = xyarray__new(ncpus, nthreads, sizeof(struct perf_mmap));
-	return evsel->mmap != NULL ? 0 : -ENOMEM;
+	evlist->mmap = zalloc(ncpus * sizeof(struct perf_mmap));
+	return evlist->mmap != NULL ? 0 : -ENOMEM;
 }
 
 void perf_evsel__delete(struct perf_evsel *evsel)
 {
 	assert(list_empty(&evsel->node));
 	xyarray__delete(evsel->fd);
-	xyarray__delete(evsel->mmap);
+	xyarray__delete(evsel->id);
 	free(evsel);
 }
 
@@ -235,47 +248,110 @@ int perf_evsel__open_per_thread(struct perf_evsel *evsel,
 	return __perf_evsel__open(evsel, &empty_cpu_map.map, threads, group, inherit);
 }
 
-int perf_evsel__mmap(struct perf_evsel *evsel, struct cpu_map *cpus,
-		     struct thread_map *threads, int pages,
-		     struct perf_evlist *evlist)
+static int __perf_evlist__mmap(struct perf_evlist *evlist, int cpu, int prot,
+			       int mask, int fd)
+{
+	evlist->mmap[cpu].prev = 0;
+	evlist->mmap[cpu].mask = mask;
+	evlist->mmap[cpu].base = mmap(NULL, evlist->mmap_len, prot,
+				      MAP_SHARED, fd, 0);
+	if (evlist->mmap[cpu].base == MAP_FAILED)
+		return -1;
+
+	perf_evlist__add_pollfd(evlist, fd);
+	return 0;
+}
+
+static int perf_evlist__id_hash(struct perf_evlist *evlist, struct perf_evsel *evsel,
+			       int cpu, int thread, int fd)
+{
+	struct perf_sample_id *sid;
+	u64 read_data[4] = { 0, };
+	int hash, id_idx = 1; /* The first entry is the counter value */
+
+	if (!(evsel->attr.read_format & PERF_FORMAT_ID) ||
+	    read(fd, &read_data, sizeof(read_data)) == -1)
+		return -1;
+
+	if (evsel->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+		++id_idx;
+	if (evsel->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+		++id_idx;
+
+	sid = SID(evsel, cpu, thread);
+	sid->id = read_data[id_idx];
+	sid->evsel = evsel;
+	hash = hash_64(sid->id, PERF_EVLIST__HLIST_BITS);
+	hlist_add_head(&sid->node, &evlist->heads[hash]);
+	return 0;
+}
+
+/** perf_evlist__mmap - Create per cpu maps to receive events
+ *
+ * @evlist - list of events
+ * @cpus - cpu map being monitored
+ * @threads - threads map being monitored
+ * @pages - map length in pages
+ * @overwrite - overwrite older events?
+ *
+ * If overwrite is false the user needs to signal event consuption using:
+ *
+ *	struct perf_mmap *m = &evlist->mmap[cpu];
+ *	unsigned int head = perf_mmap__read_head(m);
+ *
+ *	perf_mmap__write_tail(m, head)
+ */
+int perf_evlist__mmap(struct perf_evlist *evlist, struct cpu_map *cpus,
+		      struct thread_map *threads, int pages, bool overwrite)
 {
 	unsigned int page_size = sysconf(_SC_PAGE_SIZE);
 	int mask = pages * page_size - 1, cpu;
-	struct perf_mmap *mm;
-	int thread;
+	struct perf_evsel *first_evsel, *evsel;
+	int thread, prot = PROT_READ | (overwrite ? 0 : PROT_WRITE);
 
-	if (evsel->mmap == NULL &&
-	    perf_evsel__alloc_mmap(evsel, cpus->nr, threads->nr) < 0)
+	if (evlist->mmap == NULL &&
+	    perf_evlist__alloc_mmap(evlist, cpus->nr) < 0)
 		return -ENOMEM;
 
-	evsel->mmap_len = (pages + 1) * page_size;
+	if (evlist->pollfd == NULL &&
+	    perf_evlist__alloc_pollfd(evlist, cpus->nr, threads->nr) < 0)
+		return -ENOMEM;
 
-	for (cpu = 0; cpu < cpus->nr; cpu++) {
-		for (thread = 0; thread < threads->nr; thread++) {
-			mm = xyarray__entry(evsel->mmap, cpu, thread);
-			mm->prev = 0;
-			mm->mask = mask;
-			mm->base = mmap(NULL, evsel->mmap_len, PROT_READ,
-					MAP_SHARED, FD(evsel, cpu, thread), 0);
-			if (mm->base == MAP_FAILED)
-				goto out_unmap;
-
-			if (evlist != NULL)
-				 perf_evlist__add_pollfd(evlist, FD(evsel, cpu, thread));
+	evlist->mmap_len = (pages + 1) * page_size;
+	first_evsel = list_entry(evlist->entries.next, struct perf_evsel, node);
+
+	list_for_each_entry(evsel, &evlist->entries, node) {
+		if ((evsel->attr.read_format & PERF_FORMAT_ID) &&
+		    evsel->id == NULL &&
+		    perf_evsel__alloc_id(evsel, cpus->nr, threads->nr) < 0)
+			return -ENOMEM;
+
+		for (cpu = 0; cpu < cpus->nr; cpu++) {
+			for (thread = 0; thread < threads->nr; thread++) {
+				int fd = FD(evsel, cpu, thread);
+
+				if (evsel->idx || thread) {
+					if (ioctl(fd, PERF_EVENT_IOC_SET_OUTPUT,
+						  FD(first_evsel, cpu, 0)) != 0)
+						goto out_unmap;
+				} else if (__perf_evlist__mmap(evlist, cpu, prot, mask, fd) < 0)
+					goto out_unmap;
+
+				if ((evsel->attr.read_format & PERF_FORMAT_ID) &&
+				    perf_evlist__id_hash(evlist, evsel, cpu, thread, fd) < 0)
+					goto out_unmap;
+			}
 		}
 	}
 
 	return 0;
 
 out_unmap:
-	do {
-		while (--thread >= 0) {
-			mm = xyarray__entry(evsel->mmap, cpu, thread);
-			munmap(mm->base, evsel->mmap_len);
-			mm->base = NULL;
+	for (cpu = 0; cpu < cpus->nr; cpu++) {
+		if (evlist->mmap[cpu].base != NULL) {
+			munmap(evlist->mmap[cpu].base, evlist->mmap_len);
+			evlist->mmap[cpu].base = NULL;
 		}
-		thread = threads->nr;
-	} while (--cpu >= 0);
-
+	}
 	return -1;
 }
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index c8fbef2..667ee4e 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -24,14 +24,25 @@ struct perf_counts {
 	struct perf_counts_values cpu[];
 };
 
+struct perf_evsel;
+
+/*
+ * Per fd, to map back from PERF_SAMPLE_ID to evsel, only used when there are
+ * more than one entry in the evlist.
+ */
+struct perf_sample_id {
+	struct hlist_node 	node;
+	u64		 	id;
+	struct perf_evsel	*evsel;
+};
+
 struct perf_evsel {
 	struct list_head	node;
 	struct perf_event_attr	attr;
 	char			*filter;
 	struct xyarray		*fd;
-	struct xyarray		*mmap;
+	struct xyarray		*id;
 	struct perf_counts	*counts;
-	size_t			mmap_len;
 	int			idx;
 	void			*priv;
 };
@@ -44,9 +55,11 @@ struct perf_evsel *perf_evsel__new(struct perf_event_attr *attr, int idx);
 void perf_evsel__delete(struct perf_evsel *evsel);
 
 int perf_evsel__alloc_fd(struct perf_evsel *evsel, int ncpus, int nthreads);
+int perf_evsel__alloc_id(struct perf_evsel *evsel, int ncpus, int nthreads);
 int perf_evsel__alloc_counts(struct perf_evsel *evsel, int ncpus);
-int perf_evsel__alloc_mmap(struct perf_evsel *evsel, int ncpus, int nthreads);
+int perf_evlist__alloc_mmap(struct perf_evlist *evlist, int ncpus);
 void perf_evsel__free_fd(struct perf_evsel *evsel);
+void perf_evsel__free_id(struct perf_evsel *evsel);
 void perf_evsel__close_fd(struct perf_evsel *evsel, int ncpus, int nthreads);
 
 int perf_evsel__open_per_cpu(struct perf_evsel *evsel,
@@ -55,10 +68,9 @@ int perf_evsel__open_per_thread(struct perf_evsel *evsel,
 				struct thread_map *threads, bool group, bool inherit);
 int perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus,
 		     struct thread_map *threads, bool group, bool inherit);
-int perf_evsel__mmap(struct perf_evsel *evsel, struct cpu_map *cpus,
-		     struct thread_map *threads, int pages,
-		     struct perf_evlist *evlist);
-void perf_evsel__munmap(struct perf_evsel *evsel, int ncpus, int nthreads);
+int perf_evlist__mmap(struct perf_evlist *evlist, struct cpu_map *cpus,
+		      struct thread_map *threads, int pages, bool overwrite);
+void perf_evlist__munmap(struct perf_evlist *evlist, int ncpus);
 
 #define perf_evsel__match(evsel, t, c)		\
 	(evsel->attr.type == PERF_TYPE_##t &&	\
-- 
cgit v0.10.2


From 0a27d7f9f417c0305f7efa70631764a53c7af219 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 14 Jan 2011 15:50:51 -0200
Subject: perf record: Use perf_evlist__mmap

There is more stuff that can go to the perf_ev{sel,list} layer, like
detecting if sample_id_all is available, etc, but lets try using this in
'perf test' first.

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 109f3b2..45a3689 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -30,6 +30,7 @@
 #include <sys/mman.h>
 
 #define FD(e, x, y) (*(int *)xyarray__entry(e->fd, x, y))
+#define SID(e, x, y) xyarray__entry(e->id, x, y)
 
 enum write_mode_t {
 	WRITE_FORCE,
@@ -78,8 +79,6 @@ static off_t			post_processing_offset;
 static struct perf_session	*session;
 static const char		*cpu_list;
 
-static struct perf_mmap		mmap_array[MAX_NR_CPUS];
-
 static void advance_output(size_t size)
 {
 	bytes_written += size;
@@ -196,20 +195,14 @@ static struct perf_header_attr *get_header_attr(struct perf_event_attr *a, int n
 	return h_attr;
 }
 
-static void create_counter(struct perf_evlist *evlist,
-			   struct perf_evsel *evsel, int cpu)
+static void create_counter(struct perf_evsel *evsel, int cpu)
 {
 	char *filter = evsel->filter;
 	struct perf_event_attr *attr = &evsel->attr;
 	struct perf_header_attr *h_attr;
+	struct perf_sample_id *sid;
 	int thread_index;
 	int ret;
-	struct {
-		u64 count;
-		u64 time_enabled;
-		u64 time_running;
-		u64 id;
-	} read_data;
 
 	for (thread_index = 0; thread_index < threads->nr; thread_index++) {
 		h_attr = get_header_attr(attr, evsel->idx);
@@ -223,45 +216,12 @@ static void create_counter(struct perf_evlist *evlist,
 			}
 		}
 
-		if (read(FD(evsel, cpu, thread_index), &read_data, sizeof(read_data)) == -1) {
-			perror("Unable to read perf file descriptor");
-			exit(-1);
-		}
-
-		if (perf_header_attr__add_id(h_attr, read_data.id) < 0) {
+		sid = SID(evsel, cpu, thread_index);
+		if (perf_header_attr__add_id(h_attr, sid->id) < 0) {
 			pr_warning("Not enough memory to add id\n");
 			exit(-1);
 		}
 
-		assert(FD(evsel, cpu, thread_index) >= 0);
-		fcntl(FD(evsel, cpu, thread_index), F_SETFL, O_NONBLOCK);
-
-		if (evsel->idx || thread_index) {
-			struct perf_evsel *first;
-			first = list_entry(evlist->entries.next, struct perf_evsel, node);
-			ret = ioctl(FD(evsel, cpu, thread_index),
-				    PERF_EVENT_IOC_SET_OUTPUT,
-				    FD(first, cpu, 0));
-			if (ret) {
-				error("failed to set output: %d (%s)\n", errno,
-						strerror(errno));
-				exit(-1);
-			}
-		} else {
-			mmap_array[cpu].prev = 0;
-			mmap_array[cpu].mask = mmap_pages*page_size - 1;
-			mmap_array[cpu].base = mmap(NULL, (mmap_pages+1)*page_size,
-				PROT_READ | PROT_WRITE, MAP_SHARED, FD(evsel, cpu, thread_index), 0);
-			if (mmap_array[cpu].base == MAP_FAILED) {
-				error("failed to mmap with %d (%s)\n", errno, strerror(errno));
-				exit(-1);
-			}
-
-			evlist->pollfd[evlist->nr_fds].fd = FD(evsel, cpu, thread_index);
-			evlist->pollfd[evlist->nr_fds].events = POLLIN;
-			evlist->nr_fds++;
-		}
-
 		if (filter != NULL) {
 			ret = ioctl(FD(evsel, cpu, thread_index),
 				    PERF_EVENT_IOC_SET_FILTER, filter);
@@ -423,9 +383,12 @@ try_again:
 		}
 	}
 
+	if (perf_evlist__mmap(evlist, cpus, threads, mmap_pages, false) < 0)
+		die("failed to mmap with %d (%s)\n", errno, strerror(errno));
+
 	for (cpu = 0; cpu < cpus->nr; ++cpu) {
 		list_for_each_entry(pos, &evlist->entries, node)
-			create_counter(evlist, pos, cpu);
+			create_counter(pos, cpu);
 	}
 }
 
@@ -502,8 +465,8 @@ static void mmap_read_all(void)
 	int i;
 
 	for (i = 0; i < cpus->nr; i++) {
-		if (mmap_array[i].base)
-			mmap_read(&mmap_array[i]);
+		if (evsel_list->mmap[i].base)
+			mmap_read(&evsel_list->mmap[i]);
 	}
 
 	if (perf_header__has_feat(&session->header, HEADER_TRACE_INFO))
-- 
cgit v0.10.2


From 915fce20ecf8f7ff4189d0fff42b62aebf6a57cc Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 14 Jan 2011 16:19:12 -0200
Subject: perf tools: Add missing cpu_map__delete()

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c
index 3ccaa10..6893eec 100644
--- a/tools/perf/util/cpumap.c
+++ b/tools/perf/util/cpumap.c
@@ -177,3 +177,8 @@ struct cpu_map *cpu_map__dummy_new(void)
 
 	return cpus;
 }
+
+void cpu_map__delete(struct cpu_map *map)
+{
+	free(map);
+}
diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h
index f7a4f42..072c0a3 100644
--- a/tools/perf/util/cpumap.h
+++ b/tools/perf/util/cpumap.h
@@ -8,6 +8,6 @@ struct cpu_map {
 
 struct cpu_map *cpu_map__new(const char *cpu_list);
 struct cpu_map *cpu_map__dummy_new(void);
-void *cpu_map__delete(struct cpu_map *map);
+void cpu_map__delete(struct cpu_map *map);
 
 #endif /* __PERF_CPUMAP_H */
-- 
cgit v0.10.2


From d2af9687c96f3864178de1860e6d83873aeef224 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 14 Jan 2011 16:24:49 -0200
Subject: perf test: Check counts on all cpus in
 test__open_syscall_event_on_all_cpus

We were bailing out after the first count mismatch, do it in all to see
if only some CPUs are not getting the expected number of events.

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/builtin-test.c b/tools/perf/builtin-test.c
index 7287158..7cc6b20 100644
--- a/tools/perf/builtin-test.c
+++ b/tools/perf/builtin-test.c
@@ -408,6 +408,8 @@ static int test__open_syscall_event_on_all_cpus(void)
 		goto out_close_fd;
 	}
 
+	err = 0;
+
 	for (cpu = 0; cpu < cpus->nr; ++cpu) {
 		unsigned int expected;
 
@@ -416,18 +418,18 @@ static int test__open_syscall_event_on_all_cpus(void)
 
 		if (perf_evsel__read_on_cpu(evsel, cpu, 0) < 0) {
 			pr_debug("perf_evsel__open_read_on_cpu\n");
-			goto out_close_fd;
+			err = -1;
+			break;
 		}
 
 		expected = nr_open_calls + cpu;
 		if (evsel->counts->cpu[cpu].val != expected) {
 			pr_debug("perf_evsel__read_on_cpu: expected to intercept %d calls on cpu %d, got %" PRIu64 "\n",
 				 expected, cpus->map[cpu], evsel->counts->cpu[cpu].val);
-			goto out_close_fd;
+			err = -1;
 		}
 	}
 
-	err = 0;
 out_close_fd:
 	perf_evsel__close_fd(evsel, 1, threads->nr);
 out_evsel_delete:
-- 
cgit v0.10.2


From 98d77b78504a423fca911a26a17bee00ef2fdda2 Mon Sep 17 00:00:00 2001
From: Han Pingtian <phan@redhat.com>
Date: Sat, 15 Jan 2011 07:00:50 +0800
Subject: perf test: check if cpu_map__new() return NULL

It looks like we should check if cpus is NULL after

	cpus = cpu_map__new(NULL);

in test__open_syscall_event_on_all_cpus().

LKML-Reference: <20110114230050.GA7011@localhost>
Signed-off-by: Han Pingtian <phan@redhat.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/builtin-test.c b/tools/perf/builtin-test.c
index 7cc6b20..5a50e47 100644
--- a/tools/perf/builtin-test.c
+++ b/tools/perf/builtin-test.c
@@ -347,9 +347,9 @@ static int test__open_syscall_event_on_all_cpus(void)
 	}
 
 	cpus = cpu_map__new(NULL);
-	if (threads == NULL) {
-		pr_debug("thread_map__new\n");
-		return -1;
+	if (cpus == NULL) {
+		pr_debug("cpu_map__new\n");
+		goto out_thread_map_delete;
 	}
 
 
-- 
cgit v0.10.2


From 04391debc3e1195222a4dbb162ace6542dd89c1c Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Sat, 15 Jan 2011 10:40:59 -0200
Subject: perf evlist: Steal mmap reading routine from 'perf top'

Will be used in the upcoming 'perf test' entry for the evlist mmap
routines.

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index df85c1f..58352ad 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -1100,67 +1100,17 @@ static void event__process_sample(const event_t *self,
 
 static void perf_session__mmap_read_cpu(struct perf_session *self, int cpu)
 {
-	struct perf_mmap *md = &evsel_list->mmap[cpu];
-	unsigned int head = perf_mmap__read_head(md);
-	unsigned int old = md->prev;
-	unsigned char *data = md->base + page_size;
 	struct sample_data sample;
-	int diff;
-
-	/*
-	 * If we're further behind than half the buffer, there's a chance
-	 * the writer will bite our tail and mess up the samples under us.
-	 *
-	 * If we somehow ended up ahead of the head, we got messed up.
-	 *
-	 * In either case, truncate and restart at head.
-	 */
-	diff = head - old;
-	if (diff > md->mask / 2 || diff < 0) {
-		fprintf(stderr, "WARNING: failed to keep up with mmap data.\n");
-
-		/*
-		 * head points to a known good entry, start there.
-		 */
-		old = head;
-	}
-
-	for (; old != head;) {
-		event_t *event = (event_t *)&data[old & md->mask];
-
-		event_t event_copy;
-
-		size_t size = event->header.size;
-
-		/*
-		 * Event straddles the mmap boundary -- header should always
-		 * be inside due to u64 alignment of output.
-		 */
-		if ((old & md->mask) + size != ((old + size) & md->mask)) {
-			unsigned int offset = old;
-			unsigned int len = min(sizeof(*event), size), cpy;
-			void *dst = &event_copy;
-
-			do {
-				cpy = min(md->mask + 1 - (offset & md->mask), len);
-				memcpy(dst, &data[offset & md->mask], cpy);
-				offset += cpy;
-				dst += cpy;
-				len -= cpy;
-			} while (len);
-
-			event = &event_copy;
-		}
+	event_t *event;
 
+	while ((event = perf_evlist__read_on_cpu(evsel_list, cpu)) != NULL) {
 		event__parse_sample(event, self, &sample);
+
 		if (event->header.type == PERF_RECORD_SAMPLE)
 			event__process_sample(event, &sample, self);
 		else
 			event__process(event, &sample, self);
-		old += size;
 	}
-
-	md->prev = old;
 }
 
 static void perf_session__mmap_read(struct perf_session *self)
diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index deb82a4..4b3b84c 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -95,3 +95,65 @@ struct perf_evsel *perf_evlist__id2evsel(struct perf_evlist *evlist, u64 id)
 			return sid->evsel;
 	return NULL;
 }
+
+event_t *perf_evlist__read_on_cpu(struct perf_evlist *evlist, int cpu)
+{
+	/* XXX Move this to perf.c, making it generally available */
+	unsigned int page_size = sysconf(_SC_PAGE_SIZE);
+	struct perf_mmap *md = &evlist->mmap[cpu];
+	unsigned int head = perf_mmap__read_head(md);
+	unsigned int old = md->prev;
+	unsigned char *data = md->base + page_size;
+	event_t *event = NULL;
+	int diff;
+
+	/*
+	 * If we're further behind than half the buffer, there's a chance
+	 * the writer will bite our tail and mess up the samples under us.
+	 *
+	 * If we somehow ended up ahead of the head, we got messed up.
+	 *
+	 * In either case, truncate and restart at head.
+	 */
+	diff = head - old;
+	if (diff > md->mask / 2 || diff < 0) {
+		fprintf(stderr, "WARNING: failed to keep up with mmap data.\n");
+
+		/*
+		 * head points to a known good entry, start there.
+		 */
+		old = head;
+	}
+
+	if (old != head) {
+		size_t size;
+
+		event = (event_t *)&data[old & md->mask];
+		size = event->header.size;
+
+		/*
+		 * Event straddles the mmap boundary -- header should always
+		 * be inside due to u64 alignment of output.
+		 */
+		if ((old & md->mask) + size != ((old + size) & md->mask)) {
+			unsigned int offset = old;
+			unsigned int len = min(sizeof(*event), size), cpy;
+			void *dst = &evlist->event_copy;
+
+			do {
+				cpy = min(md->mask + 1 - (offset & md->mask), len);
+				memcpy(dst, &data[offset & md->mask], cpy);
+				offset += cpy;
+				dst += cpy;
+				len -= cpy;
+			} while (len);
+
+			event = &evlist->event_copy;
+		}
+
+		old += size;
+	}
+
+	md->prev = old;
+	return event;
+}
diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h
index dbfcc79..2871206 100644
--- a/tools/perf/util/evlist.h
+++ b/tools/perf/util/evlist.h
@@ -3,6 +3,7 @@
 
 #include <linux/list.h>
 #include "../perf.h"
+#include "event.h"
 
 struct pollfd;
 
@@ -15,6 +16,7 @@ struct perf_evlist {
 	int		 nr_entries;
 	int		 nr_fds;
 	int		 mmap_len;
+	event_t		 event_copy;
 	struct perf_mmap *mmap;
 	struct pollfd	 *pollfd;
 };
@@ -32,4 +34,6 @@ void perf_evlist__add_pollfd(struct perf_evlist *evlist, int fd);
 
 struct perf_evsel *perf_evlist__id2evsel(struct perf_evlist *evlist, u64 id);
 
+event_t *perf_evlist__read_on_cpu(struct perf_evlist *self, int cpu);
+
 #endif /* __PERF_EVLIST_H */
-- 
cgit v0.10.2


From de5fa3a8a05cd60f59622e88cfeb90416760d78e Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Sat, 15 Jan 2011 10:42:46 -0200
Subject: perf test: Add test for the evlist mmap routines

This test will generate random numbers of calls to some getpid syscalls,
then establish an mmap for a group of events that are created to monitor
these syscalls.

It will receive the events, using mmap, use its PERF_SAMPLE_ID generated
sample.id field to map back to its respective perf_evsel instance.

Then it checks if the number of syscalls reported as perf events by the
kernel corresponds to the number of syscalls made.

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/builtin-test.c b/tools/perf/builtin-test.c
index 5a50e47..4fd3453 100644
--- a/tools/perf/builtin-test.c
+++ b/tools/perf/builtin-test.c
@@ -7,7 +7,9 @@
 
 #include "util/cache.h"
 #include "util/debug.h"
+#include "util/evlist.h"
 #include "util/parse-options.h"
+#include "util/parse-events.h"
 #include "util/session.h"
 #include "util/symbol.h"
 #include "util/thread.h"
@@ -238,14 +240,14 @@ out:
 #include "util/evsel.h"
 #include <sys/types.h>
 
-static int trace_event__id(const char *event_name)
+static int trace_event__id(const char *evname)
 {
 	char *filename;
 	int err = -1, fd;
 
 	if (asprintf(&filename,
 		     "/sys/kernel/debug/tracing/events/syscalls/%s/id",
-		     event_name) < 0)
+		     evname) < 0)
 		return -1;
 
 	fd = open(filename, O_RDONLY);
@@ -439,6 +441,167 @@ out_thread_map_delete:
 	return err;
 }
 
+/*
+ * This test will generate random numbers of calls to some getpid syscalls,
+ * then establish an mmap for a group of events that are created to monitor
+ * the syscalls.
+ *
+ * It will receive the events, using mmap, use its PERF_SAMPLE_ID generated
+ * sample.id field to map back to its respective perf_evsel instance.
+ *
+ * Then it checks if the number of syscalls reported as perf events by
+ * the kernel corresponds to the number of syscalls made.
+ */
+static int test__basic_mmap(void)
+{
+	int err = -1;
+	event_t *event;
+	struct thread_map *threads;
+	struct perf_session session;
+	struct cpu_map *cpus;
+	struct perf_evlist *evlist;
+	struct perf_event_attr attr = {
+		.type		= PERF_TYPE_TRACEPOINT,
+		.read_format	= PERF_FORMAT_ID,
+		.sample_type	= PERF_SAMPLE_ID,
+		.watermark	= 0,
+	};
+	cpu_set_t cpu_set;
+	const char *syscall_names[] = { "getsid", "getppid", "getpgrp",
+					"getpgid", };
+	pid_t (*syscalls[])(void) = { (void *)getsid, getppid, getpgrp,
+				      (void*)getpgid };
+#define nsyscalls ARRAY_SIZE(syscall_names)
+	int ids[nsyscalls];
+	unsigned int nr_events[nsyscalls],
+		     expected_nr_events[nsyscalls], i, j;
+	struct perf_evsel *evsels[nsyscalls], *evsel;
+
+	for (i = 0; i < nsyscalls; ++i) {
+		char name[64];
+
+		snprintf(name, sizeof(name), "sys_enter_%s", syscall_names[i]);
+		ids[i] = trace_event__id(name);
+		if (ids[i] < 0) {
+			pr_debug("Is debugfs mounted on /sys/kernel/debug?\n");
+			return -1;
+		}
+		nr_events[i] = 0;
+		expected_nr_events[i] = random() % 257;
+	}
+
+	threads = thread_map__new(-1, getpid());
+	if (threads == NULL) {
+		pr_debug("thread_map__new\n");
+		return -1;
+	}
+
+	cpus = cpu_map__new(NULL);
+	if (threads == NULL) {
+		pr_debug("thread_map__new\n");
+		goto out_free_threads;
+	}
+
+	CPU_ZERO(&cpu_set);
+	CPU_SET(cpus->map[0], &cpu_set);
+	sched_setaffinity(0, sizeof(cpu_set), &cpu_set);
+	if (sched_setaffinity(0, sizeof(cpu_set), &cpu_set) < 0) {
+		pr_debug("sched_setaffinity() failed on CPU %d: %s ",
+			 cpus->map[0], strerror(errno));
+		goto out_free_cpus;
+	}
+
+	evlist = perf_evlist__new();
+	if (threads == NULL) {
+		pr_debug("perf_evlist__new\n");
+		goto out_free_cpus;
+	}
+
+	/* anonymous union fields, can't be initialized above */
+	attr.wakeup_events = 1;
+	attr.sample_period = 1;
+
+	/*
+ 	 * FIXME: use evsel->attr.sample_type in event__parse_sample.
+ 	 * 	  This will nicely remove the requirement that we have
+ 	 * 	  all the events with the same sample_type.
+ 	 */
+	session.sample_type = attr.sample_type;
+
+	for (i = 0; i < nsyscalls; ++i) {
+		attr.config = ids[i];
+		evsels[i] = perf_evsel__new(&attr, i);
+		if (evsels[i] == NULL) {
+			pr_debug("perf_evsel__new\n");
+			goto out_free_evlist;
+		}
+
+		perf_evlist__add(evlist, evsels[i]);
+
+		if (perf_evsel__open(evsels[i], cpus, threads, false, false) < 0) {
+			pr_debug("failed to open counter: %s, "
+				 "tweak /proc/sys/kernel/perf_event_paranoid?\n",
+				 strerror(errno));
+			goto out_close_fd;
+		}
+	}
+
+	if (perf_evlist__mmap(evlist, cpus, threads, 128, true) < 0) {
+		pr_debug("failed to mmap events: %d (%s)\n", errno,
+			 strerror(errno));
+		goto out_close_fd;
+	}
+
+	for (i = 0; i < nsyscalls; ++i)
+		for (j = 0; j < expected_nr_events[i]; ++j) {
+			int foo = syscalls[i]();
+			++foo;
+		}
+
+	while ((event = perf_evlist__read_on_cpu(evlist, 0)) != NULL) {
+		struct sample_data sample;
+
+		if (event->header.type != PERF_RECORD_SAMPLE) {
+			pr_debug("unexpected %s event\n",
+				 event__get_event_name(event->header.type));
+			goto out_munmap;
+		}
+
+		event__parse_sample(event, &session, &sample);
+		evsel = perf_evlist__id2evsel(evlist, sample.id);
+		if (evsel == NULL) {
+			pr_debug("event with id %" PRIu64
+				 " doesn't map to an evsel\n", sample.id);
+			goto out_munmap;
+		}
+		nr_events[evsel->idx]++;
+	}
+
+	list_for_each_entry(evsel, &evlist->entries, node) {
+		if (nr_events[evsel->idx] != expected_nr_events[evsel->idx]) {
+			pr_debug("expected %d %s events, got %d\n",
+				 expected_nr_events[evsel->idx],
+				 event_name(evsel), nr_events[evsel->idx]);
+			goto out_munmap;
+		}
+	}
+
+	err = 0;
+out_munmap:
+	perf_evlist__munmap(evlist, 1);
+out_close_fd:
+	for (i = 0; i < nsyscalls; ++i)
+		perf_evsel__close_fd(evsels[i], 1, threads->nr);
+out_free_evlist:
+	perf_evlist__delete(evlist);
+out_free_cpus:
+	cpu_map__delete(cpus);
+out_free_threads:
+	thread_map__delete(threads);
+	return err;
+#undef nsyscalls
+}
+
 static struct test {
 	const char *desc;
 	int (*func)(void);
@@ -456,6 +619,10 @@ static struct test {
 		.func = test__open_syscall_event_on_all_cpus,
 	},
 	{
+		.desc = "read samples using the mmap interface",
+		.func = test__basic_mmap,
+	},
+	{
 		.func = NULL,
 	},
 };
-- 
cgit v0.10.2


From 1b3a0e9592ebf174af934b3908a2bf6a6fa86169 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 14 Jan 2011 04:51:58 +0100
Subject: perf callchain: Feed callchains into a cursor

The callchains are fed with an array of a fixed size.
As a result we iterate over each callchains three times:

- 1st to resolve symbols
- 2nd to filter out context boundaries
- 3rd for the insertion into the tree

This also involves some pairs of memory allocation/deallocation
everytime we insert a callchain, for the filtered out array of
addresses and for the array of symbols that comes along.

Instead, feed the callchains through a linked list with persistent
allocations. It brings several pros like:

- Merge the 1st and 2nd iterations in one. That was possible before
but in a way that would involve allocating an array slightly taller
than necessary because we don't know in advance the number of context
boundaries to filter out.

- Much lesser allocations/deallocations. The linked list keeps
persistent empty entries for the next usages and is extendable at
will.

- Makes it easier for multiple sources of callchains to feed a
stacktrace together. This is deemed to pave the way for cfi based
callchains wherein traditional frame pointer based kernel
stacktraces will precede cfi based user ones, producing an overall
callchain which size is hardly predictable. This requirement
makes the static array obsolete and makes a linked list based
iterator a much more flexible fit.

Basic testing on a big perf file containing callchains (~ 176 MB)
has shown a throughput gain of about 11% with perf report.

Cc: Ingo Molnar <mingo@elte.hu>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1294977121-5700-2-git-send-email-fweisbec@gmail.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index c27e31f..c95599a 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -81,18 +81,17 @@ static int perf_session__add_hist_entry(struct perf_session *self,
 					struct addr_location *al,
 					struct sample_data *data)
 {
-	struct map_symbol *syms = NULL;
 	struct symbol *parent = NULL;
-	int err = -ENOMEM;
+	int err = 0;
 	struct hist_entry *he;
 	struct hists *hists;
 	struct perf_event_attr *attr;
 
 	if ((sort__has_parent || symbol_conf.use_callchain) && data->callchain) {
-		syms = perf_session__resolve_callchain(self, al->thread,
-						       data->callchain, &parent);
-		if (syms == NULL)
-			return -ENOMEM;
+		err = perf_session__resolve_callchain(self, al->thread,
+						      data->callchain, &parent);
+		if (err)
+			return err;
 	}
 
 	attr = perf_header__find_attr(data->id, &self->header);
@@ -101,16 +100,17 @@ static int perf_session__add_hist_entry(struct perf_session *self,
 	else
 		hists = perf_session__hists_findnew(self, data->id, 0, 0);
 	if (hists == NULL)
-		goto out_free_syms;
+		return -ENOMEM;
+
 	he = __hists__add_entry(hists, al, parent, data->period);
 	if (he == NULL)
-		goto out_free_syms;
-	err = 0;
+		return -ENOMEM;
+
 	if (symbol_conf.use_callchain) {
-		err = callchain_append(he->callchain, data->callchain, syms,
+		err = callchain_append(he->callchain, &self->callchain_cursor,
 				       data->period);
 		if (err)
-			goto out_free_syms;
+			return err;
 	}
 	/*
 	 * Only in the newt browser we are doing integrated annotation,
@@ -119,8 +119,7 @@ static int perf_session__add_hist_entry(struct perf_session *self,
 	 */
 	if (use_browser > 0)
 		err = hist_entry__inc_addr_samples(he, al->addr);
-out_free_syms:
-	free(syms);
+
 	return err;
 }
 
diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c
index e12d539..53a49e0 100644
--- a/tools/perf/util/callchain.c
+++ b/tools/perf/util/callchain.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2009-2010, Frederic Weisbecker <fweisbec@gmail.com>
+ * Copyright (C) 2009-2011, Frederic Weisbecker <fweisbec@gmail.com>
  *
  * Handle the callchains from the stream in an ad-hoc radix tree and then
  * sort them in an rbtree.
@@ -195,26 +195,21 @@ create_child(struct callchain_node *parent, bool inherit_children)
 }
 
 
-struct resolved_ip {
-	u64		  ip;
-	struct map_symbol ms;
-};
-
-struct resolved_chain {
-	u64			nr;
-	struct resolved_ip	ips[0];
-};
-
-
 /*
  * Fill the node with callchain values
  */
 static void
-fill_node(struct callchain_node *node, struct resolved_chain *chain, int start)
+fill_node(struct callchain_node *node, struct callchain_cursor *cursor)
 {
-	unsigned int i;
+	struct callchain_cursor_node *cursor_node;
+
+	node->val_nr = cursor->nr - cursor->pos;
+	if (!node->val_nr)
+		pr_warning("Warning: empty node in callchain tree\n");
 
-	for (i = start; i < chain->nr; i++) {
+	cursor_node = callchain_cursor_current(cursor);
+
+	while (cursor_node) {
 		struct callchain_list *call;
 
 		call = zalloc(sizeof(*call));
@@ -222,23 +217,25 @@ fill_node(struct callchain_node *node, struct resolved_chain *chain, int start)
 			perror("not enough memory for the code path tree");
 			return;
 		}
-		call->ip = chain->ips[i].ip;
-		call->ms = chain->ips[i].ms;
+		call->ip = cursor_node->ip;
+		call->ms.sym = cursor_node->sym;
+		call->ms.map = cursor_node->map;
 		list_add_tail(&call->list, &node->val);
+
+		callchain_cursor_advance(cursor);
+		cursor_node = callchain_cursor_current(cursor);
 	}
-	node->val_nr = chain->nr - start;
-	if (!node->val_nr)
-		pr_warning("Warning: empty node in callchain tree\n");
 }
 
 static void
-add_child(struct callchain_node *parent, struct resolved_chain *chain,
-	  int start, u64 period)
+add_child(struct callchain_node *parent,
+	  struct callchain_cursor *cursor,
+	  u64 period)
 {
 	struct callchain_node *new;
 
 	new = create_child(parent, false);
-	fill_node(new, chain, start);
+	fill_node(new, cursor);
 
 	new->children_hit = 0;
 	new->hit = period;
@@ -250,9 +247,10 @@ add_child(struct callchain_node *parent, struct resolved_chain *chain,
  * Then create another child to host the given callchain of new branch
  */
 static void
-split_add_child(struct callchain_node *parent, struct resolved_chain *chain,
-		struct callchain_list *to_split, int idx_parents, int idx_local,
-		u64 period)
+split_add_child(struct callchain_node *parent,
+		struct callchain_cursor *cursor,
+		struct callchain_list *to_split,
+		u64 idx_parents, u64 idx_local, u64 period)
 {
 	struct callchain_node *new;
 	struct list_head *old_tail;
@@ -277,9 +275,9 @@ split_add_child(struct callchain_node *parent, struct resolved_chain *chain,
 	parent->val_nr = idx_local;
 
 	/* create a new child for the new branch if any */
-	if (idx_total < chain->nr) {
+	if (idx_total < cursor->nr) {
 		parent->hit = 0;
-		add_child(parent, chain, idx_total, period);
+		add_child(parent, cursor, period);
 		parent->children_hit += period;
 	} else {
 		parent->hit = period;
@@ -287,36 +285,41 @@ split_add_child(struct callchain_node *parent, struct resolved_chain *chain,
 }
 
 static int
-append_chain(struct callchain_node *root, struct resolved_chain *chain,
-	     unsigned int start, u64 period);
+append_chain(struct callchain_node *root,
+	     struct callchain_cursor *cursor,
+	     u64 period);
 
 static void
-append_chain_children(struct callchain_node *root, struct resolved_chain *chain,
-		      unsigned int start, u64 period)
+append_chain_children(struct callchain_node *root,
+		      struct callchain_cursor *cursor,
+		      u64 period)
 {
 	struct callchain_node *rnode;
 
 	/* lookup in childrens */
 	chain_for_each_child(rnode, root) {
-		unsigned int ret = append_chain(rnode, chain, start, period);
+		unsigned int ret = append_chain(rnode, cursor, period);
 
 		if (!ret)
 			goto inc_children_hit;
 	}
 	/* nothing in children, add to the current node */
-	add_child(root, chain, start, period);
+	add_child(root, cursor, period);
 
 inc_children_hit:
 	root->children_hit += period;
 }
 
 static int
-append_chain(struct callchain_node *root, struct resolved_chain *chain,
-	     unsigned int start, u64 period)
+append_chain(struct callchain_node *root,
+	     struct callchain_cursor *cursor,
+	     u64 period)
 {
+	struct callchain_cursor_node *curr_snap = cursor->curr;
 	struct callchain_list *cnode;
-	unsigned int i = start;
+	u64 start = cursor->pos;
 	bool found = false;
+	u64 matches;
 
 	/*
 	 * Lookup in the current node
@@ -324,114 +327,95 @@ append_chain(struct callchain_node *root, struct resolved_chain *chain,
 	 * anywhere inside a function.
 	 */
 	list_for_each_entry(cnode, &root->val, list) {
+		struct callchain_cursor_node *node;
 		struct symbol *sym;
 
-		if (i == chain->nr)
+		node = callchain_cursor_current(cursor);
+		if (!node)
 			break;
 
-		sym = chain->ips[i].ms.sym;
+		sym = node->sym;
 
 		if (cnode->ms.sym && sym) {
 			if (cnode->ms.sym->start != sym->start)
 				break;
-		} else if (cnode->ip != chain->ips[i].ip)
+		} else if (cnode->ip != node->ip)
 			break;
 
 		if (!found)
 			found = true;
-		i++;
+
+		callchain_cursor_advance(cursor);
 	}
 
 	/* matches not, relay on the parent */
-	if (!found)
+	if (!found) {
+		cursor->curr = curr_snap;
+		cursor->pos = start;
 		return -1;
+	}
+
+	matches = cursor->pos - start;
 
 	/* we match only a part of the node. Split it and add the new chain */
-	if (i - start < root->val_nr) {
-		split_add_child(root, chain, cnode, start, i - start, period);
+	if (matches < root->val_nr) {
+		split_add_child(root, cursor, cnode, start, matches, period);
 		return 0;
 	}
 
 	/* we match 100% of the path, increment the hit */
-	if (i - start == root->val_nr && i == chain->nr) {
+	if (matches == root->val_nr && cursor->pos == cursor->nr) {
 		root->hit += period;
 		return 0;
 	}
 
 	/* We match the node and still have a part remaining */
-	append_chain_children(root, chain, i, period);
+	append_chain_children(root, cursor, period);
 
 	return 0;
 }
 
-static void filter_context(struct ip_callchain *old, struct resolved_chain *new,
-			   struct map_symbol *syms)
-{
-	int i, j = 0;
-
-	for (i = 0; i < (int)old->nr; i++) {
-		if (old->ips[i] >= PERF_CONTEXT_MAX)
-			continue;
-
-		new->ips[j].ip = old->ips[i];
-		new->ips[j].ms = syms[i];
-		j++;
-	}
-
-	new->nr = j;
-}
-
-
-int callchain_append(struct callchain_root *root, struct ip_callchain *chain,
-		     struct map_symbol *syms, u64 period)
+int callchain_append(struct callchain_root *root,
+		     struct callchain_cursor *cursor,
+		     u64 period)
 {
-	struct resolved_chain *filtered;
-
-	if (!chain->nr)
+	if (!cursor->nr)
 		return 0;
 
-	filtered = zalloc(sizeof(*filtered) +
-			  chain->nr * sizeof(struct resolved_ip));
-	if (!filtered)
-		return -ENOMEM;
-
-	filter_context(chain, filtered, syms);
-
-	if (!filtered->nr)
-		goto end;
+	callchain_cursor_commit(cursor);
 
-	append_chain_children(&root->node, filtered, 0, period);
+	append_chain_children(&root->node, cursor, period);
 
-	if (filtered->nr > root->max_depth)
-		root->max_depth = filtered->nr;
-end:
-	free(filtered);
+	if (cursor->nr > root->max_depth)
+		root->max_depth = cursor->nr;
 
 	return 0;
 }
 
 static int
-merge_chain_branch(struct callchain_node *dst, struct callchain_node *src,
-		   struct resolved_chain *chain)
+merge_chain_branch(struct callchain_cursor *cursor,
+		   struct callchain_node *dst, struct callchain_node *src)
 {
+	struct callchain_cursor_node **old_last = cursor->last;
 	struct callchain_node *child, *next_child;
 	struct callchain_list *list, *next_list;
-	int old_pos = chain->nr;
+	int old_pos = cursor->nr;
 	int err = 0;
 
 	list_for_each_entry_safe(list, next_list, &src->val, list) {
-		chain->ips[chain->nr].ip = list->ip;
-		chain->ips[chain->nr].ms = list->ms;
-		chain->nr++;
+		callchain_cursor_append(cursor, list->ip,
+					list->ms.map, list->ms.sym);
 		list_del(&list->list);
 		free(list);
 	}
 
-	if (src->hit)
-		append_chain_children(dst, chain, 0, src->hit);
+	if (src->hit) {
+		callchain_cursor_commit(cursor);
+		append_chain_children(dst, cursor, src->hit);
+	}
 
 	chain_for_each_child_safe(child, next_child, src) {
-		err = merge_chain_branch(dst, child, chain);
+		err = merge_chain_branch(cursor, dst, child);
 		if (err)
 			break;
 
@@ -439,26 +423,38 @@ merge_chain_branch(struct callchain_node *dst, struct callchain_node *src,
 		free(child);
 	}
 
-	chain->nr = old_pos;
+	cursor->nr = old_pos;
+	cursor->last = old_last;
 
 	return err;
 }
 
-int callchain_merge(struct callchain_root *dst, struct callchain_root *src)
+int callchain_merge(struct callchain_cursor *cursor,
+		    struct callchain_root *dst, struct callchain_root *src)
+{
+	return merge_chain_branch(cursor, &dst->node, &src->node);
+}
+
+int callchain_cursor_append(struct callchain_cursor *cursor,
+			    u64 ip, struct map *map, struct symbol *sym)
 {
-	struct resolved_chain *chain;
-	int err;
+	struct callchain_cursor_node *node = *cursor->last;
 
-	chain = malloc(sizeof(*chain) +
-		       src->max_depth * sizeof(struct resolved_ip));
-	if (!chain)
-		return -ENOMEM;
+	if (!node) {
+		node = calloc(sizeof(*node), 1);
+		if (!node)
+			return -ENOMEM;
 
-	chain->nr = 0;
+		*cursor->last = node;
+	}
 
-	err = merge_chain_branch(&dst->node, &src->node, chain);
+	node->ip = ip;
+	node->map = map;
+	node->sym = sym;
 
-	free(chain);
+	cursor->nr++;
 
-	return err;
+	cursor->last = &node->next;
+
+	return 0;
 }
diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h
index c15fb8c..d74a19a 100644
--- a/tools/perf/util/callchain.h
+++ b/tools/perf/util/callchain.h
@@ -49,6 +49,27 @@ struct callchain_list {
 	struct list_head	list;
 };
 
+/*
+ * A callchain cursor is a single linked list that
+ * let one feed a callchain progressively.
+ * It keeps persitent allocated entries to minimize
+ * allocations.
+ */
+struct callchain_cursor_node {
+	u64				ip;
+	struct map			*map;
+	struct symbol			*sym;
+	struct callchain_cursor_node	*next;
+};
+
+struct callchain_cursor {
+	u64				nr;
+	struct callchain_cursor_node	*first;
+	struct callchain_cursor_node	**last;
+	u64				pos;
+	struct callchain_cursor_node	*curr;
+};
+
 static inline void callchain_init(struct callchain_root *root)
 {
 	INIT_LIST_HEAD(&root->node.brothers);
@@ -67,9 +88,48 @@ static inline u64 cumul_hits(struct callchain_node *node)
 }
 
 int register_callchain_param(struct callchain_param *param);
-int callchain_append(struct callchain_root *root, struct ip_callchain *chain,
-		     struct map_symbol *syms, u64 period);
-int callchain_merge(struct callchain_root *dst, struct callchain_root *src);
+int callchain_append(struct callchain_root *root,
+		     struct callchain_cursor *cursor,
+		     u64 period);
+
+int callchain_merge(struct callchain_cursor *cursor,
+		    struct callchain_root *dst, struct callchain_root *src);
 
 bool ip_callchain__valid(struct ip_callchain *chain, const event_t *event);
+
+/*
+ * Initialize a cursor before adding entries inside, but keep
+ * the previously allocated entries as a cache.
+ */
+static inline void callchain_cursor_reset(struct callchain_cursor *cursor)
+{
+	cursor->nr = 0;
+	cursor->last = &cursor->first;
+}
+
+int callchain_cursor_append(struct callchain_cursor *cursor, u64 ip,
+			    struct map *map, struct symbol *sym);
+
+/* Close a cursor writing session. Initialize for the reader */
+static inline void callchain_cursor_commit(struct callchain_cursor *cursor)
+{
+	cursor->curr = cursor->first;
+	cursor->pos = 0;
+}
+
+/* Cursor reading iteration helpers */
+static inline struct callchain_cursor_node *
+callchain_cursor_current(struct callchain_cursor *cursor)
+{
+	if (cursor->pos == cursor->nr)
+		return NULL;
+
+	return cursor->curr;
+}
+
+static inline void callchain_cursor_advance(struct callchain_cursor *cursor)
+{
+	cursor->curr = cursor->curr->next;
+	cursor->pos++;
+}
 #endif	/* __PERF_CALLCHAIN_H */
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index 32f4f1f..a438a06 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -211,7 +211,9 @@ void hist_entry__free(struct hist_entry *he)
  * collapse the histogram
  */
 
-static bool collapse__insert_entry(struct rb_root *root, struct hist_entry *he)
+static bool hists__collapse_insert_entry(struct hists *self,
+					 struct rb_root *root,
+					 struct hist_entry *he)
 {
 	struct rb_node **p = &root->rb_node;
 	struct rb_node *parent = NULL;
@@ -226,8 +228,11 @@ static bool collapse__insert_entry(struct rb_root *root, struct hist_entry *he)
 
 		if (!cmp) {
 			iter->period += he->period;
-			if (symbol_conf.use_callchain)
-				callchain_merge(iter->callchain, he->callchain);
+			if (symbol_conf.use_callchain) {
+				callchain_cursor_reset(&self->callchain_cursor);
+				callchain_merge(&self->callchain_cursor, iter->callchain,
+						he->callchain);
+			}
 			hist_entry__free(he);
 			return false;
 		}
@@ -262,7 +267,7 @@ void hists__collapse_resort(struct hists *self)
 		next = rb_next(&n->rb_node);
 
 		rb_erase(&n->rb_node, &self->entries);
-		if (collapse__insert_entry(&tmp, n))
+		if (hists__collapse_insert_entry(self, &tmp, n))
 			hists__inc_nr_entries(self, n);
 	}
 
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index ee78985..889559b 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -77,6 +77,8 @@ struct hists {
 	u64			event_stream;
 	u32			type;
 	u16			col_len[HISTC_NR_COLS];
+	/* Best would be to reuse the session callchain cursor */
+	struct callchain_cursor	callchain_cursor;
 };
 
 struct hist_entry *__hists__add_entry(struct hists *self,
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 105f00b..b58a48a 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -242,17 +242,16 @@ static bool symbol__match_parent_regex(struct symbol *sym)
 	return 0;
 }
 
-struct map_symbol *perf_session__resolve_callchain(struct perf_session *self,
-						   struct thread *thread,
-						   struct ip_callchain *chain,
-						   struct symbol **parent)
+int perf_session__resolve_callchain(struct perf_session *self,
+				    struct thread *thread,
+				    struct ip_callchain *chain,
+				    struct symbol **parent)
 {
 	u8 cpumode = PERF_RECORD_MISC_USER;
 	unsigned int i;
-	struct map_symbol *syms = calloc(chain->nr, sizeof(*syms));
+	int err;
 
-	if (!syms)
-		return NULL;
+	callchain_cursor_reset(&self->callchain_cursor);
 
 	for (i = 0; i < chain->nr; i++) {
 		u64 ip = chain->ips[i];
@@ -281,12 +280,15 @@ struct map_symbol *perf_session__resolve_callchain(struct perf_session *self,
 				*parent = al.sym;
 			if (!symbol_conf.use_callchain)
 				break;
-			syms[i].map = al.map;
-			syms[i].sym = al.sym;
 		}
+
+		err = callchain_cursor_append(&self->callchain_cursor,
+					      ip, al.map, al.sym);
+		if (err)
+			return err;
 	}
 
-	return syms;
+	return 0;
 }
 
 static int process_event_synth_stub(event_t *event __used,
diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h
index decd83f..e815468 100644
--- a/tools/perf/util/session.h
+++ b/tools/perf/util/session.h
@@ -51,7 +51,8 @@ struct perf_session {
 	int			cwdlen;
 	char			*cwd;
 	struct ordered_samples	ordered_samples;
-	char filename[0];
+	struct callchain_cursor	callchain_cursor;
+	char			filename[0];
 };
 
 struct perf_event_ops;
@@ -94,10 +95,10 @@ int __perf_session__process_events(struct perf_session *self,
 int perf_session__process_events(struct perf_session *self,
 				 struct perf_event_ops *event_ops);
 
-struct map_symbol *perf_session__resolve_callchain(struct perf_session *self,
-						   struct thread *thread,
-						   struct ip_callchain *chain,
-						   struct symbol **parent);
+int perf_session__resolve_callchain(struct perf_session *self,
+				    struct thread *thread,
+				    struct ip_callchain *chain,
+				    struct symbol **parent);
 
 bool perf_session__has_traces(struct perf_session *self, const char *msg);
 
-- 
cgit v0.10.2


From f08c3154ac439c4b5762a40107d84e839e08fbc5 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 14 Jan 2011 04:51:59 +0100
Subject: perf callchain: Rename cumul_hits into callchain_cumul_hits

That makes the callchain API naming more consistent and
reduce potential naming clashes.

Cc: Ingo Molnar <mingo@elte.hu>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1294977121-5700-3-git-send-email-fweisbec@gmail.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c
index 53a49e0..4c6360f 100644
--- a/tools/perf/util/callchain.c
+++ b/tools/perf/util/callchain.c
@@ -38,14 +38,14 @@ rb_insert_callchain(struct rb_root *root, struct callchain_node *chain,
 	struct rb_node **p = &root->rb_node;
 	struct rb_node *parent = NULL;
 	struct callchain_node *rnode;
-	u64 chain_cumul = cumul_hits(chain);
+	u64 chain_cumul = callchain_cumul_hits(chain);
 
 	while (*p) {
 		u64 rnode_cumul;
 
 		parent = *p;
 		rnode = rb_entry(parent, struct callchain_node, rb_node);
-		rnode_cumul = cumul_hits(rnode);
+		rnode_cumul = callchain_cumul_hits(rnode);
 
 		switch (mode) {
 		case CHAIN_FLAT:
@@ -104,7 +104,7 @@ static void __sort_chain_graph_abs(struct callchain_node *node,
 
 	chain_for_each_child(child, node) {
 		__sort_chain_graph_abs(child, min_hit);
-		if (cumul_hits(child) >= min_hit)
+		if (callchain_cumul_hits(child) >= min_hit)
 			rb_insert_callchain(&node->rb_root, child,
 					    CHAIN_GRAPH_ABS);
 	}
@@ -129,7 +129,7 @@ static void __sort_chain_graph_rel(struct callchain_node *node,
 
 	chain_for_each_child(child, node) {
 		__sort_chain_graph_rel(child, min_percent);
-		if (cumul_hits(child) >= min_hit)
+		if (callchain_cumul_hits(child) >= min_hit)
 			rb_insert_callchain(&node->rb_root, child,
 					    CHAIN_GRAPH_REL);
 	}
@@ -270,7 +270,7 @@ split_add_child(struct callchain_node *parent,
 	/* split the hits */
 	new->hit = parent->hit;
 	new->children_hit = parent->children_hit;
-	parent->children_hit = cumul_hits(new);
+	parent->children_hit = callchain_cumul_hits(new);
 	new->val_nr = parent->val_nr - idx_local;
 	parent->val_nr = idx_local;
 
diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h
index d74a19a..07f71e3 100644
--- a/tools/perf/util/callchain.h
+++ b/tools/perf/util/callchain.h
@@ -82,7 +82,7 @@ static inline void callchain_init(struct callchain_root *root)
 	root->max_depth = 0;
 }
 
-static inline u64 cumul_hits(struct callchain_node *node)
+static inline u64 callchain_cumul_hits(struct callchain_node *node)
 {
 	return node->hit + node->children_hit;
 }
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index a438a06..02ed318 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -430,7 +430,7 @@ static size_t __callchain__fprintf_graph(FILE *fp, struct callchain_node *self,
 		u64 cumul;
 
 		child = rb_entry(node, struct callchain_node, rb_node);
-		cumul = cumul_hits(child);
+		cumul = callchain_cumul_hits(child);
 		remaining -= cumul;
 
 		/*
diff --git a/tools/perf/util/ui/browsers/hists.c b/tools/perf/util/ui/browsers/hists.c
index 60c463c..8642823 100644
--- a/tools/perf/util/ui/browsers/hists.c
+++ b/tools/perf/util/ui/browsers/hists.c
@@ -377,7 +377,7 @@ static int hist_browser__show_callchain_node_rb_tree(struct hist_browser *self,
 	while (node) {
 		struct callchain_node *child = rb_entry(node, struct callchain_node, rb_node);
 		struct rb_node *next = rb_next(node);
-		u64 cumul = cumul_hits(child);
+		u64 cumul = callchain_cumul_hits(child);
 		struct callchain_list *chain;
 		char folded_sign = ' ';
 		int first = true;
-- 
cgit v0.10.2


From 16537f1355017a285b904bfb6bf767464293e69c Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 14 Jan 2011 04:52:00 +0100
Subject: perf callchain: Rename register_callchain_param into
 callchain_register_param

To make the callchain API naming more consistent.

Cc: Ingo Molnar <mingo@elte.hu>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1294977121-5700-4-git-send-email-fweisbec@gmail.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index c95599a..f6a4349 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -221,7 +221,7 @@ static int perf_session__setup_sample_type(struct perf_session *self)
 	} else if (!dont_use_callchains && callchain_param.mode != CHAIN_NONE &&
 		   !symbol_conf.use_callchain) {
 			symbol_conf.use_callchain = true;
-			if (register_callchain_param(&callchain_param) < 0) {
+			if (callchain_register_param(&callchain_param) < 0) {
 				fprintf(stderr, "Can't register callchain"
 						" params\n");
 				return -EINVAL;
@@ -423,7 +423,7 @@ parse_callchain_opt(const struct option *opt __used, const char *arg,
 	if (tok2)
 		callchain_param.print_limit = strtod(tok2, &endptr);
 setup:
-	if (register_callchain_param(&callchain_param) < 0) {
+	if (callchain_register_param(&callchain_param) < 0) {
 		fprintf(stderr, "Can't register callchain params\n");
 		return -1;
 	}
diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c
index 4c6360f..5b3f09d 100644
--- a/tools/perf/util/callchain.c
+++ b/tools/perf/util/callchain.c
@@ -143,7 +143,7 @@ sort_chain_graph_rel(struct rb_root *rb_root, struct callchain_root *chain_root,
 	rb_root->rb_node = chain_root->node.rb_root.rb_node;
 }
 
-int register_callchain_param(struct callchain_param *param)
+int callchain_register_param(struct callchain_param *param)
 {
 	switch (param->mode) {
 	case CHAIN_GRAPH_ABS:
diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h
index 07f71e3..2bb5403 100644
--- a/tools/perf/util/callchain.h
+++ b/tools/perf/util/callchain.h
@@ -87,7 +87,7 @@ static inline u64 callchain_cumul_hits(struct callchain_node *node)
 	return node->hit + node->children_hit;
 }
 
-int register_callchain_param(struct callchain_param *param);
+int callchain_register_param(struct callchain_param *param);
 int callchain_append(struct callchain_root *root,
 		     struct callchain_cursor *cursor,
 		     u64 period);
-- 
cgit v0.10.2


From 529363b76929beb85b81439c61063130af046a21 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 14 Jan 2011 04:52:01 +0100
Subject: perf callchain: Don't give arbitrary gender to callchain tree nodes

Some little callchain tree nodes shyly asked me if they can have
sisters.

How cute!

Cc: Ingo Molnar <mingo@elte.hu>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1294977121-5700-5-git-send-email-fweisbec@gmail.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c
index 5b3f09d..f8c66d1 100644
--- a/tools/perf/util/callchain.c
+++ b/tools/perf/util/callchain.c
@@ -26,10 +26,10 @@ bool ip_callchain__valid(struct ip_callchain *chain, const event_t *event)
 }
 
 #define chain_for_each_child(child, parent)	\
-	list_for_each_entry(child, &parent->children, brothers)
+	list_for_each_entry(child, &parent->children, siblings)
 
 #define chain_for_each_child_safe(child, next, parent)	\
-	list_for_each_entry_safe(child, next, &parent->children, brothers)
+	list_for_each_entry_safe(child, next, &parent->children, siblings)
 
 static void
 rb_insert_callchain(struct rb_root *root, struct callchain_node *chain,
@@ -189,7 +189,7 @@ create_child(struct callchain_node *parent, bool inherit_children)
 		chain_for_each_child(next, new)
 			next->parent = new;
 	}
-	list_add_tail(&new->brothers, &parent->children);
+	list_add_tail(&new->siblings, &parent->children);
 
 	return new;
 }
@@ -419,7 +419,7 @@ merge_chain_branch(struct callchain_cursor *cursor,
 		if (err)
 			break;
 
-		list_del(&child->brothers);
+		list_del(&child->siblings);
 		free(child);
 	}
 
diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h
index 2bb5403..6713725 100644
--- a/tools/perf/util/callchain.h
+++ b/tools/perf/util/callchain.h
@@ -16,7 +16,7 @@ enum chain_mode {
 
 struct callchain_node {
 	struct callchain_node	*parent;
-	struct list_head	brothers;
+	struct list_head	siblings;
 	struct list_head	children;
 	struct list_head	val;
 	struct rb_node		rb_node; /* to sort nodes in an rbtree */
@@ -72,7 +72,7 @@ struct callchain_cursor {
 
 static inline void callchain_init(struct callchain_root *root)
 {
-	INIT_LIST_HEAD(&root->node.brothers);
+	INIT_LIST_HEAD(&root->node.siblings);
 	INIT_LIST_HEAD(&root->node.children);
 	INIT_LIST_HEAD(&root->node.val);
 
-- 
cgit v0.10.2


From b0e8572f3b29c0760b66ba5627a6d5426c88c97d Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Sun, 16 Jan 2011 17:39:15 -0200
Subject: perf top: Add native_safe_halt to skip symbols

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 58352ad..31fbaf3 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -933,6 +933,7 @@ repeat:
 /* Tag samples to be skipped. */
 static const char *skip_symbols[] = {
 	"default_idle",
+	"native_safe_halt",
 	"cpu_idle",
 	"enter_idle",
 	"exit_idle",
-- 
cgit v0.10.2


From 4cc9cec636e7f78aba7f17606ac13cac07ea5787 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Thu, 13 Jan 2011 21:45:58 +0900
Subject: perf probe: Introduce lines walker interface

Introduce die_walk_lines() for walking on the line list of given die, and use
it in line_range finder and probe point finder.

Cc: 2nddept-manager@sdl.hitachi.co.jp
Cc: Franck Bui-Huu <fbuihuu@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <20110113124558.22426.48170.stgit@ltc236.sdl.hitachi.co.jp>
Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
[ committer note: s/%ld/%zd/ for a size_t nlines var that broke f14 x86 build]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c
index ab83b6a..508c017 100644
--- a/tools/perf/util/probe-finder.c
+++ b/tools/perf/util/probe-finder.c
@@ -458,6 +458,124 @@ static Dwarf_Die *die_find_inlinefunc(Dwarf_Die *sp_die, Dwarf_Addr addr,
 	return die_find_child(sp_die, __die_find_inline_cb, &addr, die_mem);
 }
 
+/* Walker on lines (Note: line number will not be sorted) */
+typedef int (* line_walk_handler_t) (const char *fname, int lineno,
+				     Dwarf_Addr addr, void *data);
+
+struct __line_walk_param {
+	line_walk_handler_t handler;
+	void *data;
+	int retval;
+};
+
+/* Walk on decl lines in given DIE */
+static int __die_walk_funclines(Dwarf_Die *sp_die,
+				line_walk_handler_t handler, void *data)
+{
+	const char *fname;
+	Dwarf_Addr addr;
+	int lineno, ret = 0;
+
+	/* Handle function declaration line */
+	fname = dwarf_decl_file(sp_die);
+	if (fname && dwarf_decl_line(sp_die, &lineno) == 0 &&
+	    dwarf_entrypc(sp_die, &addr) == 0) {
+		ret = handler(fname, lineno, addr, data);
+	}
+
+	return ret;
+}
+
+static int __die_walk_culines_cb(Dwarf_Die *sp_die, void *data)
+{
+	struct __line_walk_param *lw = data;
+
+	lw->retval = __die_walk_funclines(sp_die, lw->handler, lw->data);
+	if (lw->retval != 0)
+		return DWARF_CB_ABORT;
+
+	return DWARF_CB_OK;
+}
+
+/*
+ * Walk on lines inside given PDIE. If the PDIE is subprogram, walk only on
+ * the lines inside the subprogram, otherwise PDIE must be a CU DIE.
+ */
+static int die_walk_lines(Dwarf_Die *pdie, line_walk_handler_t handler,
+			  void *data)
+{
+	Dwarf_Lines *lines;
+	Dwarf_Line *line;
+	Dwarf_Addr addr;
+	const char *fname;
+	int lineno, ret = 0;
+	Dwarf_Die die_mem, *cu_die;
+	size_t nlines, i;
+
+	/* Get the CU die */
+	if (dwarf_tag(pdie) == DW_TAG_subprogram)
+		cu_die = dwarf_diecu(pdie, &die_mem, NULL, NULL);
+	else
+		cu_die = pdie;
+	if (!cu_die) {
+		pr_debug2("Failed to get CU from subprogram\n");
+		return -EINVAL;
+	}
+
+	/* Get lines list in the CU */
+	if (dwarf_getsrclines(cu_die, &lines, &nlines) != 0) {
+		pr_debug2("Failed to get source lines on this CU.\n");
+		return -ENOENT;
+	}
+	pr_debug2("Get %zd lines from this CU\n", nlines);
+
+	/* Walk on the lines on lines list */
+	for (i = 0; i < nlines; i++) {
+		line = dwarf_onesrcline(lines, i);
+		if (line == NULL ||
+		    dwarf_lineno(line, &lineno) != 0 ||
+		    dwarf_lineaddr(line, &addr) != 0) {
+			pr_debug2("Failed to get line info. "
+				  "Possible error in debuginfo.\n");
+			continue;
+		}
+		/* Filter lines based on address */
+		if (pdie != cu_die)
+			/*
+			 * Address filtering
+			 * The line is included in given function, and
+			 * no inline block includes it.
+			 */
+			if (!dwarf_haspc(pdie, addr) ||
+			    die_find_inlinefunc(pdie, addr, &die_mem))
+				continue;
+		/* Get source line */
+		fname = dwarf_linesrc(line, NULL, NULL);
+
+		ret = handler(fname, lineno, addr, data);
+		if (ret != 0)
+			return ret;
+	}
+
+	/*
+	 * Dwarf lines doesn't include function declarations and inlined
+	 * subroutines. We have to check functions list or given function.
+	 */
+	if (pdie != cu_die)
+		ret = __die_walk_funclines(pdie, handler, data);
+	else {
+		struct __line_walk_param param = {
+			.handler = handler,
+			.data = data,
+			.retval = 0,
+		};
+		dwarf_getfuncs(cu_die, __die_walk_culines_cb, &param, 0);
+		ret = param.retval;
+	}
+
+	return ret;
+}
+
 struct __find_variable_param {
 	const char *name;
 	Dwarf_Addr addr;
@@ -1050,43 +1168,26 @@ static int call_probe_finder(Dwarf_Die *sp_die, struct probe_finder *pf)
 	return ret;
 }
 
-/* Find probe point from its line number */
-static int find_probe_point_by_line(struct probe_finder *pf)
+static int probe_point_line_walker(const char *fname, int lineno,
+				   Dwarf_Addr addr, void *data)
 {
-	Dwarf_Lines *lines;
-	Dwarf_Line *line;
-	size_t nlines, i;
-	Dwarf_Addr addr;
-	int lineno;
-	int ret = 0;
+	struct probe_finder *pf = data;
+	int ret;
 
-	if (dwarf_getsrclines(&pf->cu_die, &lines, &nlines) != 0) {
-		pr_warning("No source lines found.\n");
-		return -ENOENT;
-	}
+	if (lineno != pf->lno || strtailcmp(fname, pf->fname) != 0)
+		return 0;
 
-	for (i = 0; i < nlines && ret == 0; i++) {
-		line = dwarf_onesrcline(lines, i);
-		if (dwarf_lineno(line, &lineno) != 0 ||
-		    lineno != pf->lno)
-			continue;
+	pf->addr = addr;
+	ret = call_probe_finder(NULL, pf);
 
-		/* TODO: Get fileno from line, but how? */
-		if (strtailcmp(dwarf_linesrc(line, NULL, NULL), pf->fname) != 0)
-			continue;
-
-		if (dwarf_lineaddr(line, &addr) != 0) {
-			pr_warning("Failed to get the address of the line.\n");
-			return -ENOENT;
-		}
-		pr_debug("Probe line found: line[%d]:%d addr:0x%jx\n",
-			 (int)i, lineno, (uintmax_t)addr);
-		pf->addr = addr;
+	/* Continue if no error, because the line will be in inline function */
+	return ret < 0 ?: 0;
+}
 
-		ret = call_probe_finder(NULL, pf);
-		/* Continuing, because target line might be inlined. */
-	}
-	return ret;
+/* Find probe point from its line number */
+static int find_probe_point_by_line(struct probe_finder *pf)
+{
+	return die_walk_lines(&pf->cu_die, probe_point_line_walker, pf);
 }
 
 /* Find lines which match lazy pattern */
@@ -1140,15 +1241,31 @@ out_close:
 	return nlines;
 }
 
+static int probe_point_lazy_walker(const char *fname, int lineno,
+				   Dwarf_Addr addr, void *data)
+{
+	struct probe_finder *pf = data;
+	int ret;
+
+	if (!line_list__has_line(&pf->lcache, lineno) ||
+	    strtailcmp(fname, pf->fname) != 0)
+		return 0;
+
+	pr_debug("Probe line found: line:%d addr:0x%llx\n",
+		 lineno, (unsigned long long)addr);
+	pf->addr = addr;
+	ret = call_probe_finder(NULL, pf);
+
+	/*
+	 * Continue if no error, because the lazy pattern will match
+	 * to other lines
+	 */
+	return ret < 0 ?: 0;
+}
+
 /* Find probe points from lazy pattern  */
 static int find_probe_point_lazy(Dwarf_Die *sp_die, struct probe_finder *pf)
 {
-	Dwarf_Lines *lines;
-	Dwarf_Line *line;
-	size_t nlines, i;
-	Dwarf_Addr addr;
-	Dwarf_Die die_mem;
-	int lineno;
 	int ret = 0;
 
 	if (list_empty(&pf->lcache)) {
@@ -1162,45 +1279,7 @@ static int find_probe_point_lazy(Dwarf_Die *sp_die, struct probe_finder *pf)
 			return ret;
 	}
 
-	if (dwarf_getsrclines(&pf->cu_die, &lines, &nlines) != 0) {
-		pr_warning("No source lines found.\n");
-		return -ENOENT;
-	}
-
-	for (i = 0; i < nlines && ret >= 0; i++) {
-		line = dwarf_onesrcline(lines, i);
-
-		if (dwarf_lineno(line, &lineno) != 0 ||
-		    !line_list__has_line(&pf->lcache, lineno))
-			continue;
-
-		/* TODO: Get fileno from line, but how? */
-		if (strtailcmp(dwarf_linesrc(line, NULL, NULL), pf->fname) != 0)
-			continue;
-
-		if (dwarf_lineaddr(line, &addr) != 0) {
-			pr_debug("Failed to get the address of line %d.\n",
-				 lineno);
-			continue;
-		}
-		if (sp_die) {
-			/* Address filtering 1: does sp_die include addr? */
-			if (!dwarf_haspc(sp_die, addr))
-				continue;
-			/* Address filtering 2: No child include addr? */
-			if (die_find_inlinefunc(sp_die, addr, &die_mem))
-				continue;
-		}
-
-		pr_debug("Probe line found: line[%d]:%d addr:0x%llx\n",
-			 (int)i, lineno, (unsigned long long)addr);
-		pf->addr = addr;
-
-		ret = call_probe_finder(sp_die, pf);
-		/* Continuing, because target line might be inlined. */
-	}
-	/* TODO: deallocate lines, but how? */
-	return ret;
+	return die_walk_lines(sp_die, probe_point_lazy_walker, pf);
 }
 
 /* Callback parameter with return value */
@@ -1644,91 +1723,28 @@ static int line_range_add_line(const char *src, unsigned int lineno,
 	return line_list__add_line(&lr->line_list, lineno);
 }
 
-/* Search function declaration lines */
-static int line_range_funcdecl_cb(Dwarf_Die *sp_die, void *data)
+static int line_range_walk_cb(const char *fname, int lineno,
+			      Dwarf_Addr addr __used,
+			      void *data)
 {
-	struct dwarf_callback_param *param = data;
-	struct line_finder *lf = param->data;
-	const char *src;
-	int lineno;
-
-	src = dwarf_decl_file(sp_die);
-	if (src && strtailcmp(src, lf->fname) != 0)
-		return DWARF_CB_OK;
+	struct line_finder *lf = data;
 
-	if (dwarf_decl_line(sp_die, &lineno) != 0 ||
+	if ((strtailcmp(fname, lf->fname) != 0) ||
 	    (lf->lno_s > lineno || lf->lno_e < lineno))
-		return DWARF_CB_OK;
+		return 0;
 
-	param->retval = line_range_add_line(src, lineno, lf->lr);
-	if (param->retval < 0)
-		return DWARF_CB_ABORT;
-	return DWARF_CB_OK;
-}
+	if (line_range_add_line(fname, lineno, lf->lr) < 0)
+		return -EINVAL;
 
-static int find_line_range_func_decl_lines(struct line_finder *lf)
-{
-	struct dwarf_callback_param param = {.data = (void *)lf, .retval = 0};
-	dwarf_getfuncs(&lf->cu_die, line_range_funcdecl_cb, &param, 0);
-	return param.retval;
+	return 0;
 }
 
 /* Find line range from its line number */
 static int find_line_range_by_line(Dwarf_Die *sp_die, struct line_finder *lf)
 {
-	Dwarf_Lines *lines;
-	Dwarf_Line *line;
-	size_t nlines, i;
-	Dwarf_Addr addr;
-	int lineno, ret = 0;
-	const char *src;
-	Dwarf_Die die_mem;
-
-	line_list__init(&lf->lr->line_list);
-	if (dwarf_getsrclines(&lf->cu_die, &lines, &nlines) != 0) {
-		pr_warning("No source lines found.\n");
-		return -ENOENT;
-	}
-
-	/* Search probable lines on lines list */
-	for (i = 0; i < nlines; i++) {
-		line = dwarf_onesrcline(lines, i);
-		if (dwarf_lineno(line, &lineno) != 0 ||
-		    (lf->lno_s > lineno || lf->lno_e < lineno))
-			continue;
-
-		if (sp_die) {
-			/* Address filtering 1: does sp_die include addr? */
-			if (dwarf_lineaddr(line, &addr) != 0 ||
-			    !dwarf_haspc(sp_die, addr))
-				continue;
-
-			/* Address filtering 2: No child include addr? */
-			if (die_find_inlinefunc(sp_die, addr, &die_mem))
-				continue;
-		}
-
-		/* TODO: Get fileno from line, but how? */
-		src = dwarf_linesrc(line, NULL, NULL);
-		if (strtailcmp(src, lf->fname) != 0)
-			continue;
-
-		ret = line_range_add_line(src, lineno, lf->lr);
-		if (ret < 0)
-			return ret;
-	}
+	int ret;
 
-	/*
-	 * Dwarf lines doesn't include function declarations. We have to
-	 * check functions list or given function.
-	 */
-	if (sp_die) {
-		src = dwarf_decl_file(sp_die);
-		if (src && dwarf_decl_line(sp_die, &lineno) == 0 &&
-		    (lf->lno_s <= lineno && lf->lno_e >= lineno))
-			ret = line_range_add_line(src, lineno, lf->lr);
-	} else
-		ret = find_line_range_func_decl_lines(lf);
+	ret = die_walk_lines(sp_die ?: &lf->cu_die, line_range_walk_cb, lf);
 
 	/* Update status */
 	if (ret >= 0)
@@ -1758,9 +1774,6 @@ static int line_range_search_cb(Dwarf_Die *sp_die, void *data)
 	struct line_finder *lf = param->data;
 	struct line_range *lr = lf->lr;
 
-	pr_debug("find (%llx) %s\n",
-		 (unsigned long long)dwarf_dieoffset(sp_die),
-		 dwarf_diename(sp_die));
 	if (dwarf_tag(sp_die) == DW_TAG_subprogram &&
 	    die_compare_name(sp_die, lr->function)) {
 		lf->fname = dwarf_decl_file(sp_die);
-- 
cgit v0.10.2


From 5069ed86be3c2f28bcdf7fae1374ec0c325aafba Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Thu, 13 Jan 2011 21:46:05 +0900
Subject: perf probe: Enable to put probe inline function call site

Enable to put probe inline function call site. This will increase line-based
probe-ability.

<Without this patch>
$ ./perf probe -L schedule:48
<schedule:48>
                pre_schedule(rq, prev);

     50         if (unlikely(!rq->nr_running))
                        idle_balance(cpu, rq);

                put_prev_task(rq, prev);
                next = pick_next_task(rq);

     56         if (likely(prev != next)) {
                        sched_info_switch(prev, next);
                        trace_sched_switch_out(prev, next);
                        perf_event_task_sched_out(prev, next);

<With this patch>
$ ./perf probe -L schedule:48
<schedule:48>
     48         pre_schedule(rq, prev);

     50         if (unlikely(!rq->nr_running))
     51                 idle_balance(cpu, rq);

     53         put_prev_task(rq, prev);
     54         next = pick_next_task(rq);

     56         if (likely(prev != next)) {
     57                 sched_info_switch(prev, next);
     58                 trace_sched_switch_out(prev, next);
     59                 perf_event_task_sched_out(prev, next);

Cc: 2nddept-manager@sdl.hitachi.co.jp
Cc: Franck Bui-Huu <fbuihuu@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <20110113124604.22426.48873.stgit@ltc236.sdl.hitachi.co.jp>
Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c
index 508c017..69215bf 100644
--- a/tools/perf/util/probe-finder.c
+++ b/tools/perf/util/probe-finder.c
@@ -280,6 +280,19 @@ static bool die_compare_name(Dwarf_Die *dw_die, const char *tname)
 	return name ? (strcmp(tname, name) == 0) : false;
 }
 
+/* Get callsite line number of inline-function instance */
+static int die_get_call_lineno(Dwarf_Die *in_die)
+{
+	Dwarf_Attribute attr;
+	Dwarf_Word ret;
+
+	if (!dwarf_attr(in_die, DW_AT_call_line, &attr))
+		return -ENOENT;
+
+	dwarf_formudata(&attr, &ret);
+	return (int)ret;
+}
+
 /* Get type die */
 static Dwarf_Die *die_get_type(Dwarf_Die *vr_die, Dwarf_Die *die_mem)
 {
@@ -463,27 +476,54 @@ typedef int (* line_walk_handler_t) (const char *fname, int lineno,
 				     Dwarf_Addr addr, void *data);
 
 struct __line_walk_param {
+	const char *fname;
 	line_walk_handler_t handler;
 	void *data;
 	int retval;
 };
 
-/* Walk on decl lines in given DIE */
+static int __die_walk_funclines_cb(Dwarf_Die *in_die, void *data)
+{
+	struct __line_walk_param *lw = data;
+	Dwarf_Addr addr;
+	int lineno;
+
+	if (dwarf_tag(in_die) == DW_TAG_inlined_subroutine) {
+		lineno = die_get_call_lineno(in_die);
+		if (lineno > 0 && dwarf_entrypc(in_die, &addr) == 0) {
+			lw->retval = lw->handler(lw->fname, lineno, addr,
+						 lw->data);
+			if (lw->retval != 0)
+				return DIE_FIND_CB_FOUND;
+		}
+	}
+	return DIE_FIND_CB_SIBLING;
+}
+
+/* Walk on lines of blocks included in given DIE */
 static int __die_walk_funclines(Dwarf_Die *sp_die,
 				line_walk_handler_t handler, void *data)
 {
-	const char *fname;
+	struct __line_walk_param lw = {
+		.handler = handler,
+		.data = data,
+		.retval = 0,
+	};
+	Dwarf_Die die_mem;
 	Dwarf_Addr addr;
-	int lineno, ret = 0;
+	int lineno;
 
 	/* Handle function declaration line */
-	fname = dwarf_decl_file(sp_die);
-	if (fname && dwarf_decl_line(sp_die, &lineno) == 0 &&
+	lw.fname = dwarf_decl_file(sp_die);
+	if (lw.fname && dwarf_decl_line(sp_die, &lineno) == 0 &&
 	    dwarf_entrypc(sp_die, &addr) == 0) {
-		ret = handler(fname, lineno, addr, data);
+		lw.retval = handler(lw.fname, lineno, addr, data);
+		if (lw.retval != 0)
+			goto done;
 	}
-
-	return ret;
+	die_find_child(sp_die, __die_walk_funclines_cb, &lw, &die_mem);
+done:
+	return lw.retval;
 }
 
 static int __die_walk_culines_cb(Dwarf_Die *sp_die, void *data)
-- 
cgit v0.10.2


From e80711ca8512c8586da0c3e18e2f1caf73c88731 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Thu, 13 Jan 2011 21:46:11 +0900
Subject: perf probe: Add --funcs to show available functions in symtab

Add --funcs to show available functions in symtab.

Originally this feature came from Srikar's uprobes patches
( http://lkml.org/lkml/2010/8/27/244 )

e.g.
...
__ablkcipher_walk_complete
__absent_pages_in_range
__account_scheduler_latency
__add_pages
__alloc_pages_nodemask
__alloc_percpu
__alloc_reserved_percpu
__alloc_skb
__alloc_workqueue_key
__any_online_cpu
__ata_ehi_push_desc
...

This also supports symbols in module, e.g.

...
cleanup_module
cpuid_maxphyaddr
emulate_clts
emulate_instruction
emulate_int_real
emulate_invlpg
emulator_get_dr
emulator_set_dr
emulator_task_switch
emulator_write_emulated
emulator_write_phys
fx_init
...

Original-patch-from: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: 2nddept-manager@sdl.hitachi.co.jp
Cc: Franck Bui-Huu <fbuihuu@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <20110113124611.22426.10835.stgit@ltc236.sdl.hitachi.co.jp>
Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
[ committer note: Add missing elf.h for STB_GLOBAL that broke a RHEL4 build ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/Documentation/perf-probe.txt b/tools/perf/Documentation/perf-probe.txt
index 86b797a..fcc51fe 100644
--- a/tools/perf/Documentation/perf-probe.txt
+++ b/tools/perf/Documentation/perf-probe.txt
@@ -73,6 +73,10 @@ OPTIONS
 	(Only for --vars) Show external defined variables in addition to local
 	variables.
 
+-F::
+--funcs::
+	Show available functions in given module or kernel.
+
 -f::
 --force::
 	Forcibly add events with existing name.
diff --git a/tools/perf/builtin-probe.c b/tools/perf/builtin-probe.c
index add163c..6cf708a 100644
--- a/tools/perf/builtin-probe.c
+++ b/tools/perf/builtin-probe.c
@@ -52,6 +52,7 @@ static struct {
 	bool show_lines;
 	bool show_vars;
 	bool show_ext_vars;
+	bool show_funcs;
 	bool mod_events;
 	int nevents;
 	struct perf_probe_event events[MAX_PROBES];
@@ -221,6 +222,8 @@ static const struct option options[] = {
 	OPT__DRY_RUN(&probe_event_dry_run),
 	OPT_INTEGER('\0', "max-probes", &params.max_probe_points,
 		 "Set how many probe points can be found for a probe."),
+	OPT_BOOLEAN('F', "funcs", &params.show_funcs,
+		    "Show potential probe-able functions."),
 	OPT_END()
 };
 
@@ -246,7 +249,7 @@ int cmd_probe(int argc, const char **argv, const char *prefix __used)
 		params.max_probe_points = MAX_PROBES;
 
 	if ((!params.nevents && !params.dellist && !params.list_events &&
-	     !params.show_lines))
+	     !params.show_lines && !params.show_funcs))
 		usage_with_options(probe_usage, options);
 
 	/*
@@ -267,12 +270,36 @@ int cmd_probe(int argc, const char **argv, const char *prefix __used)
 			pr_err(" Error: Don't use --list with --vars.\n");
 			usage_with_options(probe_usage, options);
 		}
+		if (params.show_funcs) {
+			pr_err("  Error: Don't use --list with --funcs.\n");
+			usage_with_options(probe_usage, options);
+		}
 		ret = show_perf_probe_events();
 		if (ret < 0)
 			pr_err("  Error: Failed to show event list. (%d)\n",
 			       ret);
 		return ret;
 	}
+	if (params.show_funcs) {
+		if (params.nevents != 0 || params.dellist) {
+			pr_err("  Error: Don't use --funcs with"
+			       " --add/--del.\n");
+			usage_with_options(probe_usage, options);
+		}
+		if (params.show_lines) {
+			pr_err("  Error: Don't use --funcs with --line.\n");
+			usage_with_options(probe_usage, options);
+		}
+		if (params.show_vars) {
+			pr_err("  Error: Don't use --funcs with --vars.\n");
+			usage_with_options(probe_usage, options);
+		}
+		ret = show_available_funcs(params.target_module);
+		if (ret < 0)
+			pr_err("  Error: Failed to show functions."
+			       " (%d)\n", ret);
+		return ret;
+	}
 
 #ifdef DWARF_SUPPORT
 	if (params.show_lines) {
diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c
index 6e29d9c..859d377 100644
--- a/tools/perf/util/probe-event.c
+++ b/tools/perf/util/probe-event.c
@@ -31,6 +31,7 @@
 #include <string.h>
 #include <stdarg.h>
 #include <limits.h>
+#include <elf.h>
 
 #undef _GNU_SOURCE
 #include "util.h"
@@ -111,7 +112,25 @@ static struct symbol *__find_kernel_function_by_name(const char *name,
 						     NULL);
 }
 
-const char *kernel_get_module_path(const char *module)
+static struct map *kernel_get_module_map(const char *module)
+{
+	struct rb_node *nd;
+	struct map_groups *grp = &machine.kmaps;
+
+	if (!module)
+		module = "kernel";
+
+	for (nd = rb_first(&grp->maps[MAP__FUNCTION]); nd; nd = rb_next(nd)) {
+		struct map *pos = rb_entry(nd, struct map, rb_node);
+		if (strncmp(pos->dso->short_name + 1, module,
+			    pos->dso->short_name_len - 2) == 0) {
+			return pos;
+		}
+	}
+	return NULL;
+}
+
+static struct dso *kernel_get_module_dso(const char *module)
 {
 	struct dso *dso;
 	struct map *map;
@@ -141,7 +160,13 @@ const char *kernel_get_module_path(const char *module)
 		}
 	}
 found:
-	return dso->long_name;
+	return dso;
+}
+
+const char *kernel_get_module_path(const char *module)
+{
+	struct dso *dso = kernel_get_module_dso(module);
+	return (dso) ? dso->long_name : NULL;
 }
 
 #ifdef DWARF_SUPPORT
@@ -1913,3 +1938,42 @@ int del_perf_probe_events(struct strlist *dellist)
 	return ret;
 }
 
+/*
+ * If a symbol corresponds to a function with global binding return 0.
+ * For all others return 1.
+ */
+static int filter_non_global_functions(struct map *map __unused,
+					struct symbol *sym)
+{
+	if (sym->binding != STB_GLOBAL)
+		return 1;
+
+	return 0;
+}
+
+int show_available_funcs(const char *module)
+{
+	struct map *map;
+	int ret;
+
+	setup_pager();
+
+	ret = init_vmlinux();
+	if (ret < 0)
+		return ret;
+
+	map = kernel_get_module_map(module);
+	if (!map) {
+		pr_err("Failed to find %s map.\n", (module) ? : "kernel");
+		return -EINVAL;
+	}
+	if (map__load(map, filter_non_global_functions)) {
+		pr_err("Failed to load map.\n");
+		return -EINVAL;
+	}
+	if (!dso__sorted_by_name(map->dso, map->type))
+		dso__sort_by_name(map->dso, map->type);
+
+	dso__fprintf_symbols_by_name(map->dso, map->type, stdout);
+	return 0;
+}
diff --git a/tools/perf/util/probe-event.h b/tools/perf/util/probe-event.h
index 5accbed..1fb4f18 100644
--- a/tools/perf/util/probe-event.h
+++ b/tools/perf/util/probe-event.h
@@ -127,6 +127,7 @@ extern int show_line_range(struct line_range *lr, const char *module);
 extern int show_available_vars(struct perf_probe_event *pevs, int npevs,
 			       int max_probe_points, const char *module,
 			       bool externs);
+extern int show_available_funcs(const char *module);
 
 
 /* Maximum index number of event-name postfix */
-- 
cgit v0.10.2


From d7065adb9b4f3384c2615f0a3dbdb6c3aae1eb18 Mon Sep 17 00:00:00 2001
From: Franck Bui-Huu <fbuihuu@gmail.com>
Date: Sun, 16 Jan 2011 17:14:45 +0100
Subject: perf record: auto detect when stdout is a pipe

This patch gives the ability to 'perf record' to detect when its stdout
has been redirected to a pipe. There's now no more need to add '-o -'
switch in this case.

However '-o <path>' option has always precedence, that is if specified
and stdout has been connected via a pipe then the output will go into
the specified output.

LKML-Reference: <m3ipxo966i.fsf@gmail.com>
Signed-off-by: Franck Bui-Huu <fbuihuu@gmail.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 45a3689..1346d42 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -48,7 +48,7 @@ static unsigned int		user_freq 			= UINT_MAX;
 static int			freq				=   1000;
 static int			output;
 static int			pipe_output			=      0;
-static const char		*output_name			= "perf.data";
+static const char		*output_name			= NULL;
 static int			group				=      0;
 static int			realtime_prio			=      0;
 static bool			nodelay				=  false;
@@ -497,18 +497,26 @@ static int __cmd_record(int argc, const char **argv)
 		exit(-1);
 	}
 
-	if (!strcmp(output_name, "-"))
-		pipe_output = 1;
-	else if (!stat(output_name, &st) && st.st_size) {
-		if (write_mode == WRITE_FORCE) {
-			char oldname[PATH_MAX];
-			snprintf(oldname, sizeof(oldname), "%s.old",
-				 output_name);
-			unlink(oldname);
-			rename(output_name, oldname);
+	if (!output_name) {
+		if (!fstat(STDOUT_FILENO, &st) && S_ISFIFO(st.st_mode))
+			pipe_output = 1;
+		else
+			output_name = "perf.data";
+	}
+	if (output_name) {
+		if (!strcmp(output_name, "-"))
+			pipe_output = 1;
+		else if (!stat(output_name, &st) && st.st_size) {
+			if (write_mode == WRITE_FORCE) {
+				char oldname[PATH_MAX];
+				snprintf(oldname, sizeof(oldname), "%s.old",
+					 output_name);
+				unlink(oldname);
+				rename(output_name, oldname);
+			}
+		} else if (write_mode == WRITE_APPEND) {
+			write_mode = WRITE_FORCE;
 		}
-	} else if (write_mode == WRITE_APPEND) {
-		write_mode = WRITE_FORCE;
 	}
 
 	flags = O_CREAT|O_RDWR;
-- 
cgit v0.10.2


From 17ea1b70a87e28457821318341bead2b45563092 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 17 Jan 2011 14:40:46 -0200
Subject: perf tools: Pass the struct opt to the wildcard parsing routine

It is needed because it will call parse_event for each tracepoint
name that matches, and we pass the perf_evlist via opt->value.

Problem introduced in 4503fdd where my assumption about opt being
always non NULL made me not look at callers of parse_events outside
builtin-*.c.

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index d3086ce..cf082da 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -446,8 +446,8 @@ parse_single_tracepoint_event(char *sys_name,
 /* sys + ':' + event + ':' + flags*/
 #define MAX_EVOPT_LEN	(MAX_EVENT_LENGTH * 2 + 2 + 128)
 static enum event_result
-parse_multiple_tracepoint_event(char *sys_name, const char *evt_exp,
-				char *flags)
+parse_multiple_tracepoint_event(const struct option *opt, char *sys_name,
+				const char *evt_exp, char *flags)
 {
 	char evt_path[MAXPATHLEN];
 	struct dirent *evt_ent;
@@ -480,15 +480,16 @@ parse_multiple_tracepoint_event(char *sys_name, const char *evt_exp,
 		if (len < 0)
 			return EVT_FAILED;
 
-		if (parse_events(NULL, event_opt, 0))
+		if (parse_events(opt, event_opt, 0))
 			return EVT_FAILED;
 	}
 
 	return EVT_HANDLED_ALL;
 }
 
-static enum event_result parse_tracepoint_event(const char **strp,
-				    struct perf_event_attr *attr)
+static enum event_result
+parse_tracepoint_event(const struct option *opt, const char **strp,
+		       struct perf_event_attr *attr)
 {
 	const char *evt_name;
 	char *flags = NULL, *comma_loc;
@@ -527,7 +528,7 @@ static enum event_result parse_tracepoint_event(const char **strp,
 		return EVT_FAILED;
 	if (strpbrk(evt_name, "*?")) {
 		*strp += strlen(sys_name) + evt_length + 1; /* 1 == the ':' */
-		return parse_multiple_tracepoint_event(sys_name, evt_name,
+		return parse_multiple_tracepoint_event(opt, sys_name, evt_name,
 						       flags);
 	} else {
 		return parse_single_tracepoint_event(sys_name, evt_name,
@@ -737,11 +738,12 @@ parse_event_modifier(const char **strp, struct perf_event_attr *attr)
  * Symbolic names are (almost) exactly matched.
  */
 static enum event_result
-parse_event_symbols(const char **str, struct perf_event_attr *attr)
+parse_event_symbols(const struct option *opt, const char **str,
+		    struct perf_event_attr *attr)
 {
 	enum event_result ret;
 
-	ret = parse_tracepoint_event(str, attr);
+	ret = parse_tracepoint_event(opt, str, attr);
 	if (ret != EVT_FAILED)
 		goto modifier;
 
@@ -783,7 +785,7 @@ int parse_events(const struct option *opt, const char *str, int unset __used)
 
 	for (;;) {
 		memset(&attr, 0, sizeof(attr));
-		ret = parse_event_symbols(&str, &attr);
+		ret = parse_event_symbols(opt, &str, &attr);
 		if (ret == EVT_FAILED)
 			return -1;
 
-- 
cgit v0.10.2


From fd78260b5376173faeb17127bd63b3c99a8e8bfb Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 18 Jan 2011 15:15:24 -0200
Subject: perf threads: Move thread_map to separate file

To untangle it from struct thread handling, that is tied to symbols, etc.

Right now in the python bindings I'm working on I need just a subset of
the util/ files, untangling it allows me to do that.

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index f20bc6f..638e8e1 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -426,6 +426,7 @@ LIB_H += util/values.h
 LIB_H += util/sort.h
 LIB_H += util/hist.h
 LIB_H += util/thread.h
+LIB_H += util/thread_map.h
 LIB_H += util/trace-event.h
 LIB_H += util/probe-finder.h
 LIB_H += util/probe-event.h
@@ -471,6 +472,7 @@ LIB_OBJS += $(OUTPUT)util/map.o
 LIB_OBJS += $(OUTPUT)util/pstack.o
 LIB_OBJS += $(OUTPUT)util/session.o
 LIB_OBJS += $(OUTPUT)util/thread.o
+LIB_OBJS += $(OUTPUT)util/thread_map.o
 LIB_OBJS += $(OUTPUT)util/trace-event-parse.o
 LIB_OBJS += $(OUTPUT)util/trace-event-read.o
 LIB_OBJS += $(OUTPUT)util/trace-event-info.o
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 1346d42..d788630 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -24,6 +24,7 @@
 #include "util/session.h"
 #include "util/symbol.h"
 #include "util/cpumap.h"
+#include "util/thread_map.h"
 
 #include <unistd.h>
 #include <sched.h>
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index e2a2d02..8906adf 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -49,6 +49,7 @@
 #include "util/header.h"
 #include "util/cpumap.h"
 #include "util/thread.h"
+#include "util/thread_map.h"
 
 #include <sys/prctl.h>
 #include <math.h>
diff --git a/tools/perf/builtin-test.c b/tools/perf/builtin-test.c
index 4fd3453..dc91ee0 100644
--- a/tools/perf/builtin-test.c
+++ b/tools/perf/builtin-test.c
@@ -12,7 +12,7 @@
 #include "util/parse-events.h"
 #include "util/session.h"
 #include "util/symbol.h"
-#include "util/thread.h"
+#include "util/thread_map.h"
 
 static long page_size;
 
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 31fbaf3..d0b16d9 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -26,6 +26,7 @@
 #include "util/session.h"
 #include "util/symbol.h"
 #include "util/thread.h"
+#include "util/thread_map.h"
 #include "util/util.h"
 #include <linux/rbtree.h>
 #include "util/parse-options.h"
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index ee49035..9a6d942 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -3,7 +3,7 @@
 #include "../perf.h"
 #include "util.h"
 #include "cpumap.h"
-#include "thread.h"
+#include "thread_map.h"
 
 #include <unistd.h>
 #include <sys/mman.h>
diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c
index 00f4ead..d5d3b22 100644
--- a/tools/perf/util/thread.c
+++ b/tools/perf/util/thread.c
@@ -7,61 +7,6 @@
 #include "util.h"
 #include "debug.h"
 
-/* Skip "." and ".." directories */
-static int filter(const struct dirent *dir)
-{
-	if (dir->d_name[0] == '.')
-		return 0;
-	else
-		return 1;
-}
-
-struct thread_map *thread_map__new_by_pid(pid_t pid)
-{
-	struct thread_map *threads;
-	char name[256];
-	int items;
-	struct dirent **namelist = NULL;
-	int i;
-
-	sprintf(name, "/proc/%d/task", pid);
-	items = scandir(name, &namelist, filter, NULL);
-	if (items <= 0)
-                return NULL;
-
-	threads = malloc(sizeof(*threads) + sizeof(pid_t) * items);
-	if (threads != NULL) {
-		for (i = 0; i < items; i++)
-			threads->map[i] = atoi(namelist[i]->d_name);
-		threads->nr = items;
-	}
-
-	for (i=0; i<items; i++)
-		free(namelist[i]);
-	free(namelist);
-
-	return threads;
-}
-
-struct thread_map *thread_map__new_by_tid(pid_t tid)
-{
-	struct thread_map *threads = malloc(sizeof(*threads) + sizeof(pid_t));
-
-	if (threads != NULL) {
-		threads->map[0] = tid;
-		threads->nr	= 1;
-	}
-
-	return threads;
-}
-
-struct thread_map *thread_map__new(pid_t pid, pid_t tid)
-{
-	if (pid != -1)
-		return thread_map__new_by_pid(pid);
-	return thread_map__new_by_tid(tid);
-}
-
 static struct thread *thread__new(pid_t pid)
 {
 	struct thread *self = zalloc(sizeof(*self));
diff --git a/tools/perf/util/thread.h b/tools/perf/util/thread.h
index d757410..e5f2401 100644
--- a/tools/perf/util/thread.h
+++ b/tools/perf/util/thread.h
@@ -18,24 +18,10 @@ struct thread {
 	int			comm_len;
 };
 
-struct thread_map {
-	int nr;
-	int map[];
-};
-
 struct perf_session;
 
 void thread__delete(struct thread *self);
 
-struct thread_map *thread_map__new_by_pid(pid_t pid);
-struct thread_map *thread_map__new_by_tid(pid_t tid);
-struct thread_map *thread_map__new(pid_t pid, pid_t tid);
-
-static inline void thread_map__delete(struct thread_map *threads)
-{
-	free(threads);
-}
-
 int thread__set_comm(struct thread *self, const char *comm);
 int thread__comm_len(struct thread *self);
 struct thread *perf_session__findnew(struct perf_session *self, pid_t pid);
diff --git a/tools/perf/util/thread_map.c b/tools/perf/util/thread_map.c
new file mode 100644
index 0000000..a5df131
--- /dev/null
+++ b/tools/perf/util/thread_map.c
@@ -0,0 +1,64 @@
+#include <dirent.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include "thread_map.h"
+
+/* Skip "." and ".." directories */
+static int filter(const struct dirent *dir)
+{
+	if (dir->d_name[0] == '.')
+		return 0;
+	else
+		return 1;
+}
+
+struct thread_map *thread_map__new_by_pid(pid_t pid)
+{
+	struct thread_map *threads;
+	char name[256];
+	int items;
+	struct dirent **namelist = NULL;
+	int i;
+
+	sprintf(name, "/proc/%d/task", pid);
+	items = scandir(name, &namelist, filter, NULL);
+	if (items <= 0)
+                return NULL;
+
+	threads = malloc(sizeof(*threads) + sizeof(pid_t) * items);
+	if (threads != NULL) {
+		for (i = 0; i < items; i++)
+			threads->map[i] = atoi(namelist[i]->d_name);
+		threads->nr = items;
+	}
+
+	for (i=0; i<items; i++)
+		free(namelist[i]);
+	free(namelist);
+
+	return threads;
+}
+
+struct thread_map *thread_map__new_by_tid(pid_t tid)
+{
+	struct thread_map *threads = malloc(sizeof(*threads) + sizeof(pid_t));
+
+	if (threads != NULL) {
+		threads->map[0] = tid;
+		threads->nr	= 1;
+	}
+
+	return threads;
+}
+
+struct thread_map *thread_map__new(pid_t pid, pid_t tid)
+{
+	if (pid != -1)
+		return thread_map__new_by_pid(pid);
+	return thread_map__new_by_tid(tid);
+}
+
+void thread_map__delete(struct thread_map *threads)
+{
+	free(threads);
+}
diff --git a/tools/perf/util/thread_map.h b/tools/perf/util/thread_map.h
new file mode 100644
index 0000000..3cb9073
--- /dev/null
+++ b/tools/perf/util/thread_map.h
@@ -0,0 +1,15 @@
+#ifndef __PERF_THREAD_MAP_H
+#define __PERF_THREAD_MAP_H
+
+#include <sys/types.h>
+
+struct thread_map {
+	int nr;
+	int map[];
+};
+
+struct thread_map *thread_map__new_by_pid(pid_t pid);
+struct thread_map *thread_map__new_by_tid(pid_t tid);
+struct thread_map *thread_map__new(pid_t pid, pid_t tid);
+void thread_map__delete(struct thread_map *threads);
+#endif	/* __PERF_THREAD_MAP_H */
-- 
cgit v0.10.2


From d0dd74e853a0a6f37e8061d6d50be41c7034c54c Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 21 Jan 2011 13:46:41 -0200
Subject: perf tools: Move event__parse_sample to evsel.c

To avoid linking more stuff in the python binding I'm working on, future
csets will make the sample type be taken from the evsel itself, but for
that we need to first have one file per cpu and per sample_type, not a
single perf.data file.

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/builtin-test.c b/tools/perf/builtin-test.c
index dc91ee0..231e3e2 100644
--- a/tools/perf/builtin-test.c
+++ b/tools/perf/builtin-test.c
@@ -10,7 +10,6 @@
 #include "util/evlist.h"
 #include "util/parse-options.h"
 #include "util/parse-events.h"
-#include "util/session.h"
 #include "util/symbol.h"
 #include "util/thread_map.h"
 
@@ -457,7 +456,6 @@ static int test__basic_mmap(void)
 	int err = -1;
 	event_t *event;
 	struct thread_map *threads;
-	struct perf_session session;
 	struct cpu_map *cpus;
 	struct perf_evlist *evlist;
 	struct perf_event_attr attr = {
@@ -521,13 +519,6 @@ static int test__basic_mmap(void)
 	attr.wakeup_events = 1;
 	attr.sample_period = 1;
 
-	/*
- 	 * FIXME: use evsel->attr.sample_type in event__parse_sample.
- 	 * 	  This will nicely remove the requirement that we have
- 	 * 	  all the events with the same sample_type.
- 	 */
-	session.sample_type = attr.sample_type;
-
 	for (i = 0; i < nsyscalls; ++i) {
 		attr.config = ids[i];
 		evsels[i] = perf_evsel__new(&attr, i);
@@ -567,7 +558,7 @@ static int test__basic_mmap(void)
 			goto out_munmap;
 		}
 
-		event__parse_sample(event, &session, &sample);
+		event__parse_sample(event, attr.sample_type, false, &sample);
 		evsel = perf_evlist__id2evsel(evlist, sample.id);
 		if (evsel == NULL) {
 			pr_debug("event with id %" PRIu64
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index d0b16d9..ce2e50c 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -1106,7 +1106,7 @@ static void perf_session__mmap_read_cpu(struct perf_session *self, int cpu)
 	event_t *event;
 
 	while ((event = perf_evlist__read_on_cpu(evsel_list, cpu)) != NULL) {
-		event__parse_sample(event, self, &sample);
+		perf_session__parse_sample(self, event, &sample);
 
 		if (event->header.type == PERF_RECORD_SAMPLE)
 			event__process_sample(event, &sample, self);
diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
index 1478ab4..e4db8b8 100644
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -826,128 +826,3 @@ out_filtered:
 	al->filtered = true;
 	return 0;
 }
-
-static int event__parse_id_sample(const event_t *event,
-				  struct perf_session *session,
-				  struct sample_data *sample)
-{
-	const u64 *array;
-	u64 type;
-
-	sample->cpu = sample->pid = sample->tid = -1;
-	sample->stream_id = sample->id = sample->time = -1ULL;
-
-	if (!session->sample_id_all)
-		return 0;
-
-	array = event->sample.array;
-	array += ((event->header.size -
-		   sizeof(event->header)) / sizeof(u64)) - 1;
-	type = session->sample_type;
-
-	if (type & PERF_SAMPLE_CPU) {
-		u32 *p = (u32 *)array;
-		sample->cpu = *p;
-		array--;
-	}
-
-	if (type & PERF_SAMPLE_STREAM_ID) {
-		sample->stream_id = *array;
-		array--;
-	}
-
-	if (type & PERF_SAMPLE_ID) {
-		sample->id = *array;
-		array--;
-	}
-
-	if (type & PERF_SAMPLE_TIME) {
-		sample->time = *array;
-		array--;
-	}
-
-	if (type & PERF_SAMPLE_TID) {
-		u32 *p = (u32 *)array;
-		sample->pid = p[0];
-		sample->tid = p[1];
-	}
-
-	return 0;
-}
-
-int event__parse_sample(const event_t *event, struct perf_session *session,
-			struct sample_data *data)
-{
-	const u64 *array;
-	u64 type;
-
-	if (event->header.type != PERF_RECORD_SAMPLE)
-		return event__parse_id_sample(event, session, data);
-
-	array = event->sample.array;
-	type = session->sample_type;
-
-	if (type & PERF_SAMPLE_IP) {
-		data->ip = event->ip.ip;
-		array++;
-	}
-
-	if (type & PERF_SAMPLE_TID) {
-		u32 *p = (u32 *)array;
-		data->pid = p[0];
-		data->tid = p[1];
-		array++;
-	}
-
-	if (type & PERF_SAMPLE_TIME) {
-		data->time = *array;
-		array++;
-	}
-
-	if (type & PERF_SAMPLE_ADDR) {
-		data->addr = *array;
-		array++;
-	}
-
-	data->id = -1ULL;
-	if (type & PERF_SAMPLE_ID) {
-		data->id = *array;
-		array++;
-	}
-
-	if (type & PERF_SAMPLE_STREAM_ID) {
-		data->stream_id = *array;
-		array++;
-	}
-
-	if (type & PERF_SAMPLE_CPU) {
-		u32 *p = (u32 *)array;
-		data->cpu = *p;
-		array++;
-	} else
-		data->cpu = -1;
-
-	if (type & PERF_SAMPLE_PERIOD) {
-		data->period = *array;
-		array++;
-	}
-
-	if (type & PERF_SAMPLE_READ) {
-		pr_debug("PERF_SAMPLE_READ is unsuported for now\n");
-		return -1;
-	}
-
-	if (type & PERF_SAMPLE_CALLCHAIN) {
-		data->callchain = (struct ip_callchain *)array;
-		array += 1 + data->callchain->nr;
-	}
-
-	if (type & PERF_SAMPLE_RAW) {
-		u32 *p = (u32 *)array;
-		data->raw_size = *p;
-		p++;
-		data->raw_data = p;
-	}
-
-	return 0;
-}
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index 2b7e919..d79e4ed 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -169,9 +169,10 @@ struct addr_location;
 int event__preprocess_sample(const event_t *self, struct perf_session *session,
 			     struct addr_location *al, struct sample_data *data,
 			     symbol_filter_t filter);
-int event__parse_sample(const event_t *event, struct perf_session *session,
-			struct sample_data *sample);
 
 const char *event__get_event_name(unsigned int id);
 
+int event__parse_sample(const event_t *event, u64 type, bool sample_id_all,
+			struct sample_data *sample);
+
 #endif /* __PERF_RECORD_H */
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 9a6d942..a85ae12 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -355,3 +355,121 @@ out_unmap:
 	}
 	return -1;
 }
+
+static int event__parse_id_sample(const event_t *event, u64 type,
+				  struct sample_data *sample)
+{
+	const u64 *array = event->sample.array;
+
+	array += ((event->header.size -
+		   sizeof(event->header)) / sizeof(u64)) - 1;
+
+	if (type & PERF_SAMPLE_CPU) {
+		u32 *p = (u32 *)array;
+		sample->cpu = *p;
+		array--;
+	}
+
+	if (type & PERF_SAMPLE_STREAM_ID) {
+		sample->stream_id = *array;
+		array--;
+	}
+
+	if (type & PERF_SAMPLE_ID) {
+		sample->id = *array;
+		array--;
+	}
+
+	if (type & PERF_SAMPLE_TIME) {
+		sample->time = *array;
+		array--;
+	}
+
+	if (type & PERF_SAMPLE_TID) {
+		u32 *p = (u32 *)array;
+		sample->pid = p[0];
+		sample->tid = p[1];
+	}
+
+	return 0;
+}
+
+int event__parse_sample(const event_t *event, u64 type, bool sample_id_all,
+			struct sample_data *data)
+{
+	const u64 *array;
+
+	data->cpu = data->pid = data->tid = -1;
+	data->stream_id = data->id = data->time = -1ULL;
+
+	if (event->header.type != PERF_RECORD_SAMPLE) {
+		if (!sample_id_all)
+			return 0;
+		return event__parse_id_sample(event, type, data);
+	}
+
+	array = event->sample.array;
+
+	if (type & PERF_SAMPLE_IP) {
+		data->ip = event->ip.ip;
+		array++;
+	}
+
+	if (type & PERF_SAMPLE_TID) {
+		u32 *p = (u32 *)array;
+		data->pid = p[0];
+		data->tid = p[1];
+		array++;
+	}
+
+	if (type & PERF_SAMPLE_TIME) {
+		data->time = *array;
+		array++;
+	}
+
+	if (type & PERF_SAMPLE_ADDR) {
+		data->addr = *array;
+		array++;
+	}
+
+	data->id = -1ULL;
+	if (type & PERF_SAMPLE_ID) {
+		data->id = *array;
+		array++;
+	}
+
+	if (type & PERF_SAMPLE_STREAM_ID) {
+		data->stream_id = *array;
+		array++;
+	}
+
+	if (type & PERF_SAMPLE_CPU) {
+		u32 *p = (u32 *)array;
+		data->cpu = *p;
+		array++;
+	}
+
+	if (type & PERF_SAMPLE_PERIOD) {
+		data->period = *array;
+		array++;
+	}
+
+	if (type & PERF_SAMPLE_READ) {
+		fprintf(stderr, "PERF_SAMPLE_READ is unsuported for now\n");
+		return -1;
+	}
+
+	if (type & PERF_SAMPLE_CALLCHAIN) {
+		data->callchain = (struct ip_callchain *)array;
+		array += 1 + data->callchain->nr;
+	}
+
+	if (type & PERF_SAMPLE_RAW) {
+		u32 *p = (u32 *)array;
+		data->raw_size = *p;
+		p++;
+		data->raw_data = p;
+	}
+
+	return 0;
+}
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index b58a48a..e6a0740 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -496,7 +496,7 @@ static void flush_sample_queue(struct perf_session *s,
 		if (iter->timestamp > limit)
 			break;
 
-		event__parse_sample(iter->event, s, &sample);
+		perf_session__parse_sample(s, iter->event, &sample);
 		perf_session_deliver_event(s, iter->event, &sample, ops,
 					   iter->file_offset);
 
@@ -806,7 +806,7 @@ static int perf_session__process_event(struct perf_session *session,
 	/*
 	 * For all kernel events we get the sample data
 	 */
-	event__parse_sample(event, session, &sample);
+	perf_session__parse_sample(session, event, &sample);
 
 	/* Preprocess sample records - precheck callchains */
 	if (perf_session__preprocess_sample(session, event, &sample))
diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h
index e815468..7823976 100644
--- a/tools/perf/util/session.h
+++ b/tools/perf/util/session.h
@@ -155,4 +155,13 @@ size_t perf_session__fprintf_nr_events(struct perf_session *self, FILE *fp)
 {
 	return hists__fprintf_nr_events(&self->hists, fp);
 }
+
+static inline int perf_session__parse_sample(struct perf_session *session,
+					     const event_t *event,
+					     struct sample_data *sample)
+{
+	return event__parse_sample(event, session->sample_type,
+				   session->sample_id_all, sample);
+}
+
 #endif /* __PERF_SESSION_H */
-- 
cgit v0.10.2


From ef1d1af28ca37fdbc2745da040529cd2953c1af5 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 18 Jan 2011 21:41:45 -0200
Subject: perf evsel: Introduce perf_evsel__{in,ex}it

Out of the {con,des}structor, as in interpreted language bindings we will
need to go back from the wrapper object to the real thing. In that case
using container_of will save us to have an extra pointer in the perf_evsel
struct.

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index 4b3b84c..df0610e 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -6,17 +6,21 @@
 #include <linux/bitops.h>
 #include <linux/hash.h>
 
+void perf_evlist__init(struct perf_evlist *evlist)
+{
+	int i;
+
+	for (i = 0; i < PERF_EVLIST__HLIST_SIZE; ++i)
+		INIT_HLIST_HEAD(&evlist->heads[i]);
+	INIT_LIST_HEAD(&evlist->entries);
+}
+
 struct perf_evlist *perf_evlist__new(void)
 {
 	struct perf_evlist *evlist = zalloc(sizeof(*evlist));
 
-	if (evlist != NULL) {
-		int i;
-
-		for (i = 0; i < PERF_EVLIST__HLIST_SIZE; ++i)
-			INIT_HLIST_HEAD(&evlist->heads[i]);
-		INIT_LIST_HEAD(&evlist->entries);
-	}
+	if (evlist != NULL)
+		perf_evlist__init(evlist);
 
 	return evlist;
 }
@@ -33,11 +37,18 @@ static void perf_evlist__purge(struct perf_evlist *evlist)
 	evlist->nr_entries = 0;
 }
 
-void perf_evlist__delete(struct perf_evlist *evlist)
+void perf_evlist__exit(struct perf_evlist *evlist)
 {
-	perf_evlist__purge(evlist);
 	free(evlist->mmap);
 	free(evlist->pollfd);
+	evlist->mmap = NULL;
+	evlist->pollfd = NULL;
+}
+
+void perf_evlist__delete(struct perf_evlist *evlist)
+{
+	perf_evlist__purge(evlist);
+	perf_evlist__exit(evlist);
 	free(evlist);
 }
 
diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h
index 2871206..acbe48e 100644
--- a/tools/perf/util/evlist.h
+++ b/tools/perf/util/evlist.h
@@ -24,6 +24,8 @@ struct perf_evlist {
 struct perf_evsel;
 
 struct perf_evlist *perf_evlist__new(void);
+void perf_evlist__init(struct perf_evlist *evlist);
+void perf_evlist__exit(struct perf_evlist *evlist);
 void perf_evlist__delete(struct perf_evlist *evlist);
 
 void perf_evlist__add(struct perf_evlist *evlist, struct perf_evsel *entry);
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index a85ae12..76ab553 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -14,15 +14,20 @@
 #define FD(e, x, y) (*(int *)xyarray__entry(e->fd, x, y))
 #define SID(e, x, y) xyarray__entry(e->id, x, y)
 
+void perf_evsel__init(struct perf_evsel *evsel,
+		      struct perf_event_attr *attr, int idx)
+{
+	evsel->idx	   = idx;
+	evsel->attr	   = *attr;
+	INIT_LIST_HEAD(&evsel->node);
+}
+
 struct perf_evsel *perf_evsel__new(struct perf_event_attr *attr, int idx)
 {
 	struct perf_evsel *evsel = zalloc(sizeof(*evsel));
 
-	if (evsel != NULL) {
-		evsel->idx	   = idx;
-		evsel->attr	   = *attr;
-		INIT_LIST_HEAD(&evsel->node);
-	}
+	if (evsel != NULL)
+		perf_evsel__init(evsel, attr, idx);
 
 	return evsel;
 }
@@ -87,11 +92,16 @@ int perf_evlist__alloc_mmap(struct perf_evlist *evlist, int ncpus)
 	return evlist->mmap != NULL ? 0 : -ENOMEM;
 }
 
-void perf_evsel__delete(struct perf_evsel *evsel)
+void perf_evsel__exit(struct perf_evsel *evsel)
 {
 	assert(list_empty(&evsel->node));
 	xyarray__delete(evsel->fd);
 	xyarray__delete(evsel->id);
+}
+
+void perf_evsel__delete(struct perf_evsel *evsel)
+{
+	perf_evsel__exit(evsel);
 	free(evsel);
 }
 
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index 667ee4e..7962e75 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -52,6 +52,9 @@ struct thread_map;
 struct perf_evlist;
 
 struct perf_evsel *perf_evsel__new(struct perf_event_attr *attr, int idx);
+void perf_evsel__init(struct perf_evsel *evsel,
+		      struct perf_event_attr *attr, int idx);
+void perf_evsel__exit(struct perf_evsel *evsel);
 void perf_evsel__delete(struct perf_evsel *evsel);
 
 int perf_evsel__alloc_fd(struct perf_evsel *evsel, int ncpus, int nthreads);
-- 
cgit v0.10.2


From 9599ec0471deae24044241e2173090d2cbc0e899 Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Mon, 17 Jan 2011 17:39:15 -0800
Subject: x86-64, mem: Convert memmove() to assembly file and fix return value
 bug

memmove_64.c only implements memmove() function which is completely written in
inline assembly code. Therefore it doesn't make sense to keep the assembly code
in .c file.

Currently memmove() doesn't store return value to rax. This may cause issue if
caller uses the return value. The patch fixes this issue.

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
LKML-Reference: <1295314755-6625-1-git-send-email-fenghua.yu@intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>

diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index 1b950d1..9796c2f 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -52,6 +52,7 @@ extern void *__memcpy(void *, const void *, __kernel_size_t);
 EXPORT_SYMBOL(memset);
 EXPORT_SYMBOL(memcpy);
 EXPORT_SYMBOL(__memcpy);
+EXPORT_SYMBOL(memmove);
 
 EXPORT_SYMBOL(empty_zero_page);
 #ifndef CONFIG_PARAVIRT
diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S
new file mode 100644
index 0000000..0ecb843
--- /dev/null
+++ b/arch/x86/lib/memmove_64.S
@@ -0,0 +1,197 @@
+/*
+ * Normally compiler builtins are used, but sometimes the compiler calls out
+ * of line code. Based on asm-i386/string.h.
+ *
+ * This assembly file is re-written from memmove_64.c file.
+ *	- Copyright 2011 Fenghua Yu <fenghua.yu@intel.com>
+ */
+#define _STRING_C
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+
+#undef memmove
+
+/*
+ * Implement memmove(). This can handle overlap between src and dst.
+ *
+ * Input:
+ * rdi: dest
+ * rsi: src
+ * rdx: count
+ *
+ * Output:
+ * rax: dest
+ */
+ENTRY(memmove)
+	CFI_STARTPROC
+	/* Handle more 32bytes in loop */
+	mov %rdi, %rax
+	cmp $0x20, %rdx
+	jb	1f
+
+	/* Decide forward/backward copy mode */
+	cmp %rdi, %rsi
+	jb	2f
+
+	/*
+	 * movsq instruction have many startup latency
+	 * so we handle small size by general register.
+	 */
+	cmp  $680, %rdx
+	jb	3f
+	/*
+	 * movsq instruction is only good for aligned case.
+	 */
+
+	cmpb %dil, %sil
+	je 4f
+3:
+	sub $0x20, %rdx
+	/*
+	 * We gobble 32byts forward in each loop.
+	 */
+5:
+	sub $0x20, %rdx
+	movq 0*8(%rsi), %r11
+	movq 1*8(%rsi), %r10
+	movq 2*8(%rsi), %r9
+	movq 3*8(%rsi), %r8
+	leaq 4*8(%rsi), %rsi
+
+	movq %r11, 0*8(%rdi)
+	movq %r10, 1*8(%rdi)
+	movq %r9, 2*8(%rdi)
+	movq %r8, 3*8(%rdi)
+	leaq 4*8(%rdi), %rdi
+	jae 5b
+	addq $0x20, %rdx
+	jmp 1f
+	/*
+	 * Handle data forward by movsq.
+	 */
+	.p2align 4
+4:
+	movq %rdx, %rcx
+	movq -8(%rsi, %rdx), %r11
+	lea -8(%rdi, %rdx), %r10
+	shrq $3, %rcx
+	rep movsq
+	movq %r11, (%r10)
+	jmp 13f
+	/*
+	 * Handle data backward by movsq.
+	 */
+	.p2align 4
+7:
+	movq %rdx, %rcx
+	movq (%rsi), %r11
+	movq %rdi, %r10
+	leaq -8(%rsi, %rdx), %rsi
+	leaq -8(%rdi, %rdx), %rdi
+	shrq $3, %rcx
+	std
+	rep movsq
+	cld
+	movq %r11, (%r10)
+	jmp 13f
+
+	/*
+	 * Start to prepare for backward copy.
+	 */
+	.p2align 4
+2:
+	cmp $680, %rdx
+	jb 6f
+	cmp %dil, %sil
+	je 7b
+6:
+	/*
+	 * Calculate copy position to tail.
+	 */
+	addq %rdx, %rsi
+	addq %rdx, %rdi
+	subq $0x20, %rdx
+	/*
+	 * We gobble 32byts backward in each loop.
+	 */
+8:
+	subq $0x20, %rdx
+	movq -1*8(%rsi), %r11
+	movq -2*8(%rsi), %r10
+	movq -3*8(%rsi), %r9
+	movq -4*8(%rsi), %r8
+	leaq -4*8(%rsi), %rsi
+
+	movq %r11, -1*8(%rdi)
+	movq %r10, -2*8(%rdi)
+	movq %r9, -3*8(%rdi)
+	movq %r8, -4*8(%rdi)
+	leaq -4*8(%rdi), %rdi
+	jae 8b
+	/*
+	 * Calculate copy position to head.
+	 */
+	addq $0x20, %rdx
+	subq %rdx, %rsi
+	subq %rdx, %rdi
+1:
+	cmpq $16, %rdx
+	jb 9f
+	/*
+	 * Move data from 16 bytes to 31 bytes.
+	 */
+	movq 0*8(%rsi), %r11
+	movq 1*8(%rsi), %r10
+	movq -2*8(%rsi, %rdx), %r9
+	movq -1*8(%rsi, %rdx), %r8
+	movq %r11, 0*8(%rdi)
+	movq %r10, 1*8(%rdi)
+	movq %r9, -2*8(%rdi, %rdx)
+	movq %r8, -1*8(%rdi, %rdx)
+	jmp 13f
+	.p2align 4
+9:
+	cmpq $8, %rdx
+	jb 10f
+	/*
+	 * Move data from 8 bytes to 15 bytes.
+	 */
+	movq 0*8(%rsi), %r11
+	movq -1*8(%rsi, %rdx), %r10
+	movq %r11, 0*8(%rdi)
+	movq %r10, -1*8(%rdi, %rdx)
+	jmp 13f
+10:
+	cmpq $4, %rdx
+	jb 11f
+	/*
+	 * Move data from 4 bytes to 7 bytes.
+	 */
+	movl (%rsi), %r11d
+	movl -4(%rsi, %rdx), %r10d
+	movl %r11d, (%rdi)
+	movl %r10d, -4(%rdi, %rdx)
+	jmp 13f
+11:
+	cmp $2, %rdx
+	jb 12f
+	/*
+	 * Move data from 2 bytes to 3 bytes.
+	 */
+	movw (%rsi), %r11w
+	movw -2(%rsi, %rdx), %r10w
+	movw %r11w, (%rdi)
+	movw %r10w, -2(%rdi, %rdx)
+	jmp 13f
+12:
+	cmp $1, %rdx
+	jb 13f
+	/*
+	 * Move data for 1 byte.
+	 */
+	movb (%rsi), %r11b
+	movb %r11b, (%rdi)
+13:
+	retq
+	CFI_ENDPROC
+ENDPROC(memmove)
diff --git a/arch/x86/lib/memmove_64.c b/arch/x86/lib/memmove_64.c
deleted file mode 100644
index 6d0f0ec..0000000
--- a/arch/x86/lib/memmove_64.c
+++ /dev/null
@@ -1,192 +0,0 @@
-/* Normally compiler builtins are used, but sometimes the compiler calls out
-   of line code. Based on asm-i386/string.h.
- */
-#define _STRING_C
-#include <linux/string.h>
-#include <linux/module.h>
-
-#undef memmove
-void *memmove(void *dest, const void *src, size_t count)
-{
-	unsigned long d0,d1,d2,d3,d4,d5,d6,d7;
-	char *ret;
-
-	__asm__ __volatile__(
-		/* Handle more 32bytes in loop */
-		"mov %2, %3\n\t"
-		"cmp $0x20, %0\n\t"
-		"jb	1f\n\t"
-
-		/* Decide forward/backward copy mode */
-		"cmp %2, %1\n\t"
-		"jb	2f\n\t"
-
-		/*
-		 * movsq instruction have many startup latency
-		 * so we handle small size by general register.
-		 */
-		"cmp  $680, %0\n\t"
-		"jb 3f\n\t"
-		/*
-		 * movsq instruction is only good for aligned case.
-		 */
-		"cmpb %%dil, %%sil\n\t"
-		"je 4f\n\t"
-		"3:\n\t"
-		"sub $0x20, %0\n\t"
-		/*
-		 * We gobble 32byts forward in each loop.
-		 */
-		"5:\n\t"
-		"sub $0x20, %0\n\t"
-		"movq 0*8(%1), %4\n\t"
-		"movq 1*8(%1), %5\n\t"
-		"movq 2*8(%1), %6\n\t"
-		"movq 3*8(%1), %7\n\t"
-		"leaq 4*8(%1), %1\n\t"
-
-		"movq %4, 0*8(%2)\n\t"
-		"movq %5, 1*8(%2)\n\t"
-		"movq %6, 2*8(%2)\n\t"
-		"movq %7, 3*8(%2)\n\t"
-		"leaq 4*8(%2), %2\n\t"
-		"jae 5b\n\t"
-		"addq $0x20, %0\n\t"
-		"jmp 1f\n\t"
-		/*
-		 * Handle data forward by movsq.
-		 */
-		".p2align 4\n\t"
-		"4:\n\t"
-		"movq %0, %8\n\t"
-		"movq -8(%1, %0), %4\n\t"
-		"lea -8(%2, %0), %5\n\t"
-		"shrq $3, %8\n\t"
-		"rep movsq\n\t"
-		"movq %4, (%5)\n\t"
-		"jmp 13f\n\t"
-		/*
-		 * Handle data backward by movsq.
-		 */
-		".p2align 4\n\t"
-		"7:\n\t"
-		"movq %0, %8\n\t"
-		"movq (%1), %4\n\t"
-		"movq %2, %5\n\t"
-		"leaq -8(%1, %0), %1\n\t"
-		"leaq -8(%2, %0), %2\n\t"
-		"shrq $3, %8\n\t"
-		"std\n\t"
-		"rep movsq\n\t"
-		"cld\n\t"
-		"movq %4, (%5)\n\t"
-		"jmp 13f\n\t"
-
-		/*
-		 * Start to prepare for backward copy.
-		 */
-		".p2align 4\n\t"
-		"2:\n\t"
-		"cmp $680, %0\n\t"
-		"jb 6f \n\t"
-		"cmp %%dil, %%sil\n\t"
-		"je 7b \n\t"
-		"6:\n\t"
-		/*
-		 * Calculate copy position to tail.
-		 */
-		"addq %0, %1\n\t"
-		"addq %0, %2\n\t"
-		"subq $0x20, %0\n\t"
-		/*
-		 * We gobble 32byts backward in each loop.
-		 */
-		"8:\n\t"
-		"subq $0x20, %0\n\t"
-		"movq -1*8(%1), %4\n\t"
-		"movq -2*8(%1), %5\n\t"
-		"movq -3*8(%1), %6\n\t"
-		"movq -4*8(%1), %7\n\t"
-		"leaq -4*8(%1), %1\n\t"
-
-		"movq %4, -1*8(%2)\n\t"
-		"movq %5, -2*8(%2)\n\t"
-		"movq %6, -3*8(%2)\n\t"
-		"movq %7, -4*8(%2)\n\t"
-		"leaq -4*8(%2), %2\n\t"
-		"jae 8b\n\t"
-		/*
-		 * Calculate copy position to head.
-		 */
-		"addq $0x20, %0\n\t"
-		"subq %0, %1\n\t"
-		"subq %0, %2\n\t"
-		"1:\n\t"
-		"cmpq $16, %0\n\t"
-		"jb 9f\n\t"
-		/*
-		 * Move data from 16 bytes to 31 bytes.
-		 */
-		"movq 0*8(%1), %4\n\t"
-		"movq 1*8(%1), %5\n\t"
-		"movq -2*8(%1, %0), %6\n\t"
-		"movq -1*8(%1, %0), %7\n\t"
-		"movq %4, 0*8(%2)\n\t"
-		"movq %5, 1*8(%2)\n\t"
-		"movq %6, -2*8(%2, %0)\n\t"
-		"movq %7, -1*8(%2, %0)\n\t"
-		"jmp 13f\n\t"
-		".p2align 4\n\t"
-		"9:\n\t"
-		"cmpq $8, %0\n\t"
-		"jb 10f\n\t"
-		/*
-		 * Move data from 8 bytes to 15 bytes.
-		 */
-		"movq 0*8(%1), %4\n\t"
-		"movq -1*8(%1, %0), %5\n\t"
-		"movq %4, 0*8(%2)\n\t"
-		"movq %5, -1*8(%2, %0)\n\t"
-		"jmp 13f\n\t"
-		"10:\n\t"
-		"cmpq $4, %0\n\t"
-		"jb 11f\n\t"
-		/*
-		 * Move data from 4 bytes to 7 bytes.
-		 */
-		"movl (%1), %4d\n\t"
-		"movl -4(%1, %0), %5d\n\t"
-		"movl %4d, (%2)\n\t"
-		"movl %5d, -4(%2, %0)\n\t"
-		"jmp 13f\n\t"
-		"11:\n\t"
-		"cmp $2, %0\n\t"
-		"jb 12f\n\t"
-		/*
-		 * Move data from 2 bytes to 3 bytes.
-		 */
-		"movw (%1), %4w\n\t"
-		"movw -2(%1, %0), %5w\n\t"
-		"movw %4w, (%2)\n\t"
-		"movw %5w, -2(%2, %0)\n\t"
-		"jmp 13f\n\t"
-		"12:\n\t"
-		"cmp $1, %0\n\t"
-		"jb 13f\n\t"
-		/*
-		 * Move data for 1 byte.
-		 */
-		"movb (%1), %4b\n\t"
-		"movb %4b, (%2)\n\t"
-		"13:\n\t"
-		: "=&d" (d0), "=&S" (d1), "=&D" (d2), "=&a" (ret) ,
-		  "=r"(d3), "=r"(d4), "=r"(d5), "=r"(d6), "=&c" (d7)
-		:"0" (count),
-		 "1" (src),
-		 "2" (dest)
-		:"memory");
-
-		return ret;
-
-}
-EXPORT_SYMBOL(memmove);
-- 
cgit v0.10.2


From b3d7336db553d318e7ec042eb50a70d307013339 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Fri, 21 Jan 2011 15:29:44 -0800
Subject: x86: Move llc_shared_map out of cpu_info

cpu_info is already with per_cpu, We can take llc_shared_map out
of cpu_info, and declare it as per_cpu variable directly.

So later referencing could be simple and directly instead of
diving to find cpu_info at first.

Also could make smp_store_cpu_info() much simple to avoid to do
save and restore trick.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Hans Rosenfeld <hans.rosenfeld@amd.com>
Cc: Alok N Kataria <akataria@vmware.com>
Cc: Stephen Hemminger <shemminger@vyatta.com>
Cc: Hans J. Koch <hjk@linutronix.de>
Cc: Tejun Heo <tj@kernel.org>
Cc: Borislav Petkov <borislav.petkov@amd.com>
Cc: Andreas Herrmann <andreas.herrmann3@amd.com>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
LKML-Reference: <4D3A16E8.5020608@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 45636ce..4c25ab4 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -94,10 +94,6 @@ struct cpuinfo_x86 {
 	int			x86_cache_alignment;	/* In bytes */
 	int			x86_power;
 	unsigned long		loops_per_jiffy;
-#ifdef CONFIG_SMP
-	/* cpus sharing the last level cache: */
-	cpumask_var_t		llc_shared_map;
-#endif
 	/* cpuid returned max cores value: */
 	u16			 x86_max_cores;
 	u16			apicid;
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 4c2f63c..3597825 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -23,6 +23,8 @@ extern unsigned int num_processors;
 
 DECLARE_PER_CPU(cpumask_var_t, cpu_sibling_map);
 DECLARE_PER_CPU(cpumask_var_t, cpu_core_map);
+/* cpus sharing the last level cache: */
+DECLARE_PER_CPU(cpumask_var_t, cpu_llc_shared_map);
 DECLARE_PER_CPU(u16, cpu_llc_id);
 DECLARE_PER_CPU(int, cpu_number);
 
@@ -36,6 +38,11 @@ static inline struct cpumask *cpu_core_mask(int cpu)
 	return per_cpu(cpu_core_map, cpu);
 }
 
+static inline struct cpumask *cpu_llc_shared_mask(int cpu)
+{
+	return per_cpu(cpu_llc_shared_map, cpu);
+}
+
 DECLARE_EARLY_PER_CPU(u16, x86_cpu_to_apicid);
 DECLARE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid);
 
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index ec2c19a..5419a26 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -732,11 +732,11 @@ static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
 	struct cpuinfo_x86 *c = &cpu_data(cpu);
 
 	if ((index == 3) && (c->x86_vendor == X86_VENDOR_AMD)) {
-		for_each_cpu(i, c->llc_shared_map) {
+		for_each_cpu(i, cpu_llc_shared_mask(cpu)) {
 			if (!per_cpu(ici_cpuid4_info, i))
 				continue;
 			this_leaf = CPUID4_INFO_IDX(i, index);
-			for_each_cpu(sibling, c->llc_shared_map) {
+			for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) {
 				if (!cpu_online(sibling))
 					continue;
 				set_bit(sibling, this_leaf->shared_cpu_map);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 5bf2fac..167f97b 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -527,15 +527,12 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
 	int i, err = 0;
 	struct threshold_bank *b = NULL;
 	char name[32];
-#ifdef CONFIG_SMP
-	struct cpuinfo_x86 *c = &cpu_data(cpu);
-#endif
 
 	sprintf(name, "threshold_bank%i", bank);
 
 #ifdef CONFIG_SMP
 	if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) {	/* symlink */
-		i = cpumask_first(c->llc_shared_map);
+		i = cpumask_first(cpu_llc_shared_mask(cpu));
 
 		/* first core not up yet */
 		if (cpu_data(i).cpu_core_id)
@@ -555,7 +552,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
 		if (err)
 			goto out;
 
-		cpumask_copy(b->cpus, c->llc_shared_map);
+		cpumask_copy(b->cpus, cpu_llc_shared_mask(cpu));
 		per_cpu(threshold_banks, cpu)[bank] = b;
 
 		goto out;
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 0cbe8c0..d396155 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -130,6 +130,8 @@ EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
 DEFINE_PER_CPU(cpumask_var_t, cpu_core_map);
 EXPORT_PER_CPU_SYMBOL(cpu_core_map);
 
+DEFINE_PER_CPU(cpumask_var_t, cpu_llc_shared_map);
+
 /* Per CPU bogomips and other parameters */
 DEFINE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info);
 EXPORT_PER_CPU_SYMBOL(cpu_info);
@@ -355,23 +357,6 @@ notrace static void __cpuinit start_secondary(void *unused)
 	cpu_idle();
 }
 
-#ifdef CONFIG_CPUMASK_OFFSTACK
-/* In this case, llc_shared_map is a pointer to a cpumask. */
-static inline void copy_cpuinfo_x86(struct cpuinfo_x86 *dst,
-				    const struct cpuinfo_x86 *src)
-{
-	struct cpumask *llc = dst->llc_shared_map;
-	*dst = *src;
-	dst->llc_shared_map = llc;
-}
-#else
-static inline void copy_cpuinfo_x86(struct cpuinfo_x86 *dst,
-				    const struct cpuinfo_x86 *src)
-{
-	*dst = *src;
-}
-#endif /* CONFIG_CPUMASK_OFFSTACK */
-
 /*
  * The bootstrap kernel entry code has set these up. Save them for
  * a given CPU
@@ -381,7 +366,7 @@ void __cpuinit smp_store_cpu_info(int id)
 {
 	struct cpuinfo_x86 *c = &cpu_data(id);
 
-	copy_cpuinfo_x86(c, &boot_cpu_data);
+	*c = boot_cpu_data;
 	c->cpu_index = id;
 	if (id != 0)
 		identify_secondary_cpu(c);
@@ -389,15 +374,12 @@ void __cpuinit smp_store_cpu_info(int id)
 
 static void __cpuinit link_thread_siblings(int cpu1, int cpu2)
 {
-	struct cpuinfo_x86 *c1 = &cpu_data(cpu1);
-	struct cpuinfo_x86 *c2 = &cpu_data(cpu2);
-
 	cpumask_set_cpu(cpu1, cpu_sibling_mask(cpu2));
 	cpumask_set_cpu(cpu2, cpu_sibling_mask(cpu1));
 	cpumask_set_cpu(cpu1, cpu_core_mask(cpu2));
 	cpumask_set_cpu(cpu2, cpu_core_mask(cpu1));
-	cpumask_set_cpu(cpu1, c2->llc_shared_map);
-	cpumask_set_cpu(cpu2, c1->llc_shared_map);
+	cpumask_set_cpu(cpu1, cpu_llc_shared_mask(cpu2));
+	cpumask_set_cpu(cpu2, cpu_llc_shared_mask(cpu1));
 }
 
 
@@ -425,7 +407,7 @@ void __cpuinit set_cpu_sibling_map(int cpu)
 		cpumask_set_cpu(cpu, cpu_sibling_mask(cpu));
 	}
 
-	cpumask_set_cpu(cpu, c->llc_shared_map);
+	cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu));
 
 	if (__this_cpu_read(cpu_info.x86_max_cores) == 1) {
 		cpumask_copy(cpu_core_mask(cpu), cpu_sibling_mask(cpu));
@@ -436,8 +418,8 @@ void __cpuinit set_cpu_sibling_map(int cpu)
 	for_each_cpu(i, cpu_sibling_setup_mask) {
 		if (per_cpu(cpu_llc_id, cpu) != BAD_APICID &&
 		    per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) {
-			cpumask_set_cpu(i, c->llc_shared_map);
-			cpumask_set_cpu(cpu, cpu_data(i).llc_shared_map);
+			cpumask_set_cpu(i, cpu_llc_shared_mask(cpu));
+			cpumask_set_cpu(cpu, cpu_llc_shared_mask(i));
 		}
 		if (c->phys_proc_id == cpu_data(i).phys_proc_id) {
 			cpumask_set_cpu(i, cpu_core_mask(cpu));
@@ -476,7 +458,7 @@ const struct cpumask *cpu_coregroup_mask(int cpu)
 	    !(cpu_has(c, X86_FEATURE_AMD_DCM)))
 		return cpu_core_mask(cpu);
 	else
-		return c->llc_shared_map;
+		return cpu_llc_shared_mask(cpu);
 }
 
 static void impress_friends(void)
@@ -1103,7 +1085,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
 	for_each_possible_cpu(i) {
 		zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL);
 		zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
-		zalloc_cpumask_var(&cpu_data(i).llc_shared_map, GFP_KERNEL);
+		zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL);
 	}
 	set_cpu_sibling_map(0);
 
-- 
cgit v0.10.2


From 792363d2beceb1c7d865e517fa9939c8b8c1442a Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Fri, 21 Jan 2011 15:29:54 -0800
Subject: x86: Don't copy per_cpu cpuinfo for BSP two times

smp_store_cpu_info(0) will do that.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
LKML-Reference: <4D3A16F2.5090902@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index d396155..a7a3d1a 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1071,13 +1071,13 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
 
 	preempt_disable();
 	smp_cpu_index_default();
-	memcpy(__this_cpu_ptr(&cpu_info), &boot_cpu_data, sizeof(cpu_info));
-	cpumask_copy(cpu_callin_mask, cpumask_of(0));
-	mb();
+
 	/*
 	 * Setup boot CPU information
 	 */
 	smp_store_cpu_info(0); /* Final full version of the data */
+	cpumask_copy(cpu_callin_mask, cpumask_of(0));
+	mb();
 #ifdef CONFIG_X86_32
 	boot_cpu_logical_apicid = logical_smp_processor_id();
 #endif
-- 
cgit v0.10.2


From 6d5ab2932a21ea54406ab95c43ecff90a3eddfda Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Fri, 21 Jan 2011 20:45:01 -0800
Subject: sched: Simplify update_cfs_shares parameters

Re-visiting this: Since update_cfs_shares will now only ever re-weight an
entity that is a relative parent of the current entity in enqueue_entity; we
can safely issue the account_entity_enqueue relative to that cfs_rq and avoid
the requirement for special handling of the enqueue case in update_cfs_shares.

Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20110122044851.915214637@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched.c b/kernel/sched.c
index 18d38e4..e0fa3ff 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8510,7 +8510,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
 		/* Propagate contribution to hierarchy */
 		raw_spin_lock_irqsave(&rq->lock, flags);
 		for_each_sched_entity(se)
-			update_cfs_shares(group_cfs_rq(se), 0);
+			update_cfs_shares(group_cfs_rq(se));
 		raw_spin_unlock_irqrestore(&rq->lock, flags);
 	}
 
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 0c26e2d..0c550c8 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -540,7 +540,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 }
 
 static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update);
-static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta);
+static void update_cfs_shares(struct cfs_rq *cfs_rq);
 
 /*
  * Update the current task's runtime statistics. Skip current tasks that
@@ -763,16 +763,15 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
 		list_del_leaf_cfs_rq(cfs_rq);
 }
 
-static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg,
-				long weight_delta)
+static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
 {
 	long load_weight, load, shares;
 
-	load = cfs_rq->load.weight + weight_delta;
+	load = cfs_rq->load.weight;
 
 	load_weight = atomic_read(&tg->load_weight);
-	load_weight -= cfs_rq->load_contribution;
 	load_weight += load;
+	load_weight -= cfs_rq->load_contribution;
 
 	shares = (tg->shares * load);
 	if (load_weight)
@@ -790,7 +789,7 @@ static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
 {
 	if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
 		update_cfs_load(cfs_rq, 0);
-		update_cfs_shares(cfs_rq, 0);
+		update_cfs_shares(cfs_rq);
 	}
 }
 # else /* CONFIG_SMP */
@@ -798,8 +797,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
 {
 }
 
-static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg,
-				long weight_delta)
+static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
 {
 	return tg->shares;
 }
@@ -824,7 +822,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
 		account_entity_enqueue(cfs_rq, se);
 }
 
-static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
+static void update_cfs_shares(struct cfs_rq *cfs_rq)
 {
 	struct task_group *tg;
 	struct sched_entity *se;
@@ -838,7 +836,7 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
 	if (likely(se->load.weight == tg->shares))
 		return;
 #endif
-	shares = calc_cfs_shares(cfs_rq, tg, weight_delta);
+	shares = calc_cfs_shares(cfs_rq, tg);
 
 	reweight_entity(cfs_rq_of(se), se, shares);
 }
@@ -847,7 +845,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
 {
 }
 
-static inline void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
+static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
 {
 }
 
@@ -978,8 +976,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	 */
 	update_curr(cfs_rq);
 	update_cfs_load(cfs_rq, 0);
-	update_cfs_shares(cfs_rq, se->load.weight);
 	account_entity_enqueue(cfs_rq, se);
+	update_cfs_shares(cfs_rq);
 
 	if (flags & ENQUEUE_WAKEUP) {
 		place_entity(cfs_rq, se, 0);
@@ -1041,7 +1039,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	update_cfs_load(cfs_rq, 0);
 	account_entity_dequeue(cfs_rq, se);
 	update_min_vruntime(cfs_rq);
-	update_cfs_shares(cfs_rq, 0);
+	update_cfs_shares(cfs_rq);
 
 	/*
 	 * Normalize the entity after updating the min_vruntime because the
@@ -1282,7 +1280,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
 
 		update_cfs_load(cfs_rq, 0);
-		update_cfs_shares(cfs_rq, 0);
+		update_cfs_shares(cfs_rq);
 	}
 
 	hrtick_update(rq);
@@ -1312,7 +1310,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
 
 		update_cfs_load(cfs_rq, 0);
-		update_cfs_shares(cfs_rq, 0);
+		update_cfs_shares(cfs_rq);
 	}
 
 	hrtick_update(rq);
@@ -2123,7 +2121,7 @@ static int update_shares_cpu(struct task_group *tg, int cpu)
 	 * We need to update shares after updating tg->load_weight in
 	 * order to adjust the weight of groups with long running tasks.
 	 */
-	update_cfs_shares(cfs_rq, 0);
+	update_cfs_shares(cfs_rq);
 
 	raw_spin_unlock_irqrestore(&rq->lock, flags);
 
-- 
cgit v0.10.2


From f07333bf6ee66d9b49286cec4371cf375e745b7a Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Fri, 21 Jan 2011 20:45:03 -0800
Subject: sched: Avoid expensive initial update_cfs_load()

Since cfs->{load_stamp,load_last} are zero-initalized the initial load update
will consider the delta to be 'since the beginning of time'.

This results in a lot of pointless divisions to bring this large period to be
within the sysctl_sched_shares_window.

Fix this by initializing load_stamp to be 1 at cfs_rq initialization, this
allows for an initial load_stamp > load_last which then lets standard idle
truncation proceed.

We avoid spinning (and slightly improve consistency) by fixing delta to be
[period - 1] in this path resulting in a slightly more predictable shares ramp.
(Previously the amount of idle time preserved by the overflow would range between
[period/2,period-1].)

Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20110122044852.102126037@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched.c b/kernel/sched.c
index e0fa3ff..6820b5b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7796,6 +7796,8 @@ static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
 	INIT_LIST_HEAD(&cfs_rq->tasks);
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	cfs_rq->rq = rq;
+	/* allow initial update_cfs_load() to truncate */
+	cfs_rq->load_stamp = 1;
 #endif
 	cfs_rq->min_vruntime = (u64)(-(1LL << 20));
 }
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 0c550c8..4cbc912 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -733,6 +733,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
 	    now - cfs_rq->load_last > 4 * period) {
 		cfs_rq->load_period = 0;
 		cfs_rq->load_avg = 0;
+		delta = period - 1;
 	}
 
 	cfs_rq->load_stamp = now;
-- 
cgit v0.10.2


From 4dd53d891ca46dcc1fde0376a33540d3fd83cb9a Mon Sep 17 00:00:00 2001
From: Venkatesh Pallipadi <venki@google.com>
Date: Tue, 21 Dec 2010 17:09:00 -0800
Subject: softirqs: Free up pf flag PF_KSOFTIRQD

Cleanup patch, freeing up PF_KSOFTIRQD and use per_cpu ksoftirqd pointer
instead, as suggested by Eric Dumazet.

Tested-by: Shaun Ruffell <sruffell@digium.com>
Signed-off-by: Venkatesh Pallipadi <venki@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1292980144-28796-2-git-send-email-venki@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 55e0d42..a1382b9 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -426,6 +426,13 @@ extern void raise_softirq(unsigned int nr);
  */
 DECLARE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list);
 
+DECLARE_PER_CPU(struct task_struct *, ksoftirqd);
+
+static inline struct task_struct *this_cpu_ksoftirqd(void)
+{
+	return this_cpu_read(ksoftirqd);
+}
+
 /* Try to send a softirq to a remote cpu.  If this cannot be done, the
  * work will be queued to the local cpu.
  */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d747f94..af6e15f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1715,7 +1715,6 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *
 /*
  * Per process flags
  */
-#define PF_KSOFTIRQD	0x00000001	/* I am ksoftirqd */
 #define PF_STARTING	0x00000002	/* being created */
 #define PF_EXITING	0x00000004	/* getting shut down */
 #define PF_EXITPIDONE	0x00000008	/* pi exit done on shut down */
diff --git a/kernel/sched.c b/kernel/sched.c
index 6820b5b..8b89b3b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1880,7 +1880,7 @@ void account_system_vtime(struct task_struct *curr)
 	 */
 	if (hardirq_count())
 		__this_cpu_add(cpu_hardirq_time, delta);
-	else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))
+	else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
 		__this_cpu_add(cpu_softirq_time, delta);
 
 	irq_time_write_end();
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 68eb5ef..0cee504 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -54,7 +54,7 @@ EXPORT_SYMBOL(irq_stat);
 
 static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp;
 
-static DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
+DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
 
 char *softirq_to_name[NR_SOFTIRQS] = {
 	"HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL",
@@ -721,7 +721,6 @@ static int run_ksoftirqd(void * __bind_cpu)
 {
 	set_current_state(TASK_INTERRUPTIBLE);
 
-	current->flags |= PF_KSOFTIRQD;
 	while (!kthread_should_stop()) {
 		preempt_disable();
 		if (!local_softirq_pending()) {
-- 
cgit v0.10.2


From a1dabb6bfffccb897eff3e1d102dacf2a4bedf3b Mon Sep 17 00:00:00 2001
From: Venkatesh Pallipadi <venki@google.com>
Date: Tue, 21 Dec 2010 17:09:01 -0800
Subject: time: Add nsecs_to_cputime64 interface for asm-generic

Add nsecs_to_cputime64 interface. This is used in following patches that
updates cpu irq stat based on ns granularity info in IRQ_TIME_ACCOUNTING.

Tested-by: Shaun Ruffell <sruffell@digium.com>
Signed-off-by: Venkatesh Pallipadi <venki@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1292980144-28796-3-git-send-email-venki@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/asm-generic/cputime.h b/include/asm-generic/cputime.h
index 2bcc5c7..61e03dd 100644
--- a/include/asm-generic/cputime.h
+++ b/include/asm-generic/cputime.h
@@ -30,6 +30,9 @@ typedef u64 cputime64_t;
 #define cputime64_to_jiffies64(__ct)	(__ct)
 #define jiffies64_to_cputime64(__jif)	(__jif)
 #define cputime_to_cputime64(__ct)	((u64) __ct)
+#define cputime64_gt(__a, __b)		((__a) >  (__b))
+
+#define nsecs_to_cputime64(__ct)	nsecs_to_jiffies64(__ct)
 
 
 /*
diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h
index 6811f4b..922aa31 100644
--- a/include/linux/jiffies.h
+++ b/include/linux/jiffies.h
@@ -307,6 +307,7 @@ extern clock_t jiffies_to_clock_t(long x);
 extern unsigned long clock_t_to_jiffies(unsigned long x);
 extern u64 jiffies_64_to_clock_t(u64 x);
 extern u64 nsec_to_clock_t(u64 x);
+extern u64 nsecs_to_jiffies64(u64 n);
 extern unsigned long nsecs_to_jiffies(u64 n);
 
 #define TIMESTAMP_SIZE	30
diff --git a/kernel/time.c b/kernel/time.c
index 3217435..55337a8 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -645,7 +645,7 @@ u64 nsec_to_clock_t(u64 x)
 }
 
 /**
- * nsecs_to_jiffies - Convert nsecs in u64 to jiffies
+ * nsecs_to_jiffies64 - Convert nsecs in u64 to jiffies64
  *
  * @n:	nsecs in u64
  *
@@ -657,7 +657,7 @@ u64 nsec_to_clock_t(u64 x)
  *   NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512)
  *   ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years
  */
-unsigned long nsecs_to_jiffies(u64 n)
+u64 nsecs_to_jiffies64(u64 n)
 {
 #if (NSEC_PER_SEC % HZ) == 0
 	/* Common case, HZ = 100, 128, 200, 250, 256, 500, 512, 1000 etc. */
@@ -674,6 +674,25 @@ unsigned long nsecs_to_jiffies(u64 n)
 #endif
 }
 
+
+/**
+ * nsecs_to_jiffies - Convert nsecs in u64 to jiffies
+ *
+ * @n:	nsecs in u64
+ *
+ * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64.
+ * And this doesn't return MAX_JIFFY_OFFSET since this function is designed
+ * for scheduler, not for use in device drivers to calculate timeout value.
+ *
+ * note:
+ *   NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512)
+ *   ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years
+ */
+unsigned long nsecs_to_jiffies(u64 n)
+{
+	return (unsigned long)nsecs_to_jiffies64(n);
+}
+
 #if (BITS_PER_LONG < 64)
 u64 get_jiffies_64(void)
 {
-- 
cgit v0.10.2


From 70a89a6620f658d47a1488515bada4b8ee6291d8 Mon Sep 17 00:00:00 2001
From: Venkatesh Pallipadi <venki@google.com>
Date: Tue, 21 Dec 2010 17:09:02 -0800
Subject: sched: Refactor account_system_time separating id-update

Refactor account_system_time, to separate out the logic of
identifying the update needed and code that does actual update.

This is used by following patch for IRQ_TIME_ACCOUNTING,
which has different identification logic and same update logic.

Tested-by: Shaun Ruffell <sruffell@digium.com>
Signed-off-by: Venkatesh Pallipadi <venki@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1292980144-28796-4-git-send-email-venki@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched.c b/kernel/sched.c
index 8b89b3b..e3fa921 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3568,6 +3568,32 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
 }
 
 /*
+ * Account system cpu time to a process and desired cpustat field
+ * @p: the process that the cpu time gets accounted to
+ * @cputime: the cpu time spent in kernel space since the last update
+ * @cputime_scaled: cputime scaled by cpu frequency
+ * @target_cputime64: pointer to cpustat field that has to be updated
+ */
+static inline
+void __account_system_time(struct task_struct *p, cputime_t cputime,
+			cputime_t cputime_scaled, cputime64_t *target_cputime64)
+{
+	cputime64_t tmp = cputime_to_cputime64(cputime);
+
+	/* Add system time to process. */
+	p->stime = cputime_add(p->stime, cputime);
+	p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
+	account_group_system_time(p, cputime);
+
+	/* Add system time to cpustat. */
+	*target_cputime64 = cputime64_add(*target_cputime64, tmp);
+	cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
+
+	/* Account for system time used */
+	acct_update_integrals(p);
+}
+
+/*
  * Account system cpu time to a process.
  * @p: the process that the cpu time gets accounted to
  * @hardirq_offset: the offset to subtract from hardirq_count()
@@ -3578,31 +3604,21 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
 			 cputime_t cputime, cputime_t cputime_scaled)
 {
 	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
-	cputime64_t tmp;
+	cputime64_t *target_cputime64;
 
 	if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
 		account_guest_time(p, cputime, cputime_scaled);
 		return;
 	}
 
-	/* Add system time to process. */
-	p->stime = cputime_add(p->stime, cputime);
-	p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
-	account_group_system_time(p, cputime);
-
-	/* Add system time to cpustat. */
-	tmp = cputime_to_cputime64(cputime);
 	if (hardirq_count() - hardirq_offset)
-		cpustat->irq = cputime64_add(cpustat->irq, tmp);
+		target_cputime64 = &cpustat->irq;
 	else if (in_serving_softirq())
-		cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
+		target_cputime64 = &cpustat->softirq;
 	else
-		cpustat->system = cputime64_add(cpustat->system, tmp);
+		target_cputime64 = &cpustat->system;
 
-	cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
-
-	/* Account for system time used */
-	acct_update_integrals(p);
+	__account_system_time(p, cputime, cputime_scaled, target_cputime64);
 }
 
 /*
-- 
cgit v0.10.2


From abb74cefa9c682fb38ba86c17ca3c86fed6cc464 Mon Sep 17 00:00:00 2001
From: Venkatesh Pallipadi <venki@google.com>
Date: Tue, 21 Dec 2010 17:09:03 -0800
Subject: sched: Export ns irqtimes through /proc/stat

CONFIG_IRQ_TIME_ACCOUNTING adds ns granularity irq time on each CPU.
This info is already used in scheduler to do proper task chargeback
(earlier patches). This patch retro-fits this ns granularity
hardirq and softirq information to /proc/stat irq and softirq fields.

The update is still done on timer tick, where we look at accumulated
ns hardirq/softirq time and account the tick to user/system/irq/hardirq/guest
accordingly.

No new interface added.

Earlier versions looked at adding this as new fields in some /proc
files. This one seems to be the best in terms of impact to existing
apps, even though it has somewhat more kernel code than earlier versions.

Tested-by: Shaun Ruffell <sruffell@digium.com>
Signed-off-by: Venkatesh Pallipadi <venki@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1292980144-28796-5-git-send-email-venki@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched.c b/kernel/sched.c
index e3fa921..2a3c979 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1920,8 +1920,40 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
 		sched_rt_avg_update(rq, irq_delta);
 }
 
+static int irqtime_account_hi_update(void)
+{
+	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+	unsigned long flags;
+	u64 latest_ns;
+	int ret = 0;
+
+	local_irq_save(flags);
+	latest_ns = this_cpu_read(cpu_hardirq_time);
+	if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->irq))
+		ret = 1;
+	local_irq_restore(flags);
+	return ret;
+}
+
+static int irqtime_account_si_update(void)
+{
+	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+	unsigned long flags;
+	u64 latest_ns;
+	int ret = 0;
+
+	local_irq_save(flags);
+	latest_ns = this_cpu_read(cpu_softirq_time);
+	if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->softirq))
+		ret = 1;
+	local_irq_restore(flags);
+	return ret;
+}
+
 #else /* CONFIG_IRQ_TIME_ACCOUNTING */
 
+#define sched_clock_irqtime	(0)
+
 static void update_rq_clock_task(struct rq *rq, s64 delta)
 {
 	rq->clock_task += delta;
@@ -3621,6 +3653,65 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
 	__account_system_time(p, cputime, cputime_scaled, target_cputime64);
 }
 
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+/*
+ * Account a tick to a process and cpustat
+ * @p: the process that the cpu time gets accounted to
+ * @user_tick: is the tick from userspace
+ * @rq: the pointer to rq
+ *
+ * Tick demultiplexing follows the order
+ * - pending hardirq update
+ * - pending softirq update
+ * - user_time
+ * - idle_time
+ * - system time
+ *   - check for guest_time
+ *   - else account as system_time
+ *
+ * Check for hardirq is done both for system and user time as there is
+ * no timer going off while we are on hardirq and hence we may never get an
+ * opportunity to update it solely in system time.
+ * p->stime and friends are only updated on system time and not on irq
+ * softirq as those do not count in task exec_runtime any more.
+ */
+static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
+						struct rq *rq)
+{
+	cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
+	cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy);
+	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+
+	if (irqtime_account_hi_update()) {
+		cpustat->irq = cputime64_add(cpustat->irq, tmp);
+	} else if (irqtime_account_si_update()) {
+		cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
+	} else if (user_tick) {
+		account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
+	} else if (p == rq->idle) {
+		account_idle_time(cputime_one_jiffy);
+	} else if (p->flags & PF_VCPU) { /* System time or guest time */
+		account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
+	} else {
+		__account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
+					&cpustat->system);
+	}
+}
+
+static void irqtime_account_idle_ticks(int ticks)
+{
+	int i;
+	struct rq *rq = this_rq();
+
+	for (i = 0; i < ticks; i++)
+		irqtime_account_process_tick(current, 0, rq);
+}
+#else
+static void irqtime_account_idle_ticks(int ticks) {}
+static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
+						struct rq *rq) {}
+#endif
+
 /*
  * Account for involuntary wait time.
  * @steal: the cpu time spent in involuntary wait
@@ -3661,6 +3752,11 @@ void account_process_tick(struct task_struct *p, int user_tick)
 	cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
 	struct rq *rq = this_rq();
 
+	if (sched_clock_irqtime) {
+		irqtime_account_process_tick(p, user_tick, rq);
+		return;
+	}
+
 	if (user_tick)
 		account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
 	else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
@@ -3686,6 +3782,12 @@ void account_steal_ticks(unsigned long ticks)
  */
 void account_idle_ticks(unsigned long ticks)
 {
+
+	if (sched_clock_irqtime) {
+		irqtime_account_idle_ticks(ticks);
+		return;
+	}
+
 	account_idle_time(jiffies_to_cputime(ticks));
 }
 
-- 
cgit v0.10.2


From 414bee9ba613adb3804965e2d84db32d0599f9c6 Mon Sep 17 00:00:00 2001
From: Venkatesh Pallipadi <venki@google.com>
Date: Tue, 21 Dec 2010 17:09:04 -0800
Subject: softirqs: Account ksoftirqd time as cpustat softirq

softirq time in ksoftirqd context is not accounted in ns granularity
per cpu softirq stats, as we want that to be a part of ksoftirqd
exec_runtime.

Accounting them as softirq on /proc/stat separately.

Tested-by: Shaun Ruffell <sruffell@digium.com>
Signed-off-by: Venkatesh Pallipadi <venki@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1292980144-28796-6-git-send-email-venki@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched.c b/kernel/sched.c
index 2a3c979..8b718b5 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3686,6 +3686,14 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
 		cpustat->irq = cputime64_add(cpustat->irq, tmp);
 	} else if (irqtime_account_si_update()) {
 		cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
+	} else if (this_cpu_ksoftirqd() == p) {
+		/*
+		 * ksoftirqd time do not get accounted in cpu_softirq_time.
+		 * So, we have to handle it separately here.
+		 * Also, p->stime needs to be updated for ksoftirqd.
+		 */
+		__account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
+					&cpustat->softirq);
 	} else if (user_tick) {
 		account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
 	} else if (p == rq->idle) {
-- 
cgit v0.10.2


From a8941d7ec81678fb69aea7183338175f112f3e0d Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 25 Jan 2011 16:30:03 +0100
Subject: sched: Simplify the idle scheduling class

Since commit 48c5ccae88dcd (sched: Simplify cpu-hot-unplug task
migration) this should no longer happen, so remove the code.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 9fa0f40..41eb62a 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -52,31 +52,16 @@ static void set_curr_task_idle(struct rq *rq)
 {
 }
 
-static void switched_to_idle(struct rq *rq, struct task_struct *p,
-			     int running)
+static void
+switched_to_idle(struct rq *rq, struct task_struct *p, int running)
 {
-	/* Can this actually happen?? */
-	if (running)
-		resched_task(rq->curr);
-	else
-		check_preempt_curr(rq, p, 0);
+	BUG();
 }
 
 static void prio_changed_idle(struct rq *rq, struct task_struct *p,
 			      int oldprio, int running)
 {
-	/* This can happen for hot plug CPUS */
-
-	/*
-	 * Reschedule if we are currently running on this runqueue and
-	 * our priority decreased, or if we are not currently running on
-	 * this runqueue and our priority is higher than the current's
-	 */
-	if (running) {
-		if (p->prio > oldprio)
-			resched_task(rq->curr);
-	} else
-		check_preempt_curr(rq, p, 0);
+	BUG();
 }
 
 static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
-- 
cgit v0.10.2


From da7a735e51f9622eb3e1672594d4a41da01d7e4f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 17 Jan 2011 17:03:27 +0100
Subject: sched: Fix switch_from_fair()

When a task is taken out of the fair class we must ensure the vruntime
is properly normalized because when we put it back in it will assume
to be normalized.

The case that goes wrong is when changing away from the fair class
while sleeping. Sleeping tasks have non-normalized vruntime in order
to make sleeper-fairness work. So treat the switch away from fair as a
wakeup and preserve the relative vruntime.

Also update sysrq-n to call the ->switch_{to,from} methods.

Reported-by: Onkalo Samu <samu.p.onkalo@nokia.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/sched.h b/include/linux/sched.h
index af6e15f..0542774 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1084,12 +1084,10 @@ struct sched_class {
 	void (*task_tick) (struct rq *rq, struct task_struct *p, int queued);
 	void (*task_fork) (struct task_struct *p);
 
-	void (*switched_from) (struct rq *this_rq, struct task_struct *task,
-			       int running);
-	void (*switched_to) (struct rq *this_rq, struct task_struct *task,
-			     int running);
+	void (*switched_from) (struct rq *this_rq, struct task_struct *task);
+	void (*switched_to) (struct rq *this_rq, struct task_struct *task);
 	void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
-			     int oldprio, int running);
+			     int oldprio);
 
 	unsigned int (*get_rr_interval) (struct rq *rq,
 					 struct task_struct *task);
diff --git a/kernel/sched.c b/kernel/sched.c
index 8b718b5..78fa753 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2057,14 +2057,14 @@ inline int task_curr(const struct task_struct *p)
 
 static inline void check_class_changed(struct rq *rq, struct task_struct *p,
 				       const struct sched_class *prev_class,
-				       int oldprio, int running)
+				       int oldprio)
 {
 	if (prev_class != p->sched_class) {
 		if (prev_class->switched_from)
-			prev_class->switched_from(rq, p, running);
-		p->sched_class->switched_to(rq, p, running);
-	} else
-		p->sched_class->prio_changed(rq, p, oldprio, running);
+			prev_class->switched_from(rq, p);
+		p->sched_class->switched_to(rq, p);
+	} else if (oldprio != p->prio)
+		p->sched_class->prio_changed(rq, p, oldprio);
 }
 
 static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
@@ -2598,6 +2598,7 @@ static void __sched_fork(struct task_struct *p)
 	p->se.sum_exec_runtime		= 0;
 	p->se.prev_sum_exec_runtime	= 0;
 	p->se.nr_migrations		= 0;
+	p->se.vruntime			= 0;
 
 #ifdef CONFIG_SCHEDSTATS
 	memset(&p->se.statistics, 0, sizeof(p->se.statistics));
@@ -4696,11 +4697,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 
 	if (running)
 		p->sched_class->set_curr_task(rq);
-	if (on_rq) {
+	if (on_rq)
 		enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
 
-		check_class_changed(rq, p, prev_class, oldprio, running);
-	}
+	check_class_changed(rq, p, prev_class, oldprio);
 	task_rq_unlock(rq, &flags);
 }
 
@@ -5028,11 +5028,10 @@ recheck:
 
 	if (running)
 		p->sched_class->set_curr_task(rq);
-	if (on_rq) {
+	if (on_rq)
 		activate_task(rq, p, 0);
 
-		check_class_changed(rq, p, prev_class, oldprio, running);
-	}
+	check_class_changed(rq, p, prev_class, oldprio);
 	__task_rq_unlock(rq);
 	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 
@@ -8237,6 +8236,8 @@ EXPORT_SYMBOL(__might_sleep);
 #ifdef CONFIG_MAGIC_SYSRQ
 static void normalize_task(struct rq *rq, struct task_struct *p)
 {
+	const struct sched_class *prev_class = p->sched_class;
+	int old_prio = p->prio;
 	int on_rq;
 
 	on_rq = p->se.on_rq;
@@ -8247,6 +8248,8 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
 		activate_task(rq, p, 0);
 		resched_task(rq->curr);
 	}
+
+	check_class_changed(rq, p, prev_class, old_prio);
 }
 
 void normalize_rt_tasks(void)
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 4cbc912..55040f3 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -4078,33 +4078,62 @@ static void task_fork_fair(struct task_struct *p)
  * Priority of the task has changed. Check to see if we preempt
  * the current task.
  */
-static void prio_changed_fair(struct rq *rq, struct task_struct *p,
-			      int oldprio, int running)
+static void
+prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
 {
+	if (!p->se.on_rq)
+		return;
+
 	/*
 	 * Reschedule if we are currently running on this runqueue and
 	 * our priority decreased, or if we are not currently running on
 	 * this runqueue and our priority is higher than the current's
 	 */
-	if (running) {
+	if (rq->curr == p) {
 		if (p->prio > oldprio)
 			resched_task(rq->curr);
 	} else
 		check_preempt_curr(rq, p, 0);
 }
 
+static void switched_from_fair(struct rq *rq, struct task_struct *p)
+{
+	struct sched_entity *se = &p->se;
+	struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+	/*
+	 * Ensure the task's vruntime is normalized, so that when its
+	 * switched back to the fair class the enqueue_entity(.flags=0) will
+	 * do the right thing.
+	 *
+	 * If it was on_rq, then the dequeue_entity(.flags=0) will already
+	 * have normalized the vruntime, if it was !on_rq, then only when
+	 * the task is sleeping will it still have non-normalized vruntime.
+	 */
+	if (!se->on_rq && p->state != TASK_RUNNING) {
+		/*
+		 * Fix up our vruntime so that the current sleep doesn't
+		 * cause 'unlimited' sleep bonus.
+		 */
+		place_entity(cfs_rq, se, 0);
+		se->vruntime -= cfs_rq->min_vruntime;
+	}
+}
+
 /*
  * We switched to the sched_fair class.
  */
-static void switched_to_fair(struct rq *rq, struct task_struct *p,
-			     int running)
+static void switched_to_fair(struct rq *rq, struct task_struct *p)
 {
+	if (!p->se.on_rq)
+		return;
+
 	/*
 	 * We were most likely switched from sched_rt, so
 	 * kick off the schedule if running, otherwise just see
 	 * if we can still preempt the current task.
 	 */
-	if (running)
+	if (rq->curr == p)
 		resched_task(rq->curr);
 	else
 		check_preempt_curr(rq, p, 0);
@@ -4190,6 +4219,7 @@ static const struct sched_class fair_sched_class = {
 	.task_fork		= task_fork_fair,
 
 	.prio_changed		= prio_changed_fair,
+	.switched_from		= switched_from_fair,
 	.switched_to		= switched_to_fair,
 
 	.get_rr_interval	= get_rr_interval_fair,
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 41eb62a..c82f26c 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -52,14 +52,13 @@ static void set_curr_task_idle(struct rq *rq)
 {
 }
 
-static void
-switched_to_idle(struct rq *rq, struct task_struct *p, int running)
+static void switched_to_idle(struct rq *rq, struct task_struct *p)
 {
 	BUG();
 }
 
-static void prio_changed_idle(struct rq *rq, struct task_struct *p,
-			      int oldprio, int running)
+static void
+prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio)
 {
 	BUG();
 }
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index c914ec7..c381fdc 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1595,8 +1595,7 @@ static void rq_offline_rt(struct rq *rq)
  * When switch from the rt queue, we bring ourselves to a position
  * that we might want to pull RT tasks from other runqueues.
  */
-static void switched_from_rt(struct rq *rq, struct task_struct *p,
-			   int running)
+static void switched_from_rt(struct rq *rq, struct task_struct *p)
 {
 	/*
 	 * If there are other RT tasks then we will reschedule
@@ -1605,7 +1604,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p,
 	 * we may need to handle the pulling of RT tasks
 	 * now.
 	 */
-	if (!rq->rt.rt_nr_running)
+	if (p->se.on_rq && !rq->rt.rt_nr_running)
 		pull_rt_task(rq);
 }
 
@@ -1624,8 +1623,7 @@ static inline void init_sched_rt_class(void)
  * with RT tasks. In this case we try to push them off to
  * other runqueues.
  */
-static void switched_to_rt(struct rq *rq, struct task_struct *p,
-			   int running)
+static void switched_to_rt(struct rq *rq, struct task_struct *p)
 {
 	int check_resched = 1;
 
@@ -1636,7 +1634,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p,
 	 * If that current running task is also an RT task
 	 * then see if we can move to another run queue.
 	 */
-	if (!running) {
+	if (p->se.on_rq && rq->curr != p) {
 #ifdef CONFIG_SMP
 		if (rq->rt.overloaded && push_rt_task(rq) &&
 		    /* Don't resched if we changed runqueues */
@@ -1652,10 +1650,13 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p,
  * Priority of the task has changed. This may cause
  * us to initiate a push or pull.
  */
-static void prio_changed_rt(struct rq *rq, struct task_struct *p,
-			    int oldprio, int running)
+static void
+prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
 {
-	if (running) {
+	if (!p->se.on_rq)
+		return;
+
+	if (rq->curr == p) {
 #ifdef CONFIG_SMP
 		/*
 		 * If our priority decreases while running, we
diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c
index 2bf6b47..84ec9bc 100644
--- a/kernel/sched_stoptask.c
+++ b/kernel/sched_stoptask.c
@@ -59,14 +59,13 @@ static void set_curr_task_stop(struct rq *rq)
 {
 }
 
-static void switched_to_stop(struct rq *rq, struct task_struct *p,
-			     int running)
+static void switched_to_stop(struct rq *rq, struct task_struct *p)
 {
 	BUG(); /* its impossible to change to this class */
 }
 
-static void prio_changed_stop(struct rq *rq, struct task_struct *p,
-			      int oldprio, int running)
+static void
+prio_changed_stop(struct rq *rq, struct task_struct *p, int oldprio)
 {
 	BUG(); /* how!?, what priority? */
 }
-- 
cgit v0.10.2


From ccaa8d657117bb1876d471bd91579d774106778d Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 25 Jan 2011 23:17:32 +0100
Subject: rtmutex-tester: Remove BKL tests

The BKL is going away, no need to test it any more.
I left the definitions of the test case numbers
in, so that the other tests do not get renumbered.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <1295993854-4971-19-git-send-email-arnd@arndb.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index 66cb89b..d5b54350 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -9,7 +9,6 @@
 #include <linux/kthread.h>
 #include <linux/module.h>
 #include <linux/sched.h>
-#include <linux/smp_lock.h>
 #include <linux/spinlock.h>
 #include <linux/sysdev.h>
 #include <linux/timer.h>
@@ -27,7 +26,6 @@ struct test_thread_data {
 	int			opcode;
 	int			opdata;
 	int			mutexes[MAX_RT_TEST_MUTEXES];
-	int			bkl;
 	int			event;
 	struct sys_device	sysdev;
 };
@@ -46,8 +44,8 @@ enum test_opcodes {
 	RTTEST_LOCKINTNOWAIT,	/* 6 Lock interruptible no wait in wakeup, data = lockindex */
 	RTTEST_LOCKCONT,	/* 7 Continue locking after the wakeup delay */
 	RTTEST_UNLOCK,		/* 8 Unlock, data = lockindex */
-	RTTEST_LOCKBKL,		/* 9 Lock BKL */
-	RTTEST_UNLOCKBKL,	/* 10 Unlock BKL */
+	RTTEST_LOCKBKL,		/* 9 Was: Lock BKL */
+	RTTEST_UNLOCKBKL,	/* 10 Was: Unlock BKL */
 	RTTEST_SIGNAL,		/* 11 Signal other test thread, data = thread id */
 	RTTEST_RESETEVENT = 98,	/* 98 Reset event counter */
 	RTTEST_RESET = 99,	/* 99 Reset all pending operations */
@@ -74,13 +72,6 @@ static int handle_op(struct test_thread_data *td, int lockwakeup)
 				td->mutexes[i] = 0;
 			}
 		}
-
-		if (!lockwakeup && td->bkl == 4) {
-#ifdef CONFIG_LOCK_KERNEL
-			unlock_kernel();
-#endif
-			td->bkl = 0;
-		}
 		return 0;
 
 	case RTTEST_RESETEVENT:
@@ -131,25 +122,6 @@ static int handle_op(struct test_thread_data *td, int lockwakeup)
 		td->mutexes[id] = 0;
 		return 0;
 
-	case RTTEST_LOCKBKL:
-		if (td->bkl)
-			return 0;
-		td->bkl = 1;
-#ifdef CONFIG_LOCK_KERNEL
-		lock_kernel();
-#endif
-		td->bkl = 4;
-		return 0;
-
-	case RTTEST_UNLOCKBKL:
-		if (td->bkl != 4)
-			break;
-#ifdef CONFIG_LOCK_KERNEL
-		unlock_kernel();
-#endif
-		td->bkl = 0;
-		return 0;
-
 	default:
 		break;
 	}
@@ -196,7 +168,6 @@ void schedule_rt_mutex_test(struct rt_mutex *mutex)
 		td->event = atomic_add_return(1, &rttest_event);
 		break;
 
-	case RTTEST_LOCKBKL:
 	default:
 		break;
 	}
@@ -229,8 +200,6 @@ void schedule_rt_mutex_test(struct rt_mutex *mutex)
 		td->event = atomic_add_return(1, &rttest_event);
 		return;
 
-	case RTTEST_LOCKBKL:
-		return;
 	default:
 		return;
 	}
@@ -380,11 +349,11 @@ static ssize_t sysfs_test_status(struct sys_device *dev, struct sysdev_attribute
 	spin_lock(&rttest_lock);
 
 	curr += sprintf(curr,
-		"O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, K: %d, M:",
+		"O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, M:",
 		td->opcode, td->event, tsk->state,
 			(MAX_RT_PRIO - 1) - tsk->prio,
 			(MAX_RT_PRIO - 1) - tsk->normal_prio,
-		tsk->pi_blocked_on, td->bkl);
+		tsk->pi_blocked_on);
 
 	for (i = MAX_RT_TEST_MUTEXES - 1; i >=0 ; i--)
 		curr += sprintf(curr, "%d", td->mutexes[i]);
-- 
cgit v0.10.2


From 61bb46082775acd18c712607615a8b7dbeff7873 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 23 Jan 2011 15:10:51 +0100
Subject: alpha: Replace deprecated spinlock initialization

SPIN_LOCK_UNLOCK is deprecated. Use the lockdep capable variant
instead.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Matt Turner <mattst88@gmail.com>

diff --git a/arch/alpha/include/asm/rwsem.h b/arch/alpha/include/asm/rwsem.h
index 1570c0b..19c9cc7 100644
--- a/arch/alpha/include/asm/rwsem.h
+++ b/arch/alpha/include/asm/rwsem.h
@@ -39,7 +39,7 @@ struct rw_semaphore {
 };
 
 #define __RWSEM_INITIALIZER(name) \
-	{ RWSEM_UNLOCKED_VALUE, SPIN_LOCK_UNLOCKED, \
+	{ RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED((name).wait_lock),	\
 	LIST_HEAD_INIT((name).wait_list) }
 
 #define DECLARE_RWSEM(name) \
-- 
cgit v0.10.2


From e41c8ab174f47ed5ed10a365482d0d7b0e352beb Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 23 Jan 2011 15:14:15 +0100
Subject: cris: Replace deprecated spinlock initialization

SPIN_LOCK_UNLOCK is deprecated. Use the lockdep capable variant
instead.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Jesper Nilsson <jesper.nilsson@axis.com>

diff --git a/arch/cris/arch-v32/kernel/smp.c b/arch/cris/arch-v32/kernel/smp.c
index 84fed3b..4c9e3e1 100644
--- a/arch/cris/arch-v32/kernel/smp.c
+++ b/arch/cris/arch-v32/kernel/smp.c
@@ -26,7 +26,9 @@
 #define FLUSH_ALL (void*)0xffffffff
 
 /* Vector of locks used for various atomic operations */
-spinlock_t cris_atomic_locks[] = { [0 ... LOCK_COUNT - 1] = SPIN_LOCK_UNLOCKED};
+spinlock_t cris_atomic_locks[] = {
+	[0 ... LOCK_COUNT - 1] = __SPIN_LOCK_UNLOCKED(cris_atomic_locks)
+};
 
 /* CPU masks */
 cpumask_t phys_cpu_present_map = CPU_MASK_NONE;
-- 
cgit v0.10.2


From 7424cdf77b1b2975d82619084f20f0055f715166 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 23 Jan 2011 15:16:35 +0100
Subject: mips: Replace deprecated spinlock initialization

SPIN_LOCK_UNLOCK is deprecated. Use the lockdep capable variant
instead.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Ralf Baechle <ralf@linux-mips.org>

diff --git a/arch/mips/kernel/vpe.c b/arch/mips/kernel/vpe.c
index 6a1fdfe..ab52b7c 100644
--- a/arch/mips/kernel/vpe.c
+++ b/arch/mips/kernel/vpe.c
@@ -148,9 +148,9 @@ struct {
 	spinlock_t tc_list_lock;
 	struct list_head tc_list;	/* Thread contexts */
 } vpecontrol = {
-	.vpe_list_lock	= SPIN_LOCK_UNLOCKED,
+	.vpe_list_lock	= __SPIN_LOCK_UNLOCKED(vpe_list_lock),
 	.vpe_list	= LIST_HEAD_INIT(vpecontrol.vpe_list),
-	.tc_list_lock	= SPIN_LOCK_UNLOCKED,
+	.tc_list_lock	= __SPIN_LOCK_UNLOCKED(tc_list_lock),
 	.tc_list	= LIST_HEAD_INIT(vpecontrol.tc_list)
 };
 
-- 
cgit v0.10.2


From 24774fbdeab8f6ac05a19e81bd645b0f7e5d2bb7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 23 Jan 2011 15:19:12 +0100
Subject: sparc: Replace deprecated spinlock initialization

SPIN_LOCK_UNLOCK is deprecated. Use the lockdep capable variant
instead.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: David S. Miller <davem@davemloft.net>

diff --git a/arch/sparc/lib/atomic32.c b/arch/sparc/lib/atomic32.c
index cbddeb3..d3c7a12 100644
--- a/arch/sparc/lib/atomic32.c
+++ b/arch/sparc/lib/atomic32.c
@@ -16,7 +16,7 @@
 #define ATOMIC_HASH(a)	(&__atomic_hash[(((unsigned long)a)>>8) & (ATOMIC_HASH_SIZE-1)])
 
 spinlock_t __atomic_hash[ATOMIC_HASH_SIZE] = {
-	[0 ... (ATOMIC_HASH_SIZE-1)] = SPIN_LOCK_UNLOCKED
+	[0 ... (ATOMIC_HASH_SIZE-1)] = __SPIN_LOCK_UNLOCKED(__atomic_hash)
 };
 
 #else /* SMP */
-- 
cgit v0.10.2


From 22e650045899011b028e40625bc73df9f3260bac Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 23 Jan 2011 15:21:25 +0100
Subject: um: Replace deprecated spinlock initialization

SPIN_LOCK_UNLOCK is deprecated. Use the lockdep capable variant
instead.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Jeff Dike <jdike@addtoit.com>

diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c
index ba4a98b..620f5b7 100644
--- a/arch/um/drivers/ubd_kern.c
+++ b/arch/um/drivers/ubd_kern.c
@@ -185,7 +185,7 @@ struct ubd {
 	.no_cow =               0, \
 	.shared =		0, \
 	.cow =			DEFAULT_COW, \
-	.lock =			SPIN_LOCK_UNLOCKED,	\
+	.lock =			__SPIN_LOCK_UNLOCKED(ubd_devs.lock), \
 	.request =		NULL, \
 	.start_sg =		0, \
 	.end_sg =		0, \
-- 
cgit v0.10.2


From 235454c99851ba21038061b9acf38d1a636068c5 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 23 Jan 2011 15:23:14 +0100
Subject: xtensa: Replace deprecated spinlock initialization

SPIN_LOCK_UNLOCK is deprecated. Use the lockdep capable variant
instead.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Chris Zankel <chris@zankel.net>

diff --git a/arch/xtensa/include/asm/rwsem.h b/arch/xtensa/include/asm/rwsem.h
index e39edf5..9d32f68 100644
--- a/arch/xtensa/include/asm/rwsem.h
+++ b/arch/xtensa/include/asm/rwsem.h
@@ -38,7 +38,7 @@ struct rw_semaphore {
 };
 
 #define __RWSEM_INITIALIZER(name) \
-	{ RWSEM_UNLOCKED_VALUE, SPIN_LOCK_UNLOCKED, \
+	{ RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED((name).wait_lock),	\
 	  LIST_HEAD_INIT((name).wait_list) }
 
 #define DECLARE_RWSEM(name)		\
-- 
cgit v0.10.2


From 92578c0b8078f6919f9b47e7e16a1cf770bd127b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 23 Jan 2011 15:24:55 +0100
Subject: kthread: Replace deprecated spinlock initialization

SPIN_LOCK_UNLOCK is deprecated. Use the lockdep capable variant
instead.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index ce0775a..7ff16f7 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -64,7 +64,7 @@ struct kthread_work {
 };
 
 #define KTHREAD_WORKER_INIT(worker)	{				\
-	.lock = SPIN_LOCK_UNLOCKED,					\
+	.lock = __SPIN_LOCK_UNLOCKED((worker).lock),			\
 	.work_list = LIST_HEAD_INIT((worker).work_list),		\
 	}
 
-- 
cgit v0.10.2


From 10389a15e25fd4784d42de7e0e3fc8c242f2011d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 23 Jan 2011 15:25:56 +0100
Subject: cred: Replace deprecated spinlock initialization

SPIN_LOCK_UNLOCK is deprecated. Use the lockdep capable variant
instead.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/cred.c b/kernel/cred.c
index 6a1aa00..b5496e8 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -35,7 +35,7 @@ static struct kmem_cache *cred_jar;
 static struct thread_group_cred init_tgcred = {
 	.usage	= ATOMIC_INIT(2),
 	.tgid	= 0,
-	.lock	= SPIN_LOCK_UNLOCKED,
+	.lock	= __SPIN_LOCK_UNLOCKED(init_cred.tgcred.lock),
 };
 #endif
 
-- 
cgit v0.10.2


From d04fa5a3ba06c3b7a1c4a6860d0fa4825507a755 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 23 Jan 2011 15:30:09 +0100
Subject: locking: Remove deprecated lock initializers

Last users are gone. Remove the left overs.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/Documentation/spinlocks.txt b/Documentation/spinlocks.txt
index 178c831..2e3c64b 100644
--- a/Documentation/spinlocks.txt
+++ b/Documentation/spinlocks.txt
@@ -86,7 +86,7 @@ to change the variables it has to get an exclusive write lock.
 
 The routines look the same as above:
 
-   rwlock_t xxx_lock = RW_LOCK_UNLOCKED;
+   rwlock_t xxx_lock = __RW_LOCK_UNLOCKED(xxx_lock);
 
 	unsigned long flags;
 
@@ -196,25 +196,3 @@ appropriate:
 
 For static initialization, use DEFINE_SPINLOCK() / DEFINE_RWLOCK() or
 __SPIN_LOCK_UNLOCKED() / __RW_LOCK_UNLOCKED() as appropriate.
-
-SPIN_LOCK_UNLOCKED and RW_LOCK_UNLOCKED are deprecated.  These interfere
-with lockdep state tracking.
-
-Most of the time, you can simply turn:
-	static spinlock_t xxx_lock = SPIN_LOCK_UNLOCKED;
-into:
-	static DEFINE_SPINLOCK(xxx_lock);
-
-Static structure member variables go from:
-
-	struct foo bar {
-		.lock	=	SPIN_LOCK_UNLOCKED;
-	};
-
-to:
-
-	struct foo bar {
-		.lock	=	__SPIN_LOCK_UNLOCKED(bar.lock);
-	};
-
-Declaration of static rw_locks undergo a similar transformation.
diff --git a/include/linux/rwlock_types.h b/include/linux/rwlock_types.h
index bd31808..cc0072e 100644
--- a/include/linux/rwlock_types.h
+++ b/include/linux/rwlock_types.h
@@ -43,14 +43,6 @@ typedef struct {
 				RW_DEP_MAP_INIT(lockname) }
 #endif
 
-/*
- * RW_LOCK_UNLOCKED defeat lockdep state tracking and is hence
- * deprecated.
- *
- * Please use DEFINE_RWLOCK() or __RW_LOCK_UNLOCKED() as appropriate.
- */
-#define RW_LOCK_UNLOCKED	__RW_LOCK_UNLOCKED(old_style_rw_init)
-
 #define DEFINE_RWLOCK(x)	rwlock_t x = __RW_LOCK_UNLOCKED(x)
 
 #endif /* __LINUX_RWLOCK_TYPES_H */
diff --git a/include/linux/spinlock_types.h b/include/linux/spinlock_types.h
index 851b778..73548eb 100644
--- a/include/linux/spinlock_types.h
+++ b/include/linux/spinlock_types.h
@@ -81,14 +81,6 @@ typedef struct spinlock {
 #define __SPIN_LOCK_UNLOCKED(lockname) \
 	(spinlock_t ) __SPIN_LOCK_INITIALIZER(lockname)
 
-/*
- * SPIN_LOCK_UNLOCKED defeats lockdep state tracking and is hence
- * deprecated.
- * Please use DEFINE_SPINLOCK() or __SPIN_LOCK_UNLOCKED() as
- * appropriate.
- */
-#define SPIN_LOCK_UNLOCKED	__SPIN_LOCK_UNLOCKED(old_style_spin_init)
-
 #define DEFINE_SPINLOCK(x)	spinlock_t x = __SPIN_LOCK_UNLOCKED(x)
 
 #include <linux/rwlock_types.h>
diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 4c0383d..58848e3 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -2654,11 +2654,6 @@ sub process {
 			WARN("Use of volatile is usually wrong: see Documentation/volatile-considered-harmful.txt\n" . $herecurr);
 		}
 
-# SPIN_LOCK_UNLOCKED & RW_LOCK_UNLOCKED are deprecated
-		if ($line =~ /\b(SPIN_LOCK_UNLOCKED|RW_LOCK_UNLOCKED)/) {
-			ERROR("Use of $1 is deprecated: see Documentation/spinlocks.txt\n" . $herecurr);
-		}
-
 # warn about #if 0
 		if ($line =~ /^.\s*\#\s*if\s+0\b/) {
 			CHK("if this code is redundant consider removing it\n" .
-- 
cgit v0.10.2


From c16a87ce063f79e0ec7d25ce2950e1bc6db03c72 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 26 Jan 2011 20:05:50 +0000
Subject: rwsem: Cleanup includes

All rwsem implementations include the same headers. Include them from
include/linux/rwsem.h

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Matt Turner <mattst88@gmail.com>
Acked-by: Tony Luck <tony.luck@intel.com>
Acked-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Acked-by: David Miller <davem@davemloft.net>
Cc: Chris Zankel <chris@zankel.net>
LKML-Reference: <20110126195833.483520950@linutronix.de>

diff --git a/arch/alpha/include/asm/rwsem.h b/arch/alpha/include/asm/rwsem.h
index 19c9cc7..839a3fa 100644
--- a/arch/alpha/include/asm/rwsem.h
+++ b/arch/alpha/include/asm/rwsem.h
@@ -13,10 +13,6 @@
 #ifdef __KERNEL__
 
 #include <linux/compiler.h>
-#include <linux/list.h>
-#include <linux/spinlock.h>
-
-struct rwsem_waiter;
 
 extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem);
 extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem);
diff --git a/arch/ia64/include/asm/rwsem.h b/arch/ia64/include/asm/rwsem.h
index 215d545..9bcf079 100644
--- a/arch/ia64/include/asm/rwsem.h
+++ b/arch/ia64/include/asm/rwsem.h
@@ -25,9 +25,6 @@
 #error "Please don't include <asm/rwsem.h> directly, use <linux/rwsem.h> instead."
 #endif
 
-#include <linux/list.h>
-#include <linux/spinlock.h>
-
 #include <asm/intrinsics.h>
 
 /*
diff --git a/arch/powerpc/include/asm/rwsem.h b/arch/powerpc/include/asm/rwsem.h
index 8447d89..c12abbe 100644
--- a/arch/powerpc/include/asm/rwsem.h
+++ b/arch/powerpc/include/asm/rwsem.h
@@ -13,11 +13,6 @@
  * by Paul Mackerras <paulus@samba.org>.
  */
 
-#include <linux/list.h>
-#include <linux/spinlock.h>
-#include <asm/atomic.h>
-#include <asm/system.h>
-
 /*
  * the semaphore definition
  */
diff --git a/arch/s390/include/asm/rwsem.h b/arch/s390/include/asm/rwsem.h
index 423fdda..9cc8592 100644
--- a/arch/s390/include/asm/rwsem.h
+++ b/arch/s390/include/asm/rwsem.h
@@ -43,11 +43,6 @@
 
 #ifdef __KERNEL__
 
-#include <linux/list.h>
-#include <linux/spinlock.h>
-
-struct rwsem_waiter;
-
 extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *);
 extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *);
 extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *);
diff --git a/arch/sh/include/asm/rwsem.h b/arch/sh/include/asm/rwsem.h
index 06e2251..df6f346 100644
--- a/arch/sh/include/asm/rwsem.h
+++ b/arch/sh/include/asm/rwsem.h
@@ -11,11 +11,6 @@
 #endif
 
 #ifdef __KERNEL__
-#include <linux/list.h>
-#include <linux/spinlock.h>
-#include <asm/atomic.h>
-#include <asm/system.h>
-
 /*
  * the semaphore definition
  */
diff --git a/arch/sparc/include/asm/rwsem.h b/arch/sparc/include/asm/rwsem.h
index a2b4302..902d36b 100644
--- a/arch/sparc/include/asm/rwsem.h
+++ b/arch/sparc/include/asm/rwsem.h
@@ -12,12 +12,6 @@
 #endif
 
 #ifdef __KERNEL__
-
-#include <linux/list.h>
-#include <linux/spinlock.h>
-
-struct rwsem_waiter;
-
 struct rw_semaphore {
 	signed long			count;
 #define RWSEM_UNLOCKED_VALUE		0x00000000L
diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h
index d1e41b0..a626cff 100644
--- a/arch/x86/include/asm/rwsem.h
+++ b/arch/x86/include/asm/rwsem.h
@@ -37,14 +37,8 @@
 #endif
 
 #ifdef __KERNEL__
-
-#include <linux/list.h>
-#include <linux/spinlock.h>
-#include <linux/lockdep.h>
 #include <asm/asm.h>
 
-struct rwsem_waiter;
-
 extern asmregparm struct rw_semaphore *
  rwsem_down_read_failed(struct rw_semaphore *sem);
 extern asmregparm struct rw_semaphore *
diff --git a/arch/xtensa/include/asm/rwsem.h b/arch/xtensa/include/asm/rwsem.h
index 9d32f68..1be2102 100644
--- a/arch/xtensa/include/asm/rwsem.h
+++ b/arch/xtensa/include/asm/rwsem.h
@@ -16,12 +16,6 @@
 #ifndef _LINUX_RWSEM_H
 #error "Please don't include <asm/rwsem.h> directly, use <linux/rwsem.h> instead."
 #endif
-
-#include <linux/list.h>
-#include <linux/spinlock.h>
-#include <asm/atomic.h>
-#include <asm/system.h>
-
 /*
  * the semaphore definition
  */
diff --git a/include/linux/rwsem-spinlock.h b/include/linux/rwsem-spinlock.h
index bdfcc25..8c0dc7f 100644
--- a/include/linux/rwsem-spinlock.h
+++ b/include/linux/rwsem-spinlock.h
@@ -12,15 +12,7 @@
 #error "please don't include linux/rwsem-spinlock.h directly, use linux/rwsem.h instead"
 #endif
 
-#include <linux/spinlock.h>
-#include <linux/list.h>
-
 #ifdef __KERNEL__
-
-#include <linux/types.h>
-
-struct rwsem_waiter;
-
 /*
  * the rw-semaphore definition
  * - if activity is 0 then there are no active readers or writers
diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index efd348f..496296d 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -11,6 +11,9 @@
 
 #include <linux/types.h>
 #include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+
 #include <asm/system.h>
 #include <asm/atomic.h>
 
-- 
cgit v0.10.2


From bde11efbc21ea84c3351464a422b467eaefabb9a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 26 Jan 2011 20:05:53 +0000
Subject: x86: Cleanup rwsem_count_t typedef

Remove the typedef which has no real reason to be there.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: David Miller <davem@davemloft.net>
Cc: Chris Zankel <chris@zankel.net>
LKML-Reference: <20110126195833.580335506@linutronix.de>

diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h
index a626cff..c30206c 100644
--- a/arch/x86/include/asm/rwsem.h
+++ b/arch/x86/include/asm/rwsem.h
@@ -68,10 +68,8 @@ extern asmregparm struct rw_semaphore *
 #define RWSEM_ACTIVE_READ_BIAS		RWSEM_ACTIVE_BIAS
 #define RWSEM_ACTIVE_WRITE_BIAS		(RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
 
-typedef signed long rwsem_count_t;
-
 struct rw_semaphore {
-	rwsem_count_t		count;
+	long			count;
 	spinlock_t		wait_lock;
 	struct list_head	wait_list;
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -127,7 +125,7 @@ static inline void __down_read(struct rw_semaphore *sem)
  */
 static inline int __down_read_trylock(struct rw_semaphore *sem)
 {
-	rwsem_count_t result, tmp;
+	long result, tmp;
 	asm volatile("# beginning __down_read_trylock\n\t"
 		     "  mov          %0,%1\n\t"
 		     "1:\n\t"
@@ -149,7 +147,7 @@ static inline int __down_read_trylock(struct rw_semaphore *sem)
  */
 static inline void __down_write_nested(struct rw_semaphore *sem, int subclass)
 {
-	rwsem_count_t tmp;
+	long tmp;
 	asm volatile("# beginning down_write\n\t"
 		     LOCK_PREFIX "  xadd      %1,(%2)\n\t"
 		     /* adds 0xffff0001, returns the old value */
@@ -174,9 +172,8 @@ static inline void __down_write(struct rw_semaphore *sem)
  */
 static inline int __down_write_trylock(struct rw_semaphore *sem)
 {
-	rwsem_count_t ret = cmpxchg(&sem->count,
-				    RWSEM_UNLOCKED_VALUE,
-				    RWSEM_ACTIVE_WRITE_BIAS);
+	long ret = cmpxchg(&sem->count, RWSEM_UNLOCKED_VALUE,
+			   RWSEM_ACTIVE_WRITE_BIAS);
 	if (ret == RWSEM_UNLOCKED_VALUE)
 		return 1;
 	return 0;
@@ -187,7 +184,7 @@ static inline int __down_write_trylock(struct rw_semaphore *sem)
  */
 static inline void __up_read(struct rw_semaphore *sem)
 {
-	rwsem_count_t tmp;
+	long tmp;
 	asm volatile("# beginning __up_read\n\t"
 		     LOCK_PREFIX "  xadd      %1,(%2)\n\t"
 		     /* subtracts 1, returns the old value */
@@ -205,7 +202,7 @@ static inline void __up_read(struct rw_semaphore *sem)
  */
 static inline void __up_write(struct rw_semaphore *sem)
 {
-	rwsem_count_t tmp;
+	long tmp;
 	asm volatile("# beginning __up_write\n\t"
 		     LOCK_PREFIX "  xadd      %1,(%2)\n\t"
 		     /* subtracts 0xffff0001, returns the old value */
@@ -241,8 +238,7 @@ static inline void __downgrade_write(struct rw_semaphore *sem)
 /*
  * implement atomic add functionality
  */
-static inline void rwsem_atomic_add(rwsem_count_t delta,
-				    struct rw_semaphore *sem)
+static inline void rwsem_atomic_add(long delta, struct rw_semaphore *sem)
 {
 	asm volatile(LOCK_PREFIX _ASM_ADD "%1,%0"
 		     : "+m" (sem->count)
@@ -252,10 +248,9 @@ static inline void rwsem_atomic_add(rwsem_count_t delta,
 /*
  * implement exchange and add functionality
  */
-static inline rwsem_count_t rwsem_atomic_update(rwsem_count_t delta,
-						struct rw_semaphore *sem)
+static inline long rwsem_atomic_update(long delta, struct rw_semaphore *sem)
 {
-	rwsem_count_t tmp = delta;
+	long tmp = delta;
 
 	asm volatile(LOCK_PREFIX "xadd %0,%1"
 		     : "+r" (tmp), "+m" (sem->count)
-- 
cgit v0.10.2


From 1c8ed640d918290ddc1de5ada02ef6686a733c9f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 26 Jan 2011 20:05:56 +0000
Subject: rwsem: Move duplicate struct rwsem declaration to linux/rwsem.h

The difference between these declarations is the data type of the
count member and the lack of lockdep in some architectures/

long is equivivalent to signed long and the #ifdef guarded dep_map
member does not hurt anyone.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Matt Turner <mattst88@gmail.com>
Acked-by: Tony Luck <tony.luck@intel.com>
Acked-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Acked-by: David Miller <davem@davemloft.net>
Cc: Chris Zankel <chris@zankel.net>
LKML-Reference: <20110126195833.679641914@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/alpha/include/asm/rwsem.h b/arch/alpha/include/asm/rwsem.h
index 839a3fa..3e5619e 100644
--- a/arch/alpha/include/asm/rwsem.h
+++ b/arch/alpha/include/asm/rwsem.h
@@ -19,20 +19,12 @@ extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem);
 extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *);
 extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem);
 
-/*
- * the semaphore definition
- */
-struct rw_semaphore {
-	long			count;
 #define RWSEM_UNLOCKED_VALUE		0x0000000000000000L
 #define RWSEM_ACTIVE_BIAS		0x0000000000000001L
 #define RWSEM_ACTIVE_MASK		0x00000000ffffffffL
 #define RWSEM_WAITING_BIAS		(-0x0000000100000000L)
 #define RWSEM_ACTIVE_READ_BIAS		RWSEM_ACTIVE_BIAS
 #define RWSEM_ACTIVE_WRITE_BIAS		(RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
-	spinlock_t		wait_lock;
-	struct list_head	wait_list;
-};
 
 #define __RWSEM_INITIALIZER(name) \
 	{ RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED((name).wait_lock),	\
diff --git a/arch/ia64/include/asm/rwsem.h b/arch/ia64/include/asm/rwsem.h
index 9bcf079..1fe4658 100644
--- a/arch/ia64/include/asm/rwsem.h
+++ b/arch/ia64/include/asm/rwsem.h
@@ -27,15 +27,6 @@
 
 #include <asm/intrinsics.h>
 
-/*
- * the semaphore definition
- */
-struct rw_semaphore {
-	signed long		count;
-	spinlock_t		wait_lock;
-	struct list_head	wait_list;
-};
-
 #define RWSEM_UNLOCKED_VALUE		__IA64_UL_CONST(0x0000000000000000)
 #define RWSEM_ACTIVE_BIAS		(1L)
 #define RWSEM_ACTIVE_MASK		(0xffffffffL)
diff --git a/arch/powerpc/include/asm/rwsem.h b/arch/powerpc/include/asm/rwsem.h
index c12abbe..bc1acc2 100644
--- a/arch/powerpc/include/asm/rwsem.h
+++ b/arch/powerpc/include/asm/rwsem.h
@@ -28,15 +28,6 @@
 #define RWSEM_ACTIVE_READ_BIAS		RWSEM_ACTIVE_BIAS
 #define RWSEM_ACTIVE_WRITE_BIAS		(RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
 
-struct rw_semaphore {
-	long			count;
-	spinlock_t		wait_lock;
-	struct list_head	wait_list;
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-	struct lockdep_map	dep_map;
-#endif
-};
-
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 # define __RWSEM_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname }
 #else
diff --git a/arch/s390/include/asm/rwsem.h b/arch/s390/include/asm/rwsem.h
index 9cc8592..6e075f1 100644
--- a/arch/s390/include/asm/rwsem.h
+++ b/arch/s390/include/asm/rwsem.h
@@ -49,18 +49,6 @@ extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *);
 extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *);
 extern struct rw_semaphore *rwsem_downgrade_write(struct rw_semaphore *);
 
-/*
- * the semaphore definition
- */
-struct rw_semaphore {
-	signed long		count;
-	spinlock_t		wait_lock;
-	struct list_head	wait_list;
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-	struct lockdep_map	dep_map;
-#endif
-};
-
 #ifndef __s390x__
 #define RWSEM_UNLOCKED_VALUE	0x00000000
 #define RWSEM_ACTIVE_BIAS	0x00000001
diff --git a/arch/sh/include/asm/rwsem.h b/arch/sh/include/asm/rwsem.h
index df6f346..dffc625 100644
--- a/arch/sh/include/asm/rwsem.h
+++ b/arch/sh/include/asm/rwsem.h
@@ -11,23 +11,13 @@
 #endif
 
 #ifdef __KERNEL__
-/*
- * the semaphore definition
- */
-struct rw_semaphore {
-	long		count;
+
 #define RWSEM_UNLOCKED_VALUE		0x00000000
 #define RWSEM_ACTIVE_BIAS		0x00000001
 #define RWSEM_ACTIVE_MASK		0x0000ffff
 #define RWSEM_WAITING_BIAS		(-0x00010000)
 #define RWSEM_ACTIVE_READ_BIAS		RWSEM_ACTIVE_BIAS
 #define RWSEM_ACTIVE_WRITE_BIAS		(RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
-	spinlock_t		wait_lock;
-	struct list_head	wait_list;
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-	struct lockdep_map	dep_map;
-#endif
-};
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 # define __RWSEM_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname }
diff --git a/arch/sparc/include/asm/rwsem.h b/arch/sparc/include/asm/rwsem.h
index 902d36b..4c16d1d 100644
--- a/arch/sparc/include/asm/rwsem.h
+++ b/arch/sparc/include/asm/rwsem.h
@@ -12,20 +12,13 @@
 #endif
 
 #ifdef __KERNEL__
-struct rw_semaphore {
-	signed long			count;
+
 #define RWSEM_UNLOCKED_VALUE		0x00000000L
 #define RWSEM_ACTIVE_BIAS		0x00000001L
 #define RWSEM_ACTIVE_MASK		0xffffffffL
 #define RWSEM_WAITING_BIAS		(-RWSEM_ACTIVE_MASK-1)
 #define RWSEM_ACTIVE_READ_BIAS		RWSEM_ACTIVE_BIAS
 #define RWSEM_ACTIVE_WRITE_BIAS		(RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
-	spinlock_t			wait_lock;
-	struct list_head		wait_list;
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-	struct lockdep_map		dep_map;
-#endif
-};
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 # define __RWSEM_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname }
diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h
index c30206c..995cfe4 100644
--- a/arch/x86/include/asm/rwsem.h
+++ b/arch/x86/include/asm/rwsem.h
@@ -49,8 +49,6 @@ extern asmregparm struct rw_semaphore *
  rwsem_downgrade_wake(struct rw_semaphore *sem);
 
 /*
- * the semaphore definition
- *
  * The bias values and the counter type limits the number of
  * potential readers/writers to 32767 for 32 bits and 2147483647
  * for 64 bits.
@@ -68,22 +66,12 @@ extern asmregparm struct rw_semaphore *
 #define RWSEM_ACTIVE_READ_BIAS		RWSEM_ACTIVE_BIAS
 #define RWSEM_ACTIVE_WRITE_BIAS		(RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
 
-struct rw_semaphore {
-	long			count;
-	spinlock_t		wait_lock;
-	struct list_head	wait_list;
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-	struct lockdep_map dep_map;
-#endif
-};
-
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 # define __RWSEM_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname }
 #else
 # define __RWSEM_DEP_MAP_INIT(lockname)
 #endif
 
-
 #define __RWSEM_INITIALIZER(name)				\
 {								\
 	RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED((name).wait_lock), \
diff --git a/arch/xtensa/include/asm/rwsem.h b/arch/xtensa/include/asm/rwsem.h
index 1be2102..585cab9 100644
--- a/arch/xtensa/include/asm/rwsem.h
+++ b/arch/xtensa/include/asm/rwsem.h
@@ -16,20 +16,13 @@
 #ifndef _LINUX_RWSEM_H
 #error "Please don't include <asm/rwsem.h> directly, use <linux/rwsem.h> instead."
 #endif
-/*
- * the semaphore definition
- */
-struct rw_semaphore {
-	signed long		count;
+
 #define RWSEM_UNLOCKED_VALUE		0x00000000
 #define RWSEM_ACTIVE_BIAS		0x00000001
 #define RWSEM_ACTIVE_MASK		0x0000ffff
 #define RWSEM_WAITING_BIAS		(-0x00010000)
 #define RWSEM_ACTIVE_READ_BIAS		RWSEM_ACTIVE_BIAS
 #define RWSEM_ACTIVE_WRITE_BIAS		(RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
-	spinlock_t		wait_lock;
-	struct list_head	wait_list;
-};
 
 #define __RWSEM_INITIALIZER(name) \
 	{ RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED((name).wait_lock),	\
diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index 496296d..e8be18e 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -22,7 +22,18 @@ struct rw_semaphore;
 #ifdef CONFIG_RWSEM_GENERIC_SPINLOCK
 #include <linux/rwsem-spinlock.h> /* use a generic implementation */
 #else
-#include <asm/rwsem.h> /* use an arch-specific implementation */
+/* All arch specific implementations share the same struct */
+struct rw_semaphore {
+	long			count;
+	spinlock_t		wait_lock;
+	struct list_head	wait_list;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map	dep_map;
+#endif
+};
+
+/* Include the arch specific part */
+#include <asm/rwsem.h>
 #endif
 
 /*
-- 
cgit v0.10.2


From 12249b34414dba7f386aadcf6be7ca36c6878300 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 26 Jan 2011 20:06:00 +0000
Subject: rwsem: Move duplicate init macros and functions to linux/rwsem.h

The rwsem initializers and related macros and functions are mostly the
same. Some of them lack the lockdep initializer, but having it in
place does not matter for architectures which do not support lockdep.

powerpc, sparc, x86: No functional change

sh, s390: Removes the duplicate init_rwsem (inline and #define)

alpha, ia64, xtensa: Use the lockdep capable init function in
       	     	     lib/rwsem.c which is just uninlining the init
       	     	     function for the LOCKDEP=n case

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Matt Turner <mattst88@gmail.com>
Acked-by: Tony Luck <tony.luck@intel.com>
Acked-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Acked-by: David Miller <davem@davemloft.net>
Cc: Chris Zankel <chris@zankel.net>
LKML-Reference: <20110126195833.771812729@linutronix.de>

diff --git a/arch/alpha/include/asm/rwsem.h b/arch/alpha/include/asm/rwsem.h
index 3e5619e..c9f5b90 100644
--- a/arch/alpha/include/asm/rwsem.h
+++ b/arch/alpha/include/asm/rwsem.h
@@ -26,20 +26,6 @@ extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem);
 #define RWSEM_ACTIVE_READ_BIAS		RWSEM_ACTIVE_BIAS
 #define RWSEM_ACTIVE_WRITE_BIAS		(RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
 
-#define __RWSEM_INITIALIZER(name) \
-	{ RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED((name).wait_lock),	\
-	LIST_HEAD_INIT((name).wait_list) }
-
-#define DECLARE_RWSEM(name) \
-	struct rw_semaphore name = __RWSEM_INITIALIZER(name)
-
-static inline void init_rwsem(struct rw_semaphore *sem)
-{
-	sem->count = RWSEM_UNLOCKED_VALUE;
-	spin_lock_init(&sem->wait_lock);
-	INIT_LIST_HEAD(&sem->wait_list);
-}
-
 static inline void __down_read(struct rw_semaphore *sem)
 {
 	long oldcount;
diff --git a/arch/ia64/include/asm/rwsem.h b/arch/ia64/include/asm/rwsem.h
index 1fe4658..3798546 100644
--- a/arch/ia64/include/asm/rwsem.h
+++ b/arch/ia64/include/asm/rwsem.h
@@ -34,26 +34,11 @@
 #define RWSEM_ACTIVE_READ_BIAS		RWSEM_ACTIVE_BIAS
 #define RWSEM_ACTIVE_WRITE_BIAS		(RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
 
-#define __RWSEM_INITIALIZER(name) \
-	{ RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED((name).wait_lock), \
-	  LIST_HEAD_INIT((name).wait_list) }
-
-#define DECLARE_RWSEM(name) \
-	struct rw_semaphore name = __RWSEM_INITIALIZER(name)
-
 extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem);
 extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem);
 extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem);
 extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem);
 
-static inline void
-init_rwsem (struct rw_semaphore *sem)
-{
-	sem->count = RWSEM_UNLOCKED_VALUE;
-	spin_lock_init(&sem->wait_lock);
-	INIT_LIST_HEAD(&sem->wait_list);
-}
-
 /*
  * lock for reading
  */
diff --git a/arch/powerpc/include/asm/rwsem.h b/arch/powerpc/include/asm/rwsem.h
index bc1acc2..f86fdf7 100644
--- a/arch/powerpc/include/asm/rwsem.h
+++ b/arch/powerpc/include/asm/rwsem.h
@@ -28,38 +28,11 @@
 #define RWSEM_ACTIVE_READ_BIAS		RWSEM_ACTIVE_BIAS
 #define RWSEM_ACTIVE_WRITE_BIAS		(RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
 
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-# define __RWSEM_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname }
-#else
-# define __RWSEM_DEP_MAP_INIT(lockname)
-#endif
-
-#define __RWSEM_INITIALIZER(name)				\
-{								\
-	RWSEM_UNLOCKED_VALUE,					\
-	__SPIN_LOCK_UNLOCKED((name).wait_lock),			\
-	LIST_HEAD_INIT((name).wait_list)			\
-	__RWSEM_DEP_MAP_INIT(name)				\
-}
-
-#define DECLARE_RWSEM(name)		\
-	struct rw_semaphore name = __RWSEM_INITIALIZER(name)
-
 extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem);
 extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem);
 extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem);
 extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem);
 
-extern void __init_rwsem(struct rw_semaphore *sem, const char *name,
-			 struct lock_class_key *key);
-
-#define init_rwsem(sem)					\
-	do {						\
-		static struct lock_class_key __key;	\
-							\
-		__init_rwsem((sem), #sem, &__key);	\
-	} while (0)
-
 /*
  * lock for reading
  */
diff --git a/arch/s390/include/asm/rwsem.h b/arch/s390/include/asm/rwsem.h
index 6e075f1..f0f5277 100644
--- a/arch/s390/include/asm/rwsem.h
+++ b/arch/s390/include/asm/rwsem.h
@@ -64,41 +64,6 @@ extern struct rw_semaphore *rwsem_downgrade_write(struct rw_semaphore *);
 #define RWSEM_ACTIVE_WRITE_BIAS	(RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
 
 /*
- * initialisation
- */
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-# define __RWSEM_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname }
-#else
-# define __RWSEM_DEP_MAP_INIT(lockname)
-#endif
-
-#define __RWSEM_INITIALIZER(name) \
- { RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED((name).wait.lock), \
-   LIST_HEAD_INIT((name).wait_list) __RWSEM_DEP_MAP_INIT(name) }
-
-#define DECLARE_RWSEM(name) \
-	struct rw_semaphore name = __RWSEM_INITIALIZER(name)
-
-static inline void init_rwsem(struct rw_semaphore *sem)
-{
-	sem->count = RWSEM_UNLOCKED_VALUE;
-	spin_lock_init(&sem->wait_lock);
-	INIT_LIST_HEAD(&sem->wait_list);
-}
-
-extern void __init_rwsem(struct rw_semaphore *sem, const char *name,
-			 struct lock_class_key *key);
-
-#define init_rwsem(sem)				\
-do {						\
-	static struct lock_class_key __key;	\
-						\
-	__init_rwsem((sem), #sem, &__key);	\
-} while (0)
-
-
-/*
  * lock for reading
  */
 static inline void __down_read(struct rw_semaphore *sem)
diff --git a/arch/sh/include/asm/rwsem.h b/arch/sh/include/asm/rwsem.h
index dffc625..798699d 100644
--- a/arch/sh/include/asm/rwsem.h
+++ b/arch/sh/include/asm/rwsem.h
@@ -19,42 +19,11 @@
 #define RWSEM_ACTIVE_READ_BIAS		RWSEM_ACTIVE_BIAS
 #define RWSEM_ACTIVE_WRITE_BIAS		(RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
 
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-# define __RWSEM_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname }
-#else
-# define __RWSEM_DEP_MAP_INIT(lockname)
-#endif
-
-#define __RWSEM_INITIALIZER(name) \
-	{ RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED((name).wait_lock), \
-	  LIST_HEAD_INIT((name).wait_list) \
-	  __RWSEM_DEP_MAP_INIT(name) }
-
-#define DECLARE_RWSEM(name)		\
-	struct rw_semaphore name = __RWSEM_INITIALIZER(name)
-
 extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem);
 extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem);
 extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem);
 extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem);
 
-extern void __init_rwsem(struct rw_semaphore *sem, const char *name,
-			 struct lock_class_key *key);
-
-#define init_rwsem(sem)				\
-do {						\
-	static struct lock_class_key __key;	\
-						\
-	__init_rwsem((sem), #sem, &__key);	\
-} while (0)
-
-static inline void init_rwsem(struct rw_semaphore *sem)
-{
-	sem->count = RWSEM_UNLOCKED_VALUE;
-	spin_lock_init(&sem->wait_lock);
-	INIT_LIST_HEAD(&sem->wait_list);
-}
-
 /*
  * lock for reading
  */
diff --git a/arch/sparc/include/asm/rwsem.h b/arch/sparc/include/asm/rwsem.h
index 4c16d1d..79f7c1c 100644
--- a/arch/sparc/include/asm/rwsem.h
+++ b/arch/sparc/include/asm/rwsem.h
@@ -20,34 +20,11 @@
 #define RWSEM_ACTIVE_READ_BIAS		RWSEM_ACTIVE_BIAS
 #define RWSEM_ACTIVE_WRITE_BIAS		(RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
 
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-# define __RWSEM_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname }
-#else
-# define __RWSEM_DEP_MAP_INIT(lockname)
-#endif
-
-#define __RWSEM_INITIALIZER(name) \
-{ RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED((name).wait_lock), \
-  LIST_HEAD_INIT((name).wait_list) __RWSEM_DEP_MAP_INIT(name) }
-
-#define DECLARE_RWSEM(name) \
-	struct rw_semaphore name = __RWSEM_INITIALIZER(name)
-
 extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem);
 extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem);
 extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem);
 extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem);
 
-extern void __init_rwsem(struct rw_semaphore *sem, const char *name,
-			 struct lock_class_key *key);
-
-#define init_rwsem(sem)						\
-do {								\
-	static struct lock_class_key __key;			\
-								\
-	__init_rwsem((sem), #sem, &__key);			\
-} while (0)
-
 /*
  * lock for reading
  */
diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h
index 995cfe4..c0c91b4 100644
--- a/arch/x86/include/asm/rwsem.h
+++ b/arch/x86/include/asm/rwsem.h
@@ -66,31 +66,6 @@ extern asmregparm struct rw_semaphore *
 #define RWSEM_ACTIVE_READ_BIAS		RWSEM_ACTIVE_BIAS
 #define RWSEM_ACTIVE_WRITE_BIAS		(RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
 
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-# define __RWSEM_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname }
-#else
-# define __RWSEM_DEP_MAP_INIT(lockname)
-#endif
-
-#define __RWSEM_INITIALIZER(name)				\
-{								\
-	RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED((name).wait_lock), \
-	LIST_HEAD_INIT((name).wait_list) __RWSEM_DEP_MAP_INIT(name) \
-}
-
-#define DECLARE_RWSEM(name)					\
-	struct rw_semaphore name = __RWSEM_INITIALIZER(name)
-
-extern void __init_rwsem(struct rw_semaphore *sem, const char *name,
-			 struct lock_class_key *key);
-
-#define init_rwsem(sem)						\
-do {								\
-	static struct lock_class_key __key;			\
-								\
-	__init_rwsem((sem), #sem, &__key);			\
-} while (0)
-
 /*
  * lock for reading
  */
diff --git a/arch/xtensa/include/asm/rwsem.h b/arch/xtensa/include/asm/rwsem.h
index 585cab9..2fa4202 100644
--- a/arch/xtensa/include/asm/rwsem.h
+++ b/arch/xtensa/include/asm/rwsem.h
@@ -24,25 +24,11 @@
 #define RWSEM_ACTIVE_READ_BIAS		RWSEM_ACTIVE_BIAS
 #define RWSEM_ACTIVE_WRITE_BIAS		(RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
 
-#define __RWSEM_INITIALIZER(name) \
-	{ RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED((name).wait_lock),	\
-	  LIST_HEAD_INIT((name).wait_list) }
-
-#define DECLARE_RWSEM(name)		\
-	struct rw_semaphore name = __RWSEM_INITIALIZER(name)
-
 extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem);
 extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem);
 extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem);
 extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem);
 
-static inline void init_rwsem(struct rw_semaphore *sem)
-{
-	sem->count = RWSEM_UNLOCKED_VALUE;
-	spin_lock_init(&sem->wait_lock);
-	INIT_LIST_HEAD(&sem->wait_list);
-}
-
 /*
  * lock for reading
  */
diff --git a/include/linux/rwsem-spinlock.h b/include/linux/rwsem-spinlock.h
index 8c0dc7f..3470124 100644
--- a/include/linux/rwsem-spinlock.h
+++ b/include/linux/rwsem-spinlock.h
@@ -29,28 +29,7 @@ struct rw_semaphore {
 #endif
 };
 
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-# define __RWSEM_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname }
-#else
-# define __RWSEM_DEP_MAP_INIT(lockname)
-#endif
-
-#define __RWSEM_INITIALIZER(name) \
-{ 0, __SPIN_LOCK_UNLOCKED(name.wait_lock), LIST_HEAD_INIT((name).wait_list) \
-  __RWSEM_DEP_MAP_INIT(name) }
-
-#define DECLARE_RWSEM(name) \
-	struct rw_semaphore name = __RWSEM_INITIALIZER(name)
-
-extern void __init_rwsem(struct rw_semaphore *sem, const char *name,
-			 struct lock_class_key *key);
-
-#define init_rwsem(sem)						\
-do {								\
-	static struct lock_class_key __key;			\
-								\
-	__init_rwsem((sem), #sem, &__key);			\
-} while (0)
+#define RWSEM_UNLOCKED_VALUE		0x00000000
 
 extern void __down_read(struct rw_semaphore *sem);
 extern int __down_read_trylock(struct rw_semaphore *sem);
diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index e8be18e..8970c3e 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -36,6 +36,31 @@ struct rw_semaphore {
 #include <asm/rwsem.h>
 #endif
 
+/* Common initializer macros and functions */
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+# define __RWSEM_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname }
+#else
+# define __RWSEM_DEP_MAP_INIT(lockname)
+#endif
+
+#define __RWSEM_INITIALIZER(name) \
+	{ RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED(name.wait_lock),	\
+	  LIST_HEAD_INIT((name).wait_list) __RWSEM_DEP_MAP_INIT(name) }
+
+#define DECLARE_RWSEM(name) \
+	struct rw_semaphore name = __RWSEM_INITIALIZER(name)
+
+extern void __init_rwsem(struct rw_semaphore *sem, const char *name,
+			 struct lock_class_key *key);
+
+#define init_rwsem(sem)						\
+do {								\
+	static struct lock_class_key __key;			\
+								\
+	__init_rwsem((sem), #sem, &__key);			\
+} while (0)
+
 /*
  * lock for reading
  */
-- 
cgit v0.10.2


From 41e5887fa39ab272d9266a09cbefdef270e28b93 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 26 Jan 2011 20:06:03 +0000
Subject: rwsem: Unify the duplicate rwsem_is_locked() inlines

Instead of having the same implementation in each architecture, move
it to linux/rwsem.h and remove the duplicates. It's unlikely that an
arch will ever implement something different, but we can deal with
that when it happens.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Matt Turner <mattst88@gmail.com>
Acked-by: Tony Luck <tony.luck@intel.com>
Acked-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Acked-by: David Miller <davem@davemloft.net>
Cc: Chris Zankel <chris@zankel.net>
LKML-Reference: <20110126195833.876773757@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/alpha/include/asm/rwsem.h b/arch/alpha/include/asm/rwsem.h
index c9f5b90..fc7f543 100644
--- a/arch/alpha/include/asm/rwsem.h
+++ b/arch/alpha/include/asm/rwsem.h
@@ -224,10 +224,5 @@ static inline long rwsem_atomic_update(long val, struct rw_semaphore *sem)
 #endif
 }
 
-static inline int rwsem_is_locked(struct rw_semaphore *sem)
-{
-	return (sem->count != 0);
-}
-
 #endif /* __KERNEL__ */
 #endif /* _ALPHA_RWSEM_H */
diff --git a/arch/ia64/include/asm/rwsem.h b/arch/ia64/include/asm/rwsem.h
index 3798546..148b61a 100644
--- a/arch/ia64/include/asm/rwsem.h
+++ b/arch/ia64/include/asm/rwsem.h
@@ -147,9 +147,4 @@ __downgrade_write (struct rw_semaphore *sem)
 #define rwsem_atomic_add(delta, sem)	atomic64_add(delta, (atomic64_t *)(&(sem)->count))
 #define rwsem_atomic_update(delta, sem)	atomic64_add_return(delta, (atomic64_t *)(&(sem)->count))
 
-static inline int rwsem_is_locked(struct rw_semaphore *sem)
-{
-	return (sem->count != 0);
-}
-
 #endif /* _ASM_IA64_RWSEM_H */
diff --git a/arch/powerpc/include/asm/rwsem.h b/arch/powerpc/include/asm/rwsem.h
index f86fdf7..38e8e5c 100644
--- a/arch/powerpc/include/asm/rwsem.h
+++ b/arch/powerpc/include/asm/rwsem.h
@@ -133,10 +133,5 @@ static inline long rwsem_atomic_update(long delta, struct rw_semaphore *sem)
 	return atomic_long_add_return(delta, (atomic_long_t *)&sem->count);
 }
 
-static inline int rwsem_is_locked(struct rw_semaphore *sem)
-{
-	return sem->count != 0;
-}
-
 #endif	/* __KERNEL__ */
 #endif	/* _ASM_POWERPC_RWSEM_H */
diff --git a/arch/s390/include/asm/rwsem.h b/arch/s390/include/asm/rwsem.h
index f0f5277..80522dc 100644
--- a/arch/s390/include/asm/rwsem.h
+++ b/arch/s390/include/asm/rwsem.h
@@ -325,10 +325,5 @@ static inline long rwsem_atomic_update(long delta, struct rw_semaphore *sem)
 	return new;
 }
 
-static inline int rwsem_is_locked(struct rw_semaphore *sem)
-{
-	return (sem->count != 0);
-}
-
 #endif /* __KERNEL__ */
 #endif /* _S390_RWSEM_H */
diff --git a/arch/sh/include/asm/rwsem.h b/arch/sh/include/asm/rwsem.h
index 798699d..2f8cf97 100644
--- a/arch/sh/include/asm/rwsem.h
+++ b/arch/sh/include/asm/rwsem.h
@@ -133,10 +133,5 @@ static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem)
 	return atomic_add_return(delta, (atomic_t *)(&sem->count));
 }
 
-static inline int rwsem_is_locked(struct rw_semaphore *sem)
-{
-	return (sem->count != 0);
-}
-
 #endif /* __KERNEL__ */
 #endif /* _ASM_SH_RWSEM_H */
diff --git a/arch/sparc/include/asm/rwsem.h b/arch/sparc/include/asm/rwsem.h
index 79f7c1c..4f4391f 100644
--- a/arch/sparc/include/asm/rwsem.h
+++ b/arch/sparc/include/asm/rwsem.h
@@ -124,11 +124,6 @@ static inline long rwsem_atomic_update(long delta, struct rw_semaphore *sem)
 	return atomic64_add_return(delta, (atomic64_t *)(&sem->count));
 }
 
-static inline int rwsem_is_locked(struct rw_semaphore *sem)
-{
-	return (sem->count != 0);
-}
-
 #endif /* __KERNEL__ */
 
 #endif /* _SPARC64_RWSEM_H */
diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h
index c0c91b4..776d76f 100644
--- a/arch/x86/include/asm/rwsem.h
+++ b/arch/x86/include/asm/rwsem.h
@@ -222,10 +222,5 @@ static inline long rwsem_atomic_update(long delta, struct rw_semaphore *sem)
 	return tmp + delta;
 }
 
-static inline int rwsem_is_locked(struct rw_semaphore *sem)
-{
-	return (sem->count != 0);
-}
-
 #endif /* __KERNEL__ */
 #endif /* _ASM_X86_RWSEM_H */
diff --git a/arch/xtensa/include/asm/rwsem.h b/arch/xtensa/include/asm/rwsem.h
index 2fa4202..da61633 100644
--- a/arch/xtensa/include/asm/rwsem.h
+++ b/arch/xtensa/include/asm/rwsem.h
@@ -133,9 +133,4 @@ static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem)
 	return atomic_add_return(delta, (atomic_t *)(&sem->count));
 }
 
-static inline int rwsem_is_locked(struct rw_semaphore *sem)
-{
-	return (sem->count != 0);
-}
-
 #endif	/* _XTENSA_RWSEM_H */
diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index 8970c3e..8b9c7e9 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -34,6 +34,13 @@ struct rw_semaphore {
 
 /* Include the arch specific part */
 #include <asm/rwsem.h>
+
+/* In all implementations count != 0 means locked */
+static inline int rwsem_is_locked(struct rw_semaphore *sem)
+{
+	return sem->count != 0;
+}
+
 #endif
 
 /* Common initializer macros and functions */
-- 
cgit v0.10.2


From aac72277fda6ef788bb8d5deaa502ce9b9b6e472 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 26 Jan 2011 20:06:06 +0000
Subject: rwsem: Move duplicate function prototypes to linux/rwsem.h

All architecture specific rwsem headers carry the same function
prototypes. Just x86 adds asmregparm, which is an empty define on all
other architectures. S390 has a stale rwsem_downgrade_write()
prototype.

Remove the duplicates and add the prototypes to linux/rwsem.h

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Richard Henderson <rth@twiddle.net>
Acked-by: Tony Luck <tony.luck@intel.com>
Acked-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Acked-by: David Miller <davem@davemloft.net>
Cc: Chris Zankel <chris@zankel.net>
LKML-Reference: <20110126195833.970840140@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/alpha/include/asm/rwsem.h b/arch/alpha/include/asm/rwsem.h
index fc7f543..a83bbea6 100644
--- a/arch/alpha/include/asm/rwsem.h
+++ b/arch/alpha/include/asm/rwsem.h
@@ -14,11 +14,6 @@
 
 #include <linux/compiler.h>
 
-extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem);
-extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem);
-extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *);
-extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem);
-
 #define RWSEM_UNLOCKED_VALUE		0x0000000000000000L
 #define RWSEM_ACTIVE_BIAS		0x0000000000000001L
 #define RWSEM_ACTIVE_MASK		0x00000000ffffffffL
diff --git a/arch/ia64/include/asm/rwsem.h b/arch/ia64/include/asm/rwsem.h
index 148b61a..3027e75 100644
--- a/arch/ia64/include/asm/rwsem.h
+++ b/arch/ia64/include/asm/rwsem.h
@@ -34,11 +34,6 @@
 #define RWSEM_ACTIVE_READ_BIAS		RWSEM_ACTIVE_BIAS
 #define RWSEM_ACTIVE_WRITE_BIAS		(RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
 
-extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem);
-extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem);
-extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem);
-extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem);
-
 /*
  * lock for reading
  */
diff --git a/arch/powerpc/include/asm/rwsem.h b/arch/powerpc/include/asm/rwsem.h
index 38e8e5c..bb1e2cd 100644
--- a/arch/powerpc/include/asm/rwsem.h
+++ b/arch/powerpc/include/asm/rwsem.h
@@ -28,11 +28,6 @@
 #define RWSEM_ACTIVE_READ_BIAS		RWSEM_ACTIVE_BIAS
 #define RWSEM_ACTIVE_WRITE_BIAS		(RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
 
-extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem);
-extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem);
-extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem);
-extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem);
-
 /*
  * lock for reading
  */
diff --git a/arch/s390/include/asm/rwsem.h b/arch/s390/include/asm/rwsem.h
index 80522dc..d0eb465 100644
--- a/arch/s390/include/asm/rwsem.h
+++ b/arch/s390/include/asm/rwsem.h
@@ -43,12 +43,6 @@
 
 #ifdef __KERNEL__
 
-extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *);
-extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *);
-extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *);
-extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *);
-extern struct rw_semaphore *rwsem_downgrade_write(struct rw_semaphore *);
-
 #ifndef __s390x__
 #define RWSEM_UNLOCKED_VALUE	0x00000000
 #define RWSEM_ACTIVE_BIAS	0x00000001
diff --git a/arch/sh/include/asm/rwsem.h b/arch/sh/include/asm/rwsem.h
index 2f8cf97..edab572 100644
--- a/arch/sh/include/asm/rwsem.h
+++ b/arch/sh/include/asm/rwsem.h
@@ -19,11 +19,6 @@
 #define RWSEM_ACTIVE_READ_BIAS		RWSEM_ACTIVE_BIAS
 #define RWSEM_ACTIVE_WRITE_BIAS		(RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
 
-extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem);
-extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem);
-extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem);
-extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem);
-
 /*
  * lock for reading
  */
diff --git a/arch/sparc/include/asm/rwsem.h b/arch/sparc/include/asm/rwsem.h
index 4f4391f..069bf4d 100644
--- a/arch/sparc/include/asm/rwsem.h
+++ b/arch/sparc/include/asm/rwsem.h
@@ -20,11 +20,6 @@
 #define RWSEM_ACTIVE_READ_BIAS		RWSEM_ACTIVE_BIAS
 #define RWSEM_ACTIVE_WRITE_BIAS		(RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
 
-extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem);
-extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem);
-extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem);
-extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem);
-
 /*
  * lock for reading
  */
diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h
index 776d76f..df4cd32 100644
--- a/arch/x86/include/asm/rwsem.h
+++ b/arch/x86/include/asm/rwsem.h
@@ -39,15 +39,6 @@
 #ifdef __KERNEL__
 #include <asm/asm.h>
 
-extern asmregparm struct rw_semaphore *
- rwsem_down_read_failed(struct rw_semaphore *sem);
-extern asmregparm struct rw_semaphore *
- rwsem_down_write_failed(struct rw_semaphore *sem);
-extern asmregparm struct rw_semaphore *
- rwsem_wake(struct rw_semaphore *);
-extern asmregparm struct rw_semaphore *
- rwsem_downgrade_wake(struct rw_semaphore *sem);
-
 /*
  * The bias values and the counter type limits the number of
  * potential readers/writers to 32767 for 32 bits and 2147483647
diff --git a/arch/xtensa/include/asm/rwsem.h b/arch/xtensa/include/asm/rwsem.h
index da61633..249619e 100644
--- a/arch/xtensa/include/asm/rwsem.h
+++ b/arch/xtensa/include/asm/rwsem.h
@@ -24,11 +24,6 @@
 #define RWSEM_ACTIVE_READ_BIAS		RWSEM_ACTIVE_BIAS
 #define RWSEM_ACTIVE_WRITE_BIAS		(RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
 
-extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem);
-extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem);
-extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem);
-extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem);
-
 /*
  * lock for reading
  */
diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index 8b9c7e9..f7c957f 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -32,6 +32,11 @@ struct rw_semaphore {
 #endif
 };
 
+extern asmregparm struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem);
+extern asmregparm struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem);
+extern asmregparm struct rw_semaphore *rwsem_wake(struct rw_semaphore *);
+extern asmregparm struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem);
+
 /* Include the arch specific part */
 #include <asm/rwsem.h>
 
-- 
cgit v0.10.2


From d123375425d7df4b6081a631fc1203fceafa59b2 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 26 Jan 2011 21:32:01 +0100
Subject: rwsem: Remove redundant asmregparm annotation

Peter Zijlstra pointed out, that the only user of asmregparm (x86) is
compiling the kernel already with -mregparm=3. So the annotation of
the rwsem functions is redundant. Remove it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: David Miller <davem@davemloft.net>
Cc: Chris Zankel <chris@zankel.net>
LKML-Reference: <alpine.LFD.2.00.1101262130450.31804@localhost6.localdomain6>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index f7c957f..a8afe9c 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -32,10 +32,10 @@ struct rw_semaphore {
 #endif
 };
 
-extern asmregparm struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem);
-extern asmregparm struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem);
-extern asmregparm struct rw_semaphore *rwsem_wake(struct rw_semaphore *);
-extern asmregparm struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem);
+extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem);
+extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem);
+extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *);
+extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem);
 
 /* Include the arch specific part */
 #include <asm/rwsem.h>
diff --git a/lib/rwsem.c b/lib/rwsem.c
index f236d7c..aa7c305 100644
--- a/lib/rwsem.c
+++ b/lib/rwsem.c
@@ -222,8 +222,7 @@ rwsem_down_failed_common(struct rw_semaphore *sem,
 /*
  * wait for the read lock to be granted
  */
-asmregparm struct rw_semaphore __sched *
-rwsem_down_read_failed(struct rw_semaphore *sem)
+struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
 {
 	return rwsem_down_failed_common(sem, RWSEM_WAITING_FOR_READ,
 					-RWSEM_ACTIVE_READ_BIAS);
@@ -232,8 +231,7 @@ rwsem_down_read_failed(struct rw_semaphore *sem)
 /*
  * wait for the write lock to be granted
  */
-asmregparm struct rw_semaphore __sched *
-rwsem_down_write_failed(struct rw_semaphore *sem)
+struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
 {
 	return rwsem_down_failed_common(sem, RWSEM_WAITING_FOR_WRITE,
 					-RWSEM_ACTIVE_WRITE_BIAS);
@@ -243,7 +241,7 @@ rwsem_down_write_failed(struct rw_semaphore *sem)
  * handle waking up a waiter on the semaphore
  * - up_read/up_write has decremented the active part of count if we come here
  */
-asmregparm struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
+struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
 {
 	unsigned long flags;
 
@@ -263,7 +261,7 @@ asmregparm struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
  * - caller incremented waiting part of count and discovered it still negative
  * - just wake up any readers at the front of the queue
  */
-asmregparm struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)
+struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)
 {
 	unsigned long flags;
 
-- 
cgit v0.10.2


From 6ea72f12069306b235151c5b05ac0cca7e1dedfa Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 26 Jan 2011 13:36:03 +0100
Subject: sched: Avoid expensive initial update_cfs_load(), on UP too

Fix the build on UP.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Turner <pjt@google.com>
LKML-Reference: <20110122044852.102126037@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched.c b/kernel/sched.c
index 78fa753..477e1bc 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7922,8 +7922,10 @@ static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	cfs_rq->rq = rq;
 	/* allow initial update_cfs_load() to truncate */
+#ifdef CONFIG_SMP
 	cfs_rq->load_stamp = 1;
 #endif
+#endif
 	cfs_rq->min_vruntime = (u64)(-(1LL << 20));
 }
 
-- 
cgit v0.10.2


From dda99116969142cc41e945a1047a419b937536af Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Fri, 21 Jan 2011 15:30:01 -0800
Subject: x86, perf: Change two init functions to static

init_hw_perf_events() is called via early_initcall now.
x86_pmu_event_init is x86_pmu member function.

So we can change them to static.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
LKML-Reference: <4D3A16F9.109@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 9d977a2..4d98789 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1389,7 +1389,7 @@ static void __init pmu_check_apic(void)
 	pr_info("no hardware sampling interrupt available.\n");
 }
 
-int __init init_hw_perf_events(void)
+static int __init init_hw_perf_events(void)
 {
 	struct event_constraint *c;
 	int err;
@@ -1608,7 +1608,7 @@ out:
 	return ret;
 }
 
-int x86_pmu_event_init(struct perf_event *event)
+static int x86_pmu_event_init(struct perf_event *event)
 {
 	struct pmu *tmp;
 	int err;
-- 
cgit v0.10.2


From 8161239a8bcce9ad6b537c04a1fa3b5c68bae693 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Fri, 14 Jan 2011 17:09:41 +0800
Subject: rtmutex: Simplify PI algorithm and make highest prio task get lock

In current rtmutex, the pending owner may be boosted by the tasks
in the rtmutex's waitlist when the pending owner is deboosted
or a task in the waitlist is boosted. This boosting is unrelated,
because the pending owner does not really take the rtmutex.
It is not reasonable.

Example.

time1:
A(high prio) onwers the rtmutex.
B(mid prio) and C (low prio) in the waitlist.

time2
A release the lock, B becomes the pending owner
A(or other high prio task) continues to run. B's prio is lower
than A, so B is just queued at the runqueue.

time3
A or other high prio task sleeps, but we have passed some time
The B and C's prio are changed in the period (time2 ~ time3)
due to boosting or deboosting. Now C has the priority higher
than B. ***Is it reasonable that C has to boost B and help B to
get the rtmutex?

NO!! I think, it is unrelated/unneed boosting before B really
owns the rtmutex. We should give C a chance to beat B and
win the rtmutex.

This is the motivation of this patch. This patch *ensures*
only the top waiter or higher priority task can take the lock.

How?
1) we don't dequeue the top waiter when unlock, if the top waiter
   is changed, the old top waiter will fail and go to sleep again.
2) when requiring lock, it will get the lock when the lock is not taken and:
   there is no waiter OR higher priority than waiters OR it is top waiter.
3) In any time, the top waiter is changed, the top waiter will be woken up.

The algorithm is much simpler than before, no pending owner, no
boosting for pending owner.

Other advantage of this patch:
1) The states of a rtmutex are reduced a half, easier to read the code.
2) the codes become shorter.
3) top waiter is not dequeued until it really take the lock:
   they will retain FIFO when it is stolen.

Not advantage nor disadvantage
1) Even we may wakeup multiple waiters(any time when top waiter changed),
   we hardly cause "thundering herd",
   the number of wokenup task is likely 1 or very little.
2) two APIs are changed.
   rt_mutex_owner() will not return pending owner, it will return NULL when
                    the top waiter is going to take the lock.
   rt_mutex_next_owner() always return the top waiter.
	                 will not return NULL if we have waiters
                         because the top waiter is not dequeued.

   I have fixed the code that use these APIs.

need updated after this patch is accepted
1) Document/*
2) the testcase scripts/rt-tester/t4-l2-pi-deboost.tst

Signed-off-by:  Lai Jiangshan <laijs@cn.fujitsu.com>
LKML-Reference: <4D3012D5.4060709@cn.fujitsu.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/futex.c b/kernel/futex.c
index b766d28..64c3811 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1556,10 +1556,10 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
 
 	/*
 	 * We are here either because we stole the rtmutex from the
-	 * pending owner or we are the pending owner which failed to
-	 * get the rtmutex. We have to replace the pending owner TID
-	 * in the user space variable. This must be atomic as we have
-	 * to preserve the owner died bit here.
+	 * previous highest priority waiter or we are the highest priority
+	 * waiter but failed to get the rtmutex the first time.
+	 * We have to replace the newowner TID in the user space variable.
+	 * This must be atomic as we have to preserve the owner died bit here.
 	 *
 	 * Note: We write the user space value _before_ changing the pi_state
 	 * because we can fault here. Imagine swapped out pages or a fork
@@ -1608,8 +1608,8 @@ retry:
 
 	/*
 	 * To handle the page fault we need to drop the hash bucket
-	 * lock here. That gives the other task (either the pending
-	 * owner itself or the task which stole the rtmutex) the
+	 * lock here. That gives the other task (either the highest priority
+	 * waiter itself or the task which stole the rtmutex) the
 	 * chance to try the fixup of the pi_state. So once we are
 	 * back from handling the fault we need to check the pi_state
 	 * after reacquiring the hash bucket lock and before trying to
@@ -1685,18 +1685,20 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
 		/*
 		 * pi_state is incorrect, some other task did a lock steal and
 		 * we returned due to timeout or signal without taking the
-		 * rt_mutex. Too late. We can access the rt_mutex_owner without
-		 * locking, as the other task is now blocked on the hash bucket
-		 * lock. Fix the state up.
+		 * rt_mutex. Too late.
 		 */
+		raw_spin_lock(&q->pi_state->pi_mutex.wait_lock);
 		owner = rt_mutex_owner(&q->pi_state->pi_mutex);
+		if (!owner)
+			owner = rt_mutex_next_owner(&q->pi_state->pi_mutex);
+		raw_spin_unlock(&q->pi_state->pi_mutex.wait_lock);
 		ret = fixup_pi_state_owner(uaddr, q, owner);
 		goto out;
 	}
 
 	/*
 	 * Paranoia check. If we did not take the lock, then we should not be
-	 * the owner, nor the pending owner, of the rt_mutex.
+	 * the owner of the rt_mutex.
 	 */
 	if (rt_mutex_owner(&q->pi_state->pi_mutex) == current)
 		printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p "
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c
index ddabb54..3c7cbc2 100644
--- a/kernel/rtmutex-debug.c
+++ b/kernel/rtmutex-debug.c
@@ -215,7 +215,6 @@ void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter)
 	put_pid(waiter->deadlock_task_pid);
 	TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry));
 	TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
-	TRACE_WARN_ON(waiter->task);
 	memset(waiter, 0x22, sizeof(*waiter));
 }
 
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index a960481..ab44911 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -20,41 +20,34 @@
 /*
  * lock->owner state tracking:
  *
- * lock->owner holds the task_struct pointer of the owner. Bit 0 and 1
- * are used to keep track of the "owner is pending" and "lock has
- * waiters" state.
+ * lock->owner holds the task_struct pointer of the owner. Bit 0
+ * is used to keep track of the "lock has waiters" state.
  *
- * owner	bit1	bit0
- * NULL		0	0	lock is free (fast acquire possible)
- * NULL		0	1	invalid state
- * NULL		1	0	Transitional State*
- * NULL		1	1	invalid state
- * taskpointer	0	0	lock is held (fast release possible)
- * taskpointer	0	1	task is pending owner
- * taskpointer	1	0	lock is held and has waiters
- * taskpointer	1	1	task is pending owner and lock has more waiters
- *
- * Pending ownership is assigned to the top (highest priority)
- * waiter of the lock, when the lock is released. The thread is woken
- * up and can now take the lock. Until the lock is taken (bit 0
- * cleared) a competing higher priority thread can steal the lock
- * which puts the woken up thread back on the waiters list.
+ * owner	bit0
+ * NULL		0	lock is free (fast acquire possible)
+ * NULL		1	lock is free and has waiters and the top waiter
+ *				is going to take the lock*
+ * taskpointer	0	lock is held (fast release possible)
+ * taskpointer	1	lock is held and has waiters**
  *
  * The fast atomic compare exchange based acquire and release is only
- * possible when bit 0 and 1 of lock->owner are 0.
+ * possible when bit 0 of lock->owner is 0.
+ *
+ * (*) It also can be a transitional state when grabbing the lock
+ * with ->wait_lock is held. To prevent any fast path cmpxchg to the lock,
+ * we need to set the bit0 before looking at the lock, and the owner may be
+ * NULL in this small time, hence this can be a transitional state.
  *
- * (*) There's a small time where the owner can be NULL and the
- * "lock has waiters" bit is set.  This can happen when grabbing the lock.
- * To prevent a cmpxchg of the owner releasing the lock, we need to set this
- * bit before looking at the lock, hence the reason this is a transitional
- * state.
+ * (**) There is a small time when bit 0 is set but there are no
+ * waiters. This can happen when grabbing the lock in the slow path.
+ * To prevent a cmpxchg of the owner releasing the lock, we need to
+ * set this bit before looking at the lock.
  */
 
 static void
-rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner,
-		   unsigned long mask)
+rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner)
 {
-	unsigned long val = (unsigned long)owner | mask;
+	unsigned long val = (unsigned long)owner;
 
 	if (rt_mutex_has_waiters(lock))
 		val |= RT_MUTEX_HAS_WAITERS;
@@ -203,15 +196,14 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
 	 * reached or the state of the chain has changed while we
 	 * dropped the locks.
 	 */
-	if (!waiter || !waiter->task)
+	if (!waiter)
 		goto out_unlock_pi;
 
 	/*
 	 * Check the orig_waiter state. After we dropped the locks,
-	 * the previous owner of the lock might have released the lock
-	 * and made us the pending owner:
+	 * the previous owner of the lock might have released the lock.
 	 */
-	if (orig_waiter && !orig_waiter->task)
+	if (orig_waiter && !rt_mutex_owner(orig_lock))
 		goto out_unlock_pi;
 
 	/*
@@ -254,6 +246,17 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
 
 	/* Release the task */
 	raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+	if (!rt_mutex_owner(lock)) {
+		/*
+		 * If the requeue above changed the top waiter, then we need
+		 * to wake the new top waiter up to try to get the lock.
+		 */
+
+		if (top_waiter != rt_mutex_top_waiter(lock))
+			wake_up_process(rt_mutex_top_waiter(lock)->task);
+		raw_spin_unlock(&lock->wait_lock);
+		goto out_put_task;
+	}
 	put_task_struct(task);
 
 	/* Grab the next task */
@@ -296,78 +299,16 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
 }
 
 /*
- * Optimization: check if we can steal the lock from the
- * assigned pending owner [which might not have taken the
- * lock yet]:
- */
-static inline int try_to_steal_lock(struct rt_mutex *lock,
-				    struct task_struct *task)
-{
-	struct task_struct *pendowner = rt_mutex_owner(lock);
-	struct rt_mutex_waiter *next;
-	unsigned long flags;
-
-	if (!rt_mutex_owner_pending(lock))
-		return 0;
-
-	if (pendowner == task)
-		return 1;
-
-	raw_spin_lock_irqsave(&pendowner->pi_lock, flags);
-	if (task->prio >= pendowner->prio) {
-		raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
-		return 0;
-	}
-
-	/*
-	 * Check if a waiter is enqueued on the pending owners
-	 * pi_waiters list. Remove it and readjust pending owners
-	 * priority.
-	 */
-	if (likely(!rt_mutex_has_waiters(lock))) {
-		raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
-		return 1;
-	}
-
-	/* No chain handling, pending owner is not blocked on anything: */
-	next = rt_mutex_top_waiter(lock);
-	plist_del(&next->pi_list_entry, &pendowner->pi_waiters);
-	__rt_mutex_adjust_prio(pendowner);
-	raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
-
-	/*
-	 * We are going to steal the lock and a waiter was
-	 * enqueued on the pending owners pi_waiters queue. So
-	 * we have to enqueue this waiter into
-	 * task->pi_waiters list. This covers the case,
-	 * where task is boosted because it holds another
-	 * lock and gets unboosted because the booster is
-	 * interrupted, so we would delay a waiter with higher
-	 * priority as task->normal_prio.
-	 *
-	 * Note: in the rare case of a SCHED_OTHER task changing
-	 * its priority and thus stealing the lock, next->task
-	 * might be task:
-	 */
-	if (likely(next->task != task)) {
-		raw_spin_lock_irqsave(&task->pi_lock, flags);
-		plist_add(&next->pi_list_entry, &task->pi_waiters);
-		__rt_mutex_adjust_prio(task);
-		raw_spin_unlock_irqrestore(&task->pi_lock, flags);
-	}
-	return 1;
-}
-
-/*
  * Try to take an rt-mutex
  *
- * This fails
- * - when the lock has a real owner
- * - when a different pending owner exists and has higher priority than current
- *
  * Must be called with lock->wait_lock held.
+ *
+ * @lock:   the lock to be acquired.
+ * @task:   the task which wants to acquire the lock
+ * @waiter: the waiter that is queued to the lock's wait list. (could be NULL)
  */
-static int try_to_take_rt_mutex(struct rt_mutex *lock)
+static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
+		struct rt_mutex_waiter *waiter)
 {
 	/*
 	 * We have to be careful here if the atomic speedups are
@@ -390,15 +331,52 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock)
 	 */
 	mark_rt_mutex_waiters(lock);
 
-	if (rt_mutex_owner(lock) && !try_to_steal_lock(lock, current))
+	if (rt_mutex_owner(lock))
 		return 0;
 
+	/*
+	 * It will get the lock because of one of these conditions:
+	 * 1) there is no waiter
+	 * 2) higher priority than waiters
+	 * 3) it is top waiter
+	 */
+	if (rt_mutex_has_waiters(lock)) {
+		if (task->prio >= rt_mutex_top_waiter(lock)->list_entry.prio) {
+			if (!waiter || waiter != rt_mutex_top_waiter(lock))
+				return 0;
+		}
+	}
+
+	if (waiter || rt_mutex_has_waiters(lock)) {
+		unsigned long flags;
+		struct rt_mutex_waiter *top;
+
+		raw_spin_lock_irqsave(&task->pi_lock, flags);
+
+		/* remove the queued waiter. */
+		if (waiter) {
+			plist_del(&waiter->list_entry, &lock->wait_list);
+			task->pi_blocked_on = NULL;
+		}
+
+		/*
+		 * We have to enqueue the top waiter(if it exists) into
+		 * task->pi_waiters list.
+		 */
+		if (rt_mutex_has_waiters(lock)) {
+			top = rt_mutex_top_waiter(lock);
+			top->pi_list_entry.prio = top->list_entry.prio;
+			plist_add(&top->pi_list_entry, &task->pi_waiters);
+		}
+		raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+	}
+
 	/* We got the lock. */
 	debug_rt_mutex_lock(lock);
 
-	rt_mutex_set_owner(lock, current, 0);
+	rt_mutex_set_owner(lock, task);
 
-	rt_mutex_deadlock_account_lock(lock, current);
+	rt_mutex_deadlock_account_lock(lock, task);
 
 	return 1;
 }
@@ -436,6 +414,9 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
 
 	raw_spin_unlock_irqrestore(&task->pi_lock, flags);
 
+	if (!owner)
+		return 0;
+
 	if (waiter == rt_mutex_top_waiter(lock)) {
 		raw_spin_lock_irqsave(&owner->pi_lock, flags);
 		plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters);
@@ -472,21 +453,18 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
 /*
  * Wake up the next waiter on the lock.
  *
- * Remove the top waiter from the current tasks waiter list and from
- * the lock waiter list. Set it as pending owner. Then wake it up.
+ * Remove the top waiter from the current tasks waiter list and wake it up.
  *
  * Called with lock->wait_lock held.
  */
 static void wakeup_next_waiter(struct rt_mutex *lock)
 {
 	struct rt_mutex_waiter *waiter;
-	struct task_struct *pendowner;
 	unsigned long flags;
 
 	raw_spin_lock_irqsave(&current->pi_lock, flags);
 
 	waiter = rt_mutex_top_waiter(lock);
-	plist_del(&waiter->list_entry, &lock->wait_list);
 
 	/*
 	 * Remove it from current->pi_waiters. We do not adjust a
@@ -495,43 +473,19 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
 	 * lock->wait_lock.
 	 */
 	plist_del(&waiter->pi_list_entry, &current->pi_waiters);
-	pendowner = waiter->task;
-	waiter->task = NULL;
 
-	rt_mutex_set_owner(lock, pendowner, RT_MUTEX_OWNER_PENDING);
+	rt_mutex_set_owner(lock, NULL);
 
 	raw_spin_unlock_irqrestore(&current->pi_lock, flags);
 
-	/*
-	 * Clear the pi_blocked_on variable and enqueue a possible
-	 * waiter into the pi_waiters list of the pending owner. This
-	 * prevents that in case the pending owner gets unboosted a
-	 * waiter with higher priority than pending-owner->normal_prio
-	 * is blocked on the unboosted (pending) owner.
-	 */
-	raw_spin_lock_irqsave(&pendowner->pi_lock, flags);
-
-	WARN_ON(!pendowner->pi_blocked_on);
-	WARN_ON(pendowner->pi_blocked_on != waiter);
-	WARN_ON(pendowner->pi_blocked_on->lock != lock);
-
-	pendowner->pi_blocked_on = NULL;
-
-	if (rt_mutex_has_waiters(lock)) {
-		struct rt_mutex_waiter *next;
-
-		next = rt_mutex_top_waiter(lock);
-		plist_add(&next->pi_list_entry, &pendowner->pi_waiters);
-	}
-	raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
-
-	wake_up_process(pendowner);
+	wake_up_process(waiter->task);
 }
 
 /*
- * Remove a waiter from a lock
+ * Remove a waiter from a lock and give up
  *
- * Must be called with lock->wait_lock held
+ * Must be called with lock->wait_lock held and
+ * have just failed to try_to_take_rt_mutex().
  */
 static void remove_waiter(struct rt_mutex *lock,
 			  struct rt_mutex_waiter *waiter)
@@ -543,11 +497,13 @@ static void remove_waiter(struct rt_mutex *lock,
 
 	raw_spin_lock_irqsave(&current->pi_lock, flags);
 	plist_del(&waiter->list_entry, &lock->wait_list);
-	waiter->task = NULL;
 	current->pi_blocked_on = NULL;
 	raw_spin_unlock_irqrestore(&current->pi_lock, flags);
 
-	if (first && owner != current) {
+	if (!owner)
+		return;
+
+	if (first) {
 
 		raw_spin_lock_irqsave(&owner->pi_lock, flags);
 
@@ -614,21 +570,19 @@ void rt_mutex_adjust_pi(struct task_struct *task)
  * 			 or TASK_UNINTERRUPTIBLE)
  * @timeout:		 the pre-initialized and started timer, or NULL for none
  * @waiter:		 the pre-initialized rt_mutex_waiter
- * @detect_deadlock:	 passed to task_blocks_on_rt_mutex
  *
  * lock->wait_lock must be held by the caller.
  */
 static int __sched
 __rt_mutex_slowlock(struct rt_mutex *lock, int state,
 		    struct hrtimer_sleeper *timeout,
-		    struct rt_mutex_waiter *waiter,
-		    int detect_deadlock)
+		    struct rt_mutex_waiter *waiter)
 {
 	int ret = 0;
 
 	for (;;) {
 		/* Try to acquire the lock: */
-		if (try_to_take_rt_mutex(lock))
+		if (try_to_take_rt_mutex(lock, current, waiter))
 			break;
 
 		/*
@@ -645,39 +599,11 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
 				break;
 		}
 
-		/*
-		 * waiter->task is NULL the first time we come here and
-		 * when we have been woken up by the previous owner
-		 * but the lock got stolen by a higher prio task.
-		 */
-		if (!waiter->task) {
-			ret = task_blocks_on_rt_mutex(lock, waiter, current,
-						      detect_deadlock);
-			/*
-			 * If we got woken up by the owner then start loop
-			 * all over without going into schedule to try
-			 * to get the lock now:
-			 */
-			if (unlikely(!waiter->task)) {
-				/*
-				 * Reset the return value. We might
-				 * have returned with -EDEADLK and the
-				 * owner released the lock while we
-				 * were walking the pi chain.
-				 */
-				ret = 0;
-				continue;
-			}
-			if (unlikely(ret))
-				break;
-		}
-
 		raw_spin_unlock(&lock->wait_lock);
 
 		debug_rt_mutex_print_deadlock(waiter);
 
-		if (waiter->task)
-			schedule_rt_mutex(lock);
+		schedule_rt_mutex(lock);
 
 		raw_spin_lock(&lock->wait_lock);
 		set_current_state(state);
@@ -698,12 +624,11 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
 	int ret = 0;
 
 	debug_rt_mutex_init_waiter(&waiter);
-	waiter.task = NULL;
 
 	raw_spin_lock(&lock->wait_lock);
 
 	/* Try to acquire the lock again: */
-	if (try_to_take_rt_mutex(lock)) {
+	if (try_to_take_rt_mutex(lock, current, NULL)) {
 		raw_spin_unlock(&lock->wait_lock);
 		return 0;
 	}
@@ -717,12 +642,14 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
 			timeout->task = NULL;
 	}
 
-	ret = __rt_mutex_slowlock(lock, state, timeout, &waiter,
-				  detect_deadlock);
+	ret = task_blocks_on_rt_mutex(lock, &waiter, current, detect_deadlock);
+
+	if (likely(!ret))
+		ret = __rt_mutex_slowlock(lock, state, timeout, &waiter);
 
 	set_current_state(TASK_RUNNING);
 
-	if (unlikely(waiter.task))
+	if (unlikely(ret))
 		remove_waiter(lock, &waiter);
 
 	/*
@@ -737,14 +664,6 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
 	if (unlikely(timeout))
 		hrtimer_cancel(&timeout->timer);
 
-	/*
-	 * Readjust priority, when we did not get the lock. We might
-	 * have been the pending owner and boosted. Since we did not
-	 * take the lock, the PI boost has to go.
-	 */
-	if (unlikely(ret))
-		rt_mutex_adjust_prio(current);
-
 	debug_rt_mutex_free_waiter(&waiter);
 
 	return ret;
@@ -762,7 +681,7 @@ rt_mutex_slowtrylock(struct rt_mutex *lock)
 
 	if (likely(rt_mutex_owner(lock) != current)) {
 
-		ret = try_to_take_rt_mutex(lock);
+		ret = try_to_take_rt_mutex(lock, current, NULL);
 		/*
 		 * try_to_take_rt_mutex() sets the lock waiters
 		 * bit unconditionally. Clean this up.
@@ -992,7 +911,7 @@ void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
 {
 	__rt_mutex_init(lock, NULL);
 	debug_rt_mutex_proxy_lock(lock, proxy_owner);
-	rt_mutex_set_owner(lock, proxy_owner, 0);
+	rt_mutex_set_owner(lock, proxy_owner);
 	rt_mutex_deadlock_account_lock(lock, proxy_owner);
 }
 
@@ -1008,7 +927,7 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock,
 			   struct task_struct *proxy_owner)
 {
 	debug_rt_mutex_proxy_unlock(lock);
-	rt_mutex_set_owner(lock, NULL, 0);
+	rt_mutex_set_owner(lock, NULL);
 	rt_mutex_deadlock_account_unlock(proxy_owner);
 }
 
@@ -1034,20 +953,14 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
 
 	raw_spin_lock(&lock->wait_lock);
 
-	mark_rt_mutex_waiters(lock);
-
-	if (!rt_mutex_owner(lock) || try_to_steal_lock(lock, task)) {
-		/* We got the lock for task. */
-		debug_rt_mutex_lock(lock);
-		rt_mutex_set_owner(lock, task, 0);
+	if (try_to_take_rt_mutex(lock, task, NULL)) {
 		raw_spin_unlock(&lock->wait_lock);
-		rt_mutex_deadlock_account_lock(lock, task);
 		return 1;
 	}
 
 	ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock);
 
-	if (ret && !waiter->task) {
+	if (ret && !rt_mutex_owner(lock)) {
 		/*
 		 * Reset the return value. We might have
 		 * returned with -EDEADLK and the owner
@@ -1056,6 +969,10 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
 		 */
 		ret = 0;
 	}
+
+	if (unlikely(ret))
+		remove_waiter(lock, waiter);
+
 	raw_spin_unlock(&lock->wait_lock);
 
 	debug_rt_mutex_print_deadlock(waiter);
@@ -1110,12 +1027,11 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
 
 	set_current_state(TASK_INTERRUPTIBLE);
 
-	ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter,
-				  detect_deadlock);
+	ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
 
 	set_current_state(TASK_RUNNING);
 
-	if (unlikely(waiter->task))
+	if (unlikely(ret))
 		remove_waiter(lock, waiter);
 
 	/*
@@ -1126,13 +1042,5 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
 
 	raw_spin_unlock(&lock->wait_lock);
 
-	/*
-	 * Readjust priority, when we did not get the lock. We might have been
-	 * the pending owner and boosted. Since we did not take the lock, the
-	 * PI boost has to go.
-	 */
-	if (unlikely(ret))
-		rt_mutex_adjust_prio(current);
-
 	return ret;
 }
diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h
index 97a2f81..53a66c8 100644
--- a/kernel/rtmutex_common.h
+++ b/kernel/rtmutex_common.h
@@ -91,9 +91,8 @@ task_top_pi_waiter(struct task_struct *p)
 /*
  * lock->owner state tracking:
  */
-#define RT_MUTEX_OWNER_PENDING	1UL
-#define RT_MUTEX_HAS_WAITERS	2UL
-#define RT_MUTEX_OWNER_MASKALL	3UL
+#define RT_MUTEX_HAS_WAITERS	1UL
+#define RT_MUTEX_OWNER_MASKALL	1UL
 
 static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock)
 {
@@ -101,17 +100,6 @@ static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock)
 		((unsigned long)lock->owner & ~RT_MUTEX_OWNER_MASKALL);
 }
 
-static inline struct task_struct *rt_mutex_real_owner(struct rt_mutex *lock)
-{
-	return (struct task_struct *)
-		((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS);
-}
-
-static inline unsigned long rt_mutex_owner_pending(struct rt_mutex *lock)
-{
-	return (unsigned long)lock->owner & RT_MUTEX_OWNER_PENDING;
-}
-
 /*
  * PI-futex support (proxy locking functions, etc.):
  */
-- 
cgit v0.10.2


From 68baa431ec2f14ba7510d4e79bceb6ceaf0d3b74 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Thu, 20 Jan 2011 23:15:30 +0900
Subject: perf tools: Add strfilter for general purpose string filter

Add strfilter for general purpose string filter.

Every filter rules are descrived by glob matching pattern and '!' prefix
which means Logical NOT.

A strfilter consists of those filter rules connected with '&' and '|'.

A set of rules can be folded by using '(' and ')'.

It also accepts spaces around rules and those operators.

Format:
<rule> ::= <glob-exp> | "!" <rule> | <rule> <op> <rule> | "(" <rule> ")"
<op> ::= "&" | "|"

e.g.:

 "(add* | del*) & *timer" filter rules pass strings which start with add
 or del and end with timer.

This will be used by perf probe --filter.

Changes in V2:
 - Fix to check result of strdup() and strfilter__alloc().
 - Encapsulate and simplify interfaces as like regex(3).

Cc: 2nddept-manager@sdl.hitachi.co.jp
Cc: Franck Bui-Huu <fbuihuu@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <20110120141530.25915.12673.stgit@ltc236.sdl.hitachi.co.jp>
Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index 638e8e1..eedcf95 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -417,6 +417,7 @@ LIB_H += util/help.h
 LIB_H += util/session.h
 LIB_H += util/strbuf.h
 LIB_H += util/strlist.h
+LIB_H += util/strfilter.h
 LIB_H += util/svghelper.h
 LIB_H += util/run-command.h
 LIB_H += util/sigchain.h
@@ -458,6 +459,7 @@ LIB_OBJS += $(OUTPUT)util/quote.o
 LIB_OBJS += $(OUTPUT)util/strbuf.o
 LIB_OBJS += $(OUTPUT)util/string.o
 LIB_OBJS += $(OUTPUT)util/strlist.o
+LIB_OBJS += $(OUTPUT)util/strfilter.o
 LIB_OBJS += $(OUTPUT)util/usage.o
 LIB_OBJS += $(OUTPUT)util/wrapper.o
 LIB_OBJS += $(OUTPUT)util/sigchain.o
diff --git a/tools/perf/util/strfilter.c b/tools/perf/util/strfilter.c
new file mode 100644
index 0000000..4064b7d
--- /dev/null
+++ b/tools/perf/util/strfilter.c
@@ -0,0 +1,200 @@
+#include <ctype.h>
+#include "util.h"
+#include "string.h"
+#include "strfilter.h"
+
+/* Operators */
+static const char *OP_and	= "&";	/* Logical AND */
+static const char *OP_or	= "|";	/* Logical OR */
+static const char *OP_not	= "!";	/* Logical NOT */
+
+#define is_operator(c)	((c) == '|' || (c) == '&' || (c) == '!')
+#define is_separator(c)	(is_operator(c) || (c) == '(' || (c) == ')')
+
+static void strfilter_node__delete(struct strfilter_node *self)
+{
+	if (self) {
+		if (self->p && !is_operator(*self->p))
+			free((char *)self->p);
+		strfilter_node__delete(self->l);
+		strfilter_node__delete(self->r);
+		free(self);
+	}
+}
+
+void strfilter__delete(struct strfilter *self)
+{
+	if (self) {
+		strfilter_node__delete(self->root);
+		free(self);
+	}
+}
+
+static const char *get_token(const char *s, const char **e)
+{
+	const char *p;
+
+	while (isspace(*s))	/* Skip spaces */
+		s++;
+
+	if (*s == '\0') {
+		p = s;
+		goto end;
+	}
+
+	p = s + 1;
+	if (!is_separator(*s)) {
+		/* End search */
+retry:
+		while (*p && !is_separator(*p) && !isspace(*p))
+			p++;
+		/* Escape and special case: '!' is also used in glob pattern */
+		if (*(p - 1) == '\\' || (*p == '!' && *(p - 1) == '[')) {
+			p++;
+			goto retry;
+		}
+	}
+end:
+	*e = p;
+	return s;
+}
+
+static struct strfilter_node *strfilter_node__alloc(const char *op,
+						    struct strfilter_node *l,
+						    struct strfilter_node *r)
+{
+	struct strfilter_node *ret = zalloc(sizeof(struct strfilter_node));
+
+	if (ret) {
+		ret->p = op;
+		ret->l = l;
+		ret->r = r;
+	}
+
+	return ret;
+}
+
+static struct strfilter_node *strfilter_node__new(const char *s,
+						  const char **ep)
+{
+	struct strfilter_node root, *cur, *last_op;
+	const char *e;
+
+	if (!s)
+		return NULL;
+
+	memset(&root, 0, sizeof(root));
+	last_op = cur = &root;
+
+	s = get_token(s, &e);
+	while (*s != '\0' && *s != ')') {
+		switch (*s) {
+		case '&':	/* Exchg last OP->r with AND */
+			if (!cur->r || !last_op->r)
+				goto error;
+			cur = strfilter_node__alloc(OP_and, last_op->r, NULL);
+			if (!cur)
+				goto nomem;
+			last_op->r = cur;
+			last_op = cur;
+			break;
+		case '|':	/* Exchg the root with OR */
+			if (!cur->r || !root.r)
+				goto error;
+			cur = strfilter_node__alloc(OP_or, root.r, NULL);
+			if (!cur)
+				goto nomem;
+			root.r = cur;
+			last_op = cur;
+			break;
+		case '!':	/* Add NOT as a leaf node */
+			if (cur->r)
+				goto error;
+			cur->r = strfilter_node__alloc(OP_not, NULL, NULL);
+			if (!cur->r)
+				goto nomem;
+			cur = cur->r;
+			break;
+		case '(':	/* Recursively parses inside the parenthesis */
+			if (cur->r)
+				goto error;
+			cur->r = strfilter_node__new(s + 1, &s);
+			if (!s)
+				goto nomem;
+			if (!cur->r || *s != ')')
+				goto error;
+			e = s + 1;
+			break;
+		default:
+			if (cur->r)
+				goto error;
+			cur->r = strfilter_node__alloc(NULL, NULL, NULL);
+			if (!cur->r)
+				goto nomem;
+			cur->r->p = strndup(s, e - s);
+			if (!cur->r->p)
+				goto nomem;
+		}
+		s = get_token(e, &e);
+	}
+	if (!cur->r)
+		goto error;
+	*ep = s;
+	return root.r;
+nomem:
+	s = NULL;
+error:
+	*ep = s;
+	strfilter_node__delete(root.r);
+	return NULL;
+}
+
+/*
+ * Parse filter rule and return new strfilter.
+ * Return NULL if fail, and *ep == NULL if memory allocation failed.
+ */
+struct strfilter *strfilter__new(const char *rules, const char **err)
+{
+	struct strfilter *ret = zalloc(sizeof(struct strfilter));
+	const char *ep = NULL;
+
+	if (ret)
+		ret->root = strfilter_node__new(rules, &ep);
+
+	if (!ret || !ret->root || *ep != '\0') {
+		if (err)
+			*err = ep;
+		strfilter__delete(ret);
+		ret = NULL;
+	}
+
+	return ret;
+}
+
+static bool strfilter_node__compare(struct strfilter_node *self,
+				    const char *str)
+{
+	if (!self || !self->p)
+		return false;
+
+	switch (*self->p) {
+	case '|':	/* OR */
+		return strfilter_node__compare(self->l, str) ||
+			strfilter_node__compare(self->r, str);
+	case '&':	/* AND */
+		return strfilter_node__compare(self->l, str) &&
+			strfilter_node__compare(self->r, str);
+	case '!':	/* NOT */
+		return !strfilter_node__compare(self->r, str);
+	default:
+		return strglobmatch(str, self->p);
+	}
+}
+
+/* Return true if STR matches the filter rules */
+bool strfilter__compare(struct strfilter *self, const char *str)
+{
+	if (!self)
+		return false;
+	return strfilter_node__compare(self->root, str);
+}
diff --git a/tools/perf/util/strfilter.h b/tools/perf/util/strfilter.h
new file mode 100644
index 0000000..00f58a7
--- /dev/null
+++ b/tools/perf/util/strfilter.h
@@ -0,0 +1,48 @@
+#ifndef __PERF_STRFILTER_H
+#define __PERF_STRFILTER_H
+/* General purpose glob matching filter */
+
+#include <linux/list.h>
+#include <stdbool.h>
+
+/* A node of string filter */
+struct strfilter_node {
+	struct strfilter_node *l;	/* Tree left branche (for &,|) */
+	struct strfilter_node *r;	/* Tree right branche (for !,&,|) */
+	const char *p;		/* Operator or rule */
+};
+
+/* String filter */
+struct strfilter {
+	struct strfilter_node *root;
+};
+
+/**
+ * strfilter__new - Create a new string filter
+ * @rules: Filter rule, which is a combination of glob expressions.
+ * @err: Pointer which points an error detected on @rules
+ *
+ * Parse @rules and return new strfilter. Return NULL if an error detected.
+ * In that case, *@err will indicate where it is detected, and *@err is NULL
+ * if a memory allocation is failed.
+ */
+struct strfilter *strfilter__new(const char *rules, const char **err);
+
+/**
+ * strfilter__compare - compare given string and a string filter
+ * @self: String filter
+ * @str: target string
+ *
+ * Compare @str and @self. Return true if the str match the rule
+ */
+bool strfilter__compare(struct strfilter *self, const char *str);
+
+/**
+ * strfilter__delete - delete a string filter
+ * @self: String filter to delete
+ *
+ * Delete @self.
+ */
+void strfilter__delete(struct strfilter *self);
+
+#endif
-- 
cgit v0.10.2


From bd09d7b5efeb13965b6725b4a3e9944908bca9d2 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Thu, 20 Jan 2011 23:15:39 +0900
Subject: perf probe: Add variable filter support

Add filters support for available variable list.

Default filter is "!__k???tab_*&!__crc_*" for filtering out
automatically generated symbols.

The format of filter rule is "[!]GLOBPATTERN", so you can use wild
cards. If the filter rule starts with '!', matched variables are filter
out.

e.g.:
 # perf probe -V schedule --externs --filter=cpu*
Available variables at schedule
        @<schedule+0>
                cpumask_var_t   cpu_callout_mask
                cpumask_var_t   cpu_core_map
                cpumask_var_t   cpu_isolated_map
                cpumask_var_t   cpu_sibling_map
                int     cpu_number
                long unsigned int*      cpu_bit_bitmap
		...

Cc: 2nddept-manager@sdl.hitachi.co.jp
Cc: Chase Douglas <chase.douglas@canonical.com>
Cc: Franck Bui-Huu <fbuihuu@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <20110120141539.25915.43401.stgit@ltc236.sdl.hitachi.co.jp>
Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
[ committer note: Removed the elf.h include as it was fixed up in e80711c]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/Documentation/perf-probe.txt b/tools/perf/Documentation/perf-probe.txt
index fcc51fe..32fb18f 100644
--- a/tools/perf/Documentation/perf-probe.txt
+++ b/tools/perf/Documentation/perf-probe.txt
@@ -77,6 +77,12 @@ OPTIONS
 --funcs::
 	Show available functions in given module or kernel.
 
+--filter=FILTER::
+	(Only for --vars) Set filter for variables. FILTER is a combination of
+	glob pattern, see FILTER PATTERN for details.
+	Default FILTER is "!__k???tab_* & !__crc_*".
+	If several filters are specified, only the last filter is valid.
+
 -f::
 --force::
 	Forcibly add events with existing name.
@@ -139,6 +145,14 @@ e.g.
 
 This provides some sort of flexibility and robustness to probe point definitions against minor code changes. For example, actual 10th line of schedule() can be moved easily by modifying schedule(), but the same line matching 'rq=cpu_rq*' may still exist in the function.)
 
+FILTER PATTERN
+--------------
+ The filter pattern is a glob matching pattern(s) to filter variables.
+ In addition, you can use "!" for specifying filter-out rule. You also can give several rules combined with "&" or "|", and fold those rules as one rule by using "(" ")".
+
+e.g.
+ With --filter "foo* | bar*", perf probe -V shows variables which start with "foo" or "bar".
+ With --filter "!foo* & *bar", perf probe -V shows variables which don't start with "foo" and end with "bar", like "fizzbar". But "foobar" is filtered out.
 
 EXAMPLES
 --------
diff --git a/tools/perf/builtin-probe.c b/tools/perf/builtin-probe.c
index 6cf708a..abb423e 100644
--- a/tools/perf/builtin-probe.c
+++ b/tools/perf/builtin-probe.c
@@ -36,6 +36,7 @@
 #include "builtin.h"
 #include "util/util.h"
 #include "util/strlist.h"
+#include "util/strfilter.h"
 #include "util/symbol.h"
 #include "util/debug.h"
 #include "util/debugfs.h"
@@ -43,6 +44,7 @@
 #include "util/probe-finder.h"
 #include "util/probe-event.h"
 
+#define DEFAULT_VAR_FILTER "!__k???tab_* & !__crc_*"
 #define MAX_PATH_LEN 256
 
 /* Session management structure */
@@ -60,6 +62,7 @@ static struct {
 	struct line_range line_range;
 	const char *target_module;
 	int max_probe_points;
+	struct strfilter *filter;
 } params;
 
 /* Parse an event definition. Note that any error must die. */
@@ -156,6 +159,27 @@ static int opt_show_vars(const struct option *opt __used,
 
 	return ret;
 }
+
+static int opt_set_filter(const struct option *opt __used,
+			  const char *str, int unset __used)
+{
+	const char *err;
+
+	if (str) {
+		pr_debug2("Set filter: %s\n", str);
+		if (params.filter)
+			strfilter__delete(params.filter);
+		params.filter = strfilter__new(str, &err);
+		if (!params.filter) {
+			pr_err("Filter parse error at %ld.\n", err - str + 1);
+			pr_err("Source: \"%s\"\n", str);
+			pr_err("         %*c\n", (int)(err - str + 1), '^');
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
 #endif
 
 static const char * const probe_usage[] = {
@@ -212,6 +236,10 @@ static const struct option options[] = {
 		     "Show accessible variables on PROBEDEF", opt_show_vars),
 	OPT_BOOLEAN('\0', "externs", &params.show_ext_vars,
 		    "Show external variables too (with --vars only)"),
+	OPT_CALLBACK('\0', "filter", NULL,
+		     "[!]FILTER", "Set a variable filter (with --vars only)\n"
+		     "\t\t\t(default: \"" DEFAULT_VAR_FILTER "\")",
+		     opt_set_filter),
 	OPT_STRING('k', "vmlinux", &symbol_conf.vmlinux_name,
 		   "file", "vmlinux pathname"),
 	OPT_STRING('s', "source", &symbol_conf.source_prefix,
@@ -324,10 +352,16 @@ int cmd_probe(int argc, const char **argv, const char *prefix __used)
 			       " --add/--del.\n");
 			usage_with_options(probe_usage, options);
 		}
+		if (!params.filter)
+			params.filter = strfilter__new(DEFAULT_VAR_FILTER,
+						       NULL);
+
 		ret = show_available_vars(params.events, params.nevents,
 					  params.max_probe_points,
 					  params.target_module,
+					  params.filter,
 					  params.show_ext_vars);
+		strfilter__delete(params.filter);
 		if (ret < 0)
 			pr_err("  Error: Failed to show vars. (%d)\n", ret);
 		return ret;
diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c
index 859d377..077e051 100644
--- a/tools/perf/util/probe-event.c
+++ b/tools/perf/util/probe-event.c
@@ -451,12 +451,14 @@ end:
 }
 
 static int show_available_vars_at(int fd, struct perf_probe_event *pev,
-				  int max_vls, bool externs)
+				  int max_vls, struct strfilter *_filter,
+				  bool externs)
 {
 	char *buf;
-	int ret, i;
+	int ret, i, nvars;
 	struct str_node *node;
 	struct variable_list *vls = NULL, *vl;
+	const char *var;
 
 	buf = synthesize_perf_probe_point(&pev->point);
 	if (!buf)
@@ -464,36 +466,45 @@ static int show_available_vars_at(int fd, struct perf_probe_event *pev,
 	pr_debug("Searching variables at %s\n", buf);
 
 	ret = find_available_vars_at(fd, pev, &vls, max_vls, externs);
-	if (ret > 0) {
-		/* Some variables were found */
-		fprintf(stdout, "Available variables at %s\n", buf);
-		for (i = 0; i < ret; i++) {
-			vl = &vls[i];
-			/*
-			 * A probe point might be converted to
-			 * several trace points.
-			 */
-			fprintf(stdout, "\t@<%s+%lu>\n", vl->point.symbol,
-				vl->point.offset);
-			free(vl->point.symbol);
-			if (vl->vars) {
-				strlist__for_each(node, vl->vars)
+	if (ret <= 0) {
+		pr_err("Failed to find variables at %s (%d)\n", buf, ret);
+		goto end;
+	}
+	/* Some variables are found */
+	fprintf(stdout, "Available variables at %s\n", buf);
+	for (i = 0; i < ret; i++) {
+		vl = &vls[i];
+		/*
+		 * A probe point might be converted to
+		 * several trace points.
+		 */
+		fprintf(stdout, "\t@<%s+%lu>\n", vl->point.symbol,
+			vl->point.offset);
+		free(vl->point.symbol);
+		nvars = 0;
+		if (vl->vars) {
+			strlist__for_each(node, vl->vars) {
+				var = strchr(node->s, '\t') + 1;
+				if (strfilter__compare(_filter, var)) {
 					fprintf(stdout, "\t\t%s\n", node->s);
-				strlist__delete(vl->vars);
-			} else
-				fprintf(stdout, "(No variables)\n");
+					nvars++;
+				}
+			}
+			strlist__delete(vl->vars);
 		}
-		free(vls);
-	} else
-		pr_err("Failed to find variables at %s (%d)\n", buf, ret);
-
+		if (nvars == 0)
+			fprintf(stdout, "\t\t(No matched variables)\n");
+	}
+	free(vls);
+end:
 	free(buf);
 	return ret;
 }
 
 /* Show available variables on given probe point */
 int show_available_vars(struct perf_probe_event *pevs, int npevs,
-			int max_vls, const char *module, bool externs)
+			int max_vls, const char *module,
+			struct strfilter *_filter, bool externs)
 {
 	int i, fd, ret = 0;
 
@@ -510,7 +521,8 @@ int show_available_vars(struct perf_probe_event *pevs, int npevs,
 	setup_pager();
 
 	for (i = 0; i < npevs && ret >= 0; i++)
-		ret = show_available_vars_at(fd, &pevs[i], max_vls, externs);
+		ret = show_available_vars_at(fd, &pevs[i], max_vls, _filter,
+					     externs);
 
 	close(fd);
 	return ret;
@@ -556,7 +568,9 @@ int show_line_range(struct line_range *lr __unused, const char *module __unused)
 
 int show_available_vars(struct perf_probe_event *pevs __unused,
 			int npevs __unused, int max_vls __unused,
-			const char *module __unused, bool externs __unused)
+			const char *module __unused,
+			struct strfilter *filter __unused,
+			bool externs __unused)
 {
 	pr_warning("Debuginfo-analysis is not supported.\n");
 	return -ENOSYS;
diff --git a/tools/perf/util/probe-event.h b/tools/perf/util/probe-event.h
index 1fb4f18..4e80b2b 100644
--- a/tools/perf/util/probe-event.h
+++ b/tools/perf/util/probe-event.h
@@ -3,6 +3,7 @@
 
 #include <stdbool.h>
 #include "strlist.h"
+#include "strfilter.h"
 
 extern bool probe_event_dry_run;
 
@@ -126,7 +127,7 @@ extern int show_perf_probe_events(void);
 extern int show_line_range(struct line_range *lr, const char *module);
 extern int show_available_vars(struct perf_probe_event *pevs, int npevs,
 			       int max_probe_points, const char *module,
-			       bool externs);
+			       struct strfilter *filter, bool externs);
 extern int show_available_funcs(const char *module);
 
 
-- 
cgit v0.10.2


From 3c42258c9a4db70133fa6946a275b62a16792bb5 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Thu, 20 Jan 2011 23:15:45 +0900
Subject: perf probe: Add filters support for available functions

Add filters support for available function list.

Default filter is "!_*" for filtering out local-purpose symbols.

e.g.:
 # perf probe --filter="add*" -F
add_disk
add_disk_randomness
add_input_randomness
add_interrupt_randomness
add_memory
add_page_to_unevictable_list
add_page_wait_queue
...

Cc: 2nddept-manager@sdl.hitachi.co.jp
Cc: Chase Douglas <chase.douglas@canonical.com>
Cc: Franck Bui-Huu <fbuihuu@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <20110120141545.25915.85930.stgit@ltc236.sdl.hitachi.co.jp>
Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/Documentation/perf-probe.txt b/tools/perf/Documentation/perf-probe.txt
index 32fb18f..81c3220 100644
--- a/tools/perf/Documentation/perf-probe.txt
+++ b/tools/perf/Documentation/perf-probe.txt
@@ -78,10 +78,11 @@ OPTIONS
 	Show available functions in given module or kernel.
 
 --filter=FILTER::
-	(Only for --vars) Set filter for variables. FILTER is a combination of
-	glob pattern, see FILTER PATTERN for details.
-	Default FILTER is "!__k???tab_* & !__crc_*".
-	If several filters are specified, only the last filter is valid.
+	(Only for --vars and --funcs) Set filter. FILTER is a combination of glob
+	pattern, see FILTER PATTERN for detail.
+	Default FILTER is "!__k???tab_* & !__crc_*" for --vars, and "!_*"
+	for --funcs.
+	If several filters are specified, only the last filter is used.
 
 -f::
 --force::
diff --git a/tools/perf/builtin-probe.c b/tools/perf/builtin-probe.c
index abb423e..fcde003 100644
--- a/tools/perf/builtin-probe.c
+++ b/tools/perf/builtin-probe.c
@@ -45,6 +45,7 @@
 #include "util/probe-event.h"
 
 #define DEFAULT_VAR_FILTER "!__k???tab_* & !__crc_*"
+#define DEFAULT_FUNC_FILTER "!_*"
 #define MAX_PATH_LEN 256
 
 /* Session management structure */
@@ -159,6 +160,7 @@ static int opt_show_vars(const struct option *opt __used,
 
 	return ret;
 }
+#endif
 
 static int opt_set_filter(const struct option *opt __used,
 			  const char *str, int unset __used)
@@ -180,7 +182,6 @@ static int opt_set_filter(const struct option *opt __used,
 
 	return 0;
 }
-#endif
 
 static const char * const probe_usage[] = {
 	"perf probe [<options>] 'PROBEDEF' ['PROBEDEF' ...]",
@@ -236,10 +237,6 @@ static const struct option options[] = {
 		     "Show accessible variables on PROBEDEF", opt_show_vars),
 	OPT_BOOLEAN('\0', "externs", &params.show_ext_vars,
 		    "Show external variables too (with --vars only)"),
-	OPT_CALLBACK('\0', "filter", NULL,
-		     "[!]FILTER", "Set a variable filter (with --vars only)\n"
-		     "\t\t\t(default: \"" DEFAULT_VAR_FILTER "\")",
-		     opt_set_filter),
 	OPT_STRING('k', "vmlinux", &symbol_conf.vmlinux_name,
 		   "file", "vmlinux pathname"),
 	OPT_STRING('s', "source", &symbol_conf.source_prefix,
@@ -252,6 +249,11 @@ static const struct option options[] = {
 		 "Set how many probe points can be found for a probe."),
 	OPT_BOOLEAN('F', "funcs", &params.show_funcs,
 		    "Show potential probe-able functions."),
+	OPT_CALLBACK('\0', "filter", NULL,
+		     "[!]FILTER", "Set a filter (with --vars/funcs only)\n"
+		     "\t\t\t(default: \"" DEFAULT_VAR_FILTER "\" for --vars,\n"
+		     "\t\t\t \"" DEFAULT_FUNC_FILTER "\" for --funcs)",
+		     opt_set_filter),
 	OPT_END()
 };
 
@@ -322,7 +324,12 @@ int cmd_probe(int argc, const char **argv, const char *prefix __used)
 			pr_err("  Error: Don't use --funcs with --vars.\n");
 			usage_with_options(probe_usage, options);
 		}
-		ret = show_available_funcs(params.target_module);
+		if (!params.filter)
+			params.filter = strfilter__new(DEFAULT_FUNC_FILTER,
+						       NULL);
+		ret = show_available_funcs(params.target_module,
+					   params.filter);
+		strfilter__delete(params.filter);
 		if (ret < 0)
 			pr_err("  Error: Failed to show functions."
 			       " (%d)\n", ret);
diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c
index 077e051..9d237e3 100644
--- a/tools/perf/util/probe-event.c
+++ b/tools/perf/util/probe-event.c
@@ -1951,21 +1951,23 @@ int del_perf_probe_events(struct strlist *dellist)
 
 	return ret;
 }
+/* TODO: don't use a global variable for filter ... */
+static struct strfilter *available_func_filter;
 
 /*
- * If a symbol corresponds to a function with global binding return 0.
- * For all others return 1.
+ * If a symbol corresponds to a function with global binding and
+ * matches filter return 0. For all others return 1.
  */
-static int filter_non_global_functions(struct map *map __unused,
-					struct symbol *sym)
+static int filter_available_functions(struct map *map __unused,
+				      struct symbol *sym)
 {
-	if (sym->binding != STB_GLOBAL)
-		return 1;
-
-	return 0;
+	if (sym->binding == STB_GLOBAL &&
+	    strfilter__compare(available_func_filter, sym->name))
+		return 0;
+	return 1;
 }
 
-int show_available_funcs(const char *module)
+int show_available_funcs(const char *module, struct strfilter *_filter)
 {
 	struct map *map;
 	int ret;
@@ -1981,7 +1983,8 @@ int show_available_funcs(const char *module)
 		pr_err("Failed to find %s map.\n", (module) ? : "kernel");
 		return -EINVAL;
 	}
-	if (map__load(map, filter_non_global_functions)) {
+	available_func_filter = _filter;
+	if (map__load(map, filter_available_functions)) {
 		pr_err("Failed to load map.\n");
 		return -EINVAL;
 	}
diff --git a/tools/perf/util/probe-event.h b/tools/perf/util/probe-event.h
index 4e80b2b..3434fc9 100644
--- a/tools/perf/util/probe-event.h
+++ b/tools/perf/util/probe-event.h
@@ -128,7 +128,7 @@ extern int show_line_range(struct line_range *lr, const char *module);
 extern int show_available_vars(struct perf_probe_event *pevs, int npevs,
 			       int max_probe_points, const char *module,
 			       struct strfilter *filter, bool externs);
-extern int show_available_funcs(const char *module);
+extern int show_available_funcs(const char *module, struct strfilter *filter);
 
 
 /* Maximum index number of event-name postfix */
-- 
cgit v0.10.2


From 54489c189b1a0c10eaf21c6d2c5916b50442c871 Mon Sep 17 00:00:00 2001
From: Han Pingtian <phan@redhat.com>
Date: Tue, 25 Jan 2011 07:39:00 +0800
Subject: perf test: Fix return values checking

Fixing some cut'n'paste mistakes.

LKML-Reference: <20110124233900.GA3443@epc900.nay.redhat.com>
Signed-off-by: Han Pingtian <phan@redhat.com>
[ committer note: I had already removed the CPU_ALLOC calls ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/builtin-test.c b/tools/perf/builtin-test.c
index 231e3e2..738830d 100644
--- a/tools/perf/builtin-test.c
+++ b/tools/perf/builtin-test.c
@@ -495,8 +495,8 @@ static int test__basic_mmap(void)
 	}
 
 	cpus = cpu_map__new(NULL);
-	if (threads == NULL) {
-		pr_debug("thread_map__new\n");
+	if (cpus == NULL) {
+		pr_debug("cpu_map__new\n");
 		goto out_free_threads;
 	}
 
@@ -510,7 +510,7 @@ static int test__basic_mmap(void)
 	}
 
 	evlist = perf_evlist__new();
-	if (threads == NULL) {
+	if (evlist == NULL) {
 		pr_debug("perf_evlist__new\n");
 		goto out_free_cpus;
 	}
-- 
cgit v0.10.2


From bd22a2f1982fa3e90ce7d5d011c37d88aa67e73c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 23 Jan 2011 14:37:27 +0100
Subject: x86: Kill unused static boot_cpu_logical_apicid in smpboot.c

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Pekka Enberg <penberg@kernel.org>
Acked-by: Yinghai Lu <yinghai@kernel.org>
Cc: eric.dumazet@gmail.com
Cc: yinghai@kernel.org
Cc: brgerst@gmail.com
Cc: gorcunov@gmail.com
Cc: shaohui.zheng@intel.com
Cc: rientjes@google.com
LKML-Reference: <1295789862-25482-2-git-send-email-tj@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 0cbe8c0..ee9203a 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -165,8 +165,6 @@ static void unmap_cpu_to_node(int cpu)
 #endif
 
 #ifdef CONFIG_X86_32
-static int boot_cpu_logical_apicid;
-
 u8 cpu_2_logical_apicid[NR_CPUS] __read_mostly =
 					{ [0 ... NR_CPUS-1] = BAD_APICID };
 
@@ -1096,9 +1094,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
 	 * Setup boot CPU information
 	 */
 	smp_store_cpu_info(0); /* Final full version of the data */
-#ifdef CONFIG_X86_32
-	boot_cpu_logical_apicid = logical_smp_processor_id();
-#endif
+
 	current_thread_info()->cpu = 0;  /* needed? */
 	for_each_possible_cpu(i) {
 		zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL);
-- 
cgit v0.10.2


From b78aa66b1fe4179d28e3f6502dc179773519a1bb Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 23 Jan 2011 14:37:28 +0100
Subject: x86: Drop x86_32 MAX_APICID

Commit 56d91f13 (x86, acpi: Add MAX_LOCAL_APIC for 32bit) added
MAX_LOCAL_APIC for x86_32 but didn't replace MAX_APICID users
with it. Convert MAX_APICID users to MAX_LOCAL_APIC and drop
MAX_APICID.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Pekka Enberg <penberg@kernel.org>
Acked-by: Yinghai Lu <yinghai@kernel.org>
Cc: eric.dumazet@gmail.com
Cc: yinghai@kernel.org
Cc: brgerst@gmail.com
Cc: gorcunov@gmail.com
Cc: shaohui.zheng@intel.com
Cc: rientjes@google.com
LKML-Reference: <1295789862-25482-3-git-send-email-tj@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index 0c90dd9..edc2a45 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -33,8 +33,6 @@ extern int mp_bus_id_to_local[MAX_MP_BUSSES];
 extern int quad_local_to_mp_bus_id [NR_CPUS/4][4];
 #endif
 
-#define MAX_APICID		256
-
 #else /* CONFIG_X86_64: */
 
 #define MAX_MP_BUSSES		256
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index ee9203a..53a85ba 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -72,7 +72,7 @@
 #include <asm/i8259.h>
 
 #ifdef CONFIG_X86_32
-u8 apicid_2_node[MAX_APICID];
+u8 apicid_2_node[MAX_LOCAL_APIC];
 #endif
 
 /* State of each CPU */
diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c
index ae96e7b8..6027a48 100644
--- a/arch/x86/mm/srat_32.c
+++ b/arch/x86/mm/srat_32.c
@@ -57,7 +57,7 @@ struct node_memory_chunk_s {
 static struct node_memory_chunk_s __initdata node_memory_chunk[MAXCHUNKS];
 
 static int __initdata num_memory_chunks; /* total number of memory chunks */
-static u8 __initdata apicid_to_pxm[MAX_APICID];
+static u8 __initdata apicid_to_pxm[MAX_LOCAL_APIC];
 
 int acpi_numa __initdata;
 
@@ -254,7 +254,7 @@ int __init get_memcfg_from_srat(void)
 	printk(KERN_DEBUG "Number of memory chunks in system = %d\n",
 			 num_memory_chunks);
 
-	for (i = 0; i < MAX_APICID; i++)
+	for (i = 0; i < MAX_LOCAL_APIC; i++)
 		apicid_2_node[i] = pxm_to_node(apicid_to_pxm[i]);
 
 	for (j = 0; j < num_memory_chunks; j++){
-- 
cgit v0.10.2


From 1245e1668c6e52bee76a423f8fab3bfcdd6226ae Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 23 Jan 2011 14:37:29 +0100
Subject: x86: Make default_send_IPI_mask_sequence/allbutself_logical() 32bit
 only

Both functions are used only in 32bit.  Put them inside
CONFIG_X86_32. This is to prepare for logical apicid handling
update.

- Cyrill Gorcunov spotted that I forgot to move declarations in
ipi.h   under CONFIG_X86_32.  Fixed.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Pekka Enberg <penberg@kernel.org>
Reviewed-by: Cyrill Gorcunov <gorcunov@gmail.com>
Acked-by: Yinghai Lu <yinghai@kernel.org>
Cc: eric.dumazet@gmail.com
Cc: brgerst@gmail.com
Cc: shaohui.zheng@intel.com
Cc: rientjes@google.com
LKML-Reference: <1295789862-25482-4-git-send-email-tj@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/ipi.h b/arch/x86/include/asm/ipi.h
index 0b72282..615fa90 100644
--- a/arch/x86/include/asm/ipi.h
+++ b/arch/x86/include/asm/ipi.h
@@ -123,10 +123,6 @@ extern void default_send_IPI_mask_sequence_phys(const struct cpumask *mask,
 						 int vector);
 extern void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask,
 							 int vector);
-extern void default_send_IPI_mask_sequence_logical(const struct cpumask *mask,
-							 int vector);
-extern void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask,
-							 int vector);
 
 /* Avoid include hell */
 #define NMI_VECTOR 0x02
@@ -150,6 +146,10 @@ static inline void __default_local_send_IPI_all(int vector)
 }
 
 #ifdef CONFIG_X86_32
+extern void default_send_IPI_mask_sequence_logical(const struct cpumask *mask,
+							 int vector);
+extern void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask,
+							 int vector);
 extern void default_send_IPI_mask_logical(const struct cpumask *mask,
 						 int vector);
 extern void default_send_IPI_allbutself(int vector);
diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c
index 08385e0..5037736 100644
--- a/arch/x86/kernel/apic/ipi.c
+++ b/arch/x86/kernel/apic/ipi.c
@@ -56,6 +56,8 @@ void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask,
 	local_irq_restore(flags);
 }
 
+#ifdef CONFIG_X86_32
+
 void default_send_IPI_mask_sequence_logical(const struct cpumask *mask,
 						 int vector)
 {
@@ -96,8 +98,6 @@ void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask,
 	local_irq_restore(flags);
 }
 
-#ifdef CONFIG_X86_32
-
 /*
  * This is only used on smaller machines.
  */
-- 
cgit v0.10.2


From 4c321ff8a01a95badf5d5403d80ca4e0ab07fce7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 23 Jan 2011 14:37:30 +0100
Subject: x86: Replace cpu_2_logical_apicid[] with early percpu variable

Unlike x86_64, on x86_32, the mapping from cpu to logical apicid
may vary depending on apic in use.  cpu_2_logical_apicid[] array
is used for this mapping.  Replace it with early percpu variable
x86_cpu_to_logical_apicid to make it better aligned with other
mappings.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: eric.dumazet@gmail.com
Cc: yinghai@kernel.org
Cc: brgerst@gmail.com
Cc: gorcunov@gmail.com
Cc: penberg@kernel.org
Cc: shaohui.zheng@intel.com
Cc: rientjes@google.com
LKML-Reference: <1295789862-25482-5-git-send-email-tj@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 5e3969c..eb139ec 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -595,8 +595,4 @@ extern int default_check_phys_apicid_present(int phys_apicid);
 
 #endif /* CONFIG_X86_LOCAL_APIC */
 
-#ifdef CONFIG_X86_32
-extern u8 cpu_2_logical_apicid[NR_CPUS];
-#endif
-
 #endif /* _ASM_X86_APIC_H */
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 4c2f63c..dc7c46a 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -38,6 +38,9 @@ static inline struct cpumask *cpu_core_mask(int cpu)
 
 DECLARE_EARLY_PER_CPU(u16, x86_cpu_to_apicid);
 DECLARE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid);
+#if defined(CONFIG_SMP) && defined(CONFIG_X86_32)
+DECLARE_EARLY_PER_CPU(int, x86_cpu_to_logical_apicid);
+#endif
 
 /* Static state in head.S used to set up a CPU */
 extern struct {
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 06c196d..126d5a3 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -78,6 +78,17 @@ EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
 EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
 
 #ifdef CONFIG_X86_32
+
+#ifdef CONFIG_SMP
+/*
+ * On x86_32, the mapping between cpu and logical apicid may vary
+ * depending on apic in use.  The following early percpu variable is
+ * used for the mapping.  This is where the behaviors of x86_64 and 32
+ * actually diverge.  Let's keep it ugly for now.
+ */
+DEFINE_EARLY_PER_CPU(int, x86_cpu_to_logical_apicid, BAD_APICID);
+#endif
+
 /*
  * Knob to control our willingness to enable the local APIC.
  *
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index 8593582..7cb73e1 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -534,7 +534,7 @@ static int es7000_cpu_to_logical_apicid(int cpu)
 #ifdef CONFIG_SMP
 	if (cpu >= nr_cpu_ids)
 		return BAD_APICID;
-	return cpu_2_logical_apicid[cpu];
+	return early_per_cpu(x86_cpu_to_logical_apicid, cpu);
 #else
 	return logical_smp_processor_id();
 #endif
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index 960f26a..4ed90c4 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -377,7 +377,7 @@ static inline int numaq_cpu_to_logical_apicid(int cpu)
 {
 	if (cpu >= nr_cpu_ids)
 		return BAD_APICID;
-	return cpu_2_logical_apicid[cpu];
+	return early_per_cpu(x86_cpu_to_logical_apicid, cpu);
 }
 
 /*
diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c
index 9b41926..82cfc3e 100644
--- a/arch/x86/kernel/apic/summit_32.c
+++ b/arch/x86/kernel/apic/summit_32.c
@@ -206,7 +206,7 @@ static void summit_init_apic_ldr(void)
 
 	/* Create logical APIC IDs by counting CPUs already in cluster. */
 	for (count = 0, i = nr_cpu_ids; --i >= 0; ) {
-		lid = cpu_2_logical_apicid[i];
+		lid = early_per_cpu(x86_cpu_to_logical_apicid, i);
 		if (lid != BAD_APICID && APIC_CLUSTER(lid) == my_cluster)
 			++count;
 	}
@@ -247,7 +247,7 @@ static inline int summit_cpu_to_logical_apicid(int cpu)
 #ifdef CONFIG_SMP
 	if (cpu >= nr_cpu_ids)
 		return BAD_APICID;
-	return cpu_2_logical_apicid[cpu];
+	return early_per_cpu(x86_cpu_to_logical_apicid, cpu);
 #else
 	return logical_smp_processor_id();
 #endif
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 002b796..b5147f0 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -225,6 +225,10 @@ void __init setup_per_cpu_areas(void)
 		per_cpu(x86_bios_cpu_apicid, cpu) =
 			early_per_cpu_map(x86_bios_cpu_apicid, cpu);
 #endif
+#ifdef CONFIG_X86_32
+		per_cpu(x86_cpu_to_logical_apicid, cpu) =
+			early_per_cpu_map(x86_cpu_to_logical_apicid, cpu);
+#endif
 #ifdef CONFIG_X86_64
 		per_cpu(irq_stack_ptr, cpu) =
 			per_cpu(irq_stack_union.irq_stack, cpu) +
@@ -256,6 +260,9 @@ void __init setup_per_cpu_areas(void)
 	early_per_cpu_ptr(x86_cpu_to_apicid) = NULL;
 	early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL;
 #endif
+#ifdef CONFIG_X86_32
+	early_per_cpu_ptr(x86_cpu_to_logical_apicid) = NULL;
+#endif
 #if defined(CONFIG_X86_64) && defined(CONFIG_NUMA)
 	early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;
 #endif
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 53a85ba..df934e4 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -165,9 +165,6 @@ static void unmap_cpu_to_node(int cpu)
 #endif
 
 #ifdef CONFIG_X86_32
-u8 cpu_2_logical_apicid[NR_CPUS] __read_mostly =
-					{ [0 ... NR_CPUS-1] = BAD_APICID };
-
 static void map_cpu_to_logical_apicid(void)
 {
 	int cpu = smp_processor_id();
@@ -177,13 +174,13 @@ static void map_cpu_to_logical_apicid(void)
 	if (!node_online(node))
 		node = first_online_node;
 
-	cpu_2_logical_apicid[cpu] = apicid;
+	early_per_cpu(x86_cpu_to_logical_apicid, cpu) = apicid;
 	map_cpu_to_node(cpu, node);
 }
 
 void numa_remove_cpu(int cpu)
 {
-	cpu_2_logical_apicid[cpu] = BAD_APICID;
+	early_per_cpu(x86_cpu_to_logical_apicid, cpu) = BAD_APICID;
 	unmap_cpu_to_node(cpu);
 }
 #else
-- 
cgit v0.10.2


From 6f802c4bfa2acf1bffa8341fe9084da0205d581d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 23 Jan 2011 14:37:31 +0100
Subject: x86: Always use x86_cpu_to_logical_apicid for cpu -> logical apic id

Currently, cpu -> logical apic id translation is done by
apic->cpu_to_logical_apicid() callback which may or may not use
x86_cpu_to_logical_apicid.  This is unnecessary as it should
always equal logical_smp_processor_id() which is known early
during CPU bring up.

Initialize x86_cpu_to_logical_apicid after apic->init_apic_ldr()
in setup_local_APIC() and always use x86_cpu_to_logical_apicid
for cpu -> logical apic id mapping.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: eric.dumazet@gmail.com
Cc: yinghai@kernel.org
Cc: brgerst@gmail.com
Cc: gorcunov@gmail.com
Cc: penberg@kernel.org
Cc: shaohui.zheng@intel.com
Cc: rientjes@google.com
LKML-Reference: <1295789862-25482-6-git-send-email-tj@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 126d5a3..ae08246 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1248,6 +1248,14 @@ void __cpuinit setup_local_APIC(void)
 	 */
 	apic->init_apic_ldr();
 
+#ifdef CONFIG_X86_32
+	/*
+	 * APIC LDR is initialized.  Fetch and store logical_apic_id.
+	 */
+	early_per_cpu(x86_cpu_to_logical_apicid, cpu) =
+		logical_smp_processor_id();
+#endif
+
 	/*
 	 * Set Task Priority to 'accept all'. We never change this
 	 * later on.
diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c
index 5037736..cce91bf 100644
--- a/arch/x86/kernel/apic/ipi.c
+++ b/arch/x86/kernel/apic/ipi.c
@@ -73,8 +73,8 @@ void default_send_IPI_mask_sequence_logical(const struct cpumask *mask,
 	local_irq_save(flags);
 	for_each_cpu(query_cpu, mask)
 		__default_send_IPI_dest_field(
-			apic->cpu_to_logical_apicid(query_cpu), vector,
-			apic->dest_logical);
+			early_per_cpu(x86_cpu_to_logical_apicid, query_cpu),
+			vector, apic->dest_logical);
 	local_irq_restore(flags);
 }
 
@@ -92,8 +92,8 @@ void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask,
 		if (query_cpu == this_cpu)
 			continue;
 		__default_send_IPI_dest_field(
-			apic->cpu_to_logical_apicid(query_cpu), vector,
-			apic->dest_logical);
+			early_per_cpu(x86_cpu_to_logical_apicid, query_cpu),
+			vector, apic->dest_logical);
 		}
 	local_irq_restore(flags);
 }
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index df934e4..ca20f6b 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -168,19 +168,18 @@ static void unmap_cpu_to_node(int cpu)
 static void map_cpu_to_logical_apicid(void)
 {
 	int cpu = smp_processor_id();
-	int apicid = logical_smp_processor_id();
-	int node = apic->apicid_to_node(apicid);
+	int logical_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu);
+	int node;
 
+	node = apic->apicid_to_node(logical_apicid);
 	if (!node_online(node))
 		node = first_online_node;
 
-	early_per_cpu(x86_cpu_to_logical_apicid, cpu) = apicid;
 	map_cpu_to_node(cpu, node);
 }
 
 void numa_remove_cpu(int cpu)
 {
-	early_per_cpu(x86_cpu_to_logical_apicid, cpu) = BAD_APICID;
 	unmap_cpu_to_node(cpu);
 }
 #else
-- 
cgit v0.10.2


From 7632611f534340182c832d2b139cb19676f24e1a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 23 Jan 2011 14:37:32 +0100
Subject: x86: Kill apic->cpu_to_logical_apicid()

After the previous patch, apic->cpu_to_logical_apicid() is no
longer used.  Kill it.

For apic types with custom cpu_to_logical_apicid() which is also
used for other purposes, remove the function and modify its
users to do the mapping directly.

#ifdef's on CONFIG_SMP in es7000_32 and summit_32 are ignored
during conversion as they are not used for UP kernels.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: eric.dumazet@gmail.com
Cc: yinghai@kernel.org
Cc: brgerst@gmail.com
Cc: gorcunov@gmail.com
Cc: penberg@kernel.org
Cc: shaohui.zheng@intel.com
Cc: rientjes@google.com
LKML-Reference: <1295789862-25482-7-git-send-email-tj@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index eb139ec..d1aa0c3 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -307,7 +307,6 @@ struct apic {
 	void (*setup_apic_routing)(void);
 	int (*multi_timer_check)(int apic, int irq);
 	int (*apicid_to_node)(int logical_apicid);
-	int (*cpu_to_logical_apicid)(int cpu);
 	int (*cpu_present_to_apicid)(int mps_cpu);
 	void (*apicid_to_cpu_present)(int phys_apicid, physid_mask_t *retmap);
 	void (*setup_portio_remap)(void);
@@ -557,12 +556,6 @@ static inline void default_ioapic_phys_id_map(physid_mask_t *phys_map, physid_ma
 	*retmap = *phys_map;
 }
 
-/* Mapping from cpu number to logical apicid */
-static inline int default_cpu_to_logical_apicid(int cpu)
-{
-	return 1 << cpu;
-}
-
 static inline int __default_cpu_present_to_apicid(int mps_cpu)
 {
 	if (mps_cpu < nr_cpu_ids && cpu_present(mps_cpu))
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index 09d3b17..5a9d11a 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -186,7 +186,6 @@ struct apic apic_flat =  {
 	.setup_apic_routing		= NULL,
 	.multi_timer_check		= NULL,
 	.apicid_to_node			= NULL,
-	.cpu_to_logical_apicid		= NULL,
 	.cpu_present_to_apicid		= default_cpu_present_to_apicid,
 	.apicid_to_cpu_present		= NULL,
 	.setup_portio_remap		= NULL,
@@ -338,7 +337,6 @@ struct apic apic_physflat =  {
 	.setup_apic_routing		= NULL,
 	.multi_timer_check		= NULL,
 	.apicid_to_node			= NULL,
-	.cpu_to_logical_apicid		= NULL,
 	.cpu_present_to_apicid		= default_cpu_present_to_apicid,
 	.apicid_to_cpu_present		= NULL,
 	.setup_portio_remap		= NULL,
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index e31b9ff..f3d19b2 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -54,11 +54,6 @@ static u64 noop_apic_icr_read(void)
 	return 0;
 }
 
-static int noop_cpu_to_logical_apicid(int cpu)
-{
-	return 0;
-}
-
 static int noop_phys_pkg_id(int cpuid_apic, int index_msb)
 {
 	return 0;
@@ -155,7 +150,6 @@ struct apic apic_noop = {
 	.multi_timer_check		= NULL,
 	.apicid_to_node			= noop_apicid_to_node,
 
-	.cpu_to_logical_apicid		= noop_cpu_to_logical_apicid,
 	.cpu_present_to_apicid		= default_cpu_present_to_apicid,
 	.apicid_to_cpu_present		= physid_set_mask_of_physid,
 
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index cb804c5..4c62592 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -93,14 +93,6 @@ static int bigsmp_cpu_present_to_apicid(int mps_cpu)
 	return BAD_APICID;
 }
 
-/* Mapping from cpu number to logical apicid */
-static inline int bigsmp_cpu_to_logical_apicid(int cpu)
-{
-	if (cpu >= nr_cpu_ids)
-		return BAD_APICID;
-	return cpu_physical_id(cpu);
-}
-
 static void bigsmp_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
 {
 	/* For clustered we don't have a good way to do this yet - hack */
@@ -115,7 +107,11 @@ static int bigsmp_check_phys_apicid_present(int phys_apicid)
 /* As we are using single CPU as destination, pick only one CPU here */
 static unsigned int bigsmp_cpu_mask_to_apicid(const struct cpumask *cpumask)
 {
-	return bigsmp_cpu_to_logical_apicid(cpumask_first(cpumask));
+	int cpu = cpumask_first(cpumask);
+
+	if (cpu < nr_cpu_ids)
+		return cpu_physical_id(cpu);
+	return BAD_APICID;
 }
 
 static unsigned int bigsmp_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
@@ -129,9 +125,9 @@ static unsigned int bigsmp_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
 	 */
 	for_each_cpu_and(cpu, cpumask, andmask) {
 		if (cpumask_test_cpu(cpu, cpu_online_mask))
-			break;
+			return cpu_physical_id(cpu);
 	}
-	return bigsmp_cpu_to_logical_apicid(cpu);
+	return BAD_APICID;
 }
 
 static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb)
@@ -220,7 +216,6 @@ struct apic apic_bigsmp = {
 	.setup_apic_routing		= bigsmp_setup_apic_routing,
 	.multi_timer_check		= NULL,
 	.apicid_to_node			= bigsmp_apicid_to_node,
-	.cpu_to_logical_apicid		= bigsmp_cpu_to_logical_apicid,
 	.cpu_present_to_apicid		= bigsmp_cpu_present_to_apicid,
 	.apicid_to_cpu_present		= physid_set_mask_of_physid,
 	.setup_portio_remap		= NULL,
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index 7cb73e1..6840681 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -528,18 +528,6 @@ static void es7000_apicid_to_cpu_present(int phys_apicid, physid_mask_t *retmap)
 	++cpu_id;
 }
 
-/* Mapping from cpu number to logical apicid */
-static int es7000_cpu_to_logical_apicid(int cpu)
-{
-#ifdef CONFIG_SMP
-	if (cpu >= nr_cpu_ids)
-		return BAD_APICID;
-	return early_per_cpu(x86_cpu_to_logical_apicid, cpu);
-#else
-	return logical_smp_processor_id();
-#endif
-}
-
 static void es7000_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
 {
 	/* For clustered we don't have a good way to do this yet - hack */
@@ -561,7 +549,7 @@ static unsigned int es7000_cpu_mask_to_apicid(const struct cpumask *cpumask)
 	 * The cpus in the mask must all be on the apic cluster.
 	 */
 	for_each_cpu(cpu, cpumask) {
-		int new_apicid = es7000_cpu_to_logical_apicid(cpu);
+		int new_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu);
 
 		if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) {
 			WARN(1, "Not a valid mask!");
@@ -578,7 +566,7 @@ static unsigned int
 es7000_cpu_mask_to_apicid_and(const struct cpumask *inmask,
 			      const struct cpumask *andmask)
 {
-	int apicid = es7000_cpu_to_logical_apicid(0);
+	int apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0);
 	cpumask_var_t cpumask;
 
 	if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC))
@@ -656,7 +644,6 @@ struct apic __refdata apic_es7000_cluster = {
 	.setup_apic_routing		= es7000_setup_apic_routing,
 	.multi_timer_check		= NULL,
 	.apicid_to_node			= es7000_apicid_to_node,
-	.cpu_to_logical_apicid		= es7000_cpu_to_logical_apicid,
 	.cpu_present_to_apicid		= es7000_cpu_present_to_apicid,
 	.apicid_to_cpu_present		= es7000_apicid_to_cpu_present,
 	.setup_portio_remap		= NULL,
@@ -721,7 +708,6 @@ struct apic __refdata apic_es7000 = {
 	.setup_apic_routing		= es7000_setup_apic_routing,
 	.multi_timer_check		= NULL,
 	.apicid_to_node			= es7000_apicid_to_node,
-	.cpu_to_logical_apicid		= es7000_cpu_to_logical_apicid,
 	.cpu_present_to_apicid		= es7000_cpu_present_to_apicid,
 	.apicid_to_cpu_present		= es7000_apicid_to_cpu_present,
 	.setup_portio_remap		= NULL,
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index 4ed90c4..2b434d5 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -373,13 +373,6 @@ static inline void numaq_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask
 	return physids_promote(0xFUL, retmap);
 }
 
-static inline int numaq_cpu_to_logical_apicid(int cpu)
-{
-	if (cpu >= nr_cpu_ids)
-		return BAD_APICID;
-	return early_per_cpu(x86_cpu_to_logical_apicid, cpu);
-}
-
 /*
  * Supporting over 60 cpus on NUMA-Q requires a locality-dependent
  * cpu to APIC ID relation to properly interact with the intelligent
@@ -509,7 +502,6 @@ struct apic __refdata apic_numaq = {
 	.setup_apic_routing		= numaq_setup_apic_routing,
 	.multi_timer_check		= numaq_multi_timer_check,
 	.apicid_to_node			= numaq_apicid_to_node,
-	.cpu_to_logical_apicid		= numaq_cpu_to_logical_apicid,
 	.cpu_present_to_apicid		= numaq_cpu_present_to_apicid,
 	.apicid_to_cpu_present		= numaq_apicid_to_cpu_present,
 	.setup_portio_remap		= numaq_setup_portio_remap,
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index 99d2fe0..24a6828 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -131,7 +131,6 @@ struct apic apic_default = {
 	.setup_apic_routing		= setup_apic_flat_routing,
 	.multi_timer_check		= NULL,
 	.apicid_to_node			= default_apicid_to_node,
-	.cpu_to_logical_apicid		= default_cpu_to_logical_apicid,
 	.cpu_present_to_apicid		= default_cpu_present_to_apicid,
 	.apicid_to_cpu_present		= physid_set_mask_of_physid,
 	.setup_portio_remap		= NULL,
diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c
index 82cfc3e..1ef4c14 100644
--- a/arch/x86/kernel/apic/summit_32.c
+++ b/arch/x86/kernel/apic/summit_32.c
@@ -241,18 +241,6 @@ static int summit_apicid_to_node(int logical_apicid)
 #endif
 }
 
-/* Mapping from cpu number to logical apicid */
-static inline int summit_cpu_to_logical_apicid(int cpu)
-{
-#ifdef CONFIG_SMP
-	if (cpu >= nr_cpu_ids)
-		return BAD_APICID;
-	return early_per_cpu(x86_cpu_to_logical_apicid, cpu);
-#else
-	return logical_smp_processor_id();
-#endif
-}
-
 static int summit_cpu_present_to_apicid(int mps_cpu)
 {
 	if (mps_cpu < nr_cpu_ids)
@@ -286,7 +274,7 @@ static unsigned int summit_cpu_mask_to_apicid(const struct cpumask *cpumask)
 	 * The cpus in the mask must all be on the apic cluster.
 	 */
 	for_each_cpu(cpu, cpumask) {
-		int new_apicid = summit_cpu_to_logical_apicid(cpu);
+		int new_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu);
 
 		if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) {
 			printk("%s: Not a valid mask!\n", __func__);
@@ -301,7 +289,7 @@ static unsigned int summit_cpu_mask_to_apicid(const struct cpumask *cpumask)
 static unsigned int summit_cpu_mask_to_apicid_and(const struct cpumask *inmask,
 			      const struct cpumask *andmask)
 {
-	int apicid = summit_cpu_to_logical_apicid(0);
+	int apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0);
 	cpumask_var_t cpumask;
 
 	if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC))
@@ -529,7 +517,6 @@ struct apic apic_summit = {
 	.setup_apic_routing		= summit_setup_apic_routing,
 	.multi_timer_check		= NULL,
 	.apicid_to_node			= summit_apicid_to_node,
-	.cpu_to_logical_apicid		= summit_cpu_to_logical_apicid,
 	.cpu_present_to_apicid		= summit_cpu_present_to_apicid,
 	.apicid_to_cpu_present		= summit_apicid_to_cpu_present,
 	.setup_portio_remap		= NULL,
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index cf69c59..badc1fd 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -207,7 +207,6 @@ struct apic apic_x2apic_cluster = {
 	.setup_apic_routing		= NULL,
 	.multi_timer_check		= NULL,
 	.apicid_to_node			= NULL,
-	.cpu_to_logical_apicid		= NULL,
 	.cpu_present_to_apicid		= default_cpu_present_to_apicid,
 	.apicid_to_cpu_present		= NULL,
 	.setup_portio_remap		= NULL,
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index 8972f38..f28bf4c 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -196,7 +196,6 @@ struct apic apic_x2apic_phys = {
 	.setup_apic_routing		= NULL,
 	.multi_timer_check		= NULL,
 	.apicid_to_node			= NULL,
-	.cpu_to_logical_apicid		= NULL,
 	.cpu_present_to_apicid		= default_cpu_present_to_apicid,
 	.apicid_to_cpu_present		= NULL,
 	.setup_portio_remap		= NULL,
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index bd16b58..6027620 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -339,7 +339,6 @@ struct apic __refdata apic_x2apic_uv_x = {
 	.setup_apic_routing		= NULL,
 	.multi_timer_check		= NULL,
 	.apicid_to_node			= NULL,
-	.cpu_to_logical_apicid		= NULL,
 	.cpu_present_to_apicid		= default_cpu_present_to_apicid,
 	.apicid_to_cpu_present		= NULL,
 	.setup_portio_remap		= NULL,
-- 
cgit v0.10.2


From acb8bc09c6185e4d3d582d0076aaa6a89f19d8c5 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 23 Jan 2011 14:37:33 +0100
Subject: x86: Add apic->x86_32_early_logical_apicid()

On x86_32, the mapping between cpu and logical apic ID differs
depending on the specific apic implementation in use.  The
mapping is initialized while bringing up CPUs; however, this
makes early inits ignore memory topology.

Add a x86_32 specific apic->x86_32_early_logical_apicid() which
is called early during boot to query the mapping.  The mapping
is later verified against the result of init_apic_ldr().  The
method is allowed to return BAD_APICID if it can't be determined
early.

noop variant which always returns BAD_APICID is implemented and
added to all x86_32 apic implementations.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: eric.dumazet@gmail.com
Cc: yinghai@kernel.org
Cc: brgerst@gmail.com
Cc: gorcunov@gmail.com
Cc: penberg@kernel.org
Cc: shaohui.zheng@intel.com
Cc: rientjes@google.com
LKML-Reference: <1295789862-25482-8-git-send-email-tj@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index d1aa0c3..efb073b 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -354,6 +354,20 @@ struct apic {
 	void (*icr_write)(u32 low, u32 high);
 	void (*wait_icr_idle)(void);
 	u32 (*safe_wait_icr_idle)(void);
+
+#ifdef CONFIG_X86_32
+	/*
+	 * Called very early during boot from get_smp_config().  It should
+	 * return the logical apicid.  x86_[bios]_cpu_to_apicid is
+	 * initialized before this function is called.
+	 *
+	 * If logical apicid can't be determined that early, the function
+	 * may return BAD_APICID.  Logical apicid will be configured after
+	 * init_apic_ldr() while bringing up CPUs.  Note that NUMA affinity
+	 * won't be applied properly during early boot in this case.
+	 */
+	int (*x86_32_early_logical_apicid)(int cpu);
+#endif
 };
 
 /*
@@ -501,6 +515,11 @@ extern struct apic apic_noop;
 
 extern struct apic apic_default;
 
+static inline int noop_x86_32_early_logical_apicid(int cpu)
+{
+	return BAD_APICID;
+}
+
 /*
  * Set up the logical destination ID.
  *
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index ae08246..3127079 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1250,8 +1250,13 @@ void __cpuinit setup_local_APIC(void)
 
 #ifdef CONFIG_X86_32
 	/*
-	 * APIC LDR is initialized.  Fetch and store logical_apic_id.
+	 * APIC LDR is initialized.  If logical_apicid mapping was
+	 * initialized during get_smp_config(), make sure it matches the
+	 * actual value.
 	 */
+	i = early_per_cpu(x86_cpu_to_logical_apicid, cpu);
+	WARN_ON(i != BAD_APICID && i != logical_smp_processor_id());
+	/* always use the value from LDR */
 	early_per_cpu(x86_cpu_to_logical_apicid, cpu) =
 		logical_smp_processor_id();
 #endif
@@ -1991,7 +1996,10 @@ void __cpuinit generic_processor_info(int apicid, int version)
 	early_per_cpu(x86_cpu_to_apicid, cpu) = apicid;
 	early_per_cpu(x86_bios_cpu_apicid, cpu) = apicid;
 #endif
-
+#ifdef CONFIG_X86_32
+	early_per_cpu(x86_cpu_to_logical_apicid, cpu) =
+		apic->x86_32_early_logical_apicid(cpu);
+#endif
 	set_cpu_possible(cpu, true);
 	set_cpu_present(cpu, true);
 }
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index f3d19b2..0309c58 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -191,4 +191,8 @@ struct apic apic_noop = {
 	.icr_write			= noop_apic_icr_write,
 	.wait_icr_idle			= noop_apic_wait_icr_idle,
 	.safe_wait_icr_idle		= noop_safe_apic_wait_icr_idle,
+
+#ifdef CONFIG_X86_32
+	.x86_32_early_logical_apicid	= noop_x86_32_early_logical_apicid,
+#endif
 };
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index 4c62592..dd32a9b 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -251,4 +251,6 @@ struct apic apic_bigsmp = {
 	.icr_write			= native_apic_icr_write,
 	.wait_icr_idle			= native_apic_wait_icr_idle,
 	.safe_wait_icr_idle		= native_safe_apic_wait_icr_idle,
+
+	.x86_32_early_logical_apicid	= noop_x86_32_early_logical_apicid,
 };
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index 6840681..0ffc1ec 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -682,6 +682,8 @@ struct apic __refdata apic_es7000_cluster = {
 	.icr_write			= native_apic_icr_write,
 	.wait_icr_idle			= native_apic_wait_icr_idle,
 	.safe_wait_icr_idle		= native_safe_apic_wait_icr_idle,
+
+	.x86_32_early_logical_apicid	= noop_x86_32_early_logical_apicid,
 };
 
 struct apic __refdata apic_es7000 = {
@@ -744,4 +746,6 @@ struct apic __refdata apic_es7000 = {
 	.icr_write			= native_apic_icr_write,
 	.wait_icr_idle			= native_apic_wait_icr_idle,
 	.safe_wait_icr_idle		= native_safe_apic_wait_icr_idle,
+
+	.x86_32_early_logical_apicid	= noop_x86_32_early_logical_apicid,
 };
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index 2b434d5..f1a8b12 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -539,4 +539,6 @@ struct apic __refdata apic_numaq = {
 	.icr_write			= native_apic_icr_write,
 	.wait_icr_idle			= native_apic_wait_icr_idle,
 	.safe_wait_icr_idle		= native_safe_apic_wait_icr_idle,
+
+	.x86_32_early_logical_apicid	= noop_x86_32_early_logical_apicid,
 };
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index 24a6828..40be7c3 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -166,6 +166,8 @@ struct apic apic_default = {
 	.icr_write			= native_apic_icr_write,
 	.wait_icr_idle			= native_apic_wait_icr_idle,
 	.safe_wait_icr_idle		= native_safe_apic_wait_icr_idle,
+
+	.x86_32_early_logical_apicid	= noop_x86_32_early_logical_apicid,
 };
 
 extern struct apic apic_numaq;
diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c
index 1ef4c14..172c498 100644
--- a/arch/x86/kernel/apic/summit_32.c
+++ b/arch/x86/kernel/apic/summit_32.c
@@ -552,4 +552,6 @@ struct apic apic_summit = {
 	.icr_write			= native_apic_icr_write,
 	.wait_icr_idle			= native_apic_wait_icr_idle,
 	.safe_wait_icr_idle		= native_safe_apic_wait_icr_idle,
+
+	.x86_32_early_logical_apicid	= noop_x86_32_early_logical_apicid,
 };
-- 
cgit v0.10.2


From 3f6f6798889d50ec7ca8eef1d100cda37dc658ea Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 23 Jan 2011 14:37:34 +0100
Subject: x86: Implement the default x86_32_early_logical_apicid()

Implement x86_32_early_logical_apicid() for the default apic
flat routing.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: eric.dumazet@gmail.com
Cc: yinghai@kernel.org
Cc: brgerst@gmail.com
Cc: gorcunov@gmail.com
Cc: penberg@kernel.org
Cc: shaohui.zheng@intel.com
Cc: rientjes@google.com
LKML-Reference: <1295789862-25482-9-git-send-email-tj@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index 40be7c3..0f9a9ab 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -77,6 +77,11 @@ void __init default_setup_apic_routing(void)
 		apic->setup_apic_routing();
 }
 
+static int default_x86_32_early_logical_apicid(int cpu)
+{
+	return 1 << cpu;
+}
+
 static void setup_apic_flat_routing(void)
 {
 #ifdef CONFIG_X86_IO_APIC
@@ -167,7 +172,7 @@ struct apic apic_default = {
 	.wait_icr_idle			= native_apic_wait_icr_idle,
 	.safe_wait_icr_idle		= native_safe_apic_wait_icr_idle,
 
-	.x86_32_early_logical_apicid	= noop_x86_32_early_logical_apicid,
+	.x86_32_early_logical_apicid	= default_x86_32_early_logical_apicid,
 };
 
 extern struct apic apic_numaq;
-- 
cgit v0.10.2


From 12bf24a47c1a095233cc8a8b863b509a0d8e0f2c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 23 Jan 2011 14:37:35 +0100
Subject: x86: Implement x86_32_early_logical_apicid() for bigsmp_32

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: eric.dumazet@gmail.com
Cc: yinghai@kernel.org
Cc: brgerst@gmail.com
Cc: gorcunov@gmail.com
Cc: penberg@kernel.org
Cc: shaohui.zheng@intel.com
Cc: rientjes@google.com
LKML-Reference: <1295789862-25482-10-git-send-email-tj@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index dd32a9b..bc7ed04 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -45,6 +45,12 @@ static unsigned long bigsmp_check_apicid_present(int bit)
 	return 1;
 }
 
+static int bigsmp_early_logical_apicid(int cpu)
+{
+	/* on bigsmp, logical apicid is the same as physical */
+	return early_per_cpu(x86_cpu_to_apicid, cpu);
+}
+
 static inline unsigned long calculate_ldr(int cpu)
 {
 	unsigned long val, id;
@@ -252,5 +258,5 @@ struct apic apic_bigsmp = {
 	.wait_icr_idle			= native_apic_wait_icr_idle,
 	.safe_wait_icr_idle		= native_safe_apic_wait_icr_idle,
 
-	.x86_32_early_logical_apicid	= noop_x86_32_early_logical_apicid,
+	.x86_32_early_logical_apicid	= bigsmp_early_logical_apicid,
 };
-- 
cgit v0.10.2


From 3b39d937843e071c59b3aeecbf7de4750f095b12 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 23 Jan 2011 14:37:36 +0100
Subject: x86: Implement x86_32_early_logical_apicid() for summit_32

Factor out logical apic id calculation from
summit_init_apic_ldr() and use it for the
x86_32_early_logical_apicid() callback.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: eric.dumazet@gmail.com
Cc: yinghai@kernel.org
Cc: brgerst@gmail.com
Cc: gorcunov@gmail.com
Cc: penberg@kernel.org
Cc: shaohui.zheng@intel.com
Cc: rientjes@google.com
LKML-Reference: <1295789862-25482-11-git-send-email-tj@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c
index 172c498..8c91473 100644
--- a/arch/x86/kernel/apic/summit_32.c
+++ b/arch/x86/kernel/apic/summit_32.c
@@ -194,11 +194,10 @@ static unsigned long summit_check_apicid_present(int bit)
 	return 1;
 }
 
-static void summit_init_apic_ldr(void)
+static int summit_early_logical_apicid(int cpu)
 {
-	unsigned long val, id;
 	int count = 0;
-	u8 my_id = (u8)hard_smp_processor_id();
+	u8 my_id = early_per_cpu(x86_cpu_to_apicid, cpu);
 	u8 my_cluster = APIC_CLUSTER(my_id);
 #ifdef CONFIG_SMP
 	u8 lid;
@@ -214,7 +213,15 @@ static void summit_init_apic_ldr(void)
 	/* We only have a 4 wide bitmap in cluster mode.  If a deranged
 	 * BIOS puts 5 CPUs in one APIC cluster, we're hosed. */
 	BUG_ON(count >= XAPIC_DEST_CPUS_SHIFT);
-	id = my_cluster | (1UL << count);
+	return my_cluster | (1UL << count);
+}
+
+static void summit_init_apic_ldr(void)
+{
+	int cpu = smp_processor_id();
+	unsigned long id = early_per_cpu(x86_cpu_to_logical_apicid, cpu);
+	unsigned long val;
+
 	apic_write(APIC_DFR, SUMMIT_APIC_DFR_VALUE);
 	val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
 	val |= SET_APIC_LOGICAL_ID(id);
@@ -553,5 +560,5 @@ struct apic apic_summit = {
 	.wait_icr_idle			= native_apic_wait_icr_idle,
 	.safe_wait_icr_idle		= native_safe_apic_wait_icr_idle,
 
-	.x86_32_early_logical_apicid	= noop_x86_32_early_logical_apicid,
+	.x86_32_early_logical_apicid	= summit_early_logical_apicid,
 };
-- 
cgit v0.10.2


From df04cf011b0657ddc782b48d455f7e232b9be41c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 23 Jan 2011 14:37:37 +0100
Subject: x86: Implement x86_32_early_logical_apicid() for numaq_32

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: eric.dumazet@gmail.com
Cc: yinghai@kernel.org
Cc: brgerst@gmail.com
Cc: gorcunov@gmail.com
Cc: penberg@kernel.org
Cc: shaohui.zheng@intel.com
Cc: rientjes@google.com
LKML-Reference: <1295789862-25482-12-git-send-email-tj@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index 0ffc1ec..5c53d05 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -460,6 +460,12 @@ static unsigned long es7000_check_apicid_present(int bit)
 	return physid_isset(bit, phys_cpu_present_map);
 }
 
+static int es7000_early_logical_apicid(int cpu)
+{
+	/* on es7000, logical apicid is the same as physical */
+	return early_per_cpu(x86_bios_cpu_apicid, cpu);
+}
+
 static unsigned long calculate_ldr(int cpu)
 {
 	unsigned long id = per_cpu(x86_bios_cpu_apicid, cpu);
@@ -683,7 +689,7 @@ struct apic __refdata apic_es7000_cluster = {
 	.wait_icr_idle			= native_apic_wait_icr_idle,
 	.safe_wait_icr_idle		= native_safe_apic_wait_icr_idle,
 
-	.x86_32_early_logical_apicid	= noop_x86_32_early_logical_apicid,
+	.x86_32_early_logical_apicid	= es7000_early_logical_apicid,
 };
 
 struct apic __refdata apic_es7000 = {
@@ -747,5 +753,5 @@ struct apic __refdata apic_es7000 = {
 	.wait_icr_idle			= native_apic_wait_icr_idle,
 	.safe_wait_icr_idle		= native_safe_apic_wait_icr_idle,
 
-	.x86_32_early_logical_apicid	= noop_x86_32_early_logical_apicid,
+	.x86_32_early_logical_apicid	= es7000_early_logical_apicid,
 };
-- 
cgit v0.10.2


From 89e5dc218e084e13a3996db6693b01478912f4ee Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 23 Jan 2011 14:37:38 +0100
Subject: x86: Replace apic->apicid_to_node() with ->x86_32_numa_cpu_node()

apic->apicid_to_node() is 32bit specific apic operation which
determines NUMA node for a CPU.  Depending on the APIC
implementation, it can be easier to determine NUMA node from
either physical or logical apicid.  Currently,
->apicid_to_node() takes @logical_apicid and calls
hard_smp_processor_id() if the physical apicid is needed.

This prevents NUMA mapping from being queried from a different
CPU, which in turn makes it impossible to initialize NUMA
mapping before SMP bringup.

This patch replaces apic->apicid_to_node() with
->x86_32_numa_cpu_node() which takes @cpu, from which both
logical and physical apicids can easily be determined.  While at
it, drop duplicate implementations from bigsmp_32 and summit_32,
and use the default one.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Pekka Enberg <penberg@kernel.org>
Cc: eric.dumazet@gmail.com
Cc: yinghai@kernel.org
Cc: brgerst@gmail.com
Cc: gorcunov@gmail.com
Cc: shaohui.zheng@intel.com
Cc: rientjes@google.com
LKML-Reference: <1295789862-25482-13-git-send-email-tj@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index efb073b..ad30ca4 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -306,7 +306,6 @@ struct apic {
 
 	void (*setup_apic_routing)(void);
 	int (*multi_timer_check)(int apic, int irq);
-	int (*apicid_to_node)(int logical_apicid);
 	int (*cpu_present_to_apicid)(int mps_cpu);
 	void (*apicid_to_cpu_present)(int phys_apicid, physid_mask_t *retmap);
 	void (*setup_portio_remap)(void);
@@ -367,6 +366,9 @@ struct apic {
 	 * won't be applied properly during early boot in this case.
 	 */
 	int (*x86_32_early_logical_apicid)(int cpu);
+
+	/* determine CPU -> NUMA node mapping */
+	int (*x86_32_numa_cpu_node)(int cpu);
 #endif
 };
 
@@ -539,7 +541,7 @@ static inline int default_phys_pkg_id(int cpuid_apic, int index_msb)
 	return cpuid_apic >> index_msb;
 }
 
-extern int default_apicid_to_node(int logical_apicid);
+extern int default_x86_32_numa_cpu_node(int cpu);
 
 #endif
 
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 3127079..0f4f3c1 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -2020,10 +2020,14 @@ void default_init_apic_ldr(void)
 }
 
 #ifdef CONFIG_X86_32
-int default_apicid_to_node(int logical_apicid)
+int default_x86_32_numa_cpu_node(int cpu)
 {
-#ifdef CONFIG_SMP
-	return apicid_2_node[hard_smp_processor_id()];
+#ifdef CONFIG_NUMA
+	int apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
+
+	if (apicid != BAD_APICID)
+		return apicid_2_node[apicid];
+	return NUMA_NO_NODE;
 #else
 	return 0;
 #endif
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index 5a9d11a..5652d31 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -185,7 +185,6 @@ struct apic apic_flat =  {
 	.ioapic_phys_id_map		= NULL,
 	.setup_apic_routing		= NULL,
 	.multi_timer_check		= NULL,
-	.apicid_to_node			= NULL,
 	.cpu_present_to_apicid		= default_cpu_present_to_apicid,
 	.apicid_to_cpu_present		= NULL,
 	.setup_portio_remap		= NULL,
@@ -336,7 +335,6 @@ struct apic apic_physflat =  {
 	.ioapic_phys_id_map		= NULL,
 	.setup_apic_routing		= NULL,
 	.multi_timer_check		= NULL,
-	.apicid_to_node			= NULL,
 	.cpu_present_to_apicid		= default_cpu_present_to_apicid,
 	.apicid_to_cpu_present		= NULL,
 	.setup_portio_remap		= NULL,
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index 0309c58..f1baa2d 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -108,12 +108,6 @@ static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask)
 	cpumask_set_cpu(cpu, retmask);
 }
 
-int noop_apicid_to_node(int logical_apicid)
-{
-	/* we're always on node 0 */
-	return 0;
-}
-
 static u32 noop_apic_read(u32 reg)
 {
 	WARN_ON_ONCE((cpu_has_apic && !disable_apic));
@@ -125,6 +119,14 @@ static void noop_apic_write(u32 reg, u32 v)
 	WARN_ON_ONCE(cpu_has_apic && !disable_apic);
 }
 
+#ifdef CONFIG_X86_32
+static int noop_x86_32_numa_cpu_node(int cpu)
+{
+	/* we're always on node 0 */
+	return 0;
+}
+#endif
+
 struct apic apic_noop = {
 	.name				= "noop",
 	.probe				= noop_probe,
@@ -148,7 +150,6 @@ struct apic apic_noop = {
 	.ioapic_phys_id_map		= default_ioapic_phys_id_map,
 	.setup_apic_routing		= NULL,
 	.multi_timer_check		= NULL,
-	.apicid_to_node			= noop_apicid_to_node,
 
 	.cpu_present_to_apicid		= default_cpu_present_to_apicid,
 	.apicid_to_cpu_present		= physid_set_mask_of_physid,
@@ -194,5 +195,6 @@ struct apic apic_noop = {
 
 #ifdef CONFIG_X86_32
 	.x86_32_early_logical_apicid	= noop_x86_32_early_logical_apicid,
+	.x86_32_numa_cpu_node		= noop_x86_32_numa_cpu_node,
 #endif
 };
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index bc7ed04..541a2e4 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -86,11 +86,6 @@ static void bigsmp_setup_apic_routing(void)
 		nr_ioapics);
 }
 
-static int bigsmp_apicid_to_node(int logical_apicid)
-{
-	return apicid_2_node[hard_smp_processor_id()];
-}
-
 static int bigsmp_cpu_present_to_apicid(int mps_cpu)
 {
 	if (mps_cpu < nr_cpu_ids)
@@ -221,7 +216,6 @@ struct apic apic_bigsmp = {
 	.ioapic_phys_id_map		= bigsmp_ioapic_phys_id_map,
 	.setup_apic_routing		= bigsmp_setup_apic_routing,
 	.multi_timer_check		= NULL,
-	.apicid_to_node			= bigsmp_apicid_to_node,
 	.cpu_present_to_apicid		= bigsmp_cpu_present_to_apicid,
 	.apicid_to_cpu_present		= physid_set_mask_of_physid,
 	.setup_portio_remap		= NULL,
@@ -259,4 +253,5 @@ struct apic apic_bigsmp = {
 	.safe_wait_icr_idle		= native_safe_apic_wait_icr_idle,
 
 	.x86_32_early_logical_apicid	= bigsmp_early_logical_apicid,
+	.x86_32_numa_cpu_node		= default_x86_32_numa_cpu_node,
 };
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index 5c53d05..3e9de48 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -510,12 +510,11 @@ static void es7000_setup_apic_routing(void)
 		nr_ioapics, cpumask_bits(es7000_target_cpus())[0]);
 }
 
-static int es7000_apicid_to_node(int logical_apicid)
+static int es7000_numa_cpu_node(int cpu)
 {
 	return 0;
 }
 
-
 static int es7000_cpu_present_to_apicid(int mps_cpu)
 {
 	if (!mps_cpu)
@@ -649,7 +648,6 @@ struct apic __refdata apic_es7000_cluster = {
 	.ioapic_phys_id_map		= es7000_ioapic_phys_id_map,
 	.setup_apic_routing		= es7000_setup_apic_routing,
 	.multi_timer_check		= NULL,
-	.apicid_to_node			= es7000_apicid_to_node,
 	.cpu_present_to_apicid		= es7000_cpu_present_to_apicid,
 	.apicid_to_cpu_present		= es7000_apicid_to_cpu_present,
 	.setup_portio_remap		= NULL,
@@ -690,6 +688,7 @@ struct apic __refdata apic_es7000_cluster = {
 	.safe_wait_icr_idle		= native_safe_apic_wait_icr_idle,
 
 	.x86_32_early_logical_apicid	= es7000_early_logical_apicid,
+	.x86_32_numa_cpu_node		= es7000_numa_cpu_node,
 };
 
 struct apic __refdata apic_es7000 = {
@@ -715,7 +714,6 @@ struct apic __refdata apic_es7000 = {
 	.ioapic_phys_id_map		= es7000_ioapic_phys_id_map,
 	.setup_apic_routing		= es7000_setup_apic_routing,
 	.multi_timer_check		= NULL,
-	.apicid_to_node			= es7000_apicid_to_node,
 	.cpu_present_to_apicid		= es7000_cpu_present_to_apicid,
 	.apicid_to_cpu_present		= es7000_apicid_to_cpu_present,
 	.setup_portio_remap		= NULL,
@@ -754,4 +752,5 @@ struct apic __refdata apic_es7000 = {
 	.safe_wait_icr_idle		= native_safe_apic_wait_icr_idle,
 
 	.x86_32_early_logical_apicid	= es7000_early_logical_apicid,
+	.x86_32_numa_cpu_node		= es7000_numa_cpu_node,
 };
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index f1a8b12..6273eee 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -391,6 +391,15 @@ static inline int numaq_apicid_to_node(int logical_apicid)
 	return logical_apicid >> 4;
 }
 
+static int numaq_numa_cpu_node(int cpu)
+{
+	int logical_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu);
+
+	if (logical_apicid != BAD_APICID)
+		return numaq_apicid_to_node(logical_apicid);
+	return NUMA_NO_NODE;
+}
+
 static void numaq_apicid_to_cpu_present(int logical_apicid, physid_mask_t *retmap)
 {
 	int node = numaq_apicid_to_node(logical_apicid);
@@ -501,7 +510,6 @@ struct apic __refdata apic_numaq = {
 	.ioapic_phys_id_map		= numaq_ioapic_phys_id_map,
 	.setup_apic_routing		= numaq_setup_apic_routing,
 	.multi_timer_check		= numaq_multi_timer_check,
-	.apicid_to_node			= numaq_apicid_to_node,
 	.cpu_present_to_apicid		= numaq_cpu_present_to_apicid,
 	.apicid_to_cpu_present		= numaq_apicid_to_cpu_present,
 	.setup_portio_remap		= numaq_setup_portio_remap,
@@ -541,4 +549,5 @@ struct apic __refdata apic_numaq = {
 	.safe_wait_icr_idle		= native_safe_apic_wait_icr_idle,
 
 	.x86_32_early_logical_apicid	= noop_x86_32_early_logical_apicid,
+	.x86_32_numa_cpu_node		= numaq_numa_cpu_node,
 };
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index 0f9a9ab..fc84c7b 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -135,7 +135,6 @@ struct apic apic_default = {
 	.ioapic_phys_id_map		= default_ioapic_phys_id_map,
 	.setup_apic_routing		= setup_apic_flat_routing,
 	.multi_timer_check		= NULL,
-	.apicid_to_node			= default_apicid_to_node,
 	.cpu_present_to_apicid		= default_cpu_present_to_apicid,
 	.apicid_to_cpu_present		= physid_set_mask_of_physid,
 	.setup_portio_remap		= NULL,
@@ -173,6 +172,7 @@ struct apic apic_default = {
 	.safe_wait_icr_idle		= native_safe_apic_wait_icr_idle,
 
 	.x86_32_early_logical_apicid	= default_x86_32_early_logical_apicid,
+	.x86_32_numa_cpu_node		= default_x86_32_numa_cpu_node,
 };
 
 extern struct apic apic_numaq;
diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c
index 8c91473..e4b8059 100644
--- a/arch/x86/kernel/apic/summit_32.c
+++ b/arch/x86/kernel/apic/summit_32.c
@@ -239,15 +239,6 @@ static void summit_setup_apic_routing(void)
 						nr_ioapics);
 }
 
-static int summit_apicid_to_node(int logical_apicid)
-{
-#ifdef CONFIG_SMP
-	return apicid_2_node[hard_smp_processor_id()];
-#else
-	return 0;
-#endif
-}
-
 static int summit_cpu_present_to_apicid(int mps_cpu)
 {
 	if (mps_cpu < nr_cpu_ids)
@@ -523,7 +514,6 @@ struct apic apic_summit = {
 	.ioapic_phys_id_map		= summit_ioapic_phys_id_map,
 	.setup_apic_routing		= summit_setup_apic_routing,
 	.multi_timer_check		= NULL,
-	.apicid_to_node			= summit_apicid_to_node,
 	.cpu_present_to_apicid		= summit_cpu_present_to_apicid,
 	.apicid_to_cpu_present		= summit_apicid_to_cpu_present,
 	.setup_portio_remap		= NULL,
@@ -561,4 +551,5 @@ struct apic apic_summit = {
 	.safe_wait_icr_idle		= native_safe_apic_wait_icr_idle,
 
 	.x86_32_early_logical_apicid	= summit_early_logical_apicid,
+	.x86_32_numa_cpu_node		= default_x86_32_numa_cpu_node,
 };
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index badc1fd..90949bb 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -206,7 +206,6 @@ struct apic apic_x2apic_cluster = {
 	.ioapic_phys_id_map		= NULL,
 	.setup_apic_routing		= NULL,
 	.multi_timer_check		= NULL,
-	.apicid_to_node			= NULL,
 	.cpu_present_to_apicid		= default_cpu_present_to_apicid,
 	.apicid_to_cpu_present		= NULL,
 	.setup_portio_remap		= NULL,
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index f28bf4c..c7e6d66 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -195,7 +195,6 @@ struct apic apic_x2apic_phys = {
 	.ioapic_phys_id_map		= NULL,
 	.setup_apic_routing		= NULL,
 	.multi_timer_check		= NULL,
-	.apicid_to_node			= NULL,
 	.cpu_present_to_apicid		= default_cpu_present_to_apicid,
 	.apicid_to_cpu_present		= NULL,
 	.setup_portio_remap		= NULL,
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 6027620..3c28928 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -338,7 +338,6 @@ struct apic __refdata apic_x2apic_uv_x = {
 	.ioapic_phys_id_map		= NULL,
 	.setup_apic_routing		= NULL,
 	.multi_timer_check		= NULL,
-	.apicid_to_node			= NULL,
 	.cpu_present_to_apicid		= default_cpu_present_to_apicid,
 	.apicid_to_cpu_present		= NULL,
 	.setup_portio_remap		= NULL,
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index ca20f6b..5319cdd 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -168,10 +168,9 @@ static void unmap_cpu_to_node(int cpu)
 static void map_cpu_to_logical_apicid(void)
 {
 	int cpu = smp_processor_id();
-	int logical_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu);
 	int node;
 
-	node = apic->apicid_to_node(logical_apicid);
+	node = apic->x86_32_numa_cpu_node(cpu);
 	if (!node_online(node))
 		node = first_online_node;
 
-- 
cgit v0.10.2


From bbc9e2f452d9c4b166d1f9a78d941d80173312fe Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 23 Jan 2011 14:37:39 +0100
Subject: x86: Unify cpu/apicid <-> NUMA node mapping between 32 and 64bit

The mapping between cpu/apicid and node is done via
apicid_to_node[] on 64bit and apicid_2_node[] +
apic->x86_32_numa_cpu_node() on 32bit. This difference makes it
difficult to further unify 32 and 64bit NUMA handling.

This patch unifies it by replacing both apicid_to_node[] and
apicid_2_node[] with __apicid_to_node[] array, which is accessed
by two accessors - set_apicid_to_node() and numa_cpu_node().  On
64bit, numa_cpu_node() always consults __apicid_to_node[]
directly while 32bit goes through apic->numa_cpu_node() method
to allow apic implementations to override it.

srat_detect_node() for amd cpus contains workaround for broken
NUMA configuration which assumes relationship between APIC ID,
HT node ID and NUMA topology.  Leave it to access
__apicid_to_node[] directly as mapping through CPU might result
in undesirable behavior change.  The comment is reformatted and
updated to note the ugliness.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Pekka Enberg <penberg@kernel.org>
Cc: eric.dumazet@gmail.com
Cc: yinghai@kernel.org
Cc: brgerst@gmail.com
Cc: gorcunov@gmail.com
Cc: shaohui.zheng@intel.com
Cc: rientjes@google.com
LKML-Reference: <1295789862-25482-14-git-send-email-tj@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: David Rientjes <rientjes@google.com>

diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index edc2a45..9c7d95f 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -25,7 +25,6 @@ extern int pic_mode;
 #define MAX_IRQ_SOURCES		256
 
 extern unsigned int def_to_bigsmp;
-extern u8 apicid_2_node[];
 
 #ifdef CONFIG_X86_NUMAQ
 extern int mp_bus_id_to_node[MAX_MP_BUSSES];
diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h
index 27da400..5e01c76 100644
--- a/arch/x86/include/asm/numa.h
+++ b/arch/x86/include/asm/numa.h
@@ -1,5 +1,33 @@
+#ifndef _ASM_X86_NUMA_H
+#define _ASM_X86_NUMA_H
+
+#include <asm/apicdef.h>
+
+#ifdef CONFIG_NUMA
+/*
+ * __apicid_to_node[] stores the raw mapping between physical apicid and
+ * node and is used to initialize cpu_to_node mapping.
+ *
+ * The mapping may be overridden by apic->numa_cpu_node() on 32bit and thus
+ * should be accessed by the accessors - set_apicid_to_node() and
+ * numa_cpu_node().
+ */
+extern s16 __apicid_to_node[MAX_LOCAL_APIC];
+
+static inline void set_apicid_to_node(int apicid, s16 node)
+{
+	__apicid_to_node[apicid] = node;
+}
+#else	/* CONFIG_NUMA */
+static inline void set_apicid_to_node(int apicid, s16 node)
+{
+}
+#endif	/* CONFIG_NUMA */
+
 #ifdef CONFIG_X86_32
 # include "numa_32.h"
 #else
 # include "numa_64.h"
 #endif
+
+#endif	/* _ASM_X86_NUMA_H */
diff --git a/arch/x86/include/asm/numa_32.h b/arch/x86/include/asm/numa_32.h
index b0ef2b4..cdf8043 100644
--- a/arch/x86/include/asm/numa_32.h
+++ b/arch/x86/include/asm/numa_32.h
@@ -6,6 +6,12 @@ extern int numa_off;
 extern int pxm_to_nid(int pxm);
 extern void numa_remove_cpu(int cpu);
 
+#ifdef CONFIG_NUMA
+extern int __cpuinit numa_cpu_node(int apicid);
+#else	/* CONFIG_NUMA */
+static inline int numa_cpu_node(int cpu)		{ return NUMA_NO_NODE; }
+#endif	/* CONFIG_NUMA */
+
 #ifdef CONFIG_HIGHMEM
 extern void set_highmem_pages_init(void);
 #else
diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h
index 0493be3..4982a9c 100644
--- a/arch/x86/include/asm/numa_64.h
+++ b/arch/x86/include/asm/numa_64.h
@@ -2,7 +2,6 @@
 #define _ASM_X86_NUMA_64_H
 
 #include <linux/nodemask.h>
-#include <asm/apicdef.h>
 
 struct bootnode {
 	u64 start;
@@ -17,8 +16,6 @@ extern int compute_hash_shift(struct bootnode *nodes, int numblks,
 extern void numa_init_array(void);
 extern int numa_off;
 
-extern s16 apicid_to_node[MAX_LOCAL_APIC];
-
 extern unsigned long numa_free_all_bootmem(void);
 extern void setup_node_bootmem(int nodeid, unsigned long start,
 			       unsigned long end);
@@ -32,6 +29,7 @@ extern void setup_node_bootmem(int nodeid, unsigned long start,
 #define NODE_MIN_SIZE (4*1024*1024)
 
 extern void __init init_cpu_to_node(void);
+extern int __cpuinit numa_cpu_node(int cpu);
 extern void __cpuinit numa_set_node(int cpu, int node);
 extern void __cpuinit numa_clear_node(int cpu);
 extern void __cpuinit numa_add_cpu(int cpu);
@@ -44,6 +42,7 @@ void numa_emu_cmdline(char *);
 #endif /* CONFIG_NUMA_EMU */
 #else
 static inline void init_cpu_to_node(void)		{ }
+static inline int numa_cpu_node(int cpu)		{ return NUMA_NO_NODE; }
 static inline void numa_set_node(int cpu, int node)	{ }
 static inline void numa_clear_node(int cpu)		{ }
 static inline void numa_add_cpu(int cpu, int node)	{ }
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index b3a7113..a7bca59 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -589,11 +589,10 @@ static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
 	nid = acpi_get_node(handle);
 	if (nid == -1 || !node_online(nid))
 		return;
+	set_apicid_to_node(physid, nid);
 #ifdef CONFIG_X86_64
-	apicid_to_node[physid] = nid;
 	numa_set_node(cpu, nid);
 #else /* CONFIG_X86_32 */
-	apicid_2_node[physid] = nid;
 	cpu_to_node_map[cpu] = nid;
 #endif
 
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 0f4f3c1..4686ea5 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -2026,7 +2026,7 @@ int default_x86_32_numa_cpu_node(int cpu)
 	int apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
 
 	if (apicid != BAD_APICID)
-		return apicid_2_node[apicid];
+		return __apicid_to_node[apicid];
 	return NUMA_NO_NODE;
 #else
 	return 0;
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 7c7bedb..3cce8f2 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -234,17 +234,21 @@ static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c)
 #endif
 
 #if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
+/*
+ * To workaround broken NUMA config.  Read the comment in
+ * srat_detect_node().
+ */
 static int __cpuinit nearby_node(int apicid)
 {
 	int i, node;
 
 	for (i = apicid - 1; i >= 0; i--) {
-		node = apicid_to_node[i];
+		node = __apicid_to_node[i];
 		if (node != NUMA_NO_NODE && node_online(node))
 			return node;
 	}
 	for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
-		node = apicid_to_node[i];
+		node = __apicid_to_node[i];
 		if (node != NUMA_NO_NODE && node_online(node))
 			return node;
 	}
@@ -339,26 +343,35 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
 	int node;
 	unsigned apicid = c->apicid;
 
-	node = per_cpu(cpu_llc_id, cpu);
+	node = numa_cpu_node(cpu);
+	if (node == NUMA_NO_NODE)
+		node = per_cpu(cpu_llc_id, cpu);
 
-	if (apicid_to_node[apicid] != NUMA_NO_NODE)
-		node = apicid_to_node[apicid];
 	if (!node_online(node)) {
-		/* Two possibilities here:
-		   - The CPU is missing memory and no node was created.
-		   In that case try picking one from a nearby CPU
-		   - The APIC IDs differ from the HyperTransport node IDs
-		   which the K8 northbridge parsing fills in.
-		   Assume they are all increased by a constant offset,
-		   but in the same order as the HT nodeids.
-		   If that doesn't result in a usable node fall back to the
-		   path for the previous case.  */
-
+		/*
+		 * Two possibilities here:
+		 *
+		 * - The CPU is missing memory and no node was created.  In
+		 *   that case try picking one from a nearby CPU.
+		 *
+		 * - The APIC IDs differ from the HyperTransport node IDs
+		 *   which the K8 northbridge parsing fills in.  Assume
+		 *   they are all increased by a constant offset, but in
+		 *   the same order as the HT nodeids.  If that doesn't
+		 *   result in a usable node fall back to the path for the
+		 *   previous case.
+		 *
+		 * This workaround operates directly on the mapping between
+		 * APIC ID and NUMA node, assuming certain relationship
+		 * between APIC ID, HT node ID and NUMA topology.  As going
+		 * through CPU mapping may alter the outcome, directly
+		 * access __apicid_to_node[].
+		 */
 		int ht_nodeid = c->initial_apicid;
 
 		if (ht_nodeid >= 0 &&
-		    apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
-			node = apicid_to_node[ht_nodeid];
+		    __apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
+			node = __apicid_to_node[ht_nodeid];
 		/* Pick a nearby node */
 		if (!node_online(node))
 			node = nearby_node(apicid);
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index d16c2c5..6052004 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -279,11 +279,10 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
 #if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
 	unsigned node;
 	int cpu = smp_processor_id();
-	int apicid = cpu_has_apic ? hard_smp_processor_id() : c->apicid;
 
 	/* Don't do the funky fallback heuristics the AMD version employs
 	   for now. */
-	node = apicid_to_node[apicid];
+	node = numa_cpu_node(cpu);
 	if (node == NUMA_NO_NODE || !node_online(node)) {
 		/* reuse the value from init_cpu_to_node() */
 		node = cpu_to_node(cpu);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 5319cdd..b7cfce5 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -71,10 +71,6 @@
 #include <asm/smpboot_hooks.h>
 #include <asm/i8259.h>
 
-#ifdef CONFIG_X86_32
-u8 apicid_2_node[MAX_LOCAL_APIC];
-#endif
-
 /* State of each CPU */
 DEFINE_PER_CPU(int, cpu_state) = { 0 };
 
@@ -170,7 +166,7 @@ static void map_cpu_to_logical_apicid(void)
 	int cpu = smp_processor_id();
 	int node;
 
-	node = apic->x86_32_numa_cpu_node(cpu);
+	node = numa_cpu_node(cpu);
 	if (!node_online(node))
 		node = first_online_node;
 
diff --git a/arch/x86/mm/amdtopology_64.c b/arch/x86/mm/amdtopology_64.c
index f21962c..c7fae38 100644
--- a/arch/x86/mm/amdtopology_64.c
+++ b/arch/x86/mm/amdtopology_64.c
@@ -247,7 +247,7 @@ void __init amd_fake_nodes(const struct bootnode *nodes, int nr_nodes)
 		__acpi_map_pxm_to_node(nid, i);
 #endif
 	}
-	memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node));
+	memcpy(__apicid_to_node, fake_apicid_to_node, sizeof(__apicid_to_node));
 }
 #endif /* CONFIG_NUMA_EMU */
 
@@ -285,7 +285,7 @@ int __init amd_scan_nodes(void)
 				nodes[i].start >> PAGE_SHIFT,
 				nodes[i].end >> PAGE_SHIFT);
 		for (j = apicid_base; j < cores + apicid_base; j++)
-			apicid_to_node[(i << bits) + j] = i;
+			set_apicid_to_node((i << bits) + j, i);
 		setup_node_bootmem(i, nodes[i].start, nodes[i].end);
 	}
 
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index ebf6d78..480b357 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -26,8 +26,12 @@ static __init int numa_setup(char *opt)
 early_param("numa", numa_setup);
 
 /*
- * Which logical CPUs are on which nodes
+ * apicid, cpu, node mappings
  */
+s16 __apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
+	[0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
+};
+
 cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
 EXPORT_SYMBOL(node_to_cpumask_map);
 
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index 84a3e4c..8d91d22 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -110,6 +110,12 @@ void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
 
 static unsigned long kva_start_pfn;
 static unsigned long kva_pages;
+
+int __cpuinit numa_cpu_node(int cpu)
+{
+	return apic->x86_32_numa_cpu_node(cpu);
+}
+
 /*
  * FLAT - support for basic PC memory model with discontig enabled, essentially
  *        a single node with all available processors in it with a flat
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 95ea155..1e1026f 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -26,10 +26,6 @@ EXPORT_SYMBOL(node_data);
 
 struct memnode memnode;
 
-s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
-	[0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
-};
-
 static unsigned long __initdata nodemap_addr;
 static unsigned long __initdata nodemap_size;
 
@@ -716,12 +712,8 @@ void __init init_cpu_to_node(void)
 	BUG_ON(cpu_to_apicid == NULL);
 
 	for_each_possible_cpu(cpu) {
-		int node;
-		u16 apicid = cpu_to_apicid[cpu];
+		int node = numa_cpu_node(cpu);
 
-		if (apicid == BAD_APICID)
-			continue;
-		node = apicid_to_node[apicid];
 		if (node == NUMA_NO_NODE)
 			continue;
 		if (!node_online(node))
@@ -731,6 +723,14 @@ void __init init_cpu_to_node(void)
 }
 #endif
 
+int __cpuinit numa_cpu_node(int cpu)
+{
+	int apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
+
+	if (apicid != BAD_APICID)
+		return __apicid_to_node[apicid];
+	return NUMA_NO_NODE;
+}
 
 void __cpuinit numa_set_node(int cpu, int node)
 {
@@ -776,13 +776,9 @@ void __cpuinit numa_remove_cpu(int cpu)
 void __cpuinit numa_add_cpu(int cpu)
 {
 	unsigned long addr;
-	u16 apicid;
-	int physnid;
-	int nid = NUMA_NO_NODE;
+	int physnid, nid;
 
-	apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
-	if (apicid != BAD_APICID)
-		nid = apicid_to_node[apicid];
+	nid = numa_cpu_node(cpu);
 	if (nid == NUMA_NO_NODE)
 		nid = early_cpu_to_node(cpu);
 	BUG_ON(nid == NUMA_NO_NODE || !node_online(nid));
diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c
index 6027a48..48651c6 100644
--- a/arch/x86/mm/srat_32.c
+++ b/arch/x86/mm/srat_32.c
@@ -255,7 +255,7 @@ int __init get_memcfg_from_srat(void)
 			 num_memory_chunks);
 
 	for (i = 0; i < MAX_LOCAL_APIC; i++)
-		apicid_2_node[i] = pxm_to_node(apicid_to_pxm[i]);
+		set_apicid_to_node(i, pxm_to_node(apicid_to_pxm[i]));
 
 	for (j = 0; j < num_memory_chunks; j++){
 		struct node_memory_chunk_s * chunk = &node_memory_chunk[j];
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 603d285..9a97261 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -79,7 +79,7 @@ static __init void bad_srat(void)
 	printk(KERN_ERR "SRAT: SRAT not used.\n");
 	acpi_numa = -1;
 	for (i = 0; i < MAX_LOCAL_APIC; i++)
-		apicid_to_node[i] = NUMA_NO_NODE;
+		set_apicid_to_node(i, NUMA_NO_NODE);
 	for (i = 0; i < MAX_NUMNODES; i++) {
 		nodes[i].start = nodes[i].end = 0;
 		nodes_add[i].start = nodes_add[i].end = 0;
@@ -138,7 +138,7 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
 		printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node);
 		return;
 	}
-	apicid_to_node[apic_id] = node;
+	set_apicid_to_node(apic_id, node);
 	node_set(node, cpu_nodes_parsed);
 	acpi_numa = 1;
 	printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u\n",
@@ -178,7 +178,7 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
 		return;
 	}
 
-	apicid_to_node[apic_id] = node;
+	set_apicid_to_node(apic_id, node);
 	node_set(node, cpu_nodes_parsed);
 	acpi_numa = 1;
 	printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u\n",
@@ -521,7 +521,7 @@ void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
 		 * node, it must now point to the fake node ID.
 		 */
 		for (j = 0; j < MAX_LOCAL_APIC; j++)
-			if (apicid_to_node[j] == nid &&
+			if (__apicid_to_node[j] == nid &&
 			    fake_apicid_to_node[j] == NUMA_NO_NODE)
 				fake_apicid_to_node[j] = i;
 	}
@@ -532,13 +532,13 @@ void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
 	 * value.
 	 */
 	for (i = 0; i < MAX_LOCAL_APIC; i++)
-		if (apicid_to_node[i] != NUMA_NO_NODE &&
+		if (__apicid_to_node[i] != NUMA_NO_NODE &&
 		    fake_apicid_to_node[i] == NUMA_NO_NODE)
 			fake_apicid_to_node[i] = 0;
 
 	for (i = 0; i < num_nodes; i++)
 		__acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i);
-	memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node));
+	memcpy(__apicid_to_node, fake_apicid_to_node, sizeof(__apicid_to_node));
 
 	nodes_clear(nodes_parsed);
 	for (i = 0; i < num_nodes; i++)
-- 
cgit v0.10.2


From 645a79195f66eb68ef3ab2b21d9829ac3aa085a9 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 23 Jan 2011 14:37:40 +0100
Subject: x86: Unify CPU -> NUMA node mapping between 32 and 64bit

Unlike 64bit, 32bit has been using its own cpu_to_node_map[] for
CPU -> NUMA node mapping.  Replace it with early_percpu variable
x86_cpu_to_node_map and share the mapping code with 64bit.

* USE_PERCPU_NUMA_NODE_ID is now enabled for 32bit too.

* x86_cpu_to_node_map and numa_set/clear_node() are moved from
  numa_64 to numa.  For now, on 32bit, x86_cpu_to_node_map is initialized
  with 0 instead of NUMA_NO_NODE.  This is to avoid introducing unexpected
  behavior change and will be updated once init path is unified.

* srat_detect_node() is now enabled for x86_32 too.  It calls
  numa_set_node() and initializes the mapping making explicit
  cpu_to_node_map[] updates from map/unmap_cpu_to_node() unnecessary.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: eric.dumazet@gmail.com
Cc: yinghai@kernel.org
Cc: brgerst@gmail.com
Cc: gorcunov@gmail.com
Cc: penberg@kernel.org
Cc: shaohui.zheng@intel.com
Cc: rientjes@google.com
LKML-Reference: <1295789862-25482-15-git-send-email-tj@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: David Rientjes <rientjes@google.com>

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index d5ed94d..95c36c4 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1705,7 +1705,7 @@ config HAVE_ARCH_EARLY_PFN_TO_NID
 	depends on NUMA
 
 config USE_PERCPU_NUMA_NODE_ID
-	def_bool X86_64
+	def_bool y
 	depends on NUMA
 
 menu "Power management and ACPI options"
diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h
index 5e01c76..2b21fff 100644
--- a/arch/x86/include/asm/numa.h
+++ b/arch/x86/include/asm/numa.h
@@ -30,4 +30,12 @@ static inline void set_apicid_to_node(int apicid, s16 node)
 # include "numa_64.h"
 #endif
 
+#ifdef CONFIG_NUMA
+extern void __cpuinit numa_set_node(int cpu, int node);
+extern void __cpuinit numa_clear_node(int cpu);
+#else	/* CONFIG_NUMA */
+static inline void numa_set_node(int cpu, int node)	{ }
+static inline void numa_clear_node(int cpu)		{ }
+#endif	/* CONFIG_NUMA */
+
 #endif	/* _ASM_X86_NUMA_H */
diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h
index 4982a9c..6ead9f3 100644
--- a/arch/x86/include/asm/numa_64.h
+++ b/arch/x86/include/asm/numa_64.h
@@ -30,8 +30,6 @@ extern void setup_node_bootmem(int nodeid, unsigned long start,
 
 extern void __init init_cpu_to_node(void);
 extern int __cpuinit numa_cpu_node(int cpu);
-extern void __cpuinit numa_set_node(int cpu, int node);
-extern void __cpuinit numa_clear_node(int cpu);
 extern void __cpuinit numa_add_cpu(int cpu);
 extern void __cpuinit numa_remove_cpu(int cpu);
 
@@ -43,8 +41,6 @@ void numa_emu_cmdline(char *);
 #else
 static inline void init_cpu_to_node(void)		{ }
 static inline int numa_cpu_node(int cpu)		{ return NUMA_NO_NODE; }
-static inline void numa_set_node(int cpu, int node)	{ }
-static inline void numa_clear_node(int cpu)		{ }
 static inline void numa_add_cpu(int cpu, int node)	{ }
 static inline void numa_remove_cpu(int cpu)		{ }
 #endif
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 21899cc3..b101c17 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -47,21 +47,6 @@
 
 #include <asm/mpspec.h>
 
-#ifdef CONFIG_X86_32
-
-/* Mappings between logical cpu number and node number */
-extern int cpu_to_node_map[];
-
-/* Returns the number of the node containing CPU 'cpu' */
-static inline int __cpu_to_node(int cpu)
-{
-	return cpu_to_node_map[cpu];
-}
-#define early_cpu_to_node __cpu_to_node
-#define cpu_to_node __cpu_to_node
-
-#else /* CONFIG_X86_64 */
-
 /* Mappings between logical cpu number and node number */
 DECLARE_EARLY_PER_CPU(int, x86_cpu_to_node_map);
 
@@ -84,8 +69,6 @@ static inline int early_cpu_to_node(int cpu)
 
 #endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
 
-#endif /* CONFIG_X86_64 */
-
 /* Mappings between node number and cpus on that node. */
 extern cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
 
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index a7bca59..a2c5121 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -590,12 +590,7 @@ static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
 	if (nid == -1 || !node_online(nid))
 		return;
 	set_apicid_to_node(physid, nid);
-#ifdef CONFIG_X86_64
 	numa_set_node(cpu, nid);
-#else /* CONFIG_X86_32 */
-	cpu_to_node_map[cpu] = nid;
-#endif
-
 #endif
 }
 
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 3cce8f2..77858fd 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -233,7 +233,7 @@ static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c)
 }
 #endif
 
-#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
+#ifdef CONFIG_NUMA
 /*
  * To workaround broken NUMA config.  Read the comment in
  * srat_detect_node().
@@ -338,7 +338,7 @@ EXPORT_SYMBOL_GPL(amd_get_nb_id);
 
 static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
 {
-#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
+#ifdef CONFIG_NUMA
 	int cpu = smp_processor_id();
 	int node;
 	unsigned apicid = c->apicid;
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 6052004..df86bc8 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -276,7 +276,7 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
 
 static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
 {
-#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
+#ifdef CONFIG_NUMA
 	unsigned node;
 	int cpu = smp_processor_id();
 
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index b5147f0..71f4727 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -233,6 +233,7 @@ void __init setup_per_cpu_areas(void)
 		per_cpu(irq_stack_ptr, cpu) =
 			per_cpu(irq_stack_union.irq_stack, cpu) +
 			IRQ_STACK_SIZE - 64;
+#endif
 #ifdef CONFIG_NUMA
 		per_cpu(x86_cpu_to_node_map, cpu) =
 			early_per_cpu_map(x86_cpu_to_node_map, cpu);
@@ -246,7 +247,6 @@ void __init setup_per_cpu_areas(void)
 		 */
 		set_cpu_numa_node(cpu, early_cpu_to_node(cpu));
 #endif
-#endif
 		/*
 		 * Up to this point, the boot CPU has been using .init.data
 		 * area.  Reload any changed state for the boot CPU.
@@ -263,7 +263,7 @@ void __init setup_per_cpu_areas(void)
 #ifdef CONFIG_X86_32
 	early_per_cpu_ptr(x86_cpu_to_logical_apicid) = NULL;
 #endif
-#if defined(CONFIG_X86_64) && defined(CONFIG_NUMA)
+#ifdef CONFIG_NUMA
 	early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;
 #endif
 
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index b7cfce5..2c20382 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -133,16 +133,11 @@ EXPORT_PER_CPU_SYMBOL(cpu_info);
 atomic_t init_deasserted;
 
 #if defined(CONFIG_NUMA) && defined(CONFIG_X86_32)
-/* which node each logical CPU is on */
-int cpu_to_node_map[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = 0 };
-EXPORT_SYMBOL(cpu_to_node_map);
-
 /* set up a mapping between cpu and node. */
 static void map_cpu_to_node(int cpu, int node)
 {
 	printk(KERN_INFO "Mapping cpu %d to node %d\n", cpu, node);
 	cpumask_set_cpu(cpu, node_to_cpumask_map[node]);
-	cpu_to_node_map[cpu] = node;
 }
 
 /* undo a mapping between cpu and node. */
@@ -153,7 +148,6 @@ static void unmap_cpu_to_node(int cpu)
 	printk(KERN_INFO "Unmapping cpu %d from all nodes\n", cpu);
 	for (node = 0; node < MAX_NUMNODES; node++)
 		cpumask_clear_cpu(cpu, node_to_cpumask_map[node]);
-	cpu_to_node_map[cpu] = 0;
 }
 #else /* !(CONFIG_NUMA && CONFIG_X86_32) */
 #define map_cpu_to_node(cpu, node)	({})
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 480b357..187810b 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -36,6 +36,44 @@ cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
 EXPORT_SYMBOL(node_to_cpumask_map);
 
 /*
+ * Map cpu index to node index
+ */
+#ifdef CONFIG_X86_32
+DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, 0);
+#else
+DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
+#endif
+EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
+
+void __cpuinit numa_set_node(int cpu, int node)
+{
+	int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
+
+	/* early setting, no percpu area yet */
+	if (cpu_to_node_map) {
+		cpu_to_node_map[cpu] = node;
+		return;
+	}
+
+#ifdef CONFIG_DEBUG_PER_CPU_MAPS
+	if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) {
+		printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu);
+		dump_stack();
+		return;
+	}
+#endif
+	per_cpu(x86_cpu_to_node_map, cpu) = node;
+
+	if (node != NUMA_NO_NODE)
+		set_cpu_numa_node(cpu, node);
+}
+
+void __cpuinit numa_clear_node(int cpu)
+{
+	numa_set_node(cpu, NUMA_NO_NODE);
+}
+
+/*
  * Allocate node_to_cpumask_map based on number of available nodes
  * Requires node_possible_map to be valid.
  *
@@ -62,6 +100,37 @@ void __init setup_node_to_cpumask_map(void)
 }
 
 #ifdef CONFIG_DEBUG_PER_CPU_MAPS
+
+int __cpu_to_node(int cpu)
+{
+	if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
+		printk(KERN_WARNING
+			"cpu_to_node(%d): usage too early!\n", cpu);
+		dump_stack();
+		return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
+	}
+	return per_cpu(x86_cpu_to_node_map, cpu);
+}
+EXPORT_SYMBOL(__cpu_to_node);
+
+/*
+ * Same function as cpu_to_node() but used if called before the
+ * per_cpu areas are setup.
+ */
+int early_cpu_to_node(int cpu)
+{
+	if (early_per_cpu_ptr(x86_cpu_to_node_map))
+		return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
+
+	if (!cpu_possible(cpu)) {
+		printk(KERN_WARNING
+			"early_cpu_to_node(%d): no per_cpu area!\n", cpu);
+		dump_stack();
+		return NUMA_NO_NODE;
+	}
+	return per_cpu(x86_cpu_to_node_map, cpu);
+}
+
 /*
  * Returns a pointer to the bitmask of CPUs on Node 'node'.
  */
@@ -84,4 +153,5 @@ const struct cpumask *cpumask_of_node(int node)
 	return node_to_cpumask_map[node];
 }
 EXPORT_SYMBOL(cpumask_of_node);
-#endif
+
+#endif	/* CONFIG_DEBUG_PER_CPU_MAPS */
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 1e1026f..f545940 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -30,12 +30,6 @@ static unsigned long __initdata nodemap_addr;
 static unsigned long __initdata nodemap_size;
 
 /*
- * Map cpu index to node index
- */
-DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
-EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
-
-/*
  * Given a shift value, try to populate memnodemap[]
  * Returns :
  * 1 if OK
@@ -732,34 +726,6 @@ int __cpuinit numa_cpu_node(int cpu)
 	return NUMA_NO_NODE;
 }
 
-void __cpuinit numa_set_node(int cpu, int node)
-{
-	int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
-
-	/* early setting, no percpu area yet */
-	if (cpu_to_node_map) {
-		cpu_to_node_map[cpu] = node;
-		return;
-	}
-
-#ifdef CONFIG_DEBUG_PER_CPU_MAPS
-	if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) {
-		printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu);
-		dump_stack();
-		return;
-	}
-#endif
-	per_cpu(x86_cpu_to_node_map, cpu) = node;
-
-	if (node != NUMA_NO_NODE)
-		set_cpu_numa_node(cpu, node);
-}
-
-void __cpuinit numa_clear_node(int cpu)
-{
-	numa_set_node(cpu, NUMA_NO_NODE);
-}
-
 #ifndef CONFIG_DEBUG_PER_CPU_MAPS
 
 #ifndef CONFIG_NUMA_EMU
@@ -887,37 +853,6 @@ void __cpuinit numa_remove_cpu(int cpu)
 {
 	numa_set_cpumask(cpu, 0);
 }
-
-int __cpu_to_node(int cpu)
-{
-	if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
-		printk(KERN_WARNING
-			"cpu_to_node(%d): usage too early!\n", cpu);
-		dump_stack();
-		return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
-	}
-	return per_cpu(x86_cpu_to_node_map, cpu);
-}
-EXPORT_SYMBOL(__cpu_to_node);
-
-/*
- * Same function as cpu_to_node() but used if called before the
- * per_cpu areas are setup.
- */
-int early_cpu_to_node(int cpu)
-{
-	if (early_per_cpu_ptr(x86_cpu_to_node_map))
-		return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
-
-	if (!cpu_possible(cpu)) {
-		printk(KERN_WARNING
-			"early_cpu_to_node(%d): no per_cpu area!\n", cpu);
-		dump_stack();
-		return NUMA_NO_NODE;
-	}
-	return per_cpu(x86_cpu_to_node_map, cpu);
-}
-
 /*
  * --------- end of debug versions of the numa functions ---------
  */
-- 
cgit v0.10.2


From de2d9445f1627830ed2ebd00ee9d851986c940b5 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 23 Jan 2011 14:37:41 +0100
Subject: x86: Unify node_to_cpumask_map handling between 32 and 64bit

x86_32 has been managing node_to_cpumask_map explicitly from
map_cpu_to_node() and friends in a rather ugly way.  With
previous changes, it's now possible to share the code with
64bit.

* When CONFIG_NUMA_EMU is disabled, numa_add/remove_cpu() are
  implemented in numa.c and shared by 32 and 64bit.  CONFIG_NUMA_EMU
  versions still live in numa_64.c.

  NUMA_EMU's dependency on 64bit is planned to be removed and the
  above should go away together.

* identify_cpu() now calls numa_add_cpu() for 32bit too.  This
  makes the explicit mask management from map_cpu_to_node() unnecessary.

* The whole x86_32 specific map_cpu_to_node() chunk is no longer
  necessary.  Dropped.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Pekka Enberg <penberg@kernel.org>
Cc: eric.dumazet@gmail.com
Cc: yinghai@kernel.org
Cc: brgerst@gmail.com
Cc: gorcunov@gmail.com
Cc: shaohui.zheng@intel.com
Cc: rientjes@google.com
LKML-Reference: <1295789862-25482-16-git-send-email-tj@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: David Rientjes <rientjes@google.com>
Cc: Shaohui Zheng <shaohui.zheng@intel.com>

diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h
index 2b21fff..d3964b2 100644
--- a/arch/x86/include/asm/numa.h
+++ b/arch/x86/include/asm/numa.h
@@ -1,6 +1,7 @@
 #ifndef _ASM_X86_NUMA_H
 #define _ASM_X86_NUMA_H
 
+#include <asm/topology.h>
 #include <asm/apicdef.h>
 
 #ifdef CONFIG_NUMA
@@ -33,9 +34,17 @@ static inline void set_apicid_to_node(int apicid, s16 node)
 #ifdef CONFIG_NUMA
 extern void __cpuinit numa_set_node(int cpu, int node);
 extern void __cpuinit numa_clear_node(int cpu);
+extern void __cpuinit numa_add_cpu(int cpu);
+extern void __cpuinit numa_remove_cpu(int cpu);
 #else	/* CONFIG_NUMA */
 static inline void numa_set_node(int cpu, int node)	{ }
 static inline void numa_clear_node(int cpu)		{ }
+static inline void numa_add_cpu(int cpu)		{ }
+static inline void numa_remove_cpu(int cpu)		{ }
 #endif	/* CONFIG_NUMA */
 
+#ifdef CONFIG_DEBUG_PER_CPU_MAPS
+struct cpumask __cpuinit *debug_cpumask_set_cpu(int cpu, int enable);
+#endif
+
 #endif	/* _ASM_X86_NUMA_H */
diff --git a/arch/x86/include/asm/numa_32.h b/arch/x86/include/asm/numa_32.h
index cdf8043..bc66031 100644
--- a/arch/x86/include/asm/numa_32.h
+++ b/arch/x86/include/asm/numa_32.h
@@ -4,7 +4,6 @@
 extern int numa_off;
 
 extern int pxm_to_nid(int pxm);
-extern void numa_remove_cpu(int cpu);
 
 #ifdef CONFIG_NUMA
 extern int __cpuinit numa_cpu_node(int apicid);
diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h
index 6ead9f3..123f185 100644
--- a/arch/x86/include/asm/numa_64.h
+++ b/arch/x86/include/asm/numa_64.h
@@ -30,8 +30,6 @@ extern void setup_node_bootmem(int nodeid, unsigned long start,
 
 extern void __init init_cpu_to_node(void);
 extern int __cpuinit numa_cpu_node(int cpu);
-extern void __cpuinit numa_add_cpu(int cpu);
-extern void __cpuinit numa_remove_cpu(int cpu);
 
 #ifdef CONFIG_NUMA_EMU
 #define FAKE_NODE_MIN_SIZE	((u64)32 << 20)
@@ -41,8 +39,6 @@ void numa_emu_cmdline(char *);
 #else
 static inline void init_cpu_to_node(void)		{ }
 static inline int numa_cpu_node(int cpu)		{ return NUMA_NO_NODE; }
-static inline void numa_add_cpu(int cpu, int node)	{ }
-static inline void numa_remove_cpu(int cpu)		{ }
 #endif
 
 #endif /* _ASM_X86_NUMA_64_H */
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 1d59834..a2559c3 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -869,7 +869,7 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
 
 	select_idle_routine(c);
 
-#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
+#ifdef CONFIG_NUMA
 	numa_add_cpu(smp_processor_id());
 #endif
 }
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 2c20382..522b217 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -132,49 +132,6 @@ EXPORT_PER_CPU_SYMBOL(cpu_info);
 
 atomic_t init_deasserted;
 
-#if defined(CONFIG_NUMA) && defined(CONFIG_X86_32)
-/* set up a mapping between cpu and node. */
-static void map_cpu_to_node(int cpu, int node)
-{
-	printk(KERN_INFO "Mapping cpu %d to node %d\n", cpu, node);
-	cpumask_set_cpu(cpu, node_to_cpumask_map[node]);
-}
-
-/* undo a mapping between cpu and node. */
-static void unmap_cpu_to_node(int cpu)
-{
-	int node;
-
-	printk(KERN_INFO "Unmapping cpu %d from all nodes\n", cpu);
-	for (node = 0; node < MAX_NUMNODES; node++)
-		cpumask_clear_cpu(cpu, node_to_cpumask_map[node]);
-}
-#else /* !(CONFIG_NUMA && CONFIG_X86_32) */
-#define map_cpu_to_node(cpu, node)	({})
-#define unmap_cpu_to_node(cpu)	({})
-#endif
-
-#ifdef CONFIG_X86_32
-static void map_cpu_to_logical_apicid(void)
-{
-	int cpu = smp_processor_id();
-	int node;
-
-	node = numa_cpu_node(cpu);
-	if (!node_online(node))
-		node = first_online_node;
-
-	map_cpu_to_node(cpu, node);
-}
-
-void numa_remove_cpu(int cpu)
-{
-	unmap_cpu_to_node(cpu);
-}
-#else
-#define map_cpu_to_logical_apicid()  do {} while (0)
-#endif
-
 /*
  * Report back to the Boot Processor.
  * Running on AP.
@@ -242,7 +199,6 @@ static void __cpuinit smp_callin(void)
 		apic->smp_callin_clear_local_apic();
 	setup_local_APIC();
 	end_local_APIC_setup();
-	map_cpu_to_logical_apicid();
 
 	/*
 	 * Need to setup vector mappings before we enable interrupts.
@@ -943,7 +899,6 @@ static __init void disable_smp(void)
 		physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
 	else
 		physid_set_mask_of_physid(0, &phys_cpu_present_map);
-	map_cpu_to_logical_apicid();
 	cpumask_set_cpu(0, cpu_sibling_mask(0));
 	cpumask_set_cpu(0, cpu_core_mask(0));
 }
@@ -1120,8 +1075,6 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
 
 	end_local_APIC_setup();
 
-	map_cpu_to_logical_apicid();
-
 	if (apic->setup_portio_remap)
 		apic->setup_portio_remap();
 
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 187810b..75abecb 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -99,7 +99,21 @@ void __init setup_node_to_cpumask_map(void)
 	pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids);
 }
 
-#ifdef CONFIG_DEBUG_PER_CPU_MAPS
+#ifndef CONFIG_DEBUG_PER_CPU_MAPS
+
+# ifndef CONFIG_NUMA_EMU
+void __cpuinit numa_add_cpu(int cpu)
+{
+	cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
+}
+
+void __cpuinit numa_remove_cpu(int cpu)
+{
+	cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
+}
+# endif	/* !CONFIG_NUMA_EMU */
+
+#else	/* !CONFIG_DEBUG_PER_CPU_MAPS */
 
 int __cpu_to_node(int cpu)
 {
@@ -131,6 +145,52 @@ int early_cpu_to_node(int cpu)
 	return per_cpu(x86_cpu_to_node_map, cpu);
 }
 
+struct cpumask __cpuinit *debug_cpumask_set_cpu(int cpu, int enable)
+{
+	int node = early_cpu_to_node(cpu);
+	struct cpumask *mask;
+	char buf[64];
+
+	mask = node_to_cpumask_map[node];
+	if (!mask) {
+		pr_err("node_to_cpumask_map[%i] NULL\n", node);
+		dump_stack();
+		return NULL;
+	}
+
+	cpulist_scnprintf(buf, sizeof(buf), mask);
+	printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
+		enable ? "numa_add_cpu" : "numa_remove_cpu",
+		cpu, node, buf);
+	return mask;
+}
+
+# ifndef CONFIG_NUMA_EMU
+static void __cpuinit numa_set_cpumask(int cpu, int enable)
+{
+	struct cpumask *mask;
+
+	mask = debug_cpumask_set_cpu(cpu, enable);
+	if (!mask)
+		return;
+
+	if (enable)
+		cpumask_set_cpu(cpu, mask);
+	else
+		cpumask_clear_cpu(cpu, mask);
+}
+
+void __cpuinit numa_add_cpu(int cpu)
+{
+	numa_set_cpumask(cpu, 1);
+}
+
+void __cpuinit numa_remove_cpu(int cpu)
+{
+	numa_set_cpumask(cpu, 0);
+}
+# endif	/* !CONFIG_NUMA_EMU */
+
 /*
  * Returns a pointer to the bitmask of CPUs on Node 'node'.
  */
@@ -154,4 +214,4 @@ const struct cpumask *cpumask_of_node(int node)
 }
 EXPORT_SYMBOL(cpumask_of_node);
 
-#endif	/* CONFIG_DEBUG_PER_CPU_MAPS */
+#endif	/* !CONFIG_DEBUG_PER_CPU_MAPS */
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index f545940..14664f5 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -726,19 +726,18 @@ int __cpuinit numa_cpu_node(int cpu)
 	return NUMA_NO_NODE;
 }
 
-#ifndef CONFIG_DEBUG_PER_CPU_MAPS
-
-#ifndef CONFIG_NUMA_EMU
-void __cpuinit numa_add_cpu(int cpu)
-{
-	cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
-}
-
-void __cpuinit numa_remove_cpu(int cpu)
-{
-	cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
-}
-#else
+/*
+ * UGLINESS AHEAD: Currently, CONFIG_NUMA_EMU is 64bit only and makes use
+ * of 64bit specific data structures.  The distinction is artificial and
+ * should be removed.  numa_{add|remove}_cpu() are implemented in numa.c
+ * for both 32 and 64bit when CONFIG_NUMA_EMU is disabled but here when
+ * enabled.
+ *
+ * NUMA emulation is planned to be made generic and the following and other
+ * related code should be moved to numa.c.
+ */
+#ifdef CONFIG_NUMA_EMU
+# ifndef CONFIG_DEBUG_PER_CPU_MAPS
 void __cpuinit numa_add_cpu(int cpu)
 {
 	unsigned long addr;
@@ -778,47 +777,7 @@ void __cpuinit numa_remove_cpu(int cpu)
 	for_each_online_node(i)
 		cpumask_clear_cpu(cpu, node_to_cpumask_map[i]);
 }
-#endif /* !CONFIG_NUMA_EMU */
-
-#else /* CONFIG_DEBUG_PER_CPU_MAPS */
-static struct cpumask __cpuinit *debug_cpumask_set_cpu(int cpu, int enable)
-{
-	int node = early_cpu_to_node(cpu);
-	struct cpumask *mask;
-	char buf[64];
-
-	mask = node_to_cpumask_map[node];
-	if (!mask) {
-		pr_err("node_to_cpumask_map[%i] NULL\n", node);
-		dump_stack();
-		return NULL;
-	}
-
-	cpulist_scnprintf(buf, sizeof(buf), mask);
-	printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
-		enable ? "numa_add_cpu" : "numa_remove_cpu",
-		cpu, node, buf);
-	return mask;
-}
-
-/*
- * --------- debug versions of the numa functions ---------
- */
-#ifndef CONFIG_NUMA_EMU
-static void __cpuinit numa_set_cpumask(int cpu, int enable)
-{
-	struct cpumask *mask;
-
-	mask = debug_cpumask_set_cpu(cpu, enable);
-	if (!mask)
-		return;
-
-	if (enable)
-		cpumask_set_cpu(cpu, mask);
-	else
-		cpumask_clear_cpu(cpu, mask);
-}
-#else
+# else	/* !CONFIG_DEBUG_PER_CPU_MAPS */
 static void __cpuinit numa_set_cpumask(int cpu, int enable)
 {
 	int node = early_cpu_to_node(cpu);
@@ -842,7 +801,6 @@ static void __cpuinit numa_set_cpumask(int cpu, int enable)
 			cpumask_clear_cpu(cpu, mask);
 	}
 }
-#endif /* CONFIG_NUMA_EMU */
 
 void __cpuinit numa_add_cpu(int cpu)
 {
@@ -853,8 +811,5 @@ void __cpuinit numa_remove_cpu(int cpu)
 {
 	numa_set_cpumask(cpu, 0);
 }
-/*
- * --------- end of debug versions of the numa functions ---------
- */
-
-#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
+# endif	/* !CONFIG_DEBUG_PER_CPU_MAPS */
+#endif	/* CONFIG_NUMA_EMU */
-- 
cgit v0.10.2


From 8db78cc4b4048e3add40bca1bc3e55057c319256 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 23 Jan 2011 14:37:42 +0100
Subject: x86: Unify NUMA initialization between 32 and 64bit

Now that everything else is unified, NUMA initialization can be
unified too.

* numa_init_array() and init_cpu_to_node() are moved from
  numa_64 to numa.

* numa_32::initmem_init() is updated to call numa_init_array()
  and setup_arch() to call init_cpu_to_node() on 32bit too.

* x86_cpu_to_node_map is now initialized to NUMA_NO_NODE on
  32bit too. This is safe now as numa_init_array() will initialize
  it early during boot.

This makes NUMA mapping fully initialized before
setup_per_cpu_areas() on 32bit too and thus makes the first
percpu chunk which contains all the static variables and some of
dynamic area allocated with NUMA affinity correctly considered.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: yinghai@kernel.org
Cc: brgerst@gmail.com
Cc: gorcunov@gmail.com
Cc: shaohui.zheng@intel.com
Cc: rientjes@google.com
LKML-Reference: <1295789862-25482-17-git-send-email-tj@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Reported-by: Eric Dumazet <eric.dumazet@gmail.com>
Reviewed-by: Pekka Enberg <penberg@kernel.org>

diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h
index d3964b2..26fc6e2 100644
--- a/arch/x86/include/asm/numa.h
+++ b/arch/x86/include/asm/numa.h
@@ -34,11 +34,15 @@ static inline void set_apicid_to_node(int apicid, s16 node)
 #ifdef CONFIG_NUMA
 extern void __cpuinit numa_set_node(int cpu, int node);
 extern void __cpuinit numa_clear_node(int cpu);
+extern void __init numa_init_array(void);
+extern void __init init_cpu_to_node(void);
 extern void __cpuinit numa_add_cpu(int cpu);
 extern void __cpuinit numa_remove_cpu(int cpu);
 #else	/* CONFIG_NUMA */
 static inline void numa_set_node(int cpu, int node)	{ }
 static inline void numa_clear_node(int cpu)		{ }
+static inline void numa_init_array(void)		{ }
+static inline void init_cpu_to_node(void)		{ }
 static inline void numa_add_cpu(int cpu)		{ }
 static inline void numa_remove_cpu(int cpu)		{ }
 #endif	/* CONFIG_NUMA */
diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h
index 123f185..2819afa 100644
--- a/arch/x86/include/asm/numa_64.h
+++ b/arch/x86/include/asm/numa_64.h
@@ -13,7 +13,6 @@ extern int compute_hash_shift(struct bootnode *nodes, int numblks,
 
 #define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT))
 
-extern void numa_init_array(void);
 extern int numa_off;
 
 extern unsigned long numa_free_all_bootmem(void);
@@ -28,7 +27,6 @@ extern void setup_node_bootmem(int nodeid, unsigned long start,
  */
 #define NODE_MIN_SIZE (4*1024*1024)
 
-extern void __init init_cpu_to_node(void);
 extern int __cpuinit numa_cpu_node(int cpu);
 
 #ifdef CONFIG_NUMA_EMU
@@ -37,7 +35,6 @@ extern int __cpuinit numa_cpu_node(int cpu);
 void numa_emu_cmdline(char *);
 #endif /* CONFIG_NUMA_EMU */
 #else
-static inline void init_cpu_to_node(void)		{ }
 static inline int numa_cpu_node(int cpu)		{ return NUMA_NO_NODE; }
 #endif
 
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index d3cfe26c..1202341 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1040,9 +1040,7 @@ void __init setup_arch(char **cmdline_p)
 
 	prefill_possible_map();
 
-#ifdef CONFIG_X86_64
 	init_cpu_to_node();
-#endif
 
 	init_apic_mappings();
 	ioapic_and_gsi_init();
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 75abecb..bf60715 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -38,11 +38,7 @@ EXPORT_SYMBOL(node_to_cpumask_map);
 /*
  * Map cpu index to node index
  */
-#ifdef CONFIG_X86_32
-DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, 0);
-#else
 DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
-#endif
 EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
 
 void __cpuinit numa_set_node(int cpu, int node)
@@ -99,6 +95,78 @@ void __init setup_node_to_cpumask_map(void)
 	pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids);
 }
 
+/*
+ * There are unfortunately some poorly designed mainboards around that
+ * only connect memory to a single CPU. This breaks the 1:1 cpu->node
+ * mapping. To avoid this fill in the mapping for all possible CPUs,
+ * as the number of CPUs is not known yet. We round robin the existing
+ * nodes.
+ */
+void __init numa_init_array(void)
+{
+	int rr, i;
+
+	rr = first_node(node_online_map);
+	for (i = 0; i < nr_cpu_ids; i++) {
+		if (early_cpu_to_node(i) != NUMA_NO_NODE)
+			continue;
+		numa_set_node(i, rr);
+		rr = next_node(rr, node_online_map);
+		if (rr == MAX_NUMNODES)
+			rr = first_node(node_online_map);
+	}
+}
+
+static __init int find_near_online_node(int node)
+{
+	int n, val;
+	int min_val = INT_MAX;
+	int best_node = -1;
+
+	for_each_online_node(n) {
+		val = node_distance(node, n);
+
+		if (val < min_val) {
+			min_val = val;
+			best_node = n;
+		}
+	}
+
+	return best_node;
+}
+
+/*
+ * Setup early cpu_to_node.
+ *
+ * Populate cpu_to_node[] only if x86_cpu_to_apicid[],
+ * and apicid_to_node[] tables have valid entries for a CPU.
+ * This means we skip cpu_to_node[] initialisation for NUMA
+ * emulation and faking node case (when running a kernel compiled
+ * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
+ * is already initialized in a round robin manner at numa_init_array,
+ * prior to this call, and this initialization is good enough
+ * for the fake NUMA cases.
+ *
+ * Called before the per_cpu areas are setup.
+ */
+void __init init_cpu_to_node(void)
+{
+	int cpu;
+	u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
+
+	BUG_ON(cpu_to_apicid == NULL);
+
+	for_each_possible_cpu(cpu) {
+		int node = numa_cpu_node(cpu);
+
+		if (node == NUMA_NO_NODE)
+			continue;
+		if (!node_online(node))
+			node = find_near_online_node(node);
+		numa_set_node(cpu, node);
+	}
+}
+
 #ifndef CONFIG_DEBUG_PER_CPU_MAPS
 
 # ifndef CONFIG_NUMA_EMU
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index 8d91d22..505bb04 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -367,6 +367,7 @@ void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
 	 */
 
 	get_memcfg_numa();
+	numa_init_array();
 
 	kva_pages = roundup(calculate_numa_remap_pages(), PTRS_PER_PTE);
 
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 14664f5..f548fbf 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -224,28 +224,6 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
 	node_set_online(nodeid);
 }
 
-/*
- * There are unfortunately some poorly designed mainboards around that
- * only connect memory to a single CPU. This breaks the 1:1 cpu->node
- * mapping. To avoid this fill in the mapping for all possible CPUs,
- * as the number of CPUs is not known yet. We round robin the existing
- * nodes.
- */
-void __init numa_init_array(void)
-{
-	int rr, i;
-
-	rr = first_node(node_online_map);
-	for (i = 0; i < nr_cpu_ids; i++) {
-		if (early_cpu_to_node(i) != NUMA_NO_NODE)
-			continue;
-		numa_set_node(i, rr);
-		rr = next_node(rr, node_online_map);
-		if (rr == MAX_NUMNODES)
-			rr = first_node(node_online_map);
-	}
-}
-
 #ifdef CONFIG_NUMA_EMU
 /* Numa emulation */
 static struct bootnode nodes[MAX_NUMNODES] __initdata;
@@ -664,59 +642,6 @@ unsigned long __init numa_free_all_bootmem(void)
 	return pages;
 }
 
-#ifdef CONFIG_NUMA
-
-static __init int find_near_online_node(int node)
-{
-	int n, val;
-	int min_val = INT_MAX;
-	int best_node = -1;
-
-	for_each_online_node(n) {
-		val = node_distance(node, n);
-
-		if (val < min_val) {
-			min_val = val;
-			best_node = n;
-		}
-	}
-
-	return best_node;
-}
-
-/*
- * Setup early cpu_to_node.
- *
- * Populate cpu_to_node[] only if x86_cpu_to_apicid[],
- * and apicid_to_node[] tables have valid entries for a CPU.
- * This means we skip cpu_to_node[] initialisation for NUMA
- * emulation and faking node case (when running a kernel compiled
- * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
- * is already initialized in a round robin manner at numa_init_array,
- * prior to this call, and this initialization is good enough
- * for the fake NUMA cases.
- *
- * Called before the per_cpu areas are setup.
- */
-void __init init_cpu_to_node(void)
-{
-	int cpu;
-	u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
-
-	BUG_ON(cpu_to_apicid == NULL);
-
-	for_each_possible_cpu(cpu) {
-		int node = numa_cpu_node(cpu);
-
-		if (node == NUMA_NO_NODE)
-			continue;
-		if (!node_online(node))
-			node = find_near_online_node(node);
-		numa_set_node(cpu, node);
-	}
-}
-#endif
-
 int __cpuinit numa_cpu_node(int cpu)
 {
 	int apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
-- 
cgit v0.10.2


From 4e62445b90ac4ef708bd11c7ae052b1d5ef765b5 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 28 Jan 2011 17:22:48 +0100
Subject: x86: Fix build failure on X86_UP_APIC

Commit 4c321ff8 (x86: Replace cpu_2_logical_apicid[] with early
percpu variable) and following changes introduced and used
x86_cpu_to_logical_apicid percpu variable.  It was declared and
defined inside CONFIG_SMP && CONFIG_X86_32 but if
CONFIG_X86_UP_APIC is set UP configuration makes use of it and
build fails.

Fix it by declaring and defining it inside CONFIG_X86_LOCAL_APIC
&& CONFIG_X86_32.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Ingo Molnar <mingo@elte.hu>
Cc: eric.dumazet@gmail.com
Cc: yinghai@kernel.org
Cc: brgerst@gmail.com
Cc: gorcunov@gmail.com
Cc: penberg@kernel.org
Cc: shaohui.zheng@intel.com
Cc: rientjes@google.com
LKML-Reference: <20110128162248.GA25746@htj.dyndns.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index dc7c46a..7592782 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -38,7 +38,7 @@ static inline struct cpumask *cpu_core_mask(int cpu)
 
 DECLARE_EARLY_PER_CPU(u16, x86_cpu_to_apicid);
 DECLARE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid);
-#if defined(CONFIG_SMP) && defined(CONFIG_X86_32)
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32)
 DECLARE_EARLY_PER_CPU(int, x86_cpu_to_logical_apicid);
 #endif
 
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 4686ea5..1390cf9 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -79,7 +79,6 @@ EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
 
 #ifdef CONFIG_X86_32
 
-#ifdef CONFIG_SMP
 /*
  * On x86_32, the mapping between cpu and logical apicid may vary
  * depending on apic in use.  The following early percpu variable is
@@ -87,7 +86,6 @@ EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
  * actually diverge.  Let's keep it ugly for now.
  */
 DEFINE_EARLY_PER_CPU(int, x86_cpu_to_logical_apicid, BAD_APICID);
-#endif
 
 /*
  * Knob to control our willingness to enable the local APIC.
-- 
cgit v0.10.2


From dc82009aac6ee6e423b48de43a251745c62ab012 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 28 Jan 2011 14:49:19 -0200
Subject: perf record: No need to check for overwrites

As we open the mmap with (PROT_READ | PROT_WRITE), signalling the kernel
with perf_mmap__write_tail() when consuming data, so the kernel will not
overwrite.

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index d788630..caf9279 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -115,27 +115,11 @@ static void mmap_read(struct perf_mmap *md)
 	unsigned char *data = md->base + page_size;
 	unsigned long size;
 	void *buf;
-	int diff;
 
-	/*
-	 * If we're further behind than half the buffer, there's a chance
-	 * the writer will bite our tail and mess up the samples under us.
-	 *
-	 * If we somehow ended up ahead of the head, we got messed up.
-	 *
-	 * In either case, truncate and restart at head.
-	 */
-	diff = head - old;
-	if (diff < 0) {
-		fprintf(stderr, "WARNING: failed to keep up with mmap data\n");
-		/*
-		 * head points to a known good entry, start there.
-		 */
-		old = head;
-	}
+	if (old == head)
+		return;
 
-	if (old != head)
-		samples++;
+	samples++;
 
 	size = head - old;
 
-- 
cgit v0.10.2


From ef2bf6d043ac9bd4a6f38d862af407154a4754d9 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Sat, 29 Jan 2011 09:04:40 -0200
Subject: perf events: Account PERF_RECORD_LOST events in event__process

Right now this function is only used by perf top, that uses PROT_READ
only, i.e. overwrite mode, so no PERF_RECORD_LOST events are generated,
but don't forget those events.

The patch that moved this out of perf top was made so that this routine
could be used by 'perf probe' in the uprobes patchset, so perhaps there
they need to check for LOST events and warn the user, as will be done in
the following patches that will switch 'perf top' to non overwrite mode
(mmap with PROT_READ|PROT_WRITE).

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
index e4db8b8..b9d4ac8 100644
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -646,6 +646,8 @@ int event__process(event_t *event, struct sample_data *sample,
 	case PERF_RECORD_EXIT:
 		event__process_task(event, sample, session);
 		break;
+	case PERF_RECORD_LOST:
+		event__process_lost(event, sample, session);
 	default:
 		break;
 	}
-- 
cgit v0.10.2


From 7bb41152b9be7e31f10d8919bce5034135525d9d Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Sat, 29 Jan 2011 09:08:13 -0200
Subject: perf evlist: Support non overwrite mode in perf_evlist__read_on_cpu

I.e. stash the overwrite mode in struct perf_evlist and act accordingly
in perf_evlist__read_on_cpu, not checking for overwrites and touching
the tail after consuming one event, like perf record does, for instance.

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index df0610e..b498eec 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -116,24 +116,25 @@ event_t *perf_evlist__read_on_cpu(struct perf_evlist *evlist, int cpu)
 	unsigned int old = md->prev;
 	unsigned char *data = md->base + page_size;
 	event_t *event = NULL;
-	int diff;
-
-	/*
-	 * If we're further behind than half the buffer, there's a chance
-	 * the writer will bite our tail and mess up the samples under us.
-	 *
-	 * If we somehow ended up ahead of the head, we got messed up.
-	 *
-	 * In either case, truncate and restart at head.
-	 */
-	diff = head - old;
-	if (diff > md->mask / 2 || diff < 0) {
-		fprintf(stderr, "WARNING: failed to keep up with mmap data.\n");
 
+	if (evlist->overwrite) {
 		/*
-		 * head points to a known good entry, start there.
+		 * If we're further behind than half the buffer, there's a chance
+		 * the writer will bite our tail and mess up the samples under us.
+		 *
+		 * If we somehow ended up ahead of the head, we got messed up.
+		 *
+		 * In either case, truncate and restart at head.
 		 */
-		old = head;
+		int diff = head - old;
+		if (diff > md->mask / 2 || diff < 0) {
+			fprintf(stderr, "WARNING: failed to keep up with mmap data.\n");
+
+			/*
+			 * head points to a known good entry, start there.
+			 */
+			old = head;
+		}
 	}
 
 	if (old != head) {
@@ -166,5 +167,9 @@ event_t *perf_evlist__read_on_cpu(struct perf_evlist *evlist, int cpu)
 	}
 
 	md->prev = old;
+
+	if (!evlist->overwrite)
+		perf_mmap__write_tail(md, old);
+
 	return event;
 }
diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h
index acbe48e..2706ae4 100644
--- a/tools/perf/util/evlist.h
+++ b/tools/perf/util/evlist.h
@@ -16,6 +16,7 @@ struct perf_evlist {
 	int		 nr_entries;
 	int		 nr_fds;
 	int		 mmap_len;
+	bool		 overwrite;
 	event_t		 event_copy;
 	struct perf_mmap *mmap;
 	struct pollfd	 *pollfd;
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 76ab553..f98b3e5 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -327,6 +327,7 @@ int perf_evlist__mmap(struct perf_evlist *evlist, struct cpu_map *cpus,
 	    perf_evlist__alloc_pollfd(evlist, cpus->nr, threads->nr) < 0)
 		return -ENOMEM;
 
+	evlist->overwrite = overwrite;
 	evlist->mmap_len = (pages + 1) * page_size;
 	first_evsel = list_entry(evlist->entries.next, struct perf_evsel, node);
 
-- 
cgit v0.10.2


From 93fc64f14472ae24fd640bf3834a178f59142842 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Sat, 29 Jan 2011 12:08:00 -0200
Subject: perf top: Switch to non overwrite mode

Just like 'perf record'. Warn the user when PERF_RECORD_LOST events
happen.

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index ce2e50c..7f92ab7 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -464,7 +464,7 @@ static void rb_insert_active_sym(struct rb_root *tree, struct sym_entry *se)
 	rb_insert_color(&se->rb_node, tree);
 }
 
-static void print_sym_table(void)
+static void print_sym_table(struct perf_session *session)
 {
 	int printed = 0, j;
 	struct perf_evsel *counter;
@@ -513,7 +513,6 @@ static void print_sym_table(void)
 
 	puts(CONSOLE_CLEAR);
 
-	printf("%-*.*s\n", win_width, win_width, graph_dotted_line);
 	if (!perf_guest) {
 		printf("   PerfTop:%8.0f irqs/sec  kernel:%4.1f%%"
 			"  exact: %4.1f%% [",
@@ -578,6 +577,12 @@ static void print_sym_table(void)
 
 	printf("%-*.*s\n", win_width, win_width, graph_dotted_line);
 
+	if (session->hists.stats.total_lost != 0) {
+		color_fprintf(stdout, PERF_COLOR_RED, "WARNING:");
+		printf(" LOST %" PRIu64 " events, Check IO/CPU overload\n",
+		       session->hists.stats.total_lost);
+	}
+
 	if (sym_filter_entry) {
 		show_details(sym_filter_entry);
 		return;
@@ -919,7 +924,7 @@ repeat:
 	getc(stdin);
 
 	do {
-		print_sym_table();
+		print_sym_table(session);
 	} while (!poll(&stdin_poll, 1, delay_msecs) == 1);
 
 	c = getc(stdin);
@@ -1176,7 +1181,7 @@ try_again:
 		}
 	}
 
-	if (perf_evlist__mmap(evlist, cpus, threads, mmap_pages, true) < 0)
+	if (perf_evlist__mmap(evlist, cpus, threads, mmap_pages, false) < 0)
 		die("failed to mmap with %d (%s)\n", errno, strerror(errno));
 }
 
-- 
cgit v0.10.2


From 8d50e5b4171a69cf48ca94a1e7c14033d0b4771d Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Sat, 29 Jan 2011 13:02:00 -0200
Subject: perf tools: Rename 'struct sample_data' to 'struct perf_sample'

Making the namespace more uniform.

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
index 8879463..ef36751 100644
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -58,7 +58,7 @@ static int hists__add_entry(struct hists *self, struct addr_location *al)
 	return hist_entry__inc_addr_samples(he, al->addr);
 }
 
-static int process_sample_event(event_t *event, struct sample_data *sample,
+static int process_sample_event(event_t *event, struct perf_sample *sample,
 				struct perf_session *session)
 {
 	struct addr_location al;
diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c
index 3153e49..0822149 100644
--- a/tools/perf/builtin-diff.c
+++ b/tools/perf/builtin-diff.c
@@ -31,7 +31,7 @@ static int hists__add_entry(struct hists *self,
 }
 
 static int diff__process_sample_event(event_t *event,
-				      struct sample_data *sample,
+				      struct perf_sample *sample,
 				      struct perf_session *session)
 {
 	struct addr_location al;
diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c
index 0c78ffa..4c9388c 100644
--- a/tools/perf/builtin-inject.c
+++ b/tools/perf/builtin-inject.c
@@ -36,13 +36,13 @@ static int event__repipe_synth(event_t *event,
 	return 0;
 }
 
-static int event__repipe(event_t *event, struct sample_data *sample __used,
+static int event__repipe(event_t *event, struct perf_sample *sample __used,
 			 struct perf_session *session)
 {
 	return event__repipe_synth(event, session);
 }
 
-static int event__repipe_mmap(event_t *self, struct sample_data *sample,
+static int event__repipe_mmap(event_t *self, struct perf_sample *sample,
 			      struct perf_session *session)
 {
 	int err;
@@ -53,7 +53,7 @@ static int event__repipe_mmap(event_t *self, struct sample_data *sample,
 	return err;
 }
 
-static int event__repipe_task(event_t *self, struct sample_data *sample,
+static int event__repipe_task(event_t *self, struct perf_sample *sample,
 			      struct perf_session *session)
 {
 	int err;
@@ -119,7 +119,7 @@ static int dso__inject_build_id(struct dso *self, struct perf_session *session)
 	return 0;
 }
 
-static int event__inject_buildid(event_t *event, struct sample_data *sample,
+static int event__inject_buildid(event_t *event, struct perf_sample *sample,
 				 struct perf_session *session)
 {
 	struct addr_location al;
diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c
index d97256d..3c1cdcf 100644
--- a/tools/perf/builtin-kmem.c
+++ b/tools/perf/builtin-kmem.c
@@ -304,7 +304,7 @@ process_raw_event(event_t *raw_event __used, void *data,
 	}
 }
 
-static int process_sample_event(event_t *event, struct sample_data *sample,
+static int process_sample_event(event_t *event, struct perf_sample *sample,
 				struct perf_session *session)
 {
 	struct thread *thread = perf_session__findnew(session, event->ip.pid);
diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c
index 2b36def..c3f5127 100644
--- a/tools/perf/builtin-lock.c
+++ b/tools/perf/builtin-lock.c
@@ -834,7 +834,7 @@ static void dump_info(void)
 		die("Unknown type of information\n");
 }
 
-static int process_sample_event(event_t *self, struct sample_data *sample,
+static int process_sample_event(event_t *self, struct perf_sample *sample,
 				struct perf_session *s)
 {
 	struct thread *thread = perf_session__findnew(s, sample->tid);
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index caf9279..5d3e4b3 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -101,7 +101,7 @@ static void write_output(void *buf, size_t size)
 }
 
 static int process_synthesized_event(event_t *event,
-				     struct sample_data *sample __used,
+				     struct perf_sample *sample __used,
 				     struct perf_session *self __used)
 {
 	write_output(event, event->header.size);
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index f6a4349..bbbadcc 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -77,9 +77,9 @@ static struct hists *perf_session__hists_findnew(struct perf_session *self,
 	return new;
 }
 
-static int perf_session__add_hist_entry(struct perf_session *self,
+static int perf_session__add_hist_entry(struct perf_session *session,
 					struct addr_location *al,
-					struct sample_data *data)
+					struct perf_sample *sample)
 {
 	struct symbol *parent = NULL;
 	int err = 0;
@@ -87,28 +87,28 @@ static int perf_session__add_hist_entry(struct perf_session *self,
 	struct hists *hists;
 	struct perf_event_attr *attr;
 
-	if ((sort__has_parent || symbol_conf.use_callchain) && data->callchain) {
-		err = perf_session__resolve_callchain(self, al->thread,
-						      data->callchain, &parent);
+	if ((sort__has_parent || symbol_conf.use_callchain) && sample->callchain) {
+		err = perf_session__resolve_callchain(session, al->thread,
+						      sample->callchain, &parent);
 		if (err)
 			return err;
 	}
 
-	attr = perf_header__find_attr(data->id, &self->header);
+	attr = perf_header__find_attr(sample->id, &session->header);
 	if (attr)
-		hists = perf_session__hists_findnew(self, data->id, attr->type, attr->config);
+		hists = perf_session__hists_findnew(session, sample->id, attr->type, attr->config);
 	else
-		hists = perf_session__hists_findnew(self, data->id, 0, 0);
+		hists = perf_session__hists_findnew(session, sample->id, 0, 0);
 	if (hists == NULL)
 		return -ENOMEM;
 
-	he = __hists__add_entry(hists, al, parent, data->period);
+	he = __hists__add_entry(hists, al, parent, sample->period);
 	if (he == NULL)
 		return -ENOMEM;
 
 	if (symbol_conf.use_callchain) {
-		err = callchain_append(he->callchain, &self->callchain_cursor,
-				       data->period);
+		err = callchain_append(he->callchain, &session->callchain_cursor,
+				       sample->period);
 		if (err)
 			return err;
 	}
@@ -124,32 +124,32 @@ static int perf_session__add_hist_entry(struct perf_session *self,
 }
 
 static int add_event_total(struct perf_session *session,
-			   struct sample_data *data,
+			   struct perf_sample *sample,
 			   struct perf_event_attr *attr)
 {
 	struct hists *hists;
 
 	if (attr)
-		hists = perf_session__hists_findnew(session, data->id,
+		hists = perf_session__hists_findnew(session, sample->id,
 						    attr->type, attr->config);
 	else
-		hists = perf_session__hists_findnew(session, data->id, 0, 0);
+		hists = perf_session__hists_findnew(session, sample->id, 0, 0);
 
 	if (!hists)
 		return -ENOMEM;
 
-	hists->stats.total_period += data->period;
+	hists->stats.total_period += sample->period;
 	/*
 	 * FIXME: add_event_total should be moved from here to
 	 * perf_session__process_event so that the proper hist is passed to
 	 * the event_op methods.
 	 */
 	hists__inc_nr_events(hists, PERF_RECORD_SAMPLE);
-	session->hists.stats.total_period += data->period;
+	session->hists.stats.total_period += sample->period;
 	return 0;
 }
 
-static int process_sample_event(event_t *event, struct sample_data *sample,
+static int process_sample_event(event_t *event, struct perf_sample *sample,
 				struct perf_session *session)
 {
 	struct addr_location al;
@@ -179,7 +179,7 @@ static int process_sample_event(event_t *event, struct sample_data *sample,
 	return 0;
 }
 
-static int process_read_event(event_t *event, struct sample_data *sample __used,
+static int process_read_event(event_t *event, struct perf_sample *sample __used,
 			      struct perf_session *session __used)
 {
 	struct perf_event_attr *attr;
diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 29acb894..ff993c8 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -1607,7 +1607,7 @@ process_raw_event(event_t *raw_event __used, struct perf_session *session,
 		process_sched_migrate_task_event(data, session, event, cpu, timestamp, thread);
 }
 
-static int process_sample_event(event_t *event, struct sample_data *sample,
+static int process_sample_event(event_t *event, struct perf_sample *sample,
 				struct perf_session *session)
 {
 	struct thread *thread;
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index b766c2a..5c4c809 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -63,7 +63,7 @@ static int cleanup_scripting(void)
 
 static char const		*input_name = "perf.data";
 
-static int process_sample_event(event_t *event, struct sample_data *sample,
+static int process_sample_event(event_t *event, struct perf_sample *sample,
 				struct perf_session *session)
 {
 	struct thread *thread = perf_session__findnew(session, event->ip.pid);
diff --git a/tools/perf/builtin-test.c b/tools/perf/builtin-test.c
index 738830d..df62433 100644
--- a/tools/perf/builtin-test.c
+++ b/tools/perf/builtin-test.c
@@ -550,7 +550,7 @@ static int test__basic_mmap(void)
 		}
 
 	while ((event = perf_evlist__read_on_cpu(evlist, 0)) != NULL) {
-		struct sample_data sample;
+		struct perf_sample sample;
 
 		if (event->header.type != PERF_RECORD_SAMPLE) {
 			pr_debug("unexpected %s event\n",
diff --git a/tools/perf/builtin-timechart.c b/tools/perf/builtin-timechart.c
index 746cf03..01cf0c3 100644
--- a/tools/perf/builtin-timechart.c
+++ b/tools/perf/builtin-timechart.c
@@ -276,21 +276,21 @@ static int cpus_cstate_state[MAX_CPUS];
 static u64 cpus_pstate_start_times[MAX_CPUS];
 static u64 cpus_pstate_state[MAX_CPUS];
 
-static int process_comm_event(event_t *event, struct sample_data *sample __used,
+static int process_comm_event(event_t *event, struct perf_sample *sample __used,
 			      struct perf_session *session __used)
 {
 	pid_set_comm(event->comm.tid, event->comm.comm);
 	return 0;
 }
 
-static int process_fork_event(event_t *event, struct sample_data *sample __used,
+static int process_fork_event(event_t *event, struct perf_sample *sample __used,
 			      struct perf_session *session __used)
 {
 	pid_fork(event->fork.pid, event->fork.ppid, event->fork.time);
 	return 0;
 }
 
-static int process_exit_event(event_t *event, struct sample_data *sample __used,
+static int process_exit_event(event_t *event, struct perf_sample *sample __used,
 			      struct perf_session *session __used)
 {
 	pid_exit(event->fork.pid, event->fork.time);
@@ -487,7 +487,7 @@ static void sched_switch(int cpu, u64 timestamp, struct trace_entry *te)
 
 
 static int process_sample_event(event_t *event __used,
-				struct sample_data *sample,
+				struct perf_sample *sample,
 				struct perf_session *session)
 {
 	struct trace_entry *te;
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 7f92ab7..d923127 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -997,7 +997,7 @@ static int symbol_filter(struct map *map, struct symbol *sym)
 }
 
 static void event__process_sample(const event_t *self,
-				  struct sample_data *sample,
+				  struct perf_sample *sample,
 				  struct perf_session *session)
 {
 	u64 ip = self->ip.ip;
@@ -1107,7 +1107,7 @@ static void event__process_sample(const event_t *self,
 
 static void perf_session__mmap_read_cpu(struct perf_session *self, int cpu)
 {
-	struct sample_data sample;
+	struct perf_sample sample;
 	event_t *event;
 
 	while ((event = perf_evlist__read_on_cpu(evsel_list, cpu)) != NULL) {
diff --git a/tools/perf/util/build-id.c b/tools/perf/util/build-id.c
index deffb8c..b184a7f 100644
--- a/tools/perf/util/build-id.c
+++ b/tools/perf/util/build-id.c
@@ -15,7 +15,7 @@
 #include "debug.h"
 
 static int build_id__mark_dso_hit(event_t *event,
-				  struct sample_data *sample __used,
+				  struct perf_sample *sample __used,
 				  struct perf_session *session)
 {
 	struct addr_location al;
@@ -37,7 +37,7 @@ static int build_id__mark_dso_hit(event_t *event,
 	return 0;
 }
 
-static int event__exit_del_thread(event_t *self, struct sample_data *sample __used,
+static int event__exit_del_thread(event_t *self, struct perf_sample *sample __used,
 				  struct perf_session *session)
 {
 	struct thread *thread = perf_session__findnew(session, self->fork.tid);
diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
index b9d4ac8..5c886fb 100644
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -34,7 +34,7 @@ const char *event__get_event_name(unsigned int id)
 	return event__name[id];
 }
 
-static struct sample_data synth_sample = {
+static struct perf_sample synth_sample = {
 	.pid	   = -1,
 	.tid	   = -1,
 	.time	   = -1,
@@ -440,7 +440,7 @@ static int thread__set_comm_adjust(struct thread *self, const char *comm,
 	return 0;
 }
 
-int event__process_comm(event_t *self, struct sample_data *sample __used,
+int event__process_comm(event_t *self, struct perf_sample *sample __used,
 			struct perf_session *session)
 {
 	struct thread *thread = perf_session__findnew(session, self->comm.tid);
@@ -456,7 +456,7 @@ int event__process_comm(event_t *self, struct sample_data *sample __used,
 	return 0;
 }
 
-int event__process_lost(event_t *self, struct sample_data *sample __used,
+int event__process_lost(event_t *self, struct perf_sample *sample __used,
 			struct perf_session *session)
 {
 	dump_printf(": id:%" PRIu64 ": lost:%" PRIu64 "\n",
@@ -567,7 +567,7 @@ out_problem:
 	return -1;
 }
 
-int event__process_mmap(event_t *self, struct sample_data *sample __used,
+int event__process_mmap(event_t *self, struct perf_sample *sample __used,
 			struct perf_session *session)
 {
 	struct machine *machine;
@@ -609,7 +609,7 @@ out_problem:
 	return 0;
 }
 
-int event__process_task(event_t *self, struct sample_data *sample __used,
+int event__process_task(event_t *self, struct perf_sample *sample __used,
 			struct perf_session *session)
 {
 	struct thread *thread = perf_session__findnew(session, self->fork.tid);
@@ -632,7 +632,7 @@ int event__process_task(event_t *self, struct sample_data *sample __used,
 	return 0;
 }
 
-int event__process(event_t *event, struct sample_data *sample,
+int event__process(event_t *event, struct perf_sample *sample,
 		   struct perf_session *session)
 {
 	switch (event->header.type) {
@@ -757,7 +757,7 @@ static void dso__calc_col_width(struct dso *self, struct hists *hists)
 }
 
 int event__preprocess_sample(const event_t *self, struct perf_session *session,
-			     struct addr_location *al, struct sample_data *data,
+			     struct addr_location *al, struct perf_sample *sample,
 			     symbol_filter_t filter)
 {
 	u8 cpumode = self->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
@@ -788,7 +788,7 @@ int event__preprocess_sample(const event_t *self, struct perf_session *session,
 		    al->map ? al->map->dso->long_name :
 			al->level == 'H' ? "[hypervisor]" : "<not found>");
 	al->sym = NULL;
-	al->cpu = data->cpu;
+	al->cpu = sample->cpu;
 
 	if (al->map) {
 		if (symbol_conf.dso_list &&
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index d79e4ed..84fd71f 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -61,7 +61,7 @@ struct sample_event {
 	u64 array[];
 };
 
-struct sample_data {
+struct perf_sample {
 	u64 ip;
 	u32 pid, tid;
 	u64 time;
@@ -138,7 +138,7 @@ struct perf_session;
 
 typedef int (*event__handler_synth_t)(event_t *event, 
 				      struct perf_session *session);
-typedef int (*event__handler_t)(event_t *event, struct sample_data *sample,
+typedef int (*event__handler_t)(event_t *event, struct perf_sample *sample,
 				struct perf_session *session);
 
 int event__synthesize_thread(pid_t pid, event__handler_t process,
@@ -154,25 +154,25 @@ int event__synthesize_modules(event__handler_t process,
 			      struct perf_session *session,
 			      struct machine *machine);
 
-int event__process_comm(event_t *self, struct sample_data *sample,
+int event__process_comm(event_t *event, struct perf_sample *sample,
 			struct perf_session *session);
-int event__process_lost(event_t *self, struct sample_data *sample,
+int event__process_lost(event_t *event, struct perf_sample *sample,
 			struct perf_session *session);
-int event__process_mmap(event_t *self, struct sample_data *sample,
+int event__process_mmap(event_t *event, struct perf_sample *sample,
 			struct perf_session *session);
-int event__process_task(event_t *self, struct sample_data *sample,
+int event__process_task(event_t *event, struct perf_sample *sample,
 			struct perf_session *session);
-int event__process(event_t *event, struct sample_data *sample,
+int event__process(event_t *event, struct perf_sample *sample,
 		   struct perf_session *session);
 
 struct addr_location;
 int event__preprocess_sample(const event_t *self, struct perf_session *session,
-			     struct addr_location *al, struct sample_data *data,
+			     struct addr_location *al, struct perf_sample *sample,
 			     symbol_filter_t filter);
 
 const char *event__get_event_name(unsigned int id);
 
 int event__parse_sample(const event_t *event, u64 type, bool sample_id_all,
-			struct sample_data *sample);
+			struct perf_sample *sample);
 
 #endif /* __PERF_RECORD_H */
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index f98b3e5..a134885 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -368,7 +368,7 @@ out_unmap:
 }
 
 static int event__parse_id_sample(const event_t *event, u64 type,
-				  struct sample_data *sample)
+				  struct perf_sample *sample)
 {
 	const u64 *array = event->sample.array;
 
@@ -406,7 +406,7 @@ static int event__parse_id_sample(const event_t *event, u64 type,
 }
 
 int event__parse_sample(const event_t *event, u64 type, bool sample_id_all,
-			struct sample_data *data)
+			struct perf_sample *data)
 {
 	const u64 *array;
 
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index e6a0740..ee0b611 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -67,7 +67,7 @@ out_close:
 
 static void perf_session__id_header_size(struct perf_session *session)
 {
-       struct sample_data *data;
+       struct perf_sample *data;
        u64 sample_type = session->sample_type;
        u16 size = 0;
 
@@ -299,7 +299,7 @@ static int process_event_synth_stub(event_t *event __used,
 }
 
 static int process_event_stub(event_t *event __used,
-			      struct sample_data *sample __used,
+			      struct perf_sample *sample __used,
 			      struct perf_session *session __used)
 {
 	dump_printf(": unhandled!\n");
@@ -475,7 +475,7 @@ static void perf_session_free_sample_buffers(struct perf_session *session)
 
 static int perf_session_deliver_event(struct perf_session *session,
 				      event_t *event,
-				      struct sample_data *sample,
+				      struct perf_sample *sample,
 				      struct perf_event_ops *ops,
 				      u64 file_offset);
 
@@ -485,7 +485,7 @@ static void flush_sample_queue(struct perf_session *s,
 	struct ordered_samples *os = &s->ordered_samples;
 	struct list_head *head = &os->samples;
 	struct sample_queue *tmp, *iter;
-	struct sample_data sample;
+	struct perf_sample sample;
 	u64 limit = os->next_flush;
 	u64 last_ts = os->last_sample ? os->last_sample->timestamp : 0ULL;
 
@@ -610,11 +610,11 @@ static void __queue_event(struct sample_queue *new, struct perf_session *s)
 #define MAX_SAMPLE_BUFFER	(64 * 1024 / sizeof(struct sample_queue))
 
 static int perf_session_queue_event(struct perf_session *s, event_t *event,
-				    struct sample_data *data, u64 file_offset)
+				    struct perf_sample *sample, u64 file_offset)
 {
 	struct ordered_samples *os = &s->ordered_samples;
 	struct list_head *sc = &os->sample_cache;
-	u64 timestamp = data->time;
+	u64 timestamp = sample->time;
 	struct sample_queue *new;
 
 	if (!timestamp || timestamp == ~0ULL)
@@ -650,7 +650,7 @@ static int perf_session_queue_event(struct perf_session *s, event_t *event,
 	return 0;
 }
 
-static void callchain__printf(struct sample_data *sample)
+static void callchain__printf(struct perf_sample *sample)
 {
 	unsigned int i;
 
@@ -663,7 +663,7 @@ static void callchain__printf(struct sample_data *sample)
 
 static void perf_session__print_tstamp(struct perf_session *session,
 				       event_t *event,
-				       struct sample_data *sample)
+				       struct perf_sample *sample)
 {
 	if (event->header.type != PERF_RECORD_SAMPLE &&
 	    !session->sample_id_all) {
@@ -679,7 +679,7 @@ static void perf_session__print_tstamp(struct perf_session *session,
 }
 
 static void dump_event(struct perf_session *session, event_t *event,
-		       u64 file_offset, struct sample_data *sample)
+		       u64 file_offset, struct perf_sample *sample)
 {
 	if (!dump_trace)
 		return;
@@ -697,7 +697,7 @@ static void dump_event(struct perf_session *session, event_t *event,
 }
 
 static void dump_sample(struct perf_session *session, event_t *event,
-			struct sample_data *sample)
+			struct perf_sample *sample)
 {
 	if (!dump_trace)
 		return;
@@ -712,7 +712,7 @@ static void dump_sample(struct perf_session *session, event_t *event,
 
 static int perf_session_deliver_event(struct perf_session *session,
 				      event_t *event,
-				      struct sample_data *sample,
+				      struct perf_sample *sample,
 				      struct perf_event_ops *ops,
 				      u64 file_offset)
 {
@@ -745,7 +745,7 @@ static int perf_session_deliver_event(struct perf_session *session,
 }
 
 static int perf_session__preprocess_sample(struct perf_session *session,
-					   event_t *event, struct sample_data *sample)
+					   event_t *event, struct perf_sample *sample)
 {
 	if (event->header.type != PERF_RECORD_SAMPLE ||
 	    !(session->sample_type & PERF_SAMPLE_CALLCHAIN))
@@ -789,7 +789,7 @@ static int perf_session__process_event(struct perf_session *session,
 				       struct perf_event_ops *ops,
 				       u64 file_offset)
 {
-	struct sample_data sample;
+	struct perf_sample sample;
 	int ret;
 
 	if (session->header.needs_swap && event__swap_ops[event->header.type])
diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h
index 7823976..365bf53 100644
--- a/tools/perf/util/session.h
+++ b/tools/perf/util/session.h
@@ -57,7 +57,7 @@ struct perf_session {
 
 struct perf_event_ops;
 
-typedef int (*event_op)(event_t *self, struct sample_data *sample,
+typedef int (*event_op)(event_t *self, struct perf_sample *sample,
 			struct perf_session *session);
 typedef int (*event_synth_op)(event_t *self, struct perf_session *session);
 typedef int (*event_op2)(event_t *self, struct perf_session *session,
@@ -158,7 +158,7 @@ size_t perf_session__fprintf_nr_events(struct perf_session *self, FILE *fp)
 
 static inline int perf_session__parse_sample(struct perf_session *session,
 					     const event_t *event,
-					     struct sample_data *sample)
+					     struct perf_sample *sample)
 {
 	return event__parse_sample(event, session->sample_type,
 				   session->sample_id_all, sample);
-- 
cgit v0.10.2


From 8115d60c323dd9931b95221c0a392aeddc1d6ef3 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Sat, 29 Jan 2011 14:01:45 -0200
Subject: perf tools: Kill event_t typedef, use 'union perf_event' instead

And move the event_t methods to the perf_event__ too.

No code changes, just namespace consistency.

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
index ef36751..7006786 100644
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -58,12 +58,13 @@ static int hists__add_entry(struct hists *self, struct addr_location *al)
 	return hist_entry__inc_addr_samples(he, al->addr);
 }
 
-static int process_sample_event(event_t *event, struct perf_sample *sample,
+static int process_sample_event(union perf_event *event,
+				struct perf_sample *sample,
 				struct perf_session *session)
 {
 	struct addr_location al;
 
-	if (event__preprocess_sample(event, session, &al, sample, NULL) < 0) {
+	if (perf_event__preprocess_sample(event, session, &al, sample, NULL) < 0) {
 		pr_warning("problem processing %d event, skipping it.\n",
 			   event->header.type);
 		return -1;
@@ -372,9 +373,9 @@ find_next:
 
 static struct perf_event_ops event_ops = {
 	.sample	= process_sample_event,
-	.mmap	= event__process_mmap,
-	.comm	= event__process_comm,
-	.fork	= event__process_task,
+	.mmap	= perf_event__process_mmap,
+	.comm	= perf_event__process_comm,
+	.fork	= perf_event__process_task,
 	.ordered_samples = true,
 	.ordering_requires_timestamps = true,
 };
diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c
index 0822149..6b7d911 100644
--- a/tools/perf/builtin-diff.c
+++ b/tools/perf/builtin-diff.c
@@ -30,13 +30,13 @@ static int hists__add_entry(struct hists *self,
 	return -ENOMEM;
 }
 
-static int diff__process_sample_event(event_t *event,
+static int diff__process_sample_event(union perf_event *event,
 				      struct perf_sample *sample,
 				      struct perf_session *session)
 {
 	struct addr_location al;
 
-	if (event__preprocess_sample(event, session, &al, sample, NULL) < 0) {
+	if (perf_event__preprocess_sample(event, session, &al, sample, NULL) < 0) {
 		pr_warning("problem processing %d event, skipping it.\n",
 			   event->header.type);
 		return -1;
@@ -56,11 +56,11 @@ static int diff__process_sample_event(event_t *event,
 
 static struct perf_event_ops event_ops = {
 	.sample	= diff__process_sample_event,
-	.mmap	= event__process_mmap,
-	.comm	= event__process_comm,
-	.exit	= event__process_task,
-	.fork	= event__process_task,
-	.lost	= event__process_lost,
+	.mmap	= perf_event__process_mmap,
+	.comm	= perf_event__process_comm,
+	.exit	= perf_event__process_task,
+	.fork	= perf_event__process_task,
+	.lost	= perf_event__process_lost,
 	.ordered_samples = true,
 	.ordering_requires_timestamps = true,
 };
diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c
index 4c9388c..e29f04e 100644
--- a/tools/perf/builtin-inject.c
+++ b/tools/perf/builtin-inject.c
@@ -16,8 +16,8 @@
 static char		const *input_name = "-";
 static bool		inject_build_ids;
 
-static int event__repipe_synth(event_t *event,
-			       struct perf_session *session __used)
+static int perf_event__repipe_synth(union perf_event *event,
+				    struct perf_session *session __used)
 {
 	uint32_t size;
 	void *buf = event;
@@ -36,41 +36,44 @@ static int event__repipe_synth(event_t *event,
 	return 0;
 }
 
-static int event__repipe(event_t *event, struct perf_sample *sample __used,
-			 struct perf_session *session)
+static int perf_event__repipe(union perf_event *event,
+			      struct perf_sample *sample __used,
+			      struct perf_session *session)
 {
-	return event__repipe_synth(event, session);
+	return perf_event__repipe_synth(event, session);
 }
 
-static int event__repipe_mmap(event_t *self, struct perf_sample *sample,
-			      struct perf_session *session)
+static int perf_event__repipe_mmap(union perf_event *event,
+				   struct perf_sample *sample,
+				   struct perf_session *session)
 {
 	int err;
 
-	err = event__process_mmap(self, sample, session);
-	event__repipe(self, sample, session);
+	err = perf_event__process_mmap(event, sample, session);
+	perf_event__repipe(event, sample, session);
 
 	return err;
 }
 
-static int event__repipe_task(event_t *self, struct perf_sample *sample,
-			      struct perf_session *session)
+static int perf_event__repipe_task(union perf_event *event,
+				   struct perf_sample *sample,
+				   struct perf_session *session)
 {
 	int err;
 
-	err = event__process_task(self, sample, session);
-	event__repipe(self, sample, session);
+	err = perf_event__process_task(event, sample, session);
+	perf_event__repipe(event, sample, session);
 
 	return err;
 }
 
-static int event__repipe_tracing_data(event_t *self,
-				      struct perf_session *session)
+static int perf_event__repipe_tracing_data(union perf_event *event,
+					   struct perf_session *session)
 {
 	int err;
 
-	event__repipe_synth(self, session);
-	err = event__process_tracing_data(self, session);
+	perf_event__repipe_synth(event, session);
+	err = perf_event__process_tracing_data(event, session);
 
 	return err;
 }
@@ -109,8 +112,8 @@ static int dso__inject_build_id(struct dso *self, struct perf_session *session)
 	if (self->kernel)
 		misc = PERF_RECORD_MISC_KERNEL;
 
-	err = event__synthesize_build_id(self, misc, event__repipe,
-					 machine, session);
+	err = perf_event__synthesize_build_id(self, misc, perf_event__repipe,
+					      machine, session);
 	if (err) {
 		pr_err("Can't synthesize build_id event for %s\n", self->long_name);
 		return -1;
@@ -119,8 +122,9 @@ static int dso__inject_build_id(struct dso *self, struct perf_session *session)
 	return 0;
 }
 
-static int event__inject_buildid(event_t *event, struct perf_sample *sample,
-				 struct perf_session *session)
+static int perf_event__inject_buildid(union perf_event *event,
+				      struct perf_sample *sample,
+				      struct perf_session *session)
 {
 	struct addr_location al;
 	struct thread *thread;
@@ -155,24 +159,24 @@ static int event__inject_buildid(event_t *event, struct perf_sample *sample,
 	}
 
 repipe:
-	event__repipe(event, sample, session);
+	perf_event__repipe(event, sample, session);
 	return 0;
 }
 
 struct perf_event_ops inject_ops = {
-	.sample		= event__repipe,
-	.mmap		= event__repipe,
-	.comm		= event__repipe,
-	.fork		= event__repipe,
-	.exit		= event__repipe,
-	.lost		= event__repipe,
-	.read		= event__repipe,
-	.throttle	= event__repipe,
-	.unthrottle	= event__repipe,
-	.attr		= event__repipe_synth,
-	.event_type 	= event__repipe_synth,
-	.tracing_data 	= event__repipe_synth,
-	.build_id 	= event__repipe_synth,
+	.sample		= perf_event__repipe,
+	.mmap		= perf_event__repipe,
+	.comm		= perf_event__repipe,
+	.fork		= perf_event__repipe,
+	.exit		= perf_event__repipe,
+	.lost		= perf_event__repipe,
+	.read		= perf_event__repipe,
+	.throttle	= perf_event__repipe,
+	.unthrottle	= perf_event__repipe,
+	.attr		= perf_event__repipe_synth,
+	.event_type 	= perf_event__repipe_synth,
+	.tracing_data 	= perf_event__repipe_synth,
+	.build_id 	= perf_event__repipe_synth,
 };
 
 extern volatile int session_done;
@@ -190,10 +194,10 @@ static int __cmd_inject(void)
 	signal(SIGINT, sig_handler);
 
 	if (inject_build_ids) {
-		inject_ops.sample	= event__inject_buildid;
-		inject_ops.mmap		= event__repipe_mmap;
-		inject_ops.fork		= event__repipe_task;
-		inject_ops.tracing_data	= event__repipe_tracing_data;
+		inject_ops.sample	= perf_event__inject_buildid;
+		inject_ops.mmap		= perf_event__repipe_mmap;
+		inject_ops.fork		= perf_event__repipe_task;
+		inject_ops.tracing_data	= perf_event__repipe_tracing_data;
 	}
 
 	session = perf_session__new(input_name, O_RDONLY, false, true, &inject_ops);
diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c
index 3c1cdcf..7f618f4 100644
--- a/tools/perf/builtin-kmem.c
+++ b/tools/perf/builtin-kmem.c
@@ -275,9 +275,8 @@ static void process_free_event(void *data,
 	s_alloc->alloc_cpu = -1;
 }
 
-static void
-process_raw_event(event_t *raw_event __used, void *data,
-		  int cpu, u64 timestamp, struct thread *thread)
+static void process_raw_event(union perf_event *raw_event __used, void *data,
+			      int cpu, u64 timestamp, struct thread *thread)
 {
 	struct event *event;
 	int type;
@@ -304,7 +303,8 @@ process_raw_event(event_t *raw_event __used, void *data,
 	}
 }
 
-static int process_sample_event(event_t *event, struct perf_sample *sample,
+static int process_sample_event(union perf_event *event,
+				struct perf_sample *sample,
 				struct perf_session *session)
 {
 	struct thread *thread = perf_session__findnew(session, event->ip.pid);
@@ -325,7 +325,7 @@ static int process_sample_event(event_t *event, struct perf_sample *sample,
 
 static struct perf_event_ops event_ops = {
 	.sample			= process_sample_event,
-	.comm			= event__process_comm,
+	.comm			= perf_event__process_comm,
 	.ordered_samples	= true,
 };
 
diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c
index c3f5127..e00d938 100644
--- a/tools/perf/builtin-lock.c
+++ b/tools/perf/builtin-lock.c
@@ -834,14 +834,14 @@ static void dump_info(void)
 		die("Unknown type of information\n");
 }
 
-static int process_sample_event(event_t *self, struct perf_sample *sample,
+static int process_sample_event(union perf_event *event, struct perf_sample *sample,
 				struct perf_session *s)
 {
 	struct thread *thread = perf_session__findnew(s, sample->tid);
 
 	if (thread == NULL) {
 		pr_debug("problem processing %d event, skipping it.\n",
-			self->header.type);
+			event->header.type);
 		return -1;
 	}
 
@@ -852,7 +852,7 @@ static int process_sample_event(event_t *self, struct perf_sample *sample,
 
 static struct perf_event_ops eops = {
 	.sample			= process_sample_event,
-	.comm			= event__process_comm,
+	.comm			= perf_event__process_comm,
 	.ordered_samples	= true,
 };
 
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 5d3e4b3..edc3555 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -100,7 +100,7 @@ static void write_output(void *buf, size_t size)
 	}
 }
 
-static int process_synthesized_event(event_t *event,
+static int process_synthesized_event(union perf_event *event,
 				     struct perf_sample *sample __used,
 				     struct perf_session *self __used)
 {
@@ -404,7 +404,7 @@ static void atexit_header(void)
 	}
 }
 
-static void event__synthesize_guest_os(struct machine *machine, void *data)
+static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
 {
 	int err;
 	struct perf_session *psession = data;
@@ -420,8 +420,8 @@ static void event__synthesize_guest_os(struct machine *machine, void *data)
 	 *method is used to avoid symbol missing when the first addr is
 	 *in module instead of in guest kernel.
 	 */
-	err = event__synthesize_modules(process_synthesized_event,
-					psession, machine);
+	err = perf_event__synthesize_modules(process_synthesized_event,
+					     psession, machine);
 	if (err < 0)
 		pr_err("Couldn't record guest kernel [%d]'s reference"
 		       " relocation symbol.\n", machine->pid);
@@ -430,11 +430,12 @@ static void event__synthesize_guest_os(struct machine *machine, void *data)
 	 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
 	 * have no _text sometimes.
 	 */
-	err = event__synthesize_kernel_mmap(process_synthesized_event,
-					    psession, machine, "_text");
+	err = perf_event__synthesize_kernel_mmap(process_synthesized_event,
+						 psession, machine, "_text");
 	if (err < 0)
-		err = event__synthesize_kernel_mmap(process_synthesized_event,
-						    psession, machine, "_stext");
+		err = perf_event__synthesize_kernel_mmap(process_synthesized_event,
+							 psession, machine,
+							 "_stext");
 	if (err < 0)
 		pr_err("Couldn't record guest kernel [%d]'s reference"
 		       " relocation symbol.\n", machine->pid);
@@ -617,16 +618,16 @@ static int __cmd_record(int argc, const char **argv)
 	perf_session__set_sample_id_all(session, sample_id_all_avail);
 
 	if (pipe_output) {
-		err = event__synthesize_attrs(&session->header,
-					      process_synthesized_event,
-					      session);
+		err = perf_event__synthesize_attrs(&session->header,
+						   process_synthesized_event,
+						   session);
 		if (err < 0) {
 			pr_err("Couldn't synthesize attrs.\n");
 			return err;
 		}
 
-		err = event__synthesize_event_types(process_synthesized_event,
-						    session);
+		err = perf_event__synthesize_event_types(process_synthesized_event,
+							 session);
 		if (err < 0) {
 			pr_err("Couldn't synthesize event_types.\n");
 			return err;
@@ -641,9 +642,9 @@ static int __cmd_record(int argc, const char **argv)
 			 * return this more properly and also
 			 * propagate errors that now are calling die()
 			 */
-			err = event__synthesize_tracing_data(output, evsel_list,
-							     process_synthesized_event,
-							     session);
+			err = perf_event__synthesize_tracing_data(output, evsel_list,
+								  process_synthesized_event,
+								  session);
 			if (err <= 0) {
 				pr_err("Couldn't record tracing data.\n");
 				return err;
@@ -658,31 +659,34 @@ static int __cmd_record(int argc, const char **argv)
 		return -1;
 	}
 
-	err = event__synthesize_kernel_mmap(process_synthesized_event,
-					    session, machine, "_text");
+	err = perf_event__synthesize_kernel_mmap(process_synthesized_event,
+						 session, machine, "_text");
 	if (err < 0)
-		err = event__synthesize_kernel_mmap(process_synthesized_event,
-						    session, machine, "_stext");
+		err = perf_event__synthesize_kernel_mmap(process_synthesized_event,
+							 session, machine, "_stext");
 	if (err < 0)
 		pr_err("Couldn't record kernel reference relocation symbol\n"
 		       "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
 		       "Check /proc/kallsyms permission or run as root.\n");
 
-	err = event__synthesize_modules(process_synthesized_event,
-					session, machine);
+	err = perf_event__synthesize_modules(process_synthesized_event,
+					     session, machine);
 	if (err < 0)
 		pr_err("Couldn't record kernel module information.\n"
 		       "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
 		       "Check /proc/modules permission or run as root.\n");
 
 	if (perf_guest)
-		perf_session__process_machines(session, event__synthesize_guest_os);
+		perf_session__process_machines(session,
+					       perf_event__synthesize_guest_os);
 
 	if (!system_wide)
-		event__synthesize_thread(target_tid, process_synthesized_event,
-					 session);
+		perf_event__synthesize_thread(target_tid,
+					      process_synthesized_event,
+					      session);
 	else
-		event__synthesize_threads(process_synthesized_event, session);
+		perf_event__synthesize_threads(process_synthesized_event,
+					       session);
 
 	if (realtime_prio) {
 		struct sched_param param;
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index bbbadcc..a6a4e54 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -149,13 +149,14 @@ static int add_event_total(struct perf_session *session,
 	return 0;
 }
 
-static int process_sample_event(event_t *event, struct perf_sample *sample,
+static int process_sample_event(union perf_event *event,
+				struct perf_sample *sample,
 				struct perf_session *session)
 {
 	struct addr_location al;
 	struct perf_event_attr *attr;
 
-	if (event__preprocess_sample(event, session, &al, sample, NULL) < 0) {
+	if (perf_event__preprocess_sample(event, session, &al, sample, NULL) < 0) {
 		fprintf(stderr, "problem processing %d event, skipping it.\n",
 			event->header.type);
 		return -1;
@@ -179,7 +180,8 @@ static int process_sample_event(event_t *event, struct perf_sample *sample,
 	return 0;
 }
 
-static int process_read_event(event_t *event, struct perf_sample *sample __used,
+static int process_read_event(union perf_event *event,
+			      struct perf_sample *sample __used,
 			      struct perf_session *session __used)
 {
 	struct perf_event_attr *attr;
@@ -232,17 +234,17 @@ static int perf_session__setup_sample_type(struct perf_session *self)
 }
 
 static struct perf_event_ops event_ops = {
-	.sample	= process_sample_event,
-	.mmap	= event__process_mmap,
-	.comm	= event__process_comm,
-	.exit	= event__process_task,
-	.fork	= event__process_task,
-	.lost	= event__process_lost,
-	.read	= process_read_event,
-	.attr	= event__process_attr,
-	.event_type = event__process_event_type,
-	.tracing_data = event__process_tracing_data,
-	.build_id = event__process_build_id,
+	.sample		 = process_sample_event,
+	.mmap		 = perf_event__process_mmap,
+	.comm		 = perf_event__process_comm,
+	.exit		 = perf_event__process_task,
+	.fork		 = perf_event__process_task,
+	.lost		 = perf_event__process_lost,
+	.read		 = process_read_event,
+	.attr		 = perf_event__process_attr,
+	.event_type	 = perf_event__process_event_type,
+	.tracing_data	 = perf_event__process_tracing_data,
+	.build_id	 = perf_event__process_build_id,
 	.ordered_samples = true,
 	.ordering_requires_timestamps = true,
 };
diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index ff993c8..ae26211 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -1580,9 +1580,9 @@ process_sched_migrate_task_event(void *data, struct perf_session *session,
 						 event, cpu, timestamp, thread);
 }
 
-static void
-process_raw_event(event_t *raw_event __used, struct perf_session *session,
-		  void *data, int cpu, u64 timestamp, struct thread *thread)
+static void process_raw_event(union perf_event *raw_event __used,
+			      struct perf_session *session, void *data, int cpu,
+			      u64 timestamp, struct thread *thread)
 {
 	struct event *event;
 	int type;
@@ -1607,7 +1607,8 @@ process_raw_event(event_t *raw_event __used, struct perf_session *session,
 		process_sched_migrate_task_event(data, session, event, cpu, timestamp, thread);
 }
 
-static int process_sample_event(event_t *event, struct perf_sample *sample,
+static int process_sample_event(union perf_event *event,
+				struct perf_sample *sample,
 				struct perf_session *session)
 {
 	struct thread *thread;
@@ -1635,9 +1636,9 @@ static int process_sample_event(event_t *event, struct perf_sample *sample,
 
 static struct perf_event_ops event_ops = {
 	.sample			= process_sample_event,
-	.comm			= event__process_comm,
-	.lost			= event__process_lost,
-	.fork			= event__process_task,
+	.comm			= perf_event__process_comm,
+	.lost			= perf_event__process_lost,
+	.fork			= perf_event__process_task,
 	.ordered_samples	= true,
 };
 
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index 5c4c809..5f40df6 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -63,7 +63,8 @@ static int cleanup_scripting(void)
 
 static char const		*input_name = "perf.data";
 
-static int process_sample_event(event_t *event, struct perf_sample *sample,
+static int process_sample_event(union perf_event *event,
+				struct perf_sample *sample,
 				struct perf_session *session)
 {
 	struct thread *thread = perf_session__findnew(session, event->ip.pid);
@@ -100,14 +101,14 @@ static int process_sample_event(event_t *event, struct perf_sample *sample,
 }
 
 static struct perf_event_ops event_ops = {
-	.sample	= process_sample_event,
-	.comm	= event__process_comm,
-	.attr	= event__process_attr,
-	.event_type = event__process_event_type,
-	.tracing_data = event__process_tracing_data,
-	.build_id = event__process_build_id,
-	.ordering_requires_timestamps = true,
+	.sample		 = process_sample_event,
+	.comm		 = perf_event__process_comm,
+	.attr		 = perf_event__process_attr,
+	.event_type	 = perf_event__process_event_type,
+	.tracing_data	 = perf_event__process_tracing_data,
+	.build_id	 = perf_event__process_build_id,
 	.ordered_samples = true,
+	.ordering_requires_timestamps = true,
 };
 
 extern volatile int session_done;
diff --git a/tools/perf/builtin-test.c b/tools/perf/builtin-test.c
index df62433..845b9bd 100644
--- a/tools/perf/builtin-test.c
+++ b/tools/perf/builtin-test.c
@@ -454,7 +454,7 @@ out_thread_map_delete:
 static int test__basic_mmap(void)
 {
 	int err = -1;
-	event_t *event;
+	union perf_event *event;
 	struct thread_map *threads;
 	struct cpu_map *cpus;
 	struct perf_evlist *evlist;
@@ -554,11 +554,11 @@ static int test__basic_mmap(void)
 
 		if (event->header.type != PERF_RECORD_SAMPLE) {
 			pr_debug("unexpected %s event\n",
-				 event__get_event_name(event->header.type));
+				 perf_event__name(event->header.type));
 			goto out_munmap;
 		}
 
-		event__parse_sample(event, attr.sample_type, false, &sample);
+		perf_event__parse_sample(event, attr.sample_type, false, &sample);
 		evsel = perf_evlist__id2evsel(evlist, sample.id);
 		if (evsel == NULL) {
 			pr_debug("event with id %" PRIu64
diff --git a/tools/perf/builtin-timechart.c b/tools/perf/builtin-timechart.c
index 01cf0c3..0801275 100644
--- a/tools/perf/builtin-timechart.c
+++ b/tools/perf/builtin-timechart.c
@@ -276,21 +276,24 @@ static int cpus_cstate_state[MAX_CPUS];
 static u64 cpus_pstate_start_times[MAX_CPUS];
 static u64 cpus_pstate_state[MAX_CPUS];
 
-static int process_comm_event(event_t *event, struct perf_sample *sample __used,
+static int process_comm_event(union perf_event *event,
+			      struct perf_sample *sample __used,
 			      struct perf_session *session __used)
 {
 	pid_set_comm(event->comm.tid, event->comm.comm);
 	return 0;
 }
 
-static int process_fork_event(event_t *event, struct perf_sample *sample __used,
+static int process_fork_event(union perf_event *event,
+			      struct perf_sample *sample __used,
 			      struct perf_session *session __used)
 {
 	pid_fork(event->fork.pid, event->fork.ppid, event->fork.time);
 	return 0;
 }
 
-static int process_exit_event(event_t *event, struct perf_sample *sample __used,
+static int process_exit_event(union perf_event *event,
+			      struct perf_sample *sample __used,
 			      struct perf_session *session __used)
 {
 	pid_exit(event->fork.pid, event->fork.time);
@@ -486,7 +489,7 @@ static void sched_switch(int cpu, u64 timestamp, struct trace_entry *te)
 }
 
 
-static int process_sample_event(event_t *event __used,
+static int process_sample_event(union perf_event *event __used,
 				struct perf_sample *sample,
 				struct perf_session *session)
 {
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index d923127..2f4d1f2 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -401,7 +401,7 @@ static void show_details(struct sym_entry *syme)
 }
 
 /*
- * Symbols will be added here in event__process_sample and will get out
+ * Symbols will be added here in perf_event__process_sample and will get out
  * after decayed.
  */
 static LIST_HEAD(active_symbols);
@@ -996,15 +996,15 @@ static int symbol_filter(struct map *map, struct symbol *sym)
 	return 0;
 }
 
-static void event__process_sample(const event_t *self,
-				  struct perf_sample *sample,
-				  struct perf_session *session)
+static void perf_event__process_sample(const union perf_event *event,
+				       struct perf_sample *sample,
+				       struct perf_session *session)
 {
-	u64 ip = self->ip.ip;
+	u64 ip = event->ip.ip;
 	struct sym_entry *syme;
 	struct addr_location al;
 	struct machine *machine;
-	u8 origin = self->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
+	u8 origin = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
 
 	++samples;
 
@@ -1023,7 +1023,7 @@ static void event__process_sample(const event_t *self,
 		break;
 	case PERF_RECORD_MISC_GUEST_KERNEL:
 		++guest_kernel_samples;
-		machine = perf_session__find_machine(session, self->ip.pid);
+		machine = perf_session__find_machine(session, event->ip.pid);
 		break;
 	case PERF_RECORD_MISC_GUEST_USER:
 		++guest_us_samples;
@@ -1038,15 +1038,15 @@ static void event__process_sample(const event_t *self,
 
 	if (!machine && perf_guest) {
 		pr_err("Can't find guest [%d]'s kernel information\n",
-			self->ip.pid);
+			event->ip.pid);
 		return;
 	}
 
-	if (self->header.misc & PERF_RECORD_MISC_EXACT_IP)
+	if (event->header.misc & PERF_RECORD_MISC_EXACT_IP)
 		exact_samples++;
 
-	if (event__preprocess_sample(self, session, &al, sample,
-				     symbol_filter) < 0 ||
+	if (perf_event__preprocess_sample(event, session, &al, sample,
+					  symbol_filter) < 0 ||
 	    al.filtered)
 		return;
 
@@ -1108,15 +1108,15 @@ static void event__process_sample(const event_t *self,
 static void perf_session__mmap_read_cpu(struct perf_session *self, int cpu)
 {
 	struct perf_sample sample;
-	event_t *event;
+	union perf_event *event;
 
 	while ((event = perf_evlist__read_on_cpu(evsel_list, cpu)) != NULL) {
 		perf_session__parse_sample(self, event, &sample);
 
 		if (event->header.type == PERF_RECORD_SAMPLE)
-			event__process_sample(event, &sample, self);
+			perf_event__process_sample(event, &sample, self);
 		else
-			event__process(event, &sample, self);
+			perf_event__process(event, &sample, self);
 	}
 }
 
@@ -1199,9 +1199,10 @@ static int __cmd_top(void)
 		return -ENOMEM;
 
 	if (target_tid != -1)
-		event__synthesize_thread(target_tid, event__process, session);
+		perf_event__synthesize_thread(target_tid, perf_event__process,
+					      session);
 	else
-		event__synthesize_threads(event__process, session);
+		perf_event__synthesize_threads(perf_event__process, session);
 
 	start_counters(evsel_list);
 	first = list_entry(evsel_list->entries.next, struct perf_evsel, node);
diff --git a/tools/perf/util/build-id.c b/tools/perf/util/build-id.c
index b184a7f..31f934a 100644
--- a/tools/perf/util/build-id.c
+++ b/tools/perf/util/build-id.c
@@ -14,7 +14,7 @@
 #include <linux/kernel.h>
 #include "debug.h"
 
-static int build_id__mark_dso_hit(event_t *event,
+static int build_id__mark_dso_hit(union perf_event *event,
 				  struct perf_sample *sample __used,
 				  struct perf_session *session)
 {
@@ -37,13 +37,14 @@ static int build_id__mark_dso_hit(event_t *event,
 	return 0;
 }
 
-static int event__exit_del_thread(event_t *self, struct perf_sample *sample __used,
-				  struct perf_session *session)
+static int perf_event__exit_del_thread(union perf_event *event,
+				       struct perf_sample *sample __used,
+				       struct perf_session *session)
 {
-	struct thread *thread = perf_session__findnew(session, self->fork.tid);
+	struct thread *thread = perf_session__findnew(session, event->fork.tid);
 
-	dump_printf("(%d:%d):(%d:%d)\n", self->fork.pid, self->fork.tid,
-		    self->fork.ppid, self->fork.ptid);
+	dump_printf("(%d:%d):(%d:%d)\n", event->fork.pid, event->fork.tid,
+		    event->fork.ppid, event->fork.ptid);
 
 	if (thread) {
 		rb_erase(&thread->rb_node, &session->threads);
@@ -56,9 +57,9 @@ static int event__exit_del_thread(event_t *self, struct perf_sample *sample __us
 
 struct perf_event_ops build_id__mark_dso_hit_ops = {
 	.sample	= build_id__mark_dso_hit,
-	.mmap	= event__process_mmap,
-	.fork	= event__process_task,
-	.exit	= event__exit_del_thread,
+	.mmap	= perf_event__process_mmap,
+	.fork	= perf_event__process_task,
+	.exit	= perf_event__exit_del_thread,
 };
 
 char *dso__build_id_filename(struct dso *self, char *bf, size_t size)
diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c
index f8c66d1..9f7106a 100644
--- a/tools/perf/util/callchain.c
+++ b/tools/perf/util/callchain.c
@@ -18,7 +18,8 @@
 #include "util.h"
 #include "callchain.h"
 
-bool ip_callchain__valid(struct ip_callchain *chain, const event_t *event)
+bool ip_callchain__valid(struct ip_callchain *chain,
+			 const union perf_event *event)
 {
 	unsigned int chain_size = event->header.size;
 	chain_size -= (unsigned long)&event->ip.__more_data - (unsigned long)event;
diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h
index 6713725..1a79df9 100644
--- a/tools/perf/util/callchain.h
+++ b/tools/perf/util/callchain.h
@@ -95,8 +95,8 @@ int callchain_append(struct callchain_root *root,
 int callchain_merge(struct callchain_cursor *cursor,
 		    struct callchain_root *dst, struct callchain_root *src);
 
-bool ip_callchain__valid(struct ip_callchain *chain, const event_t *event);
-
+bool ip_callchain__valid(struct ip_callchain *chain,
+			 const union perf_event *event);
 /*
  * Initialize a cursor before adding entries inside, but keep
  * the previously allocated entries as a cache.
diff --git a/tools/perf/util/debug.c b/tools/perf/util/debug.c
index 01bbe8e..d4536a9 100644
--- a/tools/perf/util/debug.c
+++ b/tools/perf/util/debug.c
@@ -57,7 +57,7 @@ void ui__warning(const char *format, ...)
 }
 #endif
 
-void trace_event(event_t *event)
+void trace_event(union perf_event *event)
 {
 	unsigned char *raw_event = (void *)event;
 	const char *color = PERF_COLOR_BLUE;
diff --git a/tools/perf/util/debug.h b/tools/perf/util/debug.h
index ca35fd6..93516cf4 100644
--- a/tools/perf/util/debug.h
+++ b/tools/perf/util/debug.h
@@ -9,7 +9,7 @@ extern int verbose;
 extern bool quiet, dump_trace;
 
 int dump_printf(const char *fmt, ...) __attribute__((format(printf, 1, 2)));
-void trace_event(event_t *event);
+void trace_event(union perf_event *event);
 
 struct ui_progress;
 
diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
index 5c886fb..731265f 100644
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -7,7 +7,7 @@
 #include "strlist.h"
 #include "thread.h"
 
-static const char *event__name[] = {
+static const char *perf_event__names[] = {
 	[0]			 = "TOTAL",
 	[PERF_RECORD_MMAP]	 = "MMAP",
 	[PERF_RECORD_LOST]	 = "LOST",
@@ -25,13 +25,13 @@ static const char *event__name[] = {
 	[PERF_RECORD_FINISHED_ROUND]	 = "FINISHED_ROUND",
 };
 
-const char *event__get_event_name(unsigned int id)
+const char *perf_event__name(unsigned int id)
 {
-	if (id >= ARRAY_SIZE(event__name))
+	if (id >= ARRAY_SIZE(perf_event__names))
 		return "INVALID";
-	if (!event__name[id])
+	if (!perf_event__names[id])
 		return "UNKNOWN";
-	return event__name[id];
+	return perf_event__names[id];
 }
 
 static struct perf_sample synth_sample = {
@@ -43,9 +43,9 @@ static struct perf_sample synth_sample = {
 	.period	   = 1,
 };
 
-static pid_t event__synthesize_comm(event_t *event, pid_t pid, int full,
-				    event__handler_t process,
-				    struct perf_session *session)
+static pid_t perf_event__synthesize_comm(union perf_event *event, pid_t pid,
+					 int full, perf_event__handler_t process,
+					 struct perf_session *session)
 {
 	char filename[PATH_MAX];
 	char bf[BUFSIZ];
@@ -126,9 +126,10 @@ out:
 	return tgid;
 }
 
-static int event__synthesize_mmap_events(event_t *event, pid_t pid, pid_t tgid,
-					 event__handler_t process,
-					 struct perf_session *session)
+static int perf_event__synthesize_mmap_events(union perf_event *event,
+					      pid_t pid, pid_t tgid,
+					      perf_event__handler_t process,
+					      struct perf_session *session)
 {
 	char filename[PATH_MAX];
 	FILE *fp;
@@ -199,14 +200,14 @@ static int event__synthesize_mmap_events(event_t *event, pid_t pid, pid_t tgid,
 	return 0;
 }
 
-int event__synthesize_modules(event__handler_t process,
-			      struct perf_session *session,
-			      struct machine *machine)
+int perf_event__synthesize_modules(perf_event__handler_t process,
+				   struct perf_session *session,
+				   struct machine *machine)
 {
 	struct rb_node *nd;
 	struct map_groups *kmaps = &machine->kmaps;
-	event_t *event = zalloc(sizeof(event->mmap) + session->id_hdr_size);
-
+	union perf_event *event = zalloc((sizeof(event->mmap) +
+					  session->id_hdr_size));
 	if (event == NULL) {
 		pr_debug("Not enough memory synthesizing mmap event "
 			 "for kernel modules\n");
@@ -251,22 +252,23 @@ int event__synthesize_modules(event__handler_t process,
 	return 0;
 }
 
-static int __event__synthesize_thread(event_t *comm_event, event_t *mmap_event,
-				      pid_t pid, event__handler_t process,
+static int __event__synthesize_thread(union perf_event *comm_event,
+				      union perf_event *mmap_event,
+				      pid_t pid, perf_event__handler_t process,
 				      struct perf_session *session)
 {
-	pid_t tgid = event__synthesize_comm(comm_event, pid, 1, process,
+	pid_t tgid = perf_event__synthesize_comm(comm_event, pid, 1, process,
 					    session);
 	if (tgid == -1)
 		return -1;
-	return event__synthesize_mmap_events(mmap_event, pid, tgid,
+	return perf_event__synthesize_mmap_events(mmap_event, pid, tgid,
 					     process, session);
 }
 
-int event__synthesize_thread(pid_t pid, event__handler_t process,
-			     struct perf_session *session)
+int perf_event__synthesize_thread(pid_t pid, perf_event__handler_t process,
+				  struct perf_session *session)
 {
-	event_t *comm_event, *mmap_event;
+	union perf_event *comm_event, *mmap_event;
 	int err = -1;
 
 	comm_event = malloc(sizeof(comm_event->comm) + session->id_hdr_size);
@@ -286,12 +288,12 @@ out:
 	return err;
 }
 
-int event__synthesize_threads(event__handler_t process,
-			      struct perf_session *session)
+int perf_event__synthesize_threads(perf_event__handler_t process,
+				   struct perf_session *session)
 {
 	DIR *proc;
 	struct dirent dirent, *next;
-	event_t *comm_event, *mmap_event;
+	union perf_event *comm_event, *mmap_event;
 	int err = -1;
 
 	comm_event = malloc(sizeof(comm_event->comm) + session->id_hdr_size);
@@ -349,10 +351,10 @@ static int find_symbol_cb(void *arg, const char *name, char type,
 	return 1;
 }
 
-int event__synthesize_kernel_mmap(event__handler_t process,
-				  struct perf_session *session,
-				  struct machine *machine,
-				  const char *symbol_name)
+int perf_event__synthesize_kernel_mmap(perf_event__handler_t process,
+				       struct perf_session *session,
+				       struct machine *machine,
+				       const char *symbol_name)
 {
 	size_t size;
 	const char *filename, *mmap_name;
@@ -366,8 +368,8 @@ int event__synthesize_kernel_mmap(event__handler_t process,
 	 * kernels.
 	 */
 	struct process_symbol_args args = { .name = symbol_name, };
-	event_t *event = zalloc(sizeof(event->mmap) + session->id_hdr_size);
-
+	union perf_event *event = zalloc((sizeof(event->mmap) +
+					  session->id_hdr_size));
 	if (event == NULL) {
 		pr_debug("Not enough memory synthesizing mmap event "
 			 "for kernel modules\n");
@@ -440,14 +442,15 @@ static int thread__set_comm_adjust(struct thread *self, const char *comm,
 	return 0;
 }
 
-int event__process_comm(event_t *self, struct perf_sample *sample __used,
-			struct perf_session *session)
+int perf_event__process_comm(union perf_event *event,
+			     struct perf_sample *sample __used,
+			     struct perf_session *session)
 {
-	struct thread *thread = perf_session__findnew(session, self->comm.tid);
+	struct thread *thread = perf_session__findnew(session, event->comm.tid);
 
-	dump_printf(": %s:%d\n", self->comm.comm, self->comm.tid);
+	dump_printf(": %s:%d\n", event->comm.comm, event->comm.tid);
 
-	if (thread == NULL || thread__set_comm_adjust(thread, self->comm.comm,
+	if (thread == NULL || thread__set_comm_adjust(thread, event->comm.comm,
 						      &session->hists)) {
 		dump_printf("problem processing PERF_RECORD_COMM, skipping event.\n");
 		return -1;
@@ -456,19 +459,21 @@ int event__process_comm(event_t *self, struct perf_sample *sample __used,
 	return 0;
 }
 
-int event__process_lost(event_t *self, struct perf_sample *sample __used,
-			struct perf_session *session)
+int perf_event__process_lost(union perf_event *event,
+			     struct perf_sample *sample __used,
+			     struct perf_session *session)
 {
 	dump_printf(": id:%" PRIu64 ": lost:%" PRIu64 "\n",
-		    self->lost.id, self->lost.lost);
-	session->hists.stats.total_lost += self->lost.lost;
+		    event->lost.id, event->lost.lost);
+	session->hists.stats.total_lost += event->lost.lost;
 	return 0;
 }
 
-static void event_set_kernel_mmap_len(struct map **maps, event_t *self)
+static void perf_event__set_kernel_mmap_len(union perf_event *event,
+					    struct map **maps)
 {
-	maps[MAP__FUNCTION]->start = self->mmap.start;
-	maps[MAP__FUNCTION]->end   = self->mmap.start + self->mmap.len;
+	maps[MAP__FUNCTION]->start = event->mmap.start;
+	maps[MAP__FUNCTION]->end   = event->mmap.start + event->mmap.len;
 	/*
 	 * Be a bit paranoid here, some perf.data file came with
 	 * a zero sized synthesized MMAP event for the kernel.
@@ -477,8 +482,8 @@ static void event_set_kernel_mmap_len(struct map **maps, event_t *self)
 		maps[MAP__FUNCTION]->end = ~0ULL;
 }
 
-static int event__process_kernel_mmap(event_t *self,
-			struct perf_session *session)
+static int perf_event__process_kernel_mmap(union perf_event *event,
+					   struct perf_session *session)
 {
 	struct map *map;
 	char kmmap_prefix[PATH_MAX];
@@ -486,9 +491,9 @@ static int event__process_kernel_mmap(event_t *self,
 	enum dso_kernel_type kernel_type;
 	bool is_kernel_mmap;
 
-	machine = perf_session__findnew_machine(session, self->mmap.pid);
+	machine = perf_session__findnew_machine(session, event->mmap.pid);
 	if (!machine) {
-		pr_err("Can't find id %d's machine\n", self->mmap.pid);
+		pr_err("Can't find id %d's machine\n", event->mmap.pid);
 		goto out_problem;
 	}
 
@@ -498,17 +503,17 @@ static int event__process_kernel_mmap(event_t *self,
 	else
 		kernel_type = DSO_TYPE_GUEST_KERNEL;
 
-	is_kernel_mmap = memcmp(self->mmap.filename,
+	is_kernel_mmap = memcmp(event->mmap.filename,
 				kmmap_prefix,
 				strlen(kmmap_prefix)) == 0;
-	if (self->mmap.filename[0] == '/' ||
-	    (!is_kernel_mmap && self->mmap.filename[0] == '[')) {
+	if (event->mmap.filename[0] == '/' ||
+	    (!is_kernel_mmap && event->mmap.filename[0] == '[')) {
 
 		char short_module_name[1024];
 		char *name, *dot;
 
-		if (self->mmap.filename[0] == '/') {
-			name = strrchr(self->mmap.filename, '/');
+		if (event->mmap.filename[0] == '/') {
+			name = strrchr(event->mmap.filename, '/');
 			if (name == NULL)
 				goto out_problem;
 
@@ -520,10 +525,10 @@ static int event__process_kernel_mmap(event_t *self,
 					"[%.*s]", (int)(dot - name), name);
 			strxfrchar(short_module_name, '-', '_');
 		} else
-			strcpy(short_module_name, self->mmap.filename);
+			strcpy(short_module_name, event->mmap.filename);
 
-		map = machine__new_module(machine, self->mmap.start,
-					  self->mmap.filename);
+		map = machine__new_module(machine, event->mmap.start,
+					  event->mmap.filename);
 		if (map == NULL)
 			goto out_problem;
 
@@ -533,9 +538,9 @@ static int event__process_kernel_mmap(event_t *self,
 
 		map->dso->short_name = name;
 		map->dso->sname_alloc = 1;
-		map->end = map->start + self->mmap.len;
+		map->end = map->start + event->mmap.len;
 	} else if (is_kernel_mmap) {
-		const char *symbol_name = (self->mmap.filename +
+		const char *symbol_name = (event->mmap.filename +
 				strlen(kmmap_prefix));
 		/*
 		 * Should be there already, from the build-id table in
@@ -550,10 +555,10 @@ static int event__process_kernel_mmap(event_t *self,
 		if (__machine__create_kernel_maps(machine, kernel) < 0)
 			goto out_problem;
 
-		event_set_kernel_mmap_len(machine->vmlinux_maps, self);
+		perf_event__set_kernel_mmap_len(event, machine->vmlinux_maps);
 		perf_session__set_kallsyms_ref_reloc_sym(machine->vmlinux_maps,
 							 symbol_name,
-							 self->mmap.pgoff);
+							 event->mmap.pgoff);
 		if (machine__is_default_guest(machine)) {
 			/*
 			 * preload dso of guest kernel and modules
@@ -567,22 +572,23 @@ out_problem:
 	return -1;
 }
 
-int event__process_mmap(event_t *self, struct perf_sample *sample __used,
-			struct perf_session *session)
+int perf_event__process_mmap(union perf_event *event,
+			     struct perf_sample *sample __used,
+			     struct perf_session *session)
 {
 	struct machine *machine;
 	struct thread *thread;
 	struct map *map;
-	u8 cpumode = self->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
+	u8 cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
 	int ret = 0;
 
 	dump_printf(" %d/%d: [%#" PRIx64 "(%#" PRIx64 ") @ %#" PRIx64 "]: %s\n",
-			self->mmap.pid, self->mmap.tid, self->mmap.start,
-			self->mmap.len, self->mmap.pgoff, self->mmap.filename);
+			event->mmap.pid, event->mmap.tid, event->mmap.start,
+			event->mmap.len, event->mmap.pgoff, event->mmap.filename);
 
 	if (cpumode == PERF_RECORD_MISC_GUEST_KERNEL ||
 	    cpumode == PERF_RECORD_MISC_KERNEL) {
-		ret = event__process_kernel_mmap(self, session);
+		ret = perf_event__process_kernel_mmap(event, session);
 		if (ret < 0)
 			goto out_problem;
 		return 0;
@@ -591,12 +597,12 @@ int event__process_mmap(event_t *self, struct perf_sample *sample __used,
 	machine = perf_session__find_host_machine(session);
 	if (machine == NULL)
 		goto out_problem;
-	thread = perf_session__findnew(session, self->mmap.pid);
+	thread = perf_session__findnew(session, event->mmap.pid);
 	if (thread == NULL)
 		goto out_problem;
-	map = map__new(&machine->user_dsos, self->mmap.start,
-			self->mmap.len, self->mmap.pgoff,
-			self->mmap.pid, self->mmap.filename,
+	map = map__new(&machine->user_dsos, event->mmap.start,
+			event->mmap.len, event->mmap.pgoff,
+			event->mmap.pid, event->mmap.filename,
 			MAP__FUNCTION);
 	if (map == NULL)
 		goto out_problem;
@@ -609,16 +615,17 @@ out_problem:
 	return 0;
 }
 
-int event__process_task(event_t *self, struct perf_sample *sample __used,
-			struct perf_session *session)
+int perf_event__process_task(union perf_event *event,
+			     struct perf_sample *sample __used,
+			     struct perf_session *session)
 {
-	struct thread *thread = perf_session__findnew(session, self->fork.tid);
-	struct thread *parent = perf_session__findnew(session, self->fork.ptid);
+	struct thread *thread = perf_session__findnew(session, event->fork.tid);
+	struct thread *parent = perf_session__findnew(session, event->fork.ptid);
 
-	dump_printf("(%d:%d):(%d:%d)\n", self->fork.pid, self->fork.tid,
-		    self->fork.ppid, self->fork.ptid);
+	dump_printf("(%d:%d):(%d:%d)\n", event->fork.pid, event->fork.tid,
+		    event->fork.ppid, event->fork.ptid);
 
-	if (self->header.type == PERF_RECORD_EXIT) {
+	if (event->header.type == PERF_RECORD_EXIT) {
 		perf_session__remove_thread(session, thread);
 		return 0;
 	}
@@ -632,22 +639,22 @@ int event__process_task(event_t *self, struct perf_sample *sample __used,
 	return 0;
 }
 
-int event__process(event_t *event, struct perf_sample *sample,
-		   struct perf_session *session)
+int perf_event__process(union perf_event *event, struct perf_sample *sample,
+			struct perf_session *session)
 {
 	switch (event->header.type) {
 	case PERF_RECORD_COMM:
-		event__process_comm(event, sample, session);
+		perf_event__process_comm(event, sample, session);
 		break;
 	case PERF_RECORD_MMAP:
-		event__process_mmap(event, sample, session);
+		perf_event__process_mmap(event, sample, session);
 		break;
 	case PERF_RECORD_FORK:
 	case PERF_RECORD_EXIT:
-		event__process_task(event, sample, session);
+		perf_event__process_task(event, sample, session);
 		break;
 	case PERF_RECORD_LOST:
-		event__process_lost(event, sample, session);
+		perf_event__process_lost(event, sample, session);
 	default:
 		break;
 	}
@@ -756,12 +763,14 @@ static void dso__calc_col_width(struct dso *self, struct hists *hists)
 	self->slen_calculated = 1;
 }
 
-int event__preprocess_sample(const event_t *self, struct perf_session *session,
-			     struct addr_location *al, struct perf_sample *sample,
-			     symbol_filter_t filter)
+int perf_event__preprocess_sample(const union perf_event *event,
+				  struct perf_session *session,
+				  struct addr_location *al,
+				  struct perf_sample *sample,
+				  symbol_filter_t filter)
 {
-	u8 cpumode = self->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
-	struct thread *thread = perf_session__findnew(session, self->ip.pid);
+	u8 cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
+	struct thread *thread = perf_session__findnew(session, event->ip.pid);
 
 	if (thread == NULL)
 		return -1;
@@ -783,7 +792,7 @@ int event__preprocess_sample(const event_t *self, struct perf_session *session,
 		machine__create_kernel_maps(&session->host_machine);
 
 	thread__find_addr_map(thread, session, cpumode, MAP__FUNCTION,
-			      self->ip.pid, self->ip.ip, al);
+			      event->ip.pid, event->ip.ip, al);
 	dump_printf(" ...... dso: %s\n",
 		    al->map ? al->map->dso->long_name :
 			al->level == 'H' ? "[hypervisor]" : "<not found>");
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index 84fd71f..eecb422 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -117,7 +117,7 @@ struct tracing_data_event {
 	u32 size;
 };
 
-typedef union event_union {
+union perf_event {
 	struct perf_event_header	header;
 	struct ip_event			ip;
 	struct mmap_event		mmap;
@@ -130,49 +130,52 @@ typedef union event_union {
 	struct event_type_event		event_type;
 	struct tracing_data_event	tracing_data;
 	struct build_id_event		build_id;
-} event_t;
+};
 
-void event__print_totals(void);
+void perf_event__print_totals(void);
 
 struct perf_session;
 
-typedef int (*event__handler_synth_t)(event_t *event, 
+typedef int (*perf_event__handler_synth_t)(union perf_event *event, 
+					   struct perf_session *session);
+typedef int (*perf_event__handler_t)(union perf_event *event,
+				     struct perf_sample *sample,
 				      struct perf_session *session);
-typedef int (*event__handler_t)(event_t *event, struct perf_sample *sample,
-				struct perf_session *session);
 
-int event__synthesize_thread(pid_t pid, event__handler_t process,
+int perf_event__synthesize_thread(pid_t pid, perf_event__handler_t process,
+				  struct perf_session *session);
+int perf_event__synthesize_threads(perf_event__handler_t process,
+				   struct perf_session *session);
+int perf_event__synthesize_kernel_mmap(perf_event__handler_t process,
+				       struct perf_session *session,
+				       struct machine *machine,
+				       const char *symbol_name);
+
+int perf_event__synthesize_modules(perf_event__handler_t process,
+				   struct perf_session *session,
+				   struct machine *machine);
+
+int perf_event__process_comm(union perf_event *event, struct perf_sample *sample,
 			     struct perf_session *session);
-int event__synthesize_threads(event__handler_t process,
-			      struct perf_session *session);
-int event__synthesize_kernel_mmap(event__handler_t process,
-				struct perf_session *session,
-				struct machine *machine,
-				const char *symbol_name);
-
-int event__synthesize_modules(event__handler_t process,
-			      struct perf_session *session,
-			      struct machine *machine);
-
-int event__process_comm(event_t *event, struct perf_sample *sample,
-			struct perf_session *session);
-int event__process_lost(event_t *event, struct perf_sample *sample,
-			struct perf_session *session);
-int event__process_mmap(event_t *event, struct perf_sample *sample,
-			struct perf_session *session);
-int event__process_task(event_t *event, struct perf_sample *sample,
+int perf_event__process_lost(union perf_event *event, struct perf_sample *sample,
+			     struct perf_session *session);
+int perf_event__process_mmap(union perf_event *event, struct perf_sample *sample,
+			     struct perf_session *session);
+int perf_event__process_task(union perf_event *event, struct perf_sample *sample,
+			     struct perf_session *session);
+int perf_event__process(union perf_event *event, struct perf_sample *sample,
 			struct perf_session *session);
-int event__process(event_t *event, struct perf_sample *sample,
-		   struct perf_session *session);
 
 struct addr_location;
-int event__preprocess_sample(const event_t *self, struct perf_session *session,
-			     struct addr_location *al, struct perf_sample *sample,
-			     symbol_filter_t filter);
+int perf_event__preprocess_sample(const union perf_event *self,
+				  struct perf_session *session,
+				  struct addr_location *al,
+				  struct perf_sample *sample,
+				  symbol_filter_t filter);
 
-const char *event__get_event_name(unsigned int id);
+const char *perf_event__name(unsigned int id);
 
-int event__parse_sample(const event_t *event, u64 type, bool sample_id_all,
-			struct perf_sample *sample);
+int perf_event__parse_sample(const union perf_event *event, u64 type,
+			     bool sample_id_all, struct perf_sample *sample);
 
 #endif /* __PERF_RECORD_H */
diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index b498eec..917fc18 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -107,7 +107,7 @@ struct perf_evsel *perf_evlist__id2evsel(struct perf_evlist *evlist, u64 id)
 	return NULL;
 }
 
-event_t *perf_evlist__read_on_cpu(struct perf_evlist *evlist, int cpu)
+union perf_event *perf_evlist__read_on_cpu(struct perf_evlist *evlist, int cpu)
 {
 	/* XXX Move this to perf.c, making it generally available */
 	unsigned int page_size = sysconf(_SC_PAGE_SIZE);
@@ -115,7 +115,7 @@ event_t *perf_evlist__read_on_cpu(struct perf_evlist *evlist, int cpu)
 	unsigned int head = perf_mmap__read_head(md);
 	unsigned int old = md->prev;
 	unsigned char *data = md->base + page_size;
-	event_t *event = NULL;
+	union perf_event *event = NULL;
 
 	if (evlist->overwrite) {
 		/*
@@ -140,7 +140,7 @@ event_t *perf_evlist__read_on_cpu(struct perf_evlist *evlist, int cpu)
 	if (old != head) {
 		size_t size;
 
-		event = (event_t *)&data[old & md->mask];
+		event = (union perf_event *)&data[old & md->mask];
 		size = event->header.size;
 
 		/*
diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h
index 2706ae4..022ae40 100644
--- a/tools/perf/util/evlist.h
+++ b/tools/perf/util/evlist.h
@@ -17,7 +17,7 @@ struct perf_evlist {
 	int		 nr_fds;
 	int		 mmap_len;
 	bool		 overwrite;
-	event_t		 event_copy;
+	union perf_event event_copy;
 	struct perf_mmap *mmap;
 	struct pollfd	 *pollfd;
 };
@@ -37,6 +37,6 @@ void perf_evlist__add_pollfd(struct perf_evlist *evlist, int fd);
 
 struct perf_evsel *perf_evlist__id2evsel(struct perf_evlist *evlist, u64 id);
 
-event_t *perf_evlist__read_on_cpu(struct perf_evlist *self, int cpu);
+union perf_event *perf_evlist__read_on_cpu(struct perf_evlist *self, int cpu);
 
 #endif /* __PERF_EVLIST_H */
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index a134885..fddeb08 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -367,8 +367,8 @@ out_unmap:
 	return -1;
 }
 
-static int event__parse_id_sample(const event_t *event, u64 type,
-				  struct perf_sample *sample)
+static int perf_event__parse_id_sample(const union perf_event *event, u64 type,
+				       struct perf_sample *sample)
 {
 	const u64 *array = event->sample.array;
 
@@ -405,8 +405,8 @@ static int event__parse_id_sample(const event_t *event, u64 type,
 	return 0;
 }
 
-int event__parse_sample(const event_t *event, u64 type, bool sample_id_all,
-			struct perf_sample *data)
+int perf_event__parse_sample(const union perf_event *event, u64 type,
+			     bool sample_id_all, struct perf_sample *data)
 {
 	const u64 *array;
 
@@ -416,7 +416,7 @@ int event__parse_sample(const event_t *event, u64 type, bool sample_id_all,
 	if (event->header.type != PERF_RECORD_SAMPLE) {
 		if (!sample_id_all)
 			return 0;
-		return event__parse_id_sample(event, type, data);
+		return perf_event__parse_id_sample(event, type, data);
 	}
 
 	array = event->sample.array;
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index f0138d4..c0de5ec 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -1000,11 +1000,11 @@ perf_header__find_attr(u64 id, struct perf_header *header)
 	return NULL;
 }
 
-int event__synthesize_attr(struct perf_event_attr *attr, u16 ids, u64 *id,
-			   event__handler_t process,
-			   struct perf_session *session)
+int perf_event__synthesize_attr(struct perf_event_attr *attr, u16 ids, u64 *id,
+				perf_event__handler_t process,
+				struct perf_session *session)
 {
-	event_t *ev;
+	union perf_event *ev;
 	size_t size;
 	int err;
 
@@ -1031,8 +1031,9 @@ int event__synthesize_attr(struct perf_event_attr *attr, u16 ids, u64 *id,
 	return err;
 }
 
-int event__synthesize_attrs(struct perf_header *self, event__handler_t process,
-			    struct perf_session *session)
+int perf_event__synthesize_attrs(struct perf_header *self,
+				 perf_event__handler_t process,
+				 struct perf_session *session)
 {
 	struct perf_header_attr	*attr;
 	int i, err = 0;
@@ -1040,8 +1041,8 @@ int event__synthesize_attrs(struct perf_header *self, event__handler_t process,
 	for (i = 0; i < self->attrs; i++) {
 		attr = self->attr[i];
 
-		err = event__synthesize_attr(&attr->attr, attr->ids, attr->id,
-					     process, session);
+		err = perf_event__synthesize_attr(&attr->attr, attr->ids,
+						  attr->id, process, session);
 		if (err) {
 			pr_debug("failed to create perf header attribute\n");
 			return err;
@@ -1051,21 +1052,22 @@ int event__synthesize_attrs(struct perf_header *self, event__handler_t process,
 	return err;
 }
 
-int event__process_attr(event_t *self, struct perf_session *session)
+int perf_event__process_attr(union perf_event *event,
+			     struct perf_session *session)
 {
 	struct perf_header_attr *attr;
 	unsigned int i, ids, n_ids;
 
-	attr = perf_header_attr__new(&self->attr.attr);
+	attr = perf_header_attr__new(&event->attr.attr);
 	if (attr == NULL)
 		return -ENOMEM;
 
-	ids = self->header.size;
-	ids -= (void *)&self->attr.id - (void *)self;
+	ids = event->header.size;
+	ids -= (void *)&event->attr.id - (void *)event;
 	n_ids = ids / sizeof(u64);
 
 	for (i = 0; i < n_ids; i++) {
-		if (perf_header_attr__add_id(attr, self->attr.id[i]) < 0) {
+		if (perf_header_attr__add_id(attr, event->attr.id[i]) < 0) {
 			perf_header_attr__delete(attr);
 			return -ENOMEM;
 		}
@@ -1081,11 +1083,11 @@ int event__process_attr(event_t *self, struct perf_session *session)
 	return 0;
 }
 
-int event__synthesize_event_type(u64 event_id, char *name,
-				 event__handler_t process,
-				 struct perf_session *session)
+int perf_event__synthesize_event_type(u64 event_id, char *name,
+				      perf_event__handler_t process,
+				      struct perf_session *session)
 {
-	event_t ev;
+	union perf_event ev;
 	size_t size = 0;
 	int err = 0;
 
@@ -1106,8 +1108,8 @@ int event__synthesize_event_type(u64 event_id, char *name,
 	return err;
 }
 
-int event__synthesize_event_types(event__handler_t process,
-				  struct perf_session *session)
+int perf_event__synthesize_event_types(perf_event__handler_t process,
+				       struct perf_session *session)
 {
 	struct perf_trace_event_type *type;
 	int i, err = 0;
@@ -1115,8 +1117,9 @@ int event__synthesize_event_types(event__handler_t process,
 	for (i = 0; i < event_count; i++) {
 		type = &events[i];
 
-		err = event__synthesize_event_type(type->event_id, type->name,
-						   process, session);
+		err = perf_event__synthesize_event_type(type->event_id,
+							type->name, process,
+							session);
 		if (err) {
 			pr_debug("failed to create perf header event type\n");
 			return err;
@@ -1126,21 +1129,21 @@ int event__synthesize_event_types(event__handler_t process,
 	return err;
 }
 
-int event__process_event_type(event_t *self,
-			      struct perf_session *session __unused)
+int perf_event__process_event_type(union perf_event *event,
+				   struct perf_session *session __unused)
 {
-	if (perf_header__push_event(self->event_type.event_type.event_id,
-				    self->event_type.event_type.name) < 0)
+	if (perf_header__push_event(event->event_type.event_type.event_id,
+				    event->event_type.event_type.name) < 0)
 		return -ENOMEM;
 
 	return 0;
 }
 
-int event__synthesize_tracing_data(int fd, struct perf_evlist *evlist,
-				   event__handler_t process,
+int perf_event__synthesize_tracing_data(int fd, struct perf_evlist *evlist,
+					 perf_event__handler_t process,
 				   struct perf_session *session __unused)
 {
-	event_t ev;
+	union perf_event ev;
 	ssize_t size = 0, aligned_size = 0, padding;
 	int err = 0;
 
@@ -1163,10 +1166,10 @@ int event__synthesize_tracing_data(int fd, struct perf_evlist *evlist,
 	return aligned_size;
 }
 
-int event__process_tracing_data(event_t *self,
-				struct perf_session *session)
+int perf_event__process_tracing_data(union perf_event *event,
+				     struct perf_session *session)
 {
-	ssize_t size_read, padding, size = self->tracing_data.size;
+	ssize_t size_read, padding, size = event->tracing_data.size;
 	off_t offset = lseek(session->fd, 0, SEEK_CUR);
 	char buf[BUFSIZ];
 
@@ -1192,12 +1195,12 @@ int event__process_tracing_data(event_t *self,
 	return size_read + padding;
 }
 
-int event__synthesize_build_id(struct dso *pos, u16 misc,
-			       event__handler_t process,
-			       struct machine *machine,
-			       struct perf_session *session)
+int perf_event__synthesize_build_id(struct dso *pos, u16 misc,
+				    perf_event__handler_t process,
+				    struct machine *machine,
+				    struct perf_session *session)
 {
-	event_t ev;
+	union perf_event ev;
 	size_t len;
 	int err = 0;
 
@@ -1220,11 +1223,11 @@ int event__synthesize_build_id(struct dso *pos, u16 misc,
 	return err;
 }
 
-int event__process_build_id(event_t *self,
-			    struct perf_session *session)
+int perf_event__process_build_id(union perf_event *event,
+				 struct perf_session *session)
 {
-	__event_process_build_id(&self->build_id,
-				 self->build_id.filename,
+	__event_process_build_id(&event->build_id,
+				 event->build_id.filename,
 				 session);
 	return 0;
 }
diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h
index 65afd7f..f042ceb 100644
--- a/tools/perf/util/header.h
+++ b/tools/perf/util/header.h
@@ -100,32 +100,32 @@ int build_id_cache__add_s(const char *sbuild_id, const char *debugdir,
 			  const char *name, bool is_kallsyms);
 int build_id_cache__remove_s(const char *sbuild_id, const char *debugdir);
 
-int event__synthesize_attr(struct perf_event_attr *attr, u16 ids, u64 *id,
-			   event__handler_t process,
-			   struct perf_session *session);
-int event__synthesize_attrs(struct perf_header *self,
-			    event__handler_t process,
-			    struct perf_session *session);
-int event__process_attr(event_t *self, struct perf_session *session);
-
-int event__synthesize_event_type(u64 event_id, char *name,
-				 event__handler_t process,
+int perf_event__synthesize_attr(struct perf_event_attr *attr, u16 ids, u64 *id,
+				perf_event__handler_t process,
+				struct perf_session *session);
+int perf_event__synthesize_attrs(struct perf_header *self,
+				 perf_event__handler_t process,
 				 struct perf_session *session);
-int event__synthesize_event_types(event__handler_t process,
-				  struct perf_session *session);
-int event__process_event_type(event_t *self,
-			      struct perf_session *session);
-
-int event__synthesize_tracing_data(int fd, struct perf_evlist *evlist,
-				   event__handler_t process,
+int perf_event__process_attr(union perf_event *event, struct perf_session *session);
+
+int perf_event__synthesize_event_type(u64 event_id, char *name,
+				      perf_event__handler_t process,
+				      struct perf_session *session);
+int perf_event__synthesize_event_types(perf_event__handler_t process,
+				       struct perf_session *session);
+int perf_event__process_event_type(union perf_event *event,
 				   struct perf_session *session);
-int event__process_tracing_data(event_t *self,
-				struct perf_session *session);
-
-int event__synthesize_build_id(struct dso *pos, u16 misc,
-			       event__handler_t process,
-			       struct machine *machine,
-			       struct perf_session *session);
-int event__process_build_id(event_t *self, struct perf_session *session);
 
+int perf_event__synthesize_tracing_data(int fd, struct perf_evlist *evlist,
+					perf_event__handler_t process,
+					struct perf_session *session);
+int perf_event__process_tracing_data(union perf_event *event,
+				     struct perf_session *session);
+
+int perf_event__synthesize_build_id(struct dso *pos, u16 misc,
+				    perf_event__handler_t process,
+				    struct machine *machine,
+				    struct perf_session *session);
+int perf_event__process_build_id(union perf_event *event,
+				 struct perf_session *session);
 #endif /* __PERF_HEADER_H */
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index 02ed318..9588780 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -1182,7 +1182,7 @@ size_t hists__fprintf_nr_events(struct hists *self, FILE *fp)
 	size_t ret = 0;
 
 	for (i = 0; i < PERF_RECORD_HEADER_MAX; ++i) {
-		const char *name = event__get_event_name(i);
+		const char *name = perf_event__name(i);
 
 		if (!strcmp(name, "UNKNOWN"))
 			continue;
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index ee0b611..a3a871f 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -165,7 +165,7 @@ struct perf_session *perf_session__new(const char *filename, int mode,
 	} else if (mode == O_WRONLY) {
 		/*
 		 * In O_RDONLY mode this will be performed when reading the
-		 * kernel MMAP event, in event__process_mmap().
+		 * kernel MMAP event, in perf_event__process_mmap().
 		 */
 		if (perf_session__create_kernel_maps(self) < 0)
 			goto out_delete;
@@ -291,14 +291,14 @@ int perf_session__resolve_callchain(struct perf_session *self,
 	return 0;
 }
 
-static int process_event_synth_stub(event_t *event __used,
+static int process_event_synth_stub(union perf_event *event __used,
 				    struct perf_session *session __used)
 {
 	dump_printf(": unhandled!\n");
 	return 0;
 }
 
-static int process_event_stub(event_t *event __used,
+static int process_event_stub(union perf_event *event __used,
 			      struct perf_sample *sample __used,
 			      struct perf_session *session __used)
 {
@@ -306,7 +306,7 @@ static int process_event_stub(event_t *event __used,
 	return 0;
 }
 
-static int process_finished_round_stub(event_t *event __used,
+static int process_finished_round_stub(union perf_event *event __used,
 				       struct perf_session *session __used,
 				       struct perf_event_ops *ops __used)
 {
@@ -314,7 +314,7 @@ static int process_finished_round_stub(event_t *event __used,
 	return 0;
 }
 
-static int process_finished_round(event_t *event,
+static int process_finished_round(union perf_event *event,
 				  struct perf_session *session,
 				  struct perf_event_ops *ops);
 
@@ -331,7 +331,7 @@ static void perf_event_ops__fill_defaults(struct perf_event_ops *handler)
 	if (handler->exit == NULL)
 		handler->exit = process_event_stub;
 	if (handler->lost == NULL)
-		handler->lost = event__process_lost;
+		handler->lost = perf_event__process_lost;
 	if (handler->read == NULL)
 		handler->read = process_event_stub;
 	if (handler->throttle == NULL)
@@ -365,98 +365,98 @@ void mem_bswap_64(void *src, int byte_size)
 	}
 }
 
-static void event__all64_swap(event_t *self)
+static void perf_event__all64_swap(union perf_event *event)
 {
-	struct perf_event_header *hdr = &self->header;
-	mem_bswap_64(hdr + 1, self->header.size - sizeof(*hdr));
+	struct perf_event_header *hdr = &event->header;
+	mem_bswap_64(hdr + 1, event->header.size - sizeof(*hdr));
 }
 
-static void event__comm_swap(event_t *self)
+static void perf_event__comm_swap(union perf_event *event)
 {
-	self->comm.pid = bswap_32(self->comm.pid);
-	self->comm.tid = bswap_32(self->comm.tid);
+	event->comm.pid = bswap_32(event->comm.pid);
+	event->comm.tid = bswap_32(event->comm.tid);
 }
 
-static void event__mmap_swap(event_t *self)
+static void perf_event__mmap_swap(union perf_event *event)
 {
-	self->mmap.pid	 = bswap_32(self->mmap.pid);
-	self->mmap.tid	 = bswap_32(self->mmap.tid);
-	self->mmap.start = bswap_64(self->mmap.start);
-	self->mmap.len	 = bswap_64(self->mmap.len);
-	self->mmap.pgoff = bswap_64(self->mmap.pgoff);
+	event->mmap.pid	  = bswap_32(event->mmap.pid);
+	event->mmap.tid	  = bswap_32(event->mmap.tid);
+	event->mmap.start = bswap_64(event->mmap.start);
+	event->mmap.len	  = bswap_64(event->mmap.len);
+	event->mmap.pgoff = bswap_64(event->mmap.pgoff);
 }
 
-static void event__task_swap(event_t *self)
+static void perf_event__task_swap(union perf_event *event)
 {
-	self->fork.pid	= bswap_32(self->fork.pid);
-	self->fork.tid	= bswap_32(self->fork.tid);
-	self->fork.ppid	= bswap_32(self->fork.ppid);
-	self->fork.ptid	= bswap_32(self->fork.ptid);
-	self->fork.time	= bswap_64(self->fork.time);
+	event->fork.pid	 = bswap_32(event->fork.pid);
+	event->fork.tid	 = bswap_32(event->fork.tid);
+	event->fork.ppid = bswap_32(event->fork.ppid);
+	event->fork.ptid = bswap_32(event->fork.ptid);
+	event->fork.time = bswap_64(event->fork.time);
 }
 
-static void event__read_swap(event_t *self)
+static void perf_event__read_swap(union perf_event *event)
 {
-	self->read.pid		= bswap_32(self->read.pid);
-	self->read.tid		= bswap_32(self->read.tid);
-	self->read.value	= bswap_64(self->read.value);
-	self->read.time_enabled	= bswap_64(self->read.time_enabled);
-	self->read.time_running	= bswap_64(self->read.time_running);
-	self->read.id		= bswap_64(self->read.id);
+	event->read.pid		 = bswap_32(event->read.pid);
+	event->read.tid		 = bswap_32(event->read.tid);
+	event->read.value	 = bswap_64(event->read.value);
+	event->read.time_enabled = bswap_64(event->read.time_enabled);
+	event->read.time_running = bswap_64(event->read.time_running);
+	event->read.id		 = bswap_64(event->read.id);
 }
 
-static void event__attr_swap(event_t *self)
+static void perf_event__attr_swap(union perf_event *event)
 {
 	size_t size;
 
-	self->attr.attr.type		= bswap_32(self->attr.attr.type);
-	self->attr.attr.size		= bswap_32(self->attr.attr.size);
-	self->attr.attr.config		= bswap_64(self->attr.attr.config);
-	self->attr.attr.sample_period	= bswap_64(self->attr.attr.sample_period);
-	self->attr.attr.sample_type	= bswap_64(self->attr.attr.sample_type);
-	self->attr.attr.read_format	= bswap_64(self->attr.attr.read_format);
-	self->attr.attr.wakeup_events	= bswap_32(self->attr.attr.wakeup_events);
-	self->attr.attr.bp_type		= bswap_32(self->attr.attr.bp_type);
-	self->attr.attr.bp_addr		= bswap_64(self->attr.attr.bp_addr);
-	self->attr.attr.bp_len		= bswap_64(self->attr.attr.bp_len);
-
-	size = self->header.size;
-	size -= (void *)&self->attr.id - (void *)self;
-	mem_bswap_64(self->attr.id, size);
+	event->attr.attr.type		= bswap_32(event->attr.attr.type);
+	event->attr.attr.size		= bswap_32(event->attr.attr.size);
+	event->attr.attr.config		= bswap_64(event->attr.attr.config);
+	event->attr.attr.sample_period	= bswap_64(event->attr.attr.sample_period);
+	event->attr.attr.sample_type	= bswap_64(event->attr.attr.sample_type);
+	event->attr.attr.read_format	= bswap_64(event->attr.attr.read_format);
+	event->attr.attr.wakeup_events	= bswap_32(event->attr.attr.wakeup_events);
+	event->attr.attr.bp_type	= bswap_32(event->attr.attr.bp_type);
+	event->attr.attr.bp_addr	= bswap_64(event->attr.attr.bp_addr);
+	event->attr.attr.bp_len		= bswap_64(event->attr.attr.bp_len);
+
+	size = event->header.size;
+	size -= (void *)&event->attr.id - (void *)event;
+	mem_bswap_64(event->attr.id, size);
 }
 
-static void event__event_type_swap(event_t *self)
+static void perf_event__event_type_swap(union perf_event *event)
 {
-	self->event_type.event_type.event_id =
-		bswap_64(self->event_type.event_type.event_id);
+	event->event_type.event_type.event_id =
+		bswap_64(event->event_type.event_type.event_id);
 }
 
-static void event__tracing_data_swap(event_t *self)
+static void perf_event__tracing_data_swap(union perf_event *event)
 {
-	self->tracing_data.size = bswap_32(self->tracing_data.size);
+	event->tracing_data.size = bswap_32(event->tracing_data.size);
 }
 
-typedef void (*event__swap_op)(event_t *self);
-
-static event__swap_op event__swap_ops[] = {
-	[PERF_RECORD_MMAP]   = event__mmap_swap,
-	[PERF_RECORD_COMM]   = event__comm_swap,
-	[PERF_RECORD_FORK]   = event__task_swap,
-	[PERF_RECORD_EXIT]   = event__task_swap,
-	[PERF_RECORD_LOST]   = event__all64_swap,
-	[PERF_RECORD_READ]   = event__read_swap,
-	[PERF_RECORD_SAMPLE] = event__all64_swap,
-	[PERF_RECORD_HEADER_ATTR]   = event__attr_swap,
-	[PERF_RECORD_HEADER_EVENT_TYPE]   = event__event_type_swap,
-	[PERF_RECORD_HEADER_TRACING_DATA]   = event__tracing_data_swap,
-	[PERF_RECORD_HEADER_BUILD_ID]   = NULL,
-	[PERF_RECORD_HEADER_MAX]    = NULL,
+typedef void (*perf_event__swap_op)(union perf_event *event);
+
+static perf_event__swap_op perf_event__swap_ops[] = {
+	[PERF_RECORD_MMAP]		  = perf_event__mmap_swap,
+	[PERF_RECORD_COMM]		  = perf_event__comm_swap,
+	[PERF_RECORD_FORK]		  = perf_event__task_swap,
+	[PERF_RECORD_EXIT]		  = perf_event__task_swap,
+	[PERF_RECORD_LOST]		  = perf_event__all64_swap,
+	[PERF_RECORD_READ]		  = perf_event__read_swap,
+	[PERF_RECORD_SAMPLE]		  = perf_event__all64_swap,
+	[PERF_RECORD_HEADER_ATTR]	  = perf_event__attr_swap,
+	[PERF_RECORD_HEADER_EVENT_TYPE]	  = perf_event__event_type_swap,
+	[PERF_RECORD_HEADER_TRACING_DATA] = perf_event__tracing_data_swap,
+	[PERF_RECORD_HEADER_BUILD_ID]	  = NULL,
+	[PERF_RECORD_HEADER_MAX]	  = NULL,
 };
 
 struct sample_queue {
 	u64			timestamp;
 	u64			file_offset;
-	event_t			*event;
+	union perf_event	*event;
 	struct list_head	list;
 };
 
@@ -474,7 +474,7 @@ static void perf_session_free_sample_buffers(struct perf_session *session)
 }
 
 static int perf_session_deliver_event(struct perf_session *session,
-				      event_t *event,
+				      union perf_event *event,
 				      struct perf_sample *sample,
 				      struct perf_event_ops *ops,
 				      u64 file_offset);
@@ -552,7 +552,7 @@ static void flush_sample_queue(struct perf_session *s,
  *      Flush every events below timestamp 7
  *      etc...
  */
-static int process_finished_round(event_t *event __used,
+static int process_finished_round(union perf_event *event __used,
 				  struct perf_session *session,
 				  struct perf_event_ops *ops)
 {
@@ -609,7 +609,7 @@ static void __queue_event(struct sample_queue *new, struct perf_session *s)
 
 #define MAX_SAMPLE_BUFFER	(64 * 1024 / sizeof(struct sample_queue))
 
-static int perf_session_queue_event(struct perf_session *s, event_t *event,
+static int perf_session_queue_event(struct perf_session *s, union perf_event *event,
 				    struct perf_sample *sample, u64 file_offset)
 {
 	struct ordered_samples *os = &s->ordered_samples;
@@ -662,7 +662,7 @@ static void callchain__printf(struct perf_sample *sample)
 }
 
 static void perf_session__print_tstamp(struct perf_session *session,
-				       event_t *event,
+				       union perf_event *event,
 				       struct perf_sample *sample)
 {
 	if (event->header.type != PERF_RECORD_SAMPLE &&
@@ -678,7 +678,7 @@ static void perf_session__print_tstamp(struct perf_session *session,
 		printf("%" PRIu64 " ", sample->time);
 }
 
-static void dump_event(struct perf_session *session, event_t *event,
+static void dump_event(struct perf_session *session, union perf_event *event,
 		       u64 file_offset, struct perf_sample *sample)
 {
 	if (!dump_trace)
@@ -693,10 +693,10 @@ static void dump_event(struct perf_session *session, event_t *event,
 		perf_session__print_tstamp(session, event, sample);
 
 	printf("%#" PRIx64 " [%#x]: PERF_RECORD_%s", file_offset,
-	       event->header.size, event__get_event_name(event->header.type));
+	       event->header.size, perf_event__name(event->header.type));
 }
 
-static void dump_sample(struct perf_session *session, event_t *event,
+static void dump_sample(struct perf_session *session, union perf_event *event,
 			struct perf_sample *sample)
 {
 	if (!dump_trace)
@@ -711,7 +711,7 @@ static void dump_sample(struct perf_session *session, event_t *event,
 }
 
 static int perf_session_deliver_event(struct perf_session *session,
-				      event_t *event,
+				      union perf_event *event,
 				      struct perf_sample *sample,
 				      struct perf_event_ops *ops,
 				      u64 file_offset)
@@ -745,7 +745,7 @@ static int perf_session_deliver_event(struct perf_session *session,
 }
 
 static int perf_session__preprocess_sample(struct perf_session *session,
-					   event_t *event, struct perf_sample *sample)
+					   union perf_event *event, struct perf_sample *sample)
 {
 	if (event->header.type != PERF_RECORD_SAMPLE ||
 	    !(session->sample_type & PERF_SAMPLE_CALLCHAIN))
@@ -760,7 +760,7 @@ static int perf_session__preprocess_sample(struct perf_session *session,
 	return 0;
 }
 
-static int perf_session__process_user_event(struct perf_session *session, event_t *event,
+static int perf_session__process_user_event(struct perf_session *session, union perf_event *event,
 					    struct perf_event_ops *ops, u64 file_offset)
 {
 	dump_event(session, event, file_offset, NULL);
@@ -785,15 +785,16 @@ static int perf_session__process_user_event(struct perf_session *session, event_
 }
 
 static int perf_session__process_event(struct perf_session *session,
-				       event_t *event,
+				       union perf_event *event,
 				       struct perf_event_ops *ops,
 				       u64 file_offset)
 {
 	struct perf_sample sample;
 	int ret;
 
-	if (session->header.needs_swap && event__swap_ops[event->header.type])
-		event__swap_ops[event->header.type](event);
+	if (session->header.needs_swap &&
+	    perf_event__swap_ops[event->header.type])
+		perf_event__swap_ops[event->header.type](event);
 
 	if (event->header.type >= PERF_RECORD_HEADER_MAX)
 		return -EINVAL;
@@ -845,7 +846,7 @@ static struct thread *perf_session__register_idle_thread(struct perf_session *se
 static void perf_session__warn_about_errors(const struct perf_session *session,
 					    const struct perf_event_ops *ops)
 {
-	if (ops->lost == event__process_lost &&
+	if (ops->lost == perf_event__process_lost &&
 	    session->hists.stats.total_lost != 0) {
 		ui__warning("Processed %" PRIu64 " events and LOST %" PRIu64
 			    "!\n\nCheck IO/CPU overload!\n\n",
@@ -877,7 +878,7 @@ volatile int session_done;
 static int __perf_session__process_pipe_events(struct perf_session *self,
 					       struct perf_event_ops *ops)
 {
-	event_t event;
+	union perf_event event;
 	uint32_t size;
 	int skip = 0;
 	u64 head;
@@ -958,7 +959,7 @@ int __perf_session__process_events(struct perf_session *session,
 	struct ui_progress *progress;
 	size_t	page_size, mmap_size;
 	char *buf, *mmaps[8];
-	event_t *event;
+	union perf_event *event;
 	uint32_t size;
 
 	perf_event_ops__fill_defaults(ops);
@@ -1003,7 +1004,7 @@ remap:
 	file_pos = file_offset + head;
 
 more:
-	event = (event_t *)(buf + head);
+	event = (union perf_event *)(buf + head);
 
 	if (session->header.needs_swap)
 		perf_event_header__bswap(&event->header);
diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h
index 365bf53..977b3a1 100644
--- a/tools/perf/util/session.h
+++ b/tools/perf/util/session.h
@@ -57,10 +57,11 @@ struct perf_session {
 
 struct perf_event_ops;
 
-typedef int (*event_op)(event_t *self, struct perf_sample *sample,
+typedef int (*event_op)(union perf_event *self, struct perf_sample *sample,
 			struct perf_session *session);
-typedef int (*event_synth_op)(event_t *self, struct perf_session *session);
-typedef int (*event_op2)(event_t *self, struct perf_session *session,
+typedef int (*event_synth_op)(union perf_event *self,
+			      struct perf_session *session);
+typedef int (*event_op2)(union perf_event *self, struct perf_session *session,
 			 struct perf_event_ops *ops);
 
 struct perf_event_ops {
@@ -157,11 +158,11 @@ size_t perf_session__fprintf_nr_events(struct perf_session *self, FILE *fp)
 }
 
 static inline int perf_session__parse_sample(struct perf_session *session,
-					     const event_t *event,
+					     const union perf_event *event,
 					     struct perf_sample *sample)
 {
-	return event__parse_sample(event, session->sample_type,
-				   session->sample_id_all, sample);
+	return perf_event__parse_sample(event, session->sample_type,
+					session->sample_id_all, sample);
 }
 
 #endif /* __PERF_SESSION_H */
-- 
cgit v0.10.2


From 877108e42b1b9ba64857c4030cf356ecc120fd18 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Sat, 29 Jan 2011 15:44:29 -0200
Subject: perf tools: Initial python binding

First clarifying that this kind of binding is not a replacement or an
equivalent to the 'perf script' way of using python with perf.

The 'perf script' way is to process events and look at a given script
for some python function that matches the events to pass each event for
processing.

This is a python module, i.e. everything is driven from the python
script, that merely uses "import perf" or "from perf import".

perf script is focused on tracepoints, this binding is focused on profiling as
an initial target. More work is needed to make available tracepoint specific
variables as event variables accessible via this binding.

There is one example of such usage model, in
tools/perf/python/twatch.py, a tool to watch "cycles" events together
with task (fork, exit) and comm perf events.

For now, due to me not being able to grok how python distutils cope with
building C extensions outside the sources dir the install target just
builds it, I'm using it as:

[root@emilia linux]# export PYTHONPATH=~acme/git/build/perf/lib.linux-x86_64-2.6/
[root@emilia linux]# tools/perf/python/twatch.py
cpu:  4, pid: 30126, tid: 30126 { type: mmap, pid: 30126, tid: 30126, start: 0x4, length: 0x82e9ca03, offset: 0, filename:  }
cpu:  6, pid:   47, tid:   47 { type: mmap, pid: 47, tid: 47, start: 0x6, length: 0xbef87c36, offset: 0, filename:  }
cpu:  1, pid:    0, tid:    0 { type: mmap, pid: 0, tid: 0, start: 0x1, length: 0x775d1904, offset: 0, filename:  }
cpu:  7, pid:    0, tid:    0 { type: mmap, pid: 0, tid: 0, start: 0x7, length: 0xc750aeb6, offset: 0, filename:  }
cpu:  5, pid: 2255, tid: 2255 { type: mmap, pid: 2255, tid: 2255, start: 0x5, length: 0x76669635, offset: 0, filename:  }
cpu:  0, pid:    0, tid:    0 { type: mmap, pid: 0, tid: 0, start: 0, length: 0x6422ef6b, offset: 0, filename:  }
cpu:  2, pid: 2255, tid: 2255 { type: mmap, pid: 2255, tid: 2255, start: 0x2, length: 0xe078757a, offset: 0, filename:  }
cpu:  1, pid: 5769, tid: 5769 { type: fork, pid: 30127, ppid: 5769, tid: 30127, ptid: 5769, time: 103893991270534}
cpu:  6, pid: 30127, tid: 30127 { type: comm, pid: 30127, tid: 30127, comm: ls }
cpu:  6, pid: 30127, tid: 30127 { type: exit, pid: 30127, ppid: 30127, tid: 30127, ptid: 30127, time: 103893993273024}

The first 8 mmap events in this 8 way machine are a mistery that is still being
investigated.

More of the tools/perf/util/ APIs will be exposed via this python binding as
the need arises. For now the focus is on creating events and processing them,
symbol resolution is an obvious next step, with tracepoint variables as a close
second step.

Cc: Clark Williams <williams@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index eedcf95..36ff73c 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -315,6 +315,7 @@ COMPAT_CFLAGS =
 COMPAT_OBJS =
 LIB_H =
 LIB_OBJS =
+PYRF_OBJS =
 SCRIPT_PERL =
 SCRIPT_SH =
 TEST_PROGRAMS =
@@ -324,6 +325,9 @@ SCRIPT_SH += perf-archive.sh
 grep-libs = $(filter -l%,$(1))
 strip-libs = $(filter-out -l%,$(1))
 
+pyrf: $(PYRF_OBJS)
+	python util/setup.py build --build-base='$(OUTPUT)'
+
 #
 # No Perl scripts right now:
 #
@@ -349,7 +353,7 @@ PROGRAMS += $(OUTPUT)perf
 #
 
 # what 'all' will build and 'install' will install, in perfexecdir
-ALL_PROGRAMS = $(PROGRAMS) $(SCRIPTS)
+ALL_PROGRAMS = $(PROGRAMS) $(SCRIPTS) pyrf
 
 # what 'all' will build but not install in perfexecdir
 OTHER_PROGRAMS = $(OUTPUT)perf$X
@@ -520,6 +524,20 @@ BUILTIN_OBJS += $(OUTPUT)builtin-inject.o
 
 PERFLIBS = $(LIB_FILE)
 
+# Files needed for the python binding, perf.so
+# pyrf is just an internal name needed for all those wrappers.
+# This has to be in sync with what is in the 'sources' variable in
+# tools/perf/util/setup.py
+
+PYRF_OBJS += $(OUTPUT)util/cpumap.o
+PYRF_OBJS += $(OUTPUT)util/ctype.o
+PYRF_OBJS += $(OUTPUT)util/evlist.o
+PYRF_OBJS += $(OUTPUT)util/evsel.o
+PYRF_OBJS += $(OUTPUT)util/python.o
+PYRF_OBJS += $(OUTPUT)util/thread_map.o
+PYRF_OBJS += $(OUTPUT)util/util.o
+PYRF_OBJS += $(OUTPUT)util/xyarray.o
+
 #
 # Platform specific tweaks
 #
diff --git a/tools/perf/python/twatch.py b/tools/perf/python/twatch.py
new file mode 100755
index 0000000..5e9f3b7
--- /dev/null
+++ b/tools/perf/python/twatch.py
@@ -0,0 +1,41 @@
+#! /usr/bin/python
+# -*- python -*-
+# -*- coding: utf-8 -*-
+#   twatch - Experimental use of the perf python interface
+#   Copyright (C) 2011 Arnaldo Carvalho de Melo <acme@redhat.com>
+#
+#   This application is free software; you can redistribute it and/or
+#   modify it under the terms of the GNU General Public License
+#   as published by the Free Software Foundation; version 2.
+#
+#   This application is distributed in the hope that it will be useful,
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+#   General Public License for more details.
+
+import perf
+
+def main():
+	cpus = perf.cpu_map()
+	threads = perf.thread_map()
+	evsel = perf.evsel(task = 1, comm = 1, mmap = 0,
+			   wakeup_events = 1, sample_period = 1,
+			   sample_id_all = 1,
+			   sample_type = perf.SAMPLE_PERIOD | perf.SAMPLE_TID | perf.SAMPLE_CPU | perf.SAMPLE_TID)
+	evsel.open(cpus = cpus, threads = threads);
+	evlist = perf.evlist()
+	evlist.add(evsel)
+	evlist.mmap(cpus = cpus, threads = threads)
+	while True:
+		evlist.poll(timeout = -1)
+		for cpu in cpus:
+			event = evlist.read_on_cpu(cpu)
+			if not event:
+				continue
+			print "cpu: %2d, pid: %4d, tid: %4d" % (event.sample_cpu,
+								event.sample_pid,
+								event.sample_tid),
+			print event
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c
new file mode 100644
index 0000000..88d4789
--- /dev/null
+++ b/tools/perf/util/python.c
@@ -0,0 +1,888 @@
+#include <Python.h>
+#include <structmember.h>
+#include <inttypes.h>
+#include <poll.h>
+#include "evlist.h"
+#include "evsel.h"
+#include "event.h"
+#include "cpumap.h"
+#include "thread_map.h"
+
+struct throttle_event {
+	struct perf_event_header header;
+	u64			 time;
+	u64			 id;
+	u64			 stream_id;
+};
+
+#define member_def(type, member, ptype, help) \
+	{ #member, ptype, \
+	  offsetof(struct pyrf_event, event) + offsetof(struct type, member), \
+	  0, help }
+
+#define sample_member_def(name, member, ptype, help) \
+	{ #name, ptype, \
+	  offsetof(struct pyrf_event, sample) + offsetof(struct perf_sample, member), \
+	  0, help }
+
+struct pyrf_event {
+	PyObject_HEAD
+	struct perf_sample sample;
+	union perf_event   event;
+};
+
+#define T_ULONG_LONG T_ULONG
+
+#define sample_members \
+	sample_member_def(sample_ip, ip, T_ULONG_LONG, "event type"),			 \
+	sample_member_def(sample_pid, pid, T_INT, "event pid"),			 \
+	sample_member_def(sample_tid, tid, T_INT, "event tid"),			 \
+	sample_member_def(sample_time, time, T_ULONG_LONG, "event timestamp"),		 \
+	sample_member_def(sample_addr, addr, T_ULONG_LONG, "event addr"),		 \
+	sample_member_def(sample_id, id, T_ULONG_LONG, "event id"),			 \
+	sample_member_def(sample_stream_id, stream_id, T_ULONG_LONG, "event stream id"), \
+	sample_member_def(sample_period, period, T_ULONG_LONG, "event period"),		 \
+	sample_member_def(sample_cpu, cpu, T_UINT, "event cpu"),
+
+static char pyrf_mmap_event__doc[] = PyDoc_STR("perf mmap event object.");
+
+static PyMemberDef pyrf_mmap_event__members[] = {
+	sample_members
+	member_def(perf_event_header, type, T_UINT, "event type"),
+	member_def(mmap_event, pid, T_UINT, "event pid"),
+	member_def(mmap_event, tid, T_UINT, "event tid"),
+	member_def(mmap_event, start, T_ULONG_LONG, "start of the map"),
+	member_def(mmap_event, len, T_ULONG_LONG, "map length"),
+	member_def(mmap_event, pgoff, T_ULONG_LONG, "page offset"),
+	member_def(mmap_event, filename, T_STRING_INPLACE, "backing store"),
+	{ NULL, },
+};
+
+static PyObject *pyrf_mmap_event__repr(struct pyrf_event *pevent)
+{
+	PyObject *ret;
+	char *s;
+
+	if (asprintf(&s, "{ type: mmap, pid: %u, tid: %u, start: %#" PRIx64 ", "
+			 "length: %#" PRIx64 ", offset: %#" PRIx64 ", "
+			 "filename: %s }",
+		     pevent->event.mmap.pid, pevent->event.mmap.tid,
+		     pevent->event.mmap.start, pevent->event.mmap.len,
+		     pevent->event.mmap.pgoff, pevent->event.mmap.filename) < 0) {
+		ret = PyErr_NoMemory();
+	} else {
+		ret = PyString_FromString(s);
+		free(s);
+	}
+	return ret;
+}
+
+static PyTypeObject pyrf_mmap_event__type = {
+	PyVarObject_HEAD_INIT(NULL, 0)
+	.tp_name	= "perf.mmap_event",
+	.tp_basicsize	= sizeof(struct pyrf_event),
+	.tp_flags	= Py_TPFLAGS_DEFAULT|Py_TPFLAGS_BASETYPE,
+	.tp_doc		= pyrf_mmap_event__doc,
+	.tp_members	= pyrf_mmap_event__members,
+	.tp_repr	= (reprfunc)pyrf_mmap_event__repr,
+};
+
+static char pyrf_task_event__doc[] = PyDoc_STR("perf task (fork/exit) event object.");
+
+static PyMemberDef pyrf_task_event__members[] = {
+	sample_members
+	member_def(perf_event_header, type, T_UINT, "event type"),
+	member_def(fork_event, pid, T_UINT, "event pid"),
+	member_def(fork_event, ppid, T_UINT, "event ppid"),
+	member_def(fork_event, tid, T_UINT, "event tid"),
+	member_def(fork_event, ptid, T_UINT, "event ptid"),
+	member_def(fork_event, time, T_ULONG_LONG, "timestamp"),
+	{ NULL, },
+};
+
+static PyObject *pyrf_task_event__repr(struct pyrf_event *pevent)
+{
+	return PyString_FromFormat("{ type: %s, pid: %u, ppid: %u, tid: %u, "
+				   "ptid: %u, time: %" PRIu64 "}",
+				   pevent->event.header.type == PERF_RECORD_FORK ? "fork" : "exit",
+				   pevent->event.fork.pid,
+				   pevent->event.fork.ppid,
+				   pevent->event.fork.tid,
+				   pevent->event.fork.ptid,
+				   pevent->event.fork.time);
+}
+
+static PyTypeObject pyrf_task_event__type = {
+	PyVarObject_HEAD_INIT(NULL, 0)
+	.tp_name	= "perf.task_event",
+	.tp_basicsize	= sizeof(struct pyrf_event),
+	.tp_flags	= Py_TPFLAGS_DEFAULT|Py_TPFLAGS_BASETYPE,
+	.tp_doc		= pyrf_task_event__doc,
+	.tp_members	= pyrf_task_event__members,
+	.tp_repr	= (reprfunc)pyrf_task_event__repr,
+};
+
+static char pyrf_comm_event__doc[] = PyDoc_STR("perf comm event object.");
+
+static PyMemberDef pyrf_comm_event__members[] = {
+	sample_members
+	member_def(perf_event_header, type, T_UINT, "event type"),
+	member_def(comm_event, pid, T_UINT, "event pid"),
+	member_def(comm_event, tid, T_UINT, "event tid"),
+	member_def(comm_event, comm, T_STRING_INPLACE, "process name"),
+	{ NULL, },
+};
+
+static PyObject *pyrf_comm_event__repr(struct pyrf_event *pevent)
+{
+	return PyString_FromFormat("{ type: comm, pid: %u, tid: %u, comm: %s }",
+				   pevent->event.comm.pid,
+				   pevent->event.comm.tid,
+				   pevent->event.comm.comm);
+}
+
+static PyTypeObject pyrf_comm_event__type = {
+	PyVarObject_HEAD_INIT(NULL, 0)
+	.tp_name	= "perf.comm_event",
+	.tp_basicsize	= sizeof(struct pyrf_event),
+	.tp_flags	= Py_TPFLAGS_DEFAULT|Py_TPFLAGS_BASETYPE,
+	.tp_doc		= pyrf_comm_event__doc,
+	.tp_members	= pyrf_comm_event__members,
+	.tp_repr	= (reprfunc)pyrf_comm_event__repr,
+};
+
+static char pyrf_throttle_event__doc[] = PyDoc_STR("perf throttle event object.");
+
+static PyMemberDef pyrf_throttle_event__members[] = {
+	sample_members
+	member_def(perf_event_header, type, T_UINT, "event type"),
+	member_def(throttle_event, time, T_ULONG_LONG, "timestamp"),
+	member_def(throttle_event, id, T_ULONG_LONG, "event id"),
+	member_def(throttle_event, stream_id, T_ULONG_LONG, "event stream id"),
+	{ NULL, },
+};
+
+static PyObject *pyrf_throttle_event__repr(struct pyrf_event *pevent)
+{
+	struct throttle_event *te = (struct throttle_event *)(&pevent->event.header + 1);
+
+	return PyString_FromFormat("{ type: %sthrottle, time: %" PRIu64 ", id: %" PRIu64
+				   ", stream_id: %" PRIu64 " }",
+				   pevent->event.header.type == PERF_RECORD_THROTTLE ? "" : "un",
+				   te->time, te->id, te->stream_id);
+}
+
+static PyTypeObject pyrf_throttle_event__type = {
+	PyVarObject_HEAD_INIT(NULL, 0)
+	.tp_name	= "perf.throttle_event",
+	.tp_basicsize	= sizeof(struct pyrf_event),
+	.tp_flags	= Py_TPFLAGS_DEFAULT|Py_TPFLAGS_BASETYPE,
+	.tp_doc		= pyrf_throttle_event__doc,
+	.tp_members	= pyrf_throttle_event__members,
+	.tp_repr	= (reprfunc)pyrf_throttle_event__repr,
+};
+
+static int pyrf_event__setup_types(void)
+{
+	int err;
+	pyrf_mmap_event__type.tp_new =
+	pyrf_task_event__type.tp_new =
+	pyrf_comm_event__type.tp_new =
+	pyrf_throttle_event__type.tp_new = PyType_GenericNew;
+	err = PyType_Ready(&pyrf_mmap_event__type);
+	if (err < 0)
+		goto out;
+	err = PyType_Ready(&pyrf_task_event__type);
+	if (err < 0)
+		goto out;
+	err = PyType_Ready(&pyrf_comm_event__type);
+	if (err < 0)
+		goto out;
+	err = PyType_Ready(&pyrf_throttle_event__type);
+	if (err < 0)
+		goto out;
+out:
+	return err;
+}
+
+static PyTypeObject *pyrf_event__type[] = {
+	[PERF_RECORD_MMAP]	 = &pyrf_mmap_event__type,
+	[PERF_RECORD_LOST]	 = &pyrf_mmap_event__type,
+	[PERF_RECORD_COMM]	 = &pyrf_comm_event__type,
+	[PERF_RECORD_EXIT]	 = &pyrf_task_event__type,
+	[PERF_RECORD_THROTTLE]	 = &pyrf_throttle_event__type,
+	[PERF_RECORD_UNTHROTTLE] = &pyrf_throttle_event__type,
+	[PERF_RECORD_FORK]	 = &pyrf_task_event__type,
+	[PERF_RECORD_READ]	 = &pyrf_mmap_event__type,
+	[PERF_RECORD_SAMPLE]	 = &pyrf_mmap_event__type,
+};
+
+static PyObject *pyrf_event__new(union perf_event *event)
+{
+	struct pyrf_event *pevent;
+	PyTypeObject *ptype;
+
+	if (event->header.type < PERF_RECORD_MMAP ||
+	    event->header.type > PERF_RECORD_SAMPLE)
+		return NULL;
+
+	ptype = pyrf_event__type[event->header.type];
+	pevent = PyObject_New(struct pyrf_event, ptype);
+	if (pevent != NULL)
+		memcpy(&pevent->event, event, event->header.size);
+	return (PyObject *)pevent;
+}
+
+struct pyrf_cpu_map {
+	PyObject_HEAD
+
+	struct cpu_map *cpus;
+};
+
+static int pyrf_cpu_map__init(struct pyrf_cpu_map *pcpus,
+			      PyObject *args, PyObject *kwargs)
+{
+	static char *kwlist[] = { "cpustr", NULL, NULL, };
+	char *cpustr = NULL;
+
+	if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|s",
+					 kwlist, &cpustr))
+		return -1;
+
+	pcpus->cpus = cpu_map__new(cpustr);
+	if (pcpus->cpus == NULL)
+		return -1;
+	return 0;
+}
+
+static void pyrf_cpu_map__delete(struct pyrf_cpu_map *pcpus)
+{
+	cpu_map__delete(pcpus->cpus);
+	pcpus->ob_type->tp_free((PyObject*)pcpus);
+}
+
+static Py_ssize_t pyrf_cpu_map__length(PyObject *obj)
+{
+	struct pyrf_cpu_map *pcpus = (void *)obj;
+
+	return pcpus->cpus->nr;
+}
+
+static PyObject *pyrf_cpu_map__item(PyObject *obj, Py_ssize_t i)
+{
+	struct pyrf_cpu_map *pcpus = (void *)obj;
+
+	if (i >= pcpus->cpus->nr)
+		return NULL;
+
+	return Py_BuildValue("i", pcpus->cpus->map[i]);
+}
+
+static PySequenceMethods pyrf_cpu_map__sequence_methods = {
+	.sq_length = pyrf_cpu_map__length,
+	.sq_item   = pyrf_cpu_map__item,
+};
+
+static char pyrf_cpu_map__doc[] = PyDoc_STR("cpu map object.");
+
+static PyTypeObject pyrf_cpu_map__type = {
+	PyVarObject_HEAD_INIT(NULL, 0)
+	.tp_name	= "perf.cpu_map",
+	.tp_basicsize	= sizeof(struct pyrf_cpu_map),
+	.tp_dealloc	= (destructor)pyrf_cpu_map__delete,
+	.tp_flags	= Py_TPFLAGS_DEFAULT|Py_TPFLAGS_BASETYPE,
+	.tp_doc		= pyrf_cpu_map__doc,
+	.tp_as_sequence	= &pyrf_cpu_map__sequence_methods,
+	.tp_init	= (initproc)pyrf_cpu_map__init,
+};
+
+static int pyrf_cpu_map__setup_types(void)
+{
+	pyrf_cpu_map__type.tp_new = PyType_GenericNew;
+	return PyType_Ready(&pyrf_cpu_map__type);
+}
+
+struct pyrf_thread_map {
+	PyObject_HEAD
+
+	struct thread_map *threads;
+};
+
+static int pyrf_thread_map__init(struct pyrf_thread_map *pthreads,
+				 PyObject *args, PyObject *kwargs)
+{
+	static char *kwlist[] = { "pid", "tid", NULL, NULL, };
+	int pid = -1, tid = -1;
+
+	if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ii",
+					 kwlist, &pid, &tid))
+		return -1;
+
+	pthreads->threads = thread_map__new(pid, tid);
+	if (pthreads->threads == NULL)
+		return -1;
+	return 0;
+}
+
+static void pyrf_thread_map__delete(struct pyrf_thread_map *pthreads)
+{
+	thread_map__delete(pthreads->threads);
+	pthreads->ob_type->tp_free((PyObject*)pthreads);
+}
+
+static Py_ssize_t pyrf_thread_map__length(PyObject *obj)
+{
+	struct pyrf_thread_map *pthreads = (void *)obj;
+
+	return pthreads->threads->nr;
+}
+
+static PyObject *pyrf_thread_map__item(PyObject *obj, Py_ssize_t i)
+{
+	struct pyrf_thread_map *pthreads = (void *)obj;
+
+	if (i >= pthreads->threads->nr)
+		return NULL;
+
+	return Py_BuildValue("i", pthreads->threads->map[i]);
+}
+
+static PySequenceMethods pyrf_thread_map__sequence_methods = {
+	.sq_length = pyrf_thread_map__length,
+	.sq_item   = pyrf_thread_map__item,
+};
+
+static char pyrf_thread_map__doc[] = PyDoc_STR("thread map object.");
+
+static PyTypeObject pyrf_thread_map__type = {
+	PyVarObject_HEAD_INIT(NULL, 0)
+	.tp_name	= "perf.thread_map",
+	.tp_basicsize	= sizeof(struct pyrf_thread_map),
+	.tp_dealloc	= (destructor)pyrf_thread_map__delete,
+	.tp_flags	= Py_TPFLAGS_DEFAULT|Py_TPFLAGS_BASETYPE,
+	.tp_doc		= pyrf_thread_map__doc,
+	.tp_as_sequence	= &pyrf_thread_map__sequence_methods,
+	.tp_init	= (initproc)pyrf_thread_map__init,
+};
+
+static int pyrf_thread_map__setup_types(void)
+{
+	pyrf_thread_map__type.tp_new = PyType_GenericNew;
+	return PyType_Ready(&pyrf_thread_map__type);
+}
+
+struct pyrf_evsel {
+	PyObject_HEAD
+
+	struct perf_evsel evsel;
+};
+
+static int pyrf_evsel__init(struct pyrf_evsel *pevsel,
+			    PyObject *args, PyObject *kwargs)
+{
+	struct perf_event_attr attr = {
+		.type = PERF_TYPE_HARDWARE,
+		.config = PERF_COUNT_HW_CPU_CYCLES,
+		.sample_type = PERF_SAMPLE_PERIOD | PERF_SAMPLE_TID,
+	};
+	static char *kwlist[] = {
+		"type",
+		"config",
+		"sample_freq",
+		"sample_period",
+		"sample_type",
+		"read_format",
+		"disabled",
+		"inherit",
+		"pinned",
+		"exclusive",
+		"exclude_user",
+		"exclude_kernel",
+		"exclude_hv",
+		"exclude_idle",
+		"mmap",
+		"comm",
+		"freq",
+		"inherit_stat",
+		"enable_on_exec",
+		"task",
+		"watermark",
+		"precise_ip",
+		"mmap_data",
+		"sample_id_all",
+		"wakeup_events",
+		"bp_type",
+		"bp_addr",
+		"bp_len", NULL, NULL, };
+	u64 sample_period = 0;
+	u32 disabled = 0,
+	    inherit = 0,
+	    pinned = 0,
+	    exclusive = 0,
+	    exclude_user = 0,
+	    exclude_kernel = 0,
+	    exclude_hv = 0,
+	    exclude_idle = 0,
+	    mmap = 0,
+	    comm = 0,
+	    freq = 1,
+	    inherit_stat = 0,
+	    enable_on_exec = 0,
+	    task = 0,
+	    watermark = 0,
+	    precise_ip = 0,
+	    mmap_data = 0,
+	    sample_id_all = 1;
+	int idx = 0;
+
+	if (!PyArg_ParseTupleAndKeywords(args, kwargs,
+					 "|iKiKKiiiiiiiiiiiiiiiiiiiiiKK", kwlist,
+					 &attr.type, &attr.config, &attr.sample_freq,
+					 &sample_period, &attr.sample_type,
+					 &attr.read_format, &disabled, &inherit,
+					 &pinned, &exclusive, &exclude_user,
+					 &exclude_kernel, &exclude_hv, &exclude_idle,
+					 &mmap, &comm, &freq, &inherit_stat,
+					 &enable_on_exec, &task, &watermark,
+					 &precise_ip, &mmap_data, &sample_id_all,
+					 &attr.wakeup_events, &attr.bp_type,
+					 &attr.bp_addr, &attr.bp_len, &idx))
+		return -1;
+
+	/* union... */
+	if (sample_period != 0) {
+		if (attr.sample_freq != 0)
+			return -1; /* FIXME: throw right exception */
+		attr.sample_period = sample_period;
+	}
+
+	/* Bitfields */
+	attr.disabled	    = disabled;
+	attr.inherit	    = inherit;
+	attr.pinned	    = pinned;
+	attr.exclusive	    = exclusive;
+	attr.exclude_user   = exclude_user;
+	attr.exclude_kernel = exclude_kernel;
+	attr.exclude_hv	    = exclude_hv;
+	attr.exclude_idle   = exclude_idle;
+	attr.mmap	    = mmap;
+	attr.comm	    = comm;
+	attr.freq	    = freq;
+	attr.inherit_stat   = inherit_stat;
+	attr.enable_on_exec = enable_on_exec;
+	attr.task	    = task;
+	attr.watermark	    = watermark;
+	attr.precise_ip	    = precise_ip;
+	attr.mmap_data	    = mmap_data;
+	attr.sample_id_all  = sample_id_all;
+
+	perf_evsel__init(&pevsel->evsel, &attr, idx);
+	return 0;
+}
+
+static void pyrf_evsel__delete(struct pyrf_evsel *pevsel)
+{
+	perf_evsel__exit(&pevsel->evsel);
+	pevsel->ob_type->tp_free((PyObject*)pevsel);
+}
+
+static PyObject *pyrf_evsel__open(struct pyrf_evsel *pevsel,
+				  PyObject *args, PyObject *kwargs)
+{
+	struct perf_evsel *evsel = &pevsel->evsel;
+	struct cpu_map *cpus = NULL;
+	struct thread_map *threads = NULL;
+	PyObject *pcpus = NULL, *pthreads = NULL;
+	int group = 0, overwrite = 0;
+	static char *kwlist[] = {"cpus", "threads", "group", "overwrite", NULL, NULL};
+
+	if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|OOii", kwlist,
+					 &pcpus, &pthreads, &group, &overwrite))
+		return NULL;
+
+	if (pthreads != NULL)
+		threads = ((struct pyrf_thread_map *)pthreads)->threads;
+
+	if (pcpus != NULL)
+		cpus = ((struct pyrf_cpu_map *)pcpus)->cpus;
+
+	if (perf_evsel__open(evsel, cpus, threads, group, overwrite) < 0) {
+		PyErr_SetFromErrno(PyExc_OSError);
+		return NULL;
+	}
+
+	Py_INCREF(Py_None);
+	return Py_None;
+}
+
+static PyMethodDef pyrf_evsel__methods[] = {
+	{
+		.ml_name  = "open",
+		.ml_meth  = (PyCFunction)pyrf_evsel__open,
+		.ml_flags = METH_VARARGS | METH_KEYWORDS,
+		.ml_doc	  = PyDoc_STR("open the event selector file descriptor table.")
+	},
+	{ NULL, }
+};
+
+static char pyrf_evsel__doc[] = PyDoc_STR("perf event selector list object.");
+
+static PyTypeObject pyrf_evsel__type = {
+	PyVarObject_HEAD_INIT(NULL, 0)
+	.tp_name	= "perf.evsel",
+	.tp_basicsize	= sizeof(struct pyrf_evsel),
+	.tp_dealloc	= (destructor)pyrf_evsel__delete,
+	.tp_flags	= Py_TPFLAGS_DEFAULT|Py_TPFLAGS_BASETYPE,
+	.tp_doc		= pyrf_evsel__doc,
+	.tp_methods	= pyrf_evsel__methods,
+	.tp_init	= (initproc)pyrf_evsel__init,
+};
+
+static int pyrf_evsel__setup_types(void)
+{
+	pyrf_evsel__type.tp_new = PyType_GenericNew;
+	return PyType_Ready(&pyrf_evsel__type);
+}
+
+struct pyrf_evlist {
+	PyObject_HEAD
+
+	struct perf_evlist evlist;
+};
+
+static int pyrf_evlist__init(struct pyrf_evlist *pevlist,
+			     PyObject *args, PyObject *kwargs)
+{
+	perf_evlist__init(&pevlist->evlist);
+	return 0;
+}
+
+static void pyrf_evlist__delete(struct pyrf_evlist *pevlist)
+{
+	perf_evlist__exit(&pevlist->evlist);
+	pevlist->ob_type->tp_free((PyObject*)pevlist);
+}
+
+static PyObject *pyrf_evlist__mmap(struct pyrf_evlist *pevlist,
+				   PyObject *args, PyObject *kwargs)
+{
+	struct perf_evlist *evlist = &pevlist->evlist;
+	PyObject *pcpus = NULL, *pthreads = NULL;
+	struct cpu_map *cpus = NULL;
+	struct thread_map *threads = NULL;
+	static char *kwlist[] = {"cpus", "threads", "pages", "overwrite",
+				  NULL, NULL};
+	int pages = 128, overwrite = false;
+
+	if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OO|ii", kwlist,
+					 &pcpus, &pthreads, &pages, &overwrite))
+		return NULL;
+
+	threads = ((struct pyrf_thread_map *)pthreads)->threads;
+	cpus = ((struct pyrf_cpu_map *)pcpus)->cpus;
+
+	if (perf_evlist__mmap(evlist, cpus, threads, pages, overwrite) < 0) {
+		PyErr_SetFromErrno(PyExc_OSError);
+		return NULL;
+	}
+
+	Py_INCREF(Py_None);
+	return Py_None;
+}
+
+static PyObject *pyrf_evlist__poll(struct pyrf_evlist *pevlist,
+				   PyObject *args, PyObject *kwargs)
+{
+	struct perf_evlist *evlist = &pevlist->evlist;
+	static char *kwlist[] = {"timeout", NULL, NULL};
+	int timeout = -1, n;
+
+	if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|i", kwlist, &timeout))
+		return NULL;
+
+	n = poll(evlist->pollfd, evlist->nr_fds, timeout);
+	if (n < 0) {
+		PyErr_SetFromErrno(PyExc_OSError);
+		return NULL;
+	}
+
+	return Py_BuildValue("i", n);
+}
+
+static PyObject *pyrf_evlist__get_pollfd(struct pyrf_evlist *pevlist,
+					 PyObject *args, PyObject *kwargs)
+{
+	struct perf_evlist *evlist = &pevlist->evlist;
+        PyObject *list = PyList_New(0);
+	int i;
+
+	for (i = 0; i < evlist->nr_fds; ++i) {
+		PyObject *file;
+		FILE *fp = fdopen(evlist->pollfd[i].fd, "r");
+
+		if (fp == NULL)
+			goto free_list;
+
+		file = PyFile_FromFile(fp, "perf", "r", NULL);
+		if (file == NULL)
+			goto free_list;
+
+		if (PyList_Append(list, file) != 0) {
+			Py_DECREF(file);
+			goto free_list;
+		}
+			
+		Py_DECREF(file);
+	}
+
+	return list;
+free_list:
+	return PyErr_NoMemory();
+}
+
+
+static PyObject *pyrf_evlist__add(struct pyrf_evlist *pevlist,
+				  PyObject *args, PyObject *kwargs)
+{
+	struct perf_evlist *evlist = &pevlist->evlist;
+	PyObject *pevsel;
+	struct perf_evsel *evsel;
+
+	if (!PyArg_ParseTuple(args, "O", &pevsel))
+		return NULL;
+
+	Py_INCREF(pevsel);
+	evsel = &((struct pyrf_evsel *)pevsel)->evsel;
+	evsel->idx = evlist->nr_entries;
+	perf_evlist__add(evlist, evsel);
+
+	return Py_BuildValue("i", evlist->nr_entries);
+}
+
+static PyObject *pyrf_evlist__read_on_cpu(struct pyrf_evlist *pevlist,
+					  PyObject *args, PyObject *kwargs)
+{
+	struct perf_evlist *evlist = &pevlist->evlist;
+	union perf_event *event;
+	int sample_id_all = 1, cpu;
+	static char *kwlist[] = {"sample_id_all", NULL, NULL};
+
+	if (!PyArg_ParseTupleAndKeywords(args, kwargs, "i|i", kwlist,
+					 &cpu, &sample_id_all))
+		return NULL;
+
+	event = perf_evlist__read_on_cpu(evlist, cpu);
+	if (event != NULL) {
+		struct perf_evsel *first;
+		PyObject *pyevent = pyrf_event__new(event);
+		struct pyrf_event *pevent = (struct pyrf_event *)pyevent;
+
+		if (pyevent == NULL)
+			return PyErr_NoMemory();
+
+		first = list_entry(evlist->entries.next, struct perf_evsel, node);
+		perf_event__parse_sample(event, first->attr.sample_type, sample_id_all,
+					 &pevent->sample);
+		return pyevent;
+	}
+
+	Py_INCREF(Py_None);
+	return Py_None;
+}
+
+static PyMethodDef pyrf_evlist__methods[] = {
+	{
+		.ml_name  = "mmap",
+		.ml_meth  = (PyCFunction)pyrf_evlist__mmap,
+		.ml_flags = METH_VARARGS | METH_KEYWORDS,
+		.ml_doc	  = PyDoc_STR("mmap the file descriptor table.")
+	},
+	{
+		.ml_name  = "poll",
+		.ml_meth  = (PyCFunction)pyrf_evlist__poll,
+		.ml_flags = METH_VARARGS | METH_KEYWORDS,
+		.ml_doc	  = PyDoc_STR("poll the file descriptor table.")
+	},
+	{
+		.ml_name  = "get_pollfd",
+		.ml_meth  = (PyCFunction)pyrf_evlist__get_pollfd,
+		.ml_flags = METH_VARARGS | METH_KEYWORDS,
+		.ml_doc	  = PyDoc_STR("get the poll file descriptor table.")
+	},
+	{
+		.ml_name  = "add",
+		.ml_meth  = (PyCFunction)pyrf_evlist__add,
+		.ml_flags = METH_VARARGS | METH_KEYWORDS,
+		.ml_doc	  = PyDoc_STR("adds an event selector to the list.")
+	},
+	{
+		.ml_name  = "read_on_cpu",
+		.ml_meth  = (PyCFunction)pyrf_evlist__read_on_cpu,
+		.ml_flags = METH_VARARGS | METH_KEYWORDS,
+		.ml_doc	  = PyDoc_STR("reads an event.")
+	},
+	{ NULL, }
+};
+
+static Py_ssize_t pyrf_evlist__length(PyObject *obj)
+{
+	struct pyrf_evlist *pevlist = (void *)obj;
+
+	return pevlist->evlist.nr_entries;
+}
+
+static PyObject *pyrf_evlist__item(PyObject *obj, Py_ssize_t i)
+{
+	struct pyrf_evlist *pevlist = (void *)obj;
+	struct perf_evsel *pos;
+
+	if (i >= pevlist->evlist.nr_entries)
+		return NULL;
+
+	list_for_each_entry(pos, &pevlist->evlist.entries, node)
+		if (i-- == 0)
+			break;
+
+	return Py_BuildValue("O", container_of(pos, struct pyrf_evsel, evsel));
+}
+
+static PySequenceMethods pyrf_evlist__sequence_methods = {
+	.sq_length = pyrf_evlist__length,
+	.sq_item   = pyrf_evlist__item,
+};
+
+static char pyrf_evlist__doc[] = PyDoc_STR("perf event selector list object.");
+
+static PyTypeObject pyrf_evlist__type = {
+	PyVarObject_HEAD_INIT(NULL, 0)
+	.tp_name	= "perf.evlist",
+	.tp_basicsize	= sizeof(struct pyrf_evlist),
+	.tp_dealloc	= (destructor)pyrf_evlist__delete,
+	.tp_flags	= Py_TPFLAGS_DEFAULT|Py_TPFLAGS_BASETYPE,
+	.tp_as_sequence	= &pyrf_evlist__sequence_methods,
+	.tp_doc		= pyrf_evlist__doc,
+	.tp_methods	= pyrf_evlist__methods,
+	.tp_init	= (initproc)pyrf_evlist__init,
+};
+
+static int pyrf_evlist__setup_types(void)
+{
+	pyrf_evlist__type.tp_new = PyType_GenericNew;
+	return PyType_Ready(&pyrf_evlist__type);
+}
+
+static struct {
+	const char *name;
+	int	    value;
+} perf__constants[] = {
+	{ "TYPE_HARDWARE",   PERF_TYPE_HARDWARE },
+	{ "TYPE_SOFTWARE",   PERF_TYPE_SOFTWARE },
+	{ "TYPE_TRACEPOINT", PERF_TYPE_TRACEPOINT },
+	{ "TYPE_HW_CACHE",   PERF_TYPE_HW_CACHE },
+	{ "TYPE_RAW",	     PERF_TYPE_RAW },
+	{ "TYPE_BREAKPOINT", PERF_TYPE_BREAKPOINT },
+
+	{ "COUNT_HW_CPU_CYCLES",	  PERF_COUNT_HW_CPU_CYCLES },
+	{ "COUNT_HW_INSTRUCTIONS",	  PERF_COUNT_HW_INSTRUCTIONS },
+	{ "COUNT_HW_CACHE_REFERENCES",	  PERF_COUNT_HW_CACHE_REFERENCES },
+	{ "COUNT_HW_CACHE_MISSES",	  PERF_COUNT_HW_CACHE_MISSES },
+	{ "COUNT_HW_BRANCH_INSTRUCTIONS", PERF_COUNT_HW_BRANCH_INSTRUCTIONS },
+	{ "COUNT_HW_BRANCH_MISSES",	  PERF_COUNT_HW_BRANCH_MISSES },
+	{ "COUNT_HW_BUS_CYCLES",	  PERF_COUNT_HW_BUS_CYCLES },
+	{ "COUNT_HW_CACHE_L1D",		  PERF_COUNT_HW_CACHE_L1D },
+	{ "COUNT_HW_CACHE_L1I",		  PERF_COUNT_HW_CACHE_L1I },
+	{ "COUNT_HW_CACHE_LL",	  	  PERF_COUNT_HW_CACHE_LL },
+	{ "COUNT_HW_CACHE_DTLB",	  PERF_COUNT_HW_CACHE_DTLB },
+	{ "COUNT_HW_CACHE_ITLB",	  PERF_COUNT_HW_CACHE_ITLB },
+	{ "COUNT_HW_CACHE_BPU",		  PERF_COUNT_HW_CACHE_BPU },
+	{ "COUNT_HW_CACHE_OP_READ",	  PERF_COUNT_HW_CACHE_OP_READ },
+	{ "COUNT_HW_CACHE_OP_WRITE",	  PERF_COUNT_HW_CACHE_OP_WRITE },
+	{ "COUNT_HW_CACHE_OP_PREFETCH",	  PERF_COUNT_HW_CACHE_OP_PREFETCH },
+	{ "COUNT_HW_CACHE_RESULT_ACCESS", PERF_COUNT_HW_CACHE_RESULT_ACCESS },
+	{ "COUNT_HW_CACHE_RESULT_MISS",   PERF_COUNT_HW_CACHE_RESULT_MISS },
+
+	{ "COUNT_SW_CPU_CLOCK",	       PERF_COUNT_SW_CPU_CLOCK },
+	{ "COUNT_SW_TASK_CLOCK",       PERF_COUNT_SW_TASK_CLOCK },
+	{ "COUNT_SW_PAGE_FAULTS",      PERF_COUNT_SW_PAGE_FAULTS },
+	{ "COUNT_SW_CONTEXT_SWITCHES", PERF_COUNT_SW_CONTEXT_SWITCHES },
+	{ "COUNT_SW_CPU_MIGRATIONS",   PERF_COUNT_SW_CPU_MIGRATIONS },
+	{ "COUNT_SW_PAGE_FAULTS_MIN",  PERF_COUNT_SW_PAGE_FAULTS_MIN },
+	{ "COUNT_SW_PAGE_FAULTS_MAJ",  PERF_COUNT_SW_PAGE_FAULTS_MAJ },
+	{ "COUNT_SW_ALIGNMENT_FAULTS", PERF_COUNT_SW_ALIGNMENT_FAULTS },
+	{ "COUNT_SW_EMULATION_FAULTS", PERF_COUNT_SW_EMULATION_FAULTS },
+
+	{ "SAMPLE_IP",	      PERF_SAMPLE_IP },
+	{ "SAMPLE_TID",	      PERF_SAMPLE_TID },
+	{ "SAMPLE_TIME",      PERF_SAMPLE_TIME },
+	{ "SAMPLE_ADDR",      PERF_SAMPLE_ADDR },
+	{ "SAMPLE_READ",      PERF_SAMPLE_READ },
+	{ "SAMPLE_CALLCHAIN", PERF_SAMPLE_CALLCHAIN },
+	{ "SAMPLE_ID",	      PERF_SAMPLE_ID },
+	{ "SAMPLE_CPU",	      PERF_SAMPLE_CPU },
+	{ "SAMPLE_PERIOD",    PERF_SAMPLE_PERIOD },
+	{ "SAMPLE_STREAM_ID", PERF_SAMPLE_STREAM_ID },
+	{ "SAMPLE_RAW",	      PERF_SAMPLE_RAW },
+
+	{ "FORMAT_TOTAL_TIME_ENABLED", PERF_FORMAT_TOTAL_TIME_ENABLED },
+	{ "FORMAT_TOTAL_TIME_RUNNING", PERF_FORMAT_TOTAL_TIME_RUNNING },
+	{ "FORMAT_ID",		       PERF_FORMAT_ID },
+	{ "FORMAT_GROUP",	       PERF_FORMAT_GROUP },
+
+	{ "RECORD_MMAP",       PERF_RECORD_MMAP },
+	{ "RECORD_LOST",       PERF_RECORD_LOST },
+	{ "RECORD_COMM",       PERF_RECORD_COMM },
+	{ "RECORD_EXIT",       PERF_RECORD_EXIT },
+	{ "RECORD_THROTTLE",   PERF_RECORD_THROTTLE },
+	{ "RECORD_UNTHROTTLE", PERF_RECORD_UNTHROTTLE },
+	{ "RECORD_FORK",       PERF_RECORD_FORK },
+	{ "RECORD_READ",       PERF_RECORD_READ },
+	{ "RECORD_SAMPLE",     PERF_RECORD_SAMPLE },
+	{ NULL, },
+};
+
+static PyMethodDef perf__methods[] = {
+	{ NULL, NULL }
+};
+
+PyMODINIT_FUNC initperf(void)
+{
+	PyObject *obj;
+	int i;
+	PyObject *dict, *module = Py_InitModule("perf", perf__methods);
+
+	if (module == NULL ||
+	    pyrf_event__setup_types() < 0 ||
+	    pyrf_evlist__setup_types() < 0 ||
+	    pyrf_evsel__setup_types() < 0 ||
+	    pyrf_thread_map__setup_types() < 0 ||
+	    pyrf_cpu_map__setup_types() < 0)
+		return;
+
+	Py_INCREF(&pyrf_evlist__type);
+	PyModule_AddObject(module, "evlist", (PyObject*)&pyrf_evlist__type);
+
+	Py_INCREF(&pyrf_evsel__type);
+	PyModule_AddObject(module, "evsel", (PyObject*)&pyrf_evsel__type);
+
+	Py_INCREF(&pyrf_thread_map__type);
+	PyModule_AddObject(module, "thread_map", (PyObject*)&pyrf_thread_map__type);
+
+	Py_INCREF(&pyrf_cpu_map__type);
+	PyModule_AddObject(module, "cpu_map", (PyObject*)&pyrf_cpu_map__type);
+
+	dict = PyModule_GetDict(module);
+	if (dict == NULL)
+		goto error;
+
+	for (i = 0; perf__constants[i].name != NULL; i++) {
+		obj = PyInt_FromLong(perf__constants[i].value);
+		if (obj == NULL)
+			goto error;
+		PyDict_SetItemString(dict, perf__constants[i].name, obj);
+		Py_DECREF(obj);
+	}
+
+error:
+	if (PyErr_Occurred())
+		PyErr_SetString(PyExc_ImportError, "perf: Init failed!");
+}
diff --git a/tools/perf/util/setup.py b/tools/perf/util/setup.py
new file mode 100644
index 0000000..496d7f4
--- /dev/null
+++ b/tools/perf/util/setup.py
@@ -0,0 +1,18 @@
+#!/usr/bin/python2
+
+from distutils.core import setup, Extension
+
+perf = Extension('perf',
+		  sources = ['util/python.c', 'util/ctype.c', 'util/evlist.c',
+			     'util/evsel.c', 'util/cpumap.c', 'util/thread_map.c',
+			     'util/util.c', 'util/xyarray.c'],
+		  include_dirs = ['util/include'])
+
+setup(name='perf',
+      version='0.1',
+      description='Interface with the Linux profiling infrastructure',
+      author='Arnaldo Carvalho de Melo',
+      author_email='acme@redhat.com',
+      license='GPLv2',
+      url='http://perf.wiki.kernel.org',
+      ext_modules=[perf])
-- 
cgit v0.10.2


From f8a9530939ed87b9a1b1a038b90e355098b679a2 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Sun, 30 Jan 2011 10:46:46 -0200
Subject: perf evlist: Move evlist methods to evlist.c

They were on evsel.c because they came from refactoring existing evsel
methods, so, to make reviewing the changes easier, I kept it there, now
its a plain move.

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index 917fc18..dcd5932 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -1,11 +1,26 @@
+/*
+ * Copyright (C) 2011, Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
+ *
+ * Parts came from builtin-{top,stat,record}.c, see those files for further
+ * copyright notes.
+ *
+ * Released under the GPL v2. (and only v2, not any later version)
+ */
 #include <poll.h>
+#include "cpumap.h"
+#include "thread_map.h"
 #include "evlist.h"
 #include "evsel.h"
 #include "util.h"
 
+#include <sys/mman.h>
+
 #include <linux/bitops.h>
 #include <linux/hash.h>
 
+#define FD(e, x, y) (*(int *)xyarray__entry(e->fd, x, y))
+#define SID(e, x, y) xyarray__entry(e->id, x, y)
+
 void perf_evlist__init(struct perf_evlist *evlist)
 {
 	int i;
@@ -88,6 +103,30 @@ void perf_evlist__add_pollfd(struct perf_evlist *evlist, int fd)
 	evlist->nr_fds++;
 }
 
+static int perf_evlist__id_hash(struct perf_evlist *evlist, struct perf_evsel *evsel,
+			       int cpu, int thread, int fd)
+{
+	struct perf_sample_id *sid;
+	u64 read_data[4] = { 0, };
+	int hash, id_idx = 1; /* The first entry is the counter value */
+
+	if (!(evsel->attr.read_format & PERF_FORMAT_ID) ||
+	    read(fd, &read_data, sizeof(read_data)) == -1)
+		return -1;
+
+	if (evsel->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+		++id_idx;
+	if (evsel->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+		++id_idx;
+
+	sid = SID(evsel, cpu, thread);
+	sid->id = read_data[id_idx];
+	sid->evsel = evsel;
+	hash = hash_64(sid->id, PERF_EVLIST__HLIST_BITS);
+	hlist_add_head(&sid->node, &evlist->heads[hash]);
+	return 0;
+}
+
 struct perf_evsel *perf_evlist__id2evsel(struct perf_evlist *evlist, u64 id)
 {
 	struct hlist_head *head;
@@ -173,3 +212,106 @@ union perf_event *perf_evlist__read_on_cpu(struct perf_evlist *evlist, int cpu)
 
 	return event;
 }
+
+void perf_evlist__munmap(struct perf_evlist *evlist, int ncpus)
+{
+	int cpu;
+
+	for (cpu = 0; cpu < ncpus; cpu++) {
+		if (evlist->mmap[cpu].base != NULL) {
+			munmap(evlist->mmap[cpu].base, evlist->mmap_len);
+			evlist->mmap[cpu].base = NULL;
+		}
+	}
+}
+
+int perf_evlist__alloc_mmap(struct perf_evlist *evlist, int ncpus)
+{
+	evlist->mmap = zalloc(ncpus * sizeof(struct perf_mmap));
+	return evlist->mmap != NULL ? 0 : -ENOMEM;
+}
+
+static int __perf_evlist__mmap(struct perf_evlist *evlist, int cpu, int prot,
+			       int mask, int fd)
+{
+	evlist->mmap[cpu].prev = 0;
+	evlist->mmap[cpu].mask = mask;
+	evlist->mmap[cpu].base = mmap(NULL, evlist->mmap_len, prot,
+				      MAP_SHARED, fd, 0);
+	if (evlist->mmap[cpu].base == MAP_FAILED)
+		return -1;
+
+	perf_evlist__add_pollfd(evlist, fd);
+	return 0;
+}
+
+/** perf_evlist__mmap - Create per cpu maps to receive events
+ *
+ * @evlist - list of events
+ * @cpus - cpu map being monitored
+ * @threads - threads map being monitored
+ * @pages - map length in pages
+ * @overwrite - overwrite older events?
+ *
+ * If overwrite is false the user needs to signal event consuption using:
+ *
+ *	struct perf_mmap *m = &evlist->mmap[cpu];
+ *	unsigned int head = perf_mmap__read_head(m);
+ *
+ *	perf_mmap__write_tail(m, head)
+ */
+int perf_evlist__mmap(struct perf_evlist *evlist, struct cpu_map *cpus,
+		      struct thread_map *threads, int pages, bool overwrite)
+{
+	unsigned int page_size = sysconf(_SC_PAGE_SIZE);
+	int mask = pages * page_size - 1, cpu;
+	struct perf_evsel *first_evsel, *evsel;
+	int thread, prot = PROT_READ | (overwrite ? 0 : PROT_WRITE);
+
+	if (evlist->mmap == NULL &&
+	    perf_evlist__alloc_mmap(evlist, cpus->nr) < 0)
+		return -ENOMEM;
+
+	if (evlist->pollfd == NULL &&
+	    perf_evlist__alloc_pollfd(evlist, cpus->nr, threads->nr) < 0)
+		return -ENOMEM;
+
+	evlist->overwrite = overwrite;
+	evlist->mmap_len = (pages + 1) * page_size;
+	first_evsel = list_entry(evlist->entries.next, struct perf_evsel, node);
+
+	list_for_each_entry(evsel, &evlist->entries, node) {
+		if ((evsel->attr.read_format & PERF_FORMAT_ID) &&
+		    evsel->id == NULL &&
+		    perf_evsel__alloc_id(evsel, cpus->nr, threads->nr) < 0)
+			return -ENOMEM;
+
+		for (cpu = 0; cpu < cpus->nr; cpu++) {
+			for (thread = 0; thread < threads->nr; thread++) {
+				int fd = FD(evsel, cpu, thread);
+
+				if (evsel->idx || thread) {
+					if (ioctl(fd, PERF_EVENT_IOC_SET_OUTPUT,
+						  FD(first_evsel, cpu, 0)) != 0)
+						goto out_unmap;
+				} else if (__perf_evlist__mmap(evlist, cpu, prot, mask, fd) < 0)
+					goto out_unmap;
+
+				if ((evsel->attr.read_format & PERF_FORMAT_ID) &&
+				    perf_evlist__id_hash(evlist, evsel, cpu, thread, fd) < 0)
+					goto out_unmap;
+			}
+		}
+	}
+
+	return 0;
+
+out_unmap:
+	for (cpu = 0; cpu < cpus->nr; cpu++) {
+		if (evlist->mmap[cpu].base != NULL) {
+			munmap(evlist->mmap[cpu].base, evlist->mmap_len);
+			evlist->mmap[cpu].base = NULL;
+		}
+	}
+	return -1;
+}
diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h
index 022ae40..85aca6e 100644
--- a/tools/perf/util/evlist.h
+++ b/tools/perf/util/evlist.h
@@ -6,6 +6,8 @@
 #include "event.h"
 
 struct pollfd;
+struct thread_map;
+struct cpu_map;
 
 #define PERF_EVLIST__HLIST_BITS 8
 #define PERF_EVLIST__HLIST_SIZE (1 << PERF_EVLIST__HLIST_BITS)
@@ -39,4 +41,9 @@ struct perf_evsel *perf_evlist__id2evsel(struct perf_evlist *evlist, u64 id);
 
 union perf_event *perf_evlist__read_on_cpu(struct perf_evlist *self, int cpu);
 
+int perf_evlist__alloc_mmap(struct perf_evlist *evlist, int ncpus);
+int perf_evlist__mmap(struct perf_evlist *evlist, struct cpu_map *cpus,
+		      struct thread_map *threads, int pages, bool overwrite);
+void perf_evlist__munmap(struct perf_evlist *evlist, int ncpus);
+
 #endif /* __PERF_EVLIST_H */
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index fddeb08..2720bc1 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -1,18 +1,19 @@
+/*
+ * Copyright (C) 2011, Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
+ *
+ * Parts came from builtin-{top,stat,record}.c, see those files for further
+ * copyright notes.
+ *
+ * Released under the GPL v2. (and only v2, not any later version)
+ */
+
 #include "evsel.h"
 #include "evlist.h"
-#include "../perf.h"
 #include "util.h"
 #include "cpumap.h"
 #include "thread_map.h"
 
-#include <unistd.h>
-#include <sys/mman.h>
-
-#include <linux/bitops.h>
-#include <linux/hash.h>
-
 #define FD(e, x, y) (*(int *)xyarray__entry(e->fd, x, y))
-#define SID(e, x, y) xyarray__entry(e->id, x, y)
 
 void perf_evsel__init(struct perf_evsel *evsel,
 		      struct perf_event_attr *attr, int idx)
@@ -74,24 +75,6 @@ void perf_evsel__close_fd(struct perf_evsel *evsel, int ncpus, int nthreads)
 		}
 }
 
-void perf_evlist__munmap(struct perf_evlist *evlist, int ncpus)
-{
-	int cpu;
-
-	for (cpu = 0; cpu < ncpus; cpu++) {
-		if (evlist->mmap[cpu].base != NULL) {
-			munmap(evlist->mmap[cpu].base, evlist->mmap_len);
-			evlist->mmap[cpu].base = NULL;
-		}
-	}
-}
-
-int perf_evlist__alloc_mmap(struct perf_evlist *evlist, int ncpus)
-{
-	evlist->mmap = zalloc(ncpus * sizeof(struct perf_mmap));
-	return evlist->mmap != NULL ? 0 : -ENOMEM;
-}
-
 void perf_evsel__exit(struct perf_evsel *evsel)
 {
 	assert(list_empty(&evsel->node));
@@ -258,115 +241,6 @@ int perf_evsel__open_per_thread(struct perf_evsel *evsel,
 	return __perf_evsel__open(evsel, &empty_cpu_map.map, threads, group, inherit);
 }
 
-static int __perf_evlist__mmap(struct perf_evlist *evlist, int cpu, int prot,
-			       int mask, int fd)
-{
-	evlist->mmap[cpu].prev = 0;
-	evlist->mmap[cpu].mask = mask;
-	evlist->mmap[cpu].base = mmap(NULL, evlist->mmap_len, prot,
-				      MAP_SHARED, fd, 0);
-	if (evlist->mmap[cpu].base == MAP_FAILED)
-		return -1;
-
-	perf_evlist__add_pollfd(evlist, fd);
-	return 0;
-}
-
-static int perf_evlist__id_hash(struct perf_evlist *evlist, struct perf_evsel *evsel,
-			       int cpu, int thread, int fd)
-{
-	struct perf_sample_id *sid;
-	u64 read_data[4] = { 0, };
-	int hash, id_idx = 1; /* The first entry is the counter value */
-
-	if (!(evsel->attr.read_format & PERF_FORMAT_ID) ||
-	    read(fd, &read_data, sizeof(read_data)) == -1)
-		return -1;
-
-	if (evsel->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
-		++id_idx;
-	if (evsel->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
-		++id_idx;
-
-	sid = SID(evsel, cpu, thread);
-	sid->id = read_data[id_idx];
-	sid->evsel = evsel;
-	hash = hash_64(sid->id, PERF_EVLIST__HLIST_BITS);
-	hlist_add_head(&sid->node, &evlist->heads[hash]);
-	return 0;
-}
-
-/** perf_evlist__mmap - Create per cpu maps to receive events
- *
- * @evlist - list of events
- * @cpus - cpu map being monitored
- * @threads - threads map being monitored
- * @pages - map length in pages
- * @overwrite - overwrite older events?
- *
- * If overwrite is false the user needs to signal event consuption using:
- *
- *	struct perf_mmap *m = &evlist->mmap[cpu];
- *	unsigned int head = perf_mmap__read_head(m);
- *
- *	perf_mmap__write_tail(m, head)
- */
-int perf_evlist__mmap(struct perf_evlist *evlist, struct cpu_map *cpus,
-		      struct thread_map *threads, int pages, bool overwrite)
-{
-	unsigned int page_size = sysconf(_SC_PAGE_SIZE);
-	int mask = pages * page_size - 1, cpu;
-	struct perf_evsel *first_evsel, *evsel;
-	int thread, prot = PROT_READ | (overwrite ? 0 : PROT_WRITE);
-
-	if (evlist->mmap == NULL &&
-	    perf_evlist__alloc_mmap(evlist, cpus->nr) < 0)
-		return -ENOMEM;
-
-	if (evlist->pollfd == NULL &&
-	    perf_evlist__alloc_pollfd(evlist, cpus->nr, threads->nr) < 0)
-		return -ENOMEM;
-
-	evlist->overwrite = overwrite;
-	evlist->mmap_len = (pages + 1) * page_size;
-	first_evsel = list_entry(evlist->entries.next, struct perf_evsel, node);
-
-	list_for_each_entry(evsel, &evlist->entries, node) {
-		if ((evsel->attr.read_format & PERF_FORMAT_ID) &&
-		    evsel->id == NULL &&
-		    perf_evsel__alloc_id(evsel, cpus->nr, threads->nr) < 0)
-			return -ENOMEM;
-
-		for (cpu = 0; cpu < cpus->nr; cpu++) {
-			for (thread = 0; thread < threads->nr; thread++) {
-				int fd = FD(evsel, cpu, thread);
-
-				if (evsel->idx || thread) {
-					if (ioctl(fd, PERF_EVENT_IOC_SET_OUTPUT,
-						  FD(first_evsel, cpu, 0)) != 0)
-						goto out_unmap;
-				} else if (__perf_evlist__mmap(evlist, cpu, prot, mask, fd) < 0)
-					goto out_unmap;
-
-				if ((evsel->attr.read_format & PERF_FORMAT_ID) &&
-				    perf_evlist__id_hash(evlist, evsel, cpu, thread, fd) < 0)
-					goto out_unmap;
-			}
-		}
-	}
-
-	return 0;
-
-out_unmap:
-	for (cpu = 0; cpu < cpus->nr; cpu++) {
-		if (evlist->mmap[cpu].base != NULL) {
-			munmap(evlist->mmap[cpu].base, evlist->mmap_len);
-			evlist->mmap[cpu].base = NULL;
-		}
-	}
-	return -1;
-}
-
 static int perf_event__parse_id_sample(const union perf_event *event, u64 type,
 				       struct perf_sample *sample)
 {
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index 7962e75..eecdc3a 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -60,7 +60,6 @@ void perf_evsel__delete(struct perf_evsel *evsel);
 int perf_evsel__alloc_fd(struct perf_evsel *evsel, int ncpus, int nthreads);
 int perf_evsel__alloc_id(struct perf_evsel *evsel, int ncpus, int nthreads);
 int perf_evsel__alloc_counts(struct perf_evsel *evsel, int ncpus);
-int perf_evlist__alloc_mmap(struct perf_evlist *evlist, int ncpus);
 void perf_evsel__free_fd(struct perf_evsel *evsel);
 void perf_evsel__free_id(struct perf_evsel *evsel);
 void perf_evsel__close_fd(struct perf_evsel *evsel, int ncpus, int nthreads);
@@ -71,9 +70,6 @@ int perf_evsel__open_per_thread(struct perf_evsel *evsel,
 				struct thread_map *threads, bool group, bool inherit);
 int perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus,
 		     struct thread_map *threads, bool group, bool inherit);
-int perf_evlist__mmap(struct perf_evlist *evlist, struct cpu_map *cpus,
-		      struct thread_map *threads, int pages, bool overwrite);
-void perf_evlist__munmap(struct perf_evlist *evlist, int ncpus);
 
 #define perf_evsel__match(evsel, t, c)		\
 	(evsel->attr.type == PERF_TYPE_##t &&	\
-- 
cgit v0.10.2


From 1fb0ef31f428f345a7c3666f8e7444a563edd537 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 31 Jan 2011 08:57:41 +0100
Subject: genirq: Fix affinity notifier fallout

The new code of commit cd7eab44e(genirq: Add IRQ affinity notifiers)
references irq_desc.affinity which fails to compile with
CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED=y.

Use irq_desc.irq_data.affinity instead.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Ben Hutchings <bhutchings@solarflare.com>

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 0587c5c..538fce2 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -179,7 +179,7 @@ static void irq_affinity_notify(struct work_struct *work)
 		cpumask_copy(cpumask, desc->pending_mask);
 	else
 #endif
-		cpumask_copy(cpumask, desc->affinity);
+		cpumask_copy(cpumask, desc->irq_data.affinity);
 	raw_spin_unlock_irqrestore(&desc->lock, flags);
 
 	notify->notify(notify, cpumask);
-- 
cgit v0.10.2


From 871cf1e5f2a17702f58539a3af8b18fc8666ad4c Mon Sep 17 00:00:00 2001
From: Torben Hohn <torbenh@gmx.de>
Date: Thu, 27 Jan 2011 15:58:55 +0100
Subject: time: Move do_timer() to kernel/time/timekeeping.c

do_timer() is primary timekeeping related. calc_global_load() is
called from do_timer() as well, but that's more for historical
reasons.

[ tglx: Fixed up the calc_global_load() reject andmassaged changelog ]

Signed-off-by: Torben Hohn <torbenh@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: johnstul@us.ibm.com
Cc: yong.zhang0@gmail.com
Cc: hch@infradead.org
LKML-Reference: <20110127145855.23248.56933.stgit@localhost>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/time.h b/include/linux/time.h
index 1e6d3b5..86a9c48 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -166,7 +166,6 @@ extern void monotonic_to_bootbased(struct timespec *ts);
 extern struct timespec timespec_trunc(struct timespec t, unsigned gran);
 extern int timekeeping_valid_for_hres(void);
 extern u64 timekeeping_max_deferment(void);
-extern void update_wall_time(void);
 extern void timekeeping_leap_insert(int leapsecond);
 
 struct tms;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index d27c756..c1a178c 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -779,7 +779,7 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
  *
  * Called from the timer interrupt, must hold a write on xtime_lock.
  */
-void update_wall_time(void)
+static void update_wall_time(void)
 {
 	struct clocksource *clock;
 	cycle_t offset;
@@ -946,3 +946,15 @@ struct timespec get_monotonic_coarse(void)
 				now.tv_nsec + mono.tv_nsec);
 	return now;
 }
+
+/*
+ * The 64-bit jiffies value is not atomic - you MUST NOT read it
+ * without sampling the sequence number in xtime_lock.
+ * jiffies is defined in the linker script...
+ */
+void do_timer(unsigned long ticks)
+{
+	jiffies_64 += ticks;
+	update_wall_time();
+	calc_global_load(ticks);
+}
diff --git a/kernel/timer.c b/kernel/timer.c
index 43ca993..87f656c 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1293,19 +1293,6 @@ void run_local_timers(void)
 	raise_softirq(TIMER_SOFTIRQ);
 }
 
-/*
- * The 64-bit jiffies value is not atomic - you MUST NOT read it
- * without sampling the sequence number in xtime_lock.
- * jiffies is defined in the linker script...
- */
-
-void do_timer(unsigned long ticks)
-{
-	jiffies_64 += ticks;
-	update_wall_time();
-	calc_global_load(ticks);
-}
-
 #ifdef __ARCH_WANT_SYS_ALARM
 
 /*
-- 
cgit v0.10.2


From fbad1ea94159a71bc0f68b00e57ae803606af9fb Mon Sep 17 00:00:00 2001
From: Torben Hohn <torbenh@gmx.de>
Date: Thu, 27 Jan 2011 15:59:00 +0100
Subject: time: Move get_jiffies_64 to kernel/time/jiffies.c

Move the jiffies access functions to the jiffies clocksource code.

[ tglx: Add missing include ]

Signed-off-by: Torben Hohn <torbenh@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: johnstul@us.ibm.com
Cc: yong.zhang0@gmail.com
Cc: hch@infradead.org
LKML-Reference: <20110127145900.23248.73352.stgit@localhost>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/time.c b/kernel/time.c
index 3217435..a31b512 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -674,23 +674,6 @@ unsigned long nsecs_to_jiffies(u64 n)
 #endif
 }
 
-#if (BITS_PER_LONG < 64)
-u64 get_jiffies_64(void)
-{
-	unsigned long seq;
-	u64 ret;
-
-	do {
-		seq = read_seqbegin(&xtime_lock);
-		ret = jiffies_64;
-	} while (read_seqretry(&xtime_lock, seq));
-	return ret;
-}
-EXPORT_SYMBOL(get_jiffies_64);
-#endif
-
-EXPORT_SYMBOL(jiffies);
-
 /*
  * Add two timespec values and do a safety check for overflow.
  * It's assumed that both values are valid (>= 0)
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index 5404a84..2fbc207 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -22,6 +22,7 @@
 ************************************************************************/
 #include <linux/clocksource.h>
 #include <linux/jiffies.h>
+#include <linux/module.h>
 #include <linux/init.h>
 
 /* The Jiffies based clocksource is the lowest common
@@ -64,6 +65,23 @@ struct clocksource clocksource_jiffies = {
 	.shift		= JIFFIES_SHIFT,
 };
 
+#if (BITS_PER_LONG < 64)
+u64 get_jiffies_64(void)
+{
+	unsigned long seq;
+	u64 ret;
+
+	do {
+		seq = read_seqbegin(&xtime_lock);
+		ret = jiffies_64;
+	} while (read_seqretry(&xtime_lock, seq));
+	return ret;
+}
+EXPORT_SYMBOL(get_jiffies_64);
+#endif
+
+EXPORT_SYMBOL(jiffies);
+
 static int __init init_jiffies_clocksource(void)
 {
 	return clocksource_register(&clocksource_jiffies);
-- 
cgit v0.10.2


From 48cf76f7104f655bbd48a75c7759dce82c3e1ab6 Mon Sep 17 00:00:00 2001
From: Torben Hohn <torbenh@gmx.de>
Date: Thu, 27 Jan 2011 15:59:05 +0100
Subject: time: Provide get_xtime_and_monotonic_offset()

The hrtimer code accesses timekeeping variables under
xtime_lock. Provide a sensible accessor function and use it.

[ tglx: Removed the conditionals, unused variable, fixed codingstyle
  	and massaged changelog ]

Signed-off-by: Torben Hohn <torbenh@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: johnstul@us.ibm.com
Cc: yong.zhang0@gmail.com
Cc: hch@infradead.org
LKML-Reference: <20110127145905.23248.30458.stgit@localhost>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/time.h b/include/linux/time.h
index 86a9c48..4007a12 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -127,6 +127,7 @@ struct timespec current_kernel_time(void);
 struct timespec __current_kernel_time(void); /* does not take xtime_lock */
 struct timespec __get_wall_to_monotonic(void); /* does not take xtime_lock */
 struct timespec get_monotonic_coarse(void);
+void get_xtime_and_monotonic_offset(struct timespec *xtim, struct timespec *wtom);
 
 #define CURRENT_TIME		(current_kernel_time())
 #define CURRENT_TIME_SEC	((struct timespec) { get_seconds(), 0 })
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 0c8d7c0..57c4d33 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -85,13 +85,8 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
 {
 	ktime_t xtim, tomono;
 	struct timespec xts, tom;
-	unsigned long seq;
 
-	do {
-		seq = read_seqbegin(&xtime_lock);
-		xts = __current_kernel_time();
-		tom = __get_wall_to_monotonic();
-	} while (read_seqretry(&xtime_lock, seq));
+	get_xtime_and_monotonic_offset(&xts, &tom);
 
 	xtim = timespec_to_ktime(xts);
 	tomono = timespec_to_ktime(tom);
@@ -612,15 +607,11 @@ static void retrigger_next_event(void *arg)
 {
 	struct hrtimer_cpu_base *base;
 	struct timespec realtime_offset, wtm;
-	unsigned long seq;
 
 	if (!hrtimer_hres_active())
 		return;
 
-	do {
-		seq = read_seqbegin(&xtime_lock);
-		wtm = __get_wall_to_monotonic();
-	} while (read_seqretry(&xtime_lock, seq));
+	get_xtime_and_monotonic_offset(&realtime_offset, &wtm);
 	set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec);
 
 	base = &__get_cpu_var(hrtimer_bases);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index c1a178c..c50aaf6 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -958,3 +958,19 @@ void do_timer(unsigned long ticks)
 	update_wall_time();
 	calc_global_load(ticks);
 }
+
+/**
+ * get_xtime_and_monotonic_offset() - get xtime and wall_to_monotonic
+ * @xtim:	pointer to timespec to be set with xtime
+ * @wtom:	pointer to timespec to be set with wall_to_monotonic
+ */
+void get_xtime_and_monotonic_offset(struct timespec *xtim, struct timespec *wtom)
+{
+	unsigned long seq;
+
+	do {
+		seq = read_seqbegin(&xtime_lock);
+		*xtim = xtime;
+		*wtom = wall_to_monotonic;
+	} while (read_seqretry(&xtime_lock, seq));
+}
-- 
cgit v0.10.2


From 79ecaf0d15344d78904becf0f25de3fc9b49d430 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 31 Jan 2011 11:07:54 +0100
Subject: time: Remove unused __get_wall_to_monotonic()

No users left. Remove it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/time.h b/include/linux/time.h
index 4007a12..ce29c86 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -125,7 +125,6 @@ extern int timekeeping_suspended;
 unsigned long get_seconds(void);
 struct timespec current_kernel_time(void);
 struct timespec __current_kernel_time(void); /* does not take xtime_lock */
-struct timespec __get_wall_to_monotonic(void); /* does not take xtime_lock */
 struct timespec get_monotonic_coarse(void);
 void get_xtime_and_monotonic_offset(struct timespec *xtim, struct timespec *wtom);
 
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index c50aaf6..8da35d1 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -910,11 +910,6 @@ struct timespec __current_kernel_time(void)
 	return xtime;
 }
 
-struct timespec __get_wall_to_monotonic(void)
-{
-	return wall_to_monotonic;
-}
-
 struct timespec current_kernel_time(void)
 {
 	struct timespec now;
-- 
cgit v0.10.2


From f0af911a9dec9de702645182c8d269449e24d24b Mon Sep 17 00:00:00 2001
From: Torben Hohn <torbenh@gmx.de>
Date: Thu, 27 Jan 2011 15:59:10 +0100
Subject: time: Provide xtime_update()

xtime_update() takes xtime_lock write locked and calls
do_timer(). Provided to replace the do_timer() calls in the
architecture code.

Signed-off-by: Torben Hohn <torbenh@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: johnstul@us.ibm.com
Cc: yong.zhang0@gmail.com
Cc: hch@infradead.org
LKML-Reference: <20110127145910.23248.21379.stgit@localhost>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/sched.h b/include/linux/sched.h
index d747f94..9d9a078 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2050,6 +2050,7 @@ extern void release_uids(struct user_namespace *ns);
 #include <asm/current.h>
 
 extern void do_timer(unsigned long ticks);
+extern void xtime_update(unsigned long ticks);
 
 extern int wake_up_state(struct task_struct *tsk, unsigned int state);
 extern int wake_up_process(struct task_struct *tsk);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 8da35d1..02c13a3 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -969,3 +969,16 @@ void get_xtime_and_monotonic_offset(struct timespec *xtim, struct timespec *wtom
 		*wtom = wall_to_monotonic;
 	} while (read_seqretry(&xtime_lock, seq));
 }
+
+/**
+ * xtime_update() - advances the timekeeping infrastructure
+ * @ticks:	number of ticks, that have elapsed since the last call.
+ *
+ * Must be called with interrupts disabled.
+ */
+void xtime_update(unsigned long ticks)
+{
+	write_seqlock(&xtime_lock);
+	do_timer(ticks);
+	write_sequnlock(&xtime_lock);
+}
-- 
cgit v0.10.2


From 1340f3e0b29b745a33f431455c3a37f48197bc81 Mon Sep 17 00:00:00 2001
From: Torben Hohn <torbenh@gmx.de>
Date: Thu, 27 Jan 2011 15:59:15 +0100
Subject: alpha: Change do_timer() to xtime_update()

xtime_update() takes the xtime_lock itself.

timer_interrupt() is only called on the boot cpu. See do_entInt(). So
"state" in timer_interrupt does not require protection by xtime_lock.

Signed-off-by: Torben Hohn <torbenh@gmx.de>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: johnstul@us.ibm.com
Cc: hch@infradead.org
Cc: yong.zhang0@gmail.com
LKML-Reference: <20110127145915.23248.20919.stgit@localhost>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/alpha/kernel/time.c b/arch/alpha/kernel/time.c
index c1f3e7c..a58e84f 100644
--- a/arch/alpha/kernel/time.c
+++ b/arch/alpha/kernel/time.c
@@ -159,7 +159,7 @@ void read_persistent_clock(struct timespec *ts)
 
 /*
  * timer_interrupt() needs to keep up the real-time clock,
- * as well as call the "do_timer()" routine every clocktick
+ * as well as call the "xtime_update()" routine every clocktick
  */
 irqreturn_t timer_interrupt(int irq, void *dev)
 {
@@ -172,8 +172,6 @@ irqreturn_t timer_interrupt(int irq, void *dev)
 	profile_tick(CPU_PROFILING);
 #endif
 
-	write_seqlock(&xtime_lock);
-
 	/*
 	 * Calculate how many ticks have passed since the last update,
 	 * including any previous partial leftover.  Save any resulting
@@ -187,9 +185,7 @@ irqreturn_t timer_interrupt(int irq, void *dev)
 	nticks = delta >> FIX_SHIFT;
 
 	if (nticks)
-		do_timer(nticks);
-
-	write_sequnlock(&xtime_lock);
+		xtime_update(nticks);
 
 	if (test_irq_work_pending()) {
 		clear_irq_work_pending();
-- 
cgit v0.10.2


From 6906e33cc555c390cd091f6f363b783322dfedf6 Mon Sep 17 00:00:00 2001
From: Torben Hohn <torbenh@gmx.de>
Date: Thu, 27 Jan 2011 15:59:21 +0100
Subject: arm: Switch from do_timer() to xtime_update()

xtime_update takes the xtime_lock itself.

Signed-off-by: Torben Hohn <torbenh@gmx.de>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: johnstul@us.ibm.com
Cc: hch@infradead.org
Cc: yong.zhang0@gmail.com
LKML-Reference: <20110127145920.23248.75541.stgit@localhost>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/arm/kernel/time.c b/arch/arm/kernel/time.c
index 3d76bf2..1ff46ca 100644
--- a/arch/arm/kernel/time.c
+++ b/arch/arm/kernel/time.c
@@ -107,9 +107,7 @@ void timer_tick(void)
 {
 	profile_tick(CPU_PROFILING);
 	do_leds();
-	write_seqlock(&xtime_lock);
-	do_timer(1);
-	write_sequnlock(&xtime_lock);
+	xtime_update(1);
 #ifndef CONFIG_SMP
 	update_process_times(user_mode(get_irq_regs()));
 #endif
-- 
cgit v0.10.2


From ec2dff2febf19ff2109c2eb3e56d5a969fe399e2 Mon Sep 17 00:00:00 2001
From: Torben Hohn <torbenh@gmx.de>
Date: Thu, 27 Jan 2011 15:59:26 +0100
Subject: arm/mach-clps711x: Switch do_timer() to xtime_update()

do_timer() requires holding the xtime_lock, which this
code did not do. Use the safe version xtime_update()

Signed-off-by: Torben Hohn <torbenh@gmx.de>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: johnstul@us.ibm.com
Cc: hch@infradead.org
Cc: yong.zhang0@gmail.com
LKML-Reference: <20110127145926.23248.56369.stgit@localhost>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/arm/mach-clps711x/include/mach/time.h b/arch/arm/mach-clps711x/include/mach/time.h
index 8fe283c..61fef91 100644
--- a/arch/arm/mach-clps711x/include/mach/time.h
+++ b/arch/arm/mach-clps711x/include/mach/time.h
@@ -30,7 +30,7 @@ p720t_timer_interrupt(int irq, void *dev_id)
 {
 	struct pt_regs *regs = get_irq_regs();
 	do_leds();
-	do_timer(1);
+	xtime_update(1);
 #ifndef CONFIG_SMP
 	update_process_times(user_mode(regs));
 #endif
-- 
cgit v0.10.2


From 4196b892d55caaf2c98da05e80472ca482ca19fe Mon Sep 17 00:00:00 2001
From: Torben Hohn <torbenh@gmx.de>
Date: Thu, 27 Jan 2011 15:59:31 +0100
Subject: blackfin: Switch from do_timer() to xtime_update()

xtime_update() takes the xtime_lock itself.

Signed-off-by: Torben Hohn <torbenh@gmx.de>
Cc: Mike Frysinger <vapier@gentoo.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: johnstul@us.ibm.com
Cc: hch@infradead.org
Cc: yong.zhang0@gmail.com
LKML-Reference: <20110127145931.23248.33917.stgit@localhost>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/blackfin/kernel/time.c b/arch/blackfin/kernel/time.c
index c911361..8d73724 100644
--- a/arch/blackfin/kernel/time.c
+++ b/arch/blackfin/kernel/time.c
@@ -114,16 +114,14 @@ u32 arch_gettimeoffset(void)
 
 /*
  * timer_interrupt() needs to keep up the real-time clock,
- * as well as call the "do_timer()" routine every clocktick
+ * as well as call the "xtime_update()" routine every clocktick
  */
 #ifdef CONFIG_CORE_TIMER_IRQ_L1
 __attribute__((l1_text))
 #endif
 irqreturn_t timer_interrupt(int irq, void *dummy)
 {
-	write_seqlock(&xtime_lock);
-	do_timer(1);
-	write_sequnlock(&xtime_lock);
+	xtime_update(1);
 
 #ifdef CONFIG_IPIPE
 	update_root_process_times(get_irq_regs());
-- 
cgit v0.10.2


From 17588b99183ece563013622afeefd37eb8e68fd3 Mon Sep 17 00:00:00 2001
From: Torben Hohn <torbenh@gmx.de>
Date: Thu, 27 Jan 2011 15:59:36 +0100
Subject: cris: arch-v10: Switch do_timer() to xtime_update()

This code failed to take the xtime_lock, which must be held when
calling do_timer(). Use the safe version xtime_update()

Signed-off-by: Torben Hohn <torbenh@gmx.de>
Cc: hch@infradead.org
Cc: Jesper Nilsson <jesper.nilsson@axis.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: johnstul@us.ibm.com
Cc: Mikael Starvik <starvik@axis.com>
Cc: yong.zhang0@gmail.com
LKML-Reference: <20110127145936.23248.16192.stgit@localhost>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/cris/arch-v10/kernel/time.c b/arch/cris/arch-v10/kernel/time.c
index 00eb36f..20c85b5 100644
--- a/arch/cris/arch-v10/kernel/time.c
+++ b/arch/cris/arch-v10/kernel/time.c
@@ -140,7 +140,7 @@ stop_watchdog(void)
 
 /*
  * timer_interrupt() needs to keep up the real-time clock,
- * as well as call the "do_timer()" routine every clocktick
+ * as well as call the "xtime_update()" routine every clocktick
  */
 
 //static unsigned short myjiff; /* used by our debug routine print_timestamp */
@@ -176,7 +176,7 @@ timer_interrupt(int irq, void *dev_id)
 
 	/* call the real timer interrupt handler */
 
-	do_timer(1);
+	xtime_update(1);
 	
         cris_do_profile(regs); /* Save profiling information */
         return IRQ_HANDLED;
-- 
cgit v0.10.2


From 36cb07bb8118cb14211ef25c58026f005877c47d Mon Sep 17 00:00:00 2001
From: Torben Hohn <torbenh@gmx.de>
Date: Thu, 27 Jan 2011 15:59:41 +0100
Subject: cris: arch-v32: Switch do_timer() to xtime_update()

xtime_update() takes the xtime_lock itself.

Signed-off-by: Torben Hohn <torbenh@gmx.de>
Cc: hch@infradead.org
Cc: Jesper Nilsson <jesper.nilsson@axis.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: johnstul@us.ibm.com
Cc: Mikael Starvik <starvik@axis.com>
Cc: yong.zhang0@gmail.com
LKML-Reference: <20110127145941.23248.92547.stgit@localhost>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/cris/arch-v32/kernel/time.c b/arch/cris/arch-v32/kernel/time.c
index a545211..bb978ed 100644
--- a/arch/cris/arch-v32/kernel/time.c
+++ b/arch/cris/arch-v32/kernel/time.c
@@ -183,7 +183,7 @@ void handle_watchdog_bite(struct pt_regs *regs)
 
 /*
  * timer_interrupt() needs to keep up the real-time clock,
- * as well as call the "do_timer()" routine every clocktick.
+ * as well as call the "xtime_update()" routine every clocktick.
  */
 extern void cris_do_profile(struct pt_regs *regs);
 
@@ -216,9 +216,7 @@ static inline irqreturn_t timer_interrupt(int irq, void *dev_id)
 		return IRQ_HANDLED;
 
 	/* Call the real timer interrupt handler */
-	write_seqlock(&xtime_lock);
-	do_timer(1);
-	write_sequnlock(&xtime_lock);
+	xtime_update(1);
         return IRQ_HANDLED;
 }
 
-- 
cgit v0.10.2


From 57464bd87f708e75b47312766e3fc8dc3aaf66ad Mon Sep 17 00:00:00 2001
From: Torben Hohn <torbenh@gmx.de>
Date: Thu, 27 Jan 2011 15:59:46 +0100
Subject: frv: Switch do_timer() to xtime_update()

__set_LEDS() does not need to be protected by xtime_lock.
its used unprotected in other places.

[ tglx: Removed stale comment ]

Signed-off-by: Torben Hohn <torbenh@gmx.de>
Cc: hch@infradead.org
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: johnstul@us.ibm.com
Cc: David Howells <dhowells@redhat.com>
Cc: yong.zhang0@gmail.com
LKML-Reference: <20110127145946.23248.57952.stgit@localhost>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/frv/kernel/time.c b/arch/frv/kernel/time.c
index 0ddbbae..b457de4 100644
--- a/arch/frv/kernel/time.c
+++ b/arch/frv/kernel/time.c
@@ -50,21 +50,13 @@ static struct irqaction timer_irq  = {
 
 /*
  * timer_interrupt() needs to keep up the real-time clock,
- * as well as call the "do_timer()" routine every clocktick
+ * as well as call the "xtime_update()" routine every clocktick
  */
 static irqreturn_t timer_interrupt(int irq, void *dummy)
 {
 	profile_tick(CPU_PROFILING);
-	/*
-	 * Here we are in the timer irq handler. We just have irqs locally
-	 * disabled but we don't know if the timer_bh is running on the other
-	 * CPU. We need to avoid to SMP race with it. NOTE: we don't need
-	 * the irq version of write_lock because as just said we have irq
-	 * locally disabled. -arca
-	 */
-	write_seqlock(&xtime_lock);
 
-	do_timer(1);
+	xtime_update(1);
 
 #ifdef CONFIG_HEARTBEAT
 	static unsigned short n;
@@ -72,8 +64,6 @@ static irqreturn_t timer_interrupt(int irq, void *dummy)
 	__set_LEDS(n);
 #endif /* CONFIG_HEARTBEAT */
 
-	write_sequnlock(&xtime_lock);
-
 	update_process_times(user_mode(get_irq_regs()));
 
 	return IRQ_HANDLED;
-- 
cgit v0.10.2


From daad8b581e7f5e21a2f79e49d57d4f6a73b26510 Mon Sep 17 00:00:00 2001
From: Torben Hohn <torbenh@gmx.de>
Date: Thu, 27 Jan 2011 15:59:51 +0100
Subject: h8300: Switch do_timer() to xtime_update()

xtime_update() takes the xtime_lock itself.

Signed-off-by: Torben Hohn <torbenh@gmx.de>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: johnstul@us.ibm.com
Cc: hch@infradead.org
Cc: yong.zhang0@gmail.com
LKML-Reference: <20110127145951.23248.92727.stgit@localhost>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/h8300/kernel/time.c b/arch/h8300/kernel/time.c
index 165005a..32263a1 100644
--- a/arch/h8300/kernel/time.c
+++ b/arch/h8300/kernel/time.c
@@ -35,9 +35,7 @@ void h8300_timer_tick(void)
 {
 	if (current->pid)
 		profile_tick(CPU_PROFILING);
-	write_seqlock(&xtime_lock);
-	do_timer(1);
-	write_sequnlock(&xtime_lock);
+	xtime_update(1);
 	update_process_times(user_mode(get_irq_regs()));
 }
 
diff --git a/arch/h8300/kernel/timer/timer8.c b/arch/h8300/kernel/timer/timer8.c
index 3946c0f..7a1533f 100644
--- a/arch/h8300/kernel/timer/timer8.c
+++ b/arch/h8300/kernel/timer/timer8.c
@@ -61,7 +61,7 @@
 
 /*
  * timer_interrupt() needs to keep up the real-time clock,
- * as well as call the "do_timer()" routine every clocktick
+ * as well as call the "xtime_update()" routine every clocktick
  */
 
 static irqreturn_t timer_interrupt(int irq, void *dev_id)
-- 
cgit v0.10.2


From 1aabd67d2e97e6affdf5a7c65f442ac91ace3f85 Mon Sep 17 00:00:00 2001
From: Torben Hohn <torbenh@gmx.de>
Date: Thu, 27 Jan 2011 15:59:56 +0100
Subject: ia64: Switch do_timer() to xtime_update()

local_cpu_data->itm_next = new_itm; does not need to be protected by
xtime_lock. xtime_update() takes the lock itself.

Signed-off-by: Torben Hohn <torbenh@gmx.de>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: johnstul@us.ibm.com
Cc: hch@infradead.org
Cc: yong.zhang0@gmail.com
LKML-Reference: <20110127145956.23248.49107.stgit@localhost>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c
index 9702fa9..156ad80 100644
--- a/arch/ia64/kernel/time.c
+++ b/arch/ia64/kernel/time.c
@@ -190,19 +190,10 @@ timer_interrupt (int irq, void *dev_id)
 
 		new_itm += local_cpu_data->itm_delta;
 
-		if (smp_processor_id() == time_keeper_id) {
-			/*
-			 * Here we are in the timer irq handler. We have irqs locally
-			 * disabled, but we don't know if the timer_bh is running on
-			 * another CPU. We need to avoid to SMP race by acquiring the
-			 * xtime_lock.
-			 */
-			write_seqlock(&xtime_lock);
-			do_timer(1);
-			local_cpu_data->itm_next = new_itm;
-			write_sequnlock(&xtime_lock);
-		} else
-			local_cpu_data->itm_next = new_itm;
+		if (smp_processor_id() == time_keeper_id)
+			xtime_update(1);
+
+		local_cpu_data->itm_next = new_itm;
 
 		if (time_after(new_itm, ia64_get_itc()))
 			break;
@@ -222,7 +213,7 @@ skip_process_time_accounting:
 		 * comfort, we increase the safety margin by
 		 * intentionally dropping the next tick(s).  We do NOT
 		 * update itm.next because that would force us to call
-		 * do_timer() which in turn would let our clock run
+		 * xtime_update() which in turn would let our clock run
 		 * too fast (with the potentially devastating effect
 		 * of losing monotony of time).
 		 */
diff --git a/arch/ia64/xen/time.c b/arch/ia64/xen/time.c
index c1c5445..1f8244a 100644
--- a/arch/ia64/xen/time.c
+++ b/arch/ia64/xen/time.c
@@ -139,14 +139,11 @@ consider_steal_time(unsigned long new_itm)
 		run_posix_cpu_timers(p);
 		delta_itm += local_cpu_data->itm_delta * (stolen + blocked);
 
-		if (cpu == time_keeper_id) {
-			write_seqlock(&xtime_lock);
-			do_timer(stolen + blocked);
-			local_cpu_data->itm_next = delta_itm + new_itm;
-			write_sequnlock(&xtime_lock);
-		} else {
-			local_cpu_data->itm_next = delta_itm + new_itm;
-		}
+		if (cpu == time_keeper_id)
+			xtime_update(stolen + blocked);
+
+		local_cpu_data->itm_next = delta_itm + new_itm;
+
 		per_cpu(xen_stolen_time, cpu) += NS_PER_TICK * stolen;
 		per_cpu(xen_blocked_time, cpu) += NS_PER_TICK * blocked;
 	}
-- 
cgit v0.10.2


From 7bde2ab7cb51f14c6f6574f0f5a78445f2caed3e Mon Sep 17 00:00:00 2001
From: Torben Hohn <torbenh@gmx.de>
Date: Thu, 27 Jan 2011 16:00:01 +0100
Subject: m32r: Switch from do_timer() to xtime_update()

xtime_update() does proper locking.

Signed-off-by: Torben Hohn <torbenh@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: johnstul@us.ibm.com
Cc: Hirokazu Takata <takata@linux-m32r.org>
Cc: hch@infradead.org
Cc: yong.zhang0@gmail.com
LKML-Reference: <20110127150001.23248.68620.stgit@localhost>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/m32r/kernel/time.c b/arch/m32r/kernel/time.c
index bda8682..84dd040 100644
--- a/arch/m32r/kernel/time.c
+++ b/arch/m32r/kernel/time.c
@@ -107,15 +107,14 @@ u32 arch_gettimeoffset(void)
 
 /*
  * timer_interrupt() needs to keep up the real-time clock,
- * as well as call the "do_timer()" routine every clocktick
+ * as well as call the "xtime_update()" routine every clocktick
  */
 static irqreturn_t timer_interrupt(int irq, void *dev_id)
 {
 #ifndef CONFIG_SMP
 	profile_tick(CPU_PROFILING);
 #endif
-	/* XXX FIXME. Uh, the xtime_lock should be held here, no? */
-	do_timer(1);
+	xtime_update(1);
 
 #ifndef CONFIG_SMP
 	update_process_times(user_mode(get_irq_regs()));
-- 
cgit v0.10.2


From e53f276beb655c711a5d1f25f800b61aa976e34f Mon Sep 17 00:00:00 2001
From: Torben Hohn <torbenh@gmx.de>
Date: Thu, 27 Jan 2011 16:00:06 +0100
Subject: m68k: Switch do_timer() to xtime_update()

xtime_update() properly takes the xtime_lock

Signed-off-by: Torben Hohn <torbenh@gmx.de>
Cc: Sam Creasey <sammy@sammy.net>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: johnstul@us.ibm.com
Cc: Roman Zippel <zippel@linux-m68k.org>
Cc: hch@infradead.org
Cc: yong.zhang0@gmail.com
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Greg Ungerer <gerg@uclinux.org>
LKML-Reference: <20110127150006.23248.71790.stgit@localhost>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/m68k/bvme6000/config.c b/arch/m68k/bvme6000/config.c
index 9fe6fef..1edd950 100644
--- a/arch/m68k/bvme6000/config.c
+++ b/arch/m68k/bvme6000/config.c
@@ -45,8 +45,8 @@ extern int bvme6000_set_clock_mmss (unsigned long);
 extern void bvme6000_reset (void);
 void bvme6000_set_vectors (void);
 
-/* Save tick handler routine pointer, will point to do_timer() in
- * kernel/sched.c, called via bvme6000_process_int() */
+/* Save tick handler routine pointer, will point to xtime_update() in
+ * kernel/timer/timekeeping.c, called via bvme6000_process_int() */
 
 static irq_handler_t tick_handler;
 
diff --git a/arch/m68k/kernel/time.c b/arch/m68k/kernel/time.c
index 06438da..18b34ee 100644
--- a/arch/m68k/kernel/time.c
+++ b/arch/m68k/kernel/time.c
@@ -37,11 +37,11 @@ static inline int set_rtc_mmss(unsigned long nowtime)
 
 /*
  * timer_interrupt() needs to keep up the real-time clock,
- * as well as call the "do_timer()" routine every clocktick
+ * as well as call the "xtime_update()" routine every clocktick
  */
 static irqreturn_t timer_interrupt(int irq, void *dummy)
 {
-	do_timer(1);
+	xtime_update(1);
 	update_process_times(user_mode(get_irq_regs()));
 	profile_tick(CPU_PROFILING);
 
diff --git a/arch/m68k/mvme147/config.c b/arch/m68k/mvme147/config.c
index 100baaa..6cb9c3a 100644
--- a/arch/m68k/mvme147/config.c
+++ b/arch/m68k/mvme147/config.c
@@ -46,8 +46,8 @@ extern void mvme147_reset (void);
 
 static int bcd2int (unsigned char b);
 
-/* Save tick handler routine pointer, will point to do_timer() in
- * kernel/sched.c, called via mvme147_process_int() */
+/* Save tick handler routine pointer, will point to xtime_update() in
+ * kernel/time/timekeeping.c, called via mvme147_process_int() */
 
 irq_handler_t tick_handler;
 
diff --git a/arch/m68k/mvme16x/config.c b/arch/m68k/mvme16x/config.c
index 11edf61..0b28e26 100644
--- a/arch/m68k/mvme16x/config.c
+++ b/arch/m68k/mvme16x/config.c
@@ -51,8 +51,8 @@ extern void mvme16x_reset (void);
 
 int bcd2int (unsigned char b);
 
-/* Save tick handler routine pointer, will point to do_timer() in
- * kernel/sched.c, called via mvme16x_process_int() */
+/* Save tick handler routine pointer, will point to xtime_update() in
+ * kernel/time/timekeeping.c, called via mvme16x_process_int() */
 
 static irq_handler_t tick_handler;
 
diff --git a/arch/m68k/sun3/sun3ints.c b/arch/m68k/sun3/sun3ints.c
index 2d9e21b..6464ad3 100644
--- a/arch/m68k/sun3/sun3ints.c
+++ b/arch/m68k/sun3/sun3ints.c
@@ -66,7 +66,7 @@ static irqreturn_t sun3_int5(int irq, void *dev_id)
 #ifdef CONFIG_SUN3
 	intersil_clear();
 #endif
-        do_timer(1);
+	xtime_update(1);
 	update_process_times(user_mode(get_irq_regs()));
         if (!(kstat_cpu(0).irqs[irq] % 20))
                 sun3_leds(led_pattern[(kstat_cpu(0).irqs[irq] % 160) / 20]);
diff --git a/arch/m68knommu/kernel/time.c b/arch/m68knommu/kernel/time.c
index d6ac2a4..6623909 100644
--- a/arch/m68knommu/kernel/time.c
+++ b/arch/m68knommu/kernel/time.c
@@ -36,7 +36,7 @@ static inline int set_rtc_mmss(unsigned long nowtime)
 #ifndef CONFIG_GENERIC_CLOCKEVENTS
 /*
  * timer_interrupt() needs to keep up the real-time clock,
- * as well as call the "do_timer()" routine every clocktick
+ * as well as call the "xtime_update()" routine every clocktick
  */
 irqreturn_t arch_timer_interrupt(int irq, void *dummy)
 {
@@ -44,11 +44,7 @@ irqreturn_t arch_timer_interrupt(int irq, void *dummy)
 	if (current->pid)
 		profile_tick(CPU_PROFILING);
 
-	write_seqlock(&xtime_lock);
-
-	do_timer(1);
-
-	write_sequnlock(&xtime_lock);
+	xtime_update(1);
 
 	update_process_times(user_mode(get_irq_regs()));
 
-- 
cgit v0.10.2


From bb1dfc1cf6c51ca42f7c05029a6f06df9092a0fc Mon Sep 17 00:00:00 2001
From: Torben Hohn <torbenh@gmx.de>
Date: Thu, 27 Jan 2011 16:00:17 +0100
Subject: parisc: Switch do_timer() to xtime_update()

xtime_update() takes the xtime_lock itself.

Signed-off-by: Torben Hohn <torbenh@gmx.de>
Cc: hch@infradead.org
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: johnstul@us.ibm.com
Cc: Helge Deller <deller@gmx.de>
Cc: "James E.J. Bottomley" <jejb@parisc-linux.org>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Cc: yong.zhang0@gmail.com
LKML-Reference: <20110127150017.23248.22559.stgit@localhost>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/parisc/kernel/time.c b/arch/parisc/kernel/time.c
index 05511cc..45b7389 100644
--- a/arch/parisc/kernel/time.c
+++ b/arch/parisc/kernel/time.c
@@ -162,11 +162,8 @@ irqreturn_t __irq_entry timer_interrupt(int irq, void *dev_id)
 		update_process_times(user_mode(get_irq_regs()));
 	}
 
-	if (cpu == 0) {
-		write_seqlock(&xtime_lock);
-		do_timer(ticks_elapsed);
-		write_sequnlock(&xtime_lock);
-	}
+	if (cpu == 0)
+		xtime_update(ticks_elapsed);
 
 	return IRQ_HANDLED;
 }
-- 
cgit v0.10.2


From 4ea1b72551d052a3993ef72ce06ecf5bdd125859 Mon Sep 17 00:00:00 2001
From: Torben Hohn <torbenh@gmx.de>
Date: Thu, 27 Jan 2011 16:00:22 +0100
Subject: sparc: Switch do_timer() to xtime_update()

xtime_update() takes the xtime_lock itself.

pcic_clear_clock_irq() and clear_clock_irq do not need
to be protected by xtime_lock.

Signed-off-by: Torben Hohn <torbenh@gmx.de>
Acked-by: David S. Miller <davem@davemloft.net>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: johnstul@us.ibm.com
Cc: hch@infradead.org
Cc: yong.zhang0@gmail.com
LKML-Reference: <20110127150022.23248.80369.stgit@localhost>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/sparc/kernel/pcic.c b/arch/sparc/kernel/pcic.c
index aeaa09a..2cdc131 100644
--- a/arch/sparc/kernel/pcic.c
+++ b/arch/sparc/kernel/pcic.c
@@ -700,10 +700,8 @@ static void pcic_clear_clock_irq(void)
 
 static irqreturn_t pcic_timer_handler (int irq, void *h)
 {
-	write_seqlock(&xtime_lock);	/* Dummy, to show that we remember */
 	pcic_clear_clock_irq();
-	do_timer(1);
-	write_sequnlock(&xtime_lock);
+	xtime_update(1);
 #ifndef CONFIG_SMP
 	update_process_times(user_mode(get_irq_regs()));
 #endif
diff --git a/arch/sparc/kernel/time_32.c b/arch/sparc/kernel/time_32.c
index 9c743b1..4211bfc 100644
--- a/arch/sparc/kernel/time_32.c
+++ b/arch/sparc/kernel/time_32.c
@@ -85,7 +85,7 @@ int update_persistent_clock(struct timespec now)
 
 /*
  * timer_interrupt() needs to keep up the real-time clock,
- * as well as call the "do_timer()" routine every clocktick
+ * as well as call the "xtime_update()" routine every clocktick
  */
 
 #define TICK_SIZE (tick_nsec / 1000)
@@ -96,14 +96,9 @@ static irqreturn_t timer_interrupt(int dummy, void *dev_id)
 	profile_tick(CPU_PROFILING);
 #endif
 
-	/* Protect counter clear so that do_gettimeoffset works */
-	write_seqlock(&xtime_lock);
-
 	clear_clock_irq();
 
-	do_timer(1);
-
-	write_sequnlock(&xtime_lock);
+	xtime_update(1);
 
 #ifndef CONFIG_SMP
 	update_process_times(user_mode(get_irq_regs()));
-- 
cgit v0.10.2


From d12b0e24c56c6fb2398609f26858e5278d688840 Mon Sep 17 00:00:00 2001
From: Torben Hohn <torbenh@gmx.de>
Date: Thu, 27 Jan 2011 16:00:27 +0100
Subject: xtensa: Switch do_timer() to xtime_update()

xtime_update() takes the xtime_lock itself.

set_linux_timer() does not need to be protected by xtime_lock.

[ tglx: This code is broken on SMP anyway. ]

Signed-off-by: Torben Hohn <torbenh@gmx.de>
Cc: Chris Zankel <chris@zankel.net>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: johnstul@us.ibm.com
Cc: hch@infradead.org
Cc: yong.zhang0@gmail.com
LKML-Reference: <20110127150027.23248.61798.stgit@localhost>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/xtensa/kernel/time.c b/arch/xtensa/kernel/time.c
index 19df764..f3e5eb4 100644
--- a/arch/xtensa/kernel/time.c
+++ b/arch/xtensa/kernel/time.c
@@ -96,16 +96,12 @@ again:
 		update_process_times(user_mode(get_irq_regs()));
 #endif
 
-		write_seqlock(&xtime_lock);
-
-		do_timer(1); /* Linux handler in kernel/timer.c */
+		xtime_update(1); /* Linux handler in kernel/time/timekeeping */
 
 		/* Note that writing CCOMPARE clears the interrupt. */
 
 		next += CCOUNT_PER_JIFFY;
 		set_linux_timer(next);
-
-		write_sequnlock(&xtime_lock);
 	}
 
 	/* Allow platform to do something useful (Wdog). */
-- 
cgit v0.10.2


From 7e2ed097538c57ff5268e9a6bced7c0b885809c8 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Sun, 30 Jan 2011 11:59:43 -0200
Subject: perf evlist: Store pointer to the cpu and thread maps

So that we don't have to pass it around to the several methods that
needs it, simplifying usage.

There is one case where we don't have the thread/cpu map in advance,
which is in the parsing routines used by top, stat, record, that we have
to wait till all options are parsed to know if a cpu or thread list was
passed to then create those maps.

For that case consolidate the cpu and thread map creation via
perf_evlist__create_maps() out of the code in top and record, while also
providing a perf_evlist__set_maps() for cases where multiple evlists
share maps or for when maps that represent CPU sockets, for instance,
get crafted out of topology information or subsets of threads in a
particular application are to be monitored, providing more granularity
in specifying which cpus and threads to monitor.

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index edc3555..07f8d6d 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -42,7 +42,6 @@ static u64			user_interval			= ULLONG_MAX;
 static u64			default_interval		=      0;
 static u64			sample_type;
 
-static struct cpu_map		*cpus;
 static unsigned int		page_size;
 static unsigned int		mmap_pages			=    128;
 static unsigned int		user_freq 			= UINT_MAX;
@@ -58,7 +57,6 @@ static bool			sample_id_all_avail		=   true;
 static bool			system_wide			=  false;
 static pid_t			target_pid			=     -1;
 static pid_t			target_tid			=     -1;
-static struct thread_map	*threads;
 static pid_t			child_pid			=     -1;
 static bool			no_inherit			=  false;
 static enum write_mode_t	write_mode			= WRITE_FORCE;
@@ -189,7 +187,7 @@ static void create_counter(struct perf_evsel *evsel, int cpu)
 	int thread_index;
 	int ret;
 
-	for (thread_index = 0; thread_index < threads->nr; thread_index++) {
+	for (thread_index = 0; thread_index < evsel_list->threads->nr; thread_index++) {
 		h_attr = get_header_attr(attr, evsel->idx);
 		if (h_attr == NULL)
 			die("nomem\n");
@@ -317,7 +315,8 @@ static void open_counters(struct perf_evlist *evlist)
 retry_sample_id:
 		attr->sample_id_all = sample_id_all_avail ? 1 : 0;
 try_again:
-		if (perf_evsel__open(pos, cpus, threads, group, !no_inherit) < 0) {
+		if (perf_evsel__open(pos, evlist->cpus, evlist->threads, group,
+				     !no_inherit) < 0) {
 			int err = errno;
 
 			if (err == EPERM || err == EACCES)
@@ -368,10 +367,10 @@ try_again:
 		}
 	}
 
-	if (perf_evlist__mmap(evlist, cpus, threads, mmap_pages, false) < 0)
+	if (perf_evlist__mmap(evlist, mmap_pages, false) < 0)
 		die("failed to mmap with %d (%s)\n", errno, strerror(errno));
 
-	for (cpu = 0; cpu < cpus->nr; ++cpu) {
+	for (cpu = 0; cpu < evsel_list->cpus->nr; ++cpu) {
 		list_for_each_entry(pos, &evlist->entries, node)
 			create_counter(pos, cpu);
 	}
@@ -450,7 +449,7 @@ static void mmap_read_all(void)
 {
 	int i;
 
-	for (i = 0; i < cpus->nr; i++) {
+	for (i = 0; i < evsel_list->cpus->nr; i++) {
 		if (evsel_list->mmap[i].base)
 			mmap_read(&evsel_list->mmap[i]);
 	}
@@ -584,7 +583,7 @@ static int __cmd_record(int argc, const char **argv)
 		}
 
 		if (!system_wide && target_tid == -1 && target_pid == -1)
-			threads->map[0] = child_pid;
+			evsel_list->threads->map[0] = child_pid;
 
 		close(child_ready_pipe[1]);
 		close(go_pipe[0]);
@@ -718,12 +717,12 @@ static int __cmd_record(int argc, const char **argv)
 		}
 
 		if (done) {
-			for (i = 0; i < cpus->nr; i++) {
+			for (i = 0; i < evsel_list->cpus->nr; i++) {
 				struct perf_evsel *pos;
 
 				list_for_each_entry(pos, &evsel_list->entries, node) {
 					for (thread = 0;
-						thread < threads->nr;
+						thread < evsel_list->threads->nr;
 						thread++)
 						ioctl(FD(pos, i, thread),
 							PERF_EVENT_IOC_DISABLE);
@@ -816,7 +815,7 @@ int cmd_record(int argc, const char **argv, const char *prefix __used)
 	int err = -ENOMEM;
 	struct perf_evsel *pos;
 
-	evsel_list = perf_evlist__new();
+	evsel_list = perf_evlist__new(NULL, NULL);
 	if (evsel_list == NULL)
 		return -ENOMEM;
 
@@ -850,28 +849,19 @@ int cmd_record(int argc, const char **argv, const char *prefix __used)
 	if (target_pid != -1)
 		target_tid = target_pid;
 
-	threads = thread_map__new(target_pid, target_tid);
-	if (threads == NULL) {
-		pr_err("Problems finding threads of monitor\n");
-		usage_with_options(record_usage, record_options);
-	}
-
-	if (target_tid != -1)
-		cpus = cpu_map__dummy_new();
-	else
-		cpus = cpu_map__new(cpu_list);
-
-	if (cpus == NULL)
+	if (perf_evlist__create_maps(evsel_list, target_pid,
+				     target_tid, cpu_list) < 0)
 		usage_with_options(record_usage, record_options);
 
 	list_for_each_entry(pos, &evsel_list->entries, node) {
-		if (perf_evsel__alloc_fd(pos, cpus->nr, threads->nr) < 0)
+		if (perf_evsel__alloc_fd(pos, evsel_list->cpus->nr,
+					 evsel_list->threads->nr) < 0)
 			goto out_free_fd;
 		if (perf_header__push_event(pos->attr.config, event_name(pos)))
 			goto out_free_fd;
 	}
 
-	if (perf_evlist__alloc_pollfd(evsel_list, cpus->nr, threads->nr) < 0)
+	if (perf_evlist__alloc_pollfd(evsel_list) < 0)
 		goto out_free_fd;
 
 	if (user_interval != ULLONG_MAX)
@@ -893,10 +883,8 @@ int cmd_record(int argc, const char **argv, const char *prefix __used)
 	}
 
 	err = __cmd_record(argc, argv);
-
 out_free_fd:
-	thread_map__delete(threads);
-	threads = NULL;
+	perf_evlist__delete_maps(evsel_list);
 out_symbol_exit:
 	symbol__exit();
 	return err;
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 8906adf..e0f9575 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -76,7 +76,6 @@ static struct perf_event_attr default_attrs[] = {
 struct perf_evlist		*evsel_list;
 
 static bool			system_wide			=  false;
-static struct cpu_map		*cpus;
 static int			run_idx				=  0;
 
 static int			run_count			=  1;
@@ -85,7 +84,6 @@ static bool			scale				=  true;
 static bool			no_aggr				= false;
 static pid_t			target_pid			= -1;
 static pid_t			target_tid			= -1;
-static struct thread_map	*threads;
 static pid_t			child_pid			= -1;
 static bool			null_run			=  false;
 static bool			big_num				=  true;
@@ -170,7 +168,7 @@ static int create_perf_stat_counter(struct perf_evsel *evsel)
 				    PERF_FORMAT_TOTAL_TIME_RUNNING;
 
 	if (system_wide)
-		return perf_evsel__open_per_cpu(evsel, cpus, false, false);
+		return perf_evsel__open_per_cpu(evsel, evsel_list->cpus, false, false);
 
 	attr->inherit = !no_inherit;
 	if (target_pid == -1 && target_tid == -1) {
@@ -178,7 +176,7 @@ static int create_perf_stat_counter(struct perf_evsel *evsel)
 		attr->enable_on_exec = 1;
 	}
 
-	return perf_evsel__open_per_thread(evsel, threads, false, false);
+	return perf_evsel__open_per_thread(evsel, evsel_list->threads, false, false);
 }
 
 /*
@@ -203,7 +201,8 @@ static int read_counter_aggr(struct perf_evsel *counter)
 	u64 *count = counter->counts->aggr.values;
 	int i;
 
-	if (__perf_evsel__read(counter, cpus->nr, threads->nr, scale) < 0)
+	if (__perf_evsel__read(counter, evsel_list->cpus->nr,
+			       evsel_list->threads->nr, scale) < 0)
 		return -1;
 
 	for (i = 0; i < 3; i++)
@@ -236,7 +235,7 @@ static int read_counter(struct perf_evsel *counter)
 	u64 *count;
 	int cpu;
 
-	for (cpu = 0; cpu < cpus->nr; cpu++) {
+	for (cpu = 0; cpu < evsel_list->cpus->nr; cpu++) {
 		if (__perf_evsel__read_on_cpu(counter, cpu, 0, scale) < 0)
 			return -1;
 
@@ -301,7 +300,7 @@ static int run_perf_stat(int argc __used, const char **argv)
 		}
 
 		if (target_tid == -1 && target_pid == -1 && !system_wide)
-			threads->map[0] = child_pid;
+			evsel_list->threads->map[0] = child_pid;
 
 		/*
 		 * Wait for the child to be ready to exec.
@@ -353,12 +352,13 @@ static int run_perf_stat(int argc __used, const char **argv)
 	if (no_aggr) {
 		list_for_each_entry(counter, &evsel_list->entries, node) {
 			read_counter(counter);
-			perf_evsel__close_fd(counter, cpus->nr, 1);
+			perf_evsel__close_fd(counter, evsel_list->cpus->nr, 1);
 		}
 	} else {
 		list_for_each_entry(counter, &evsel_list->entries, node) {
 			read_counter_aggr(counter);
-			perf_evsel__close_fd(counter, cpus->nr, threads->nr);
+			perf_evsel__close_fd(counter, evsel_list->cpus->nr,
+					     evsel_list->threads->nr);
 		}
 	}
 
@@ -386,7 +386,7 @@ static void nsec_printout(int cpu, struct perf_evsel *evsel, double avg)
 	if (no_aggr)
 		sprintf(cpustr, "CPU%*d%s",
 			csv_output ? 0 : -4,
-			cpus->map[cpu], csv_sep);
+			evsel_list->cpus->map[cpu], csv_sep);
 
 	fprintf(stderr, fmt, cpustr, msecs, csv_sep, event_name(evsel));
 
@@ -414,7 +414,7 @@ static void abs_printout(int cpu, struct perf_evsel *evsel, double avg)
 	if (no_aggr)
 		sprintf(cpustr, "CPU%*d%s",
 			csv_output ? 0 : -4,
-			cpus->map[cpu], csv_sep);
+			evsel_list->cpus->map[cpu], csv_sep);
 	else
 		cpu = 0;
 
@@ -500,14 +500,14 @@ static void print_counter(struct perf_evsel *counter)
 	u64 ena, run, val;
 	int cpu;
 
-	for (cpu = 0; cpu < cpus->nr; cpu++) {
+	for (cpu = 0; cpu < evsel_list->cpus->nr; cpu++) {
 		val = counter->counts->cpu[cpu].val;
 		ena = counter->counts->cpu[cpu].ena;
 		run = counter->counts->cpu[cpu].run;
 		if (run == 0 || ena == 0) {
 			fprintf(stderr, "CPU%*d%s%*s%s%-24s",
 				csv_output ? 0 : -4,
-				cpus->map[cpu], csv_sep,
+				evsel_list->cpus->map[cpu], csv_sep,
 				csv_output ? 0 : 18,
 				"<not counted>", csv_sep,
 				event_name(counter));
@@ -652,7 +652,7 @@ int cmd_stat(int argc, const char **argv, const char *prefix __used)
 
 	setlocale(LC_ALL, "");
 
-	evsel_list = perf_evlist__new();
+	evsel_list = perf_evlist__new(NULL, NULL);
 	if (evsel_list == NULL)
 		return -ENOMEM;
 
@@ -701,18 +701,18 @@ int cmd_stat(int argc, const char **argv, const char *prefix __used)
 	if (target_pid != -1)
 		target_tid = target_pid;
 
-	threads = thread_map__new(target_pid, target_tid);
-	if (threads == NULL) {
+	evsel_list->threads = thread_map__new(target_pid, target_tid);
+	if (evsel_list->threads == NULL) {
 		pr_err("Problems finding threads of monitor\n");
 		usage_with_options(stat_usage, options);
 	}
 
 	if (system_wide)
-		cpus = cpu_map__new(cpu_list);
+		evsel_list->cpus = cpu_map__new(cpu_list);
 	else
-		cpus = cpu_map__dummy_new();
+		evsel_list->cpus = cpu_map__dummy_new();
 
-	if (cpus == NULL) {
+	if (evsel_list->cpus == NULL) {
 		perror("failed to parse CPUs map");
 		usage_with_options(stat_usage, options);
 		return -1;
@@ -720,8 +720,8 @@ int cmd_stat(int argc, const char **argv, const char *prefix __used)
 
 	list_for_each_entry(pos, &evsel_list->entries, node) {
 		if (perf_evsel__alloc_stat_priv(pos) < 0 ||
-		    perf_evsel__alloc_counts(pos, cpus->nr) < 0 ||
-		    perf_evsel__alloc_fd(pos, cpus->nr, threads->nr) < 0)
+		    perf_evsel__alloc_counts(pos, evsel_list->cpus->nr) < 0 ||
+		    perf_evsel__alloc_fd(pos, evsel_list->cpus->nr, evsel_list->threads->nr) < 0)
 			goto out_free_fd;
 	}
 
@@ -750,7 +750,6 @@ out_free_fd:
 		perf_evsel__free_stat_priv(pos);
 	perf_evlist__delete(evsel_list);
 out:
-	thread_map__delete(threads);
-	threads = NULL;
+	perf_evlist__delete_maps(evsel_list);
 	return status;
 }
diff --git a/tools/perf/builtin-test.c b/tools/perf/builtin-test.c
index 845b9bd..1b2106c 100644
--- a/tools/perf/builtin-test.c
+++ b/tools/perf/builtin-test.c
@@ -509,7 +509,7 @@ static int test__basic_mmap(void)
 		goto out_free_cpus;
 	}
 
-	evlist = perf_evlist__new();
+	evlist = perf_evlist__new(cpus, threads);
 	if (evlist == NULL) {
 		pr_debug("perf_evlist__new\n");
 		goto out_free_cpus;
@@ -537,7 +537,7 @@ static int test__basic_mmap(void)
 		}
 	}
 
-	if (perf_evlist__mmap(evlist, cpus, threads, 128, true) < 0) {
+	if (perf_evlist__mmap(evlist, 128, true) < 0) {
 		pr_debug("failed to mmap events: %d (%s)\n", errno,
 			 strerror(errno));
 		goto out_close_fd;
@@ -579,7 +579,7 @@ static int test__basic_mmap(void)
 
 	err = 0;
 out_munmap:
-	perf_evlist__munmap(evlist, 1);
+	perf_evlist__munmap(evlist);
 out_close_fd:
 	for (i = 0; i < nsyscalls; ++i)
 		perf_evsel__close_fd(evsels[i], 1, threads->nr);
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 2f4d1f2..599036b 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -73,9 +73,7 @@ static int			print_entries;
 
 static int			target_pid			=     -1;
 static int			target_tid			=     -1;
-static struct thread_map	*threads;
 static bool			inherit				=  false;
-static struct cpu_map		*cpus;
 static int			realtime_prio			=      0;
 static bool			group				=  false;
 static unsigned int		page_size;
@@ -567,12 +565,13 @@ static void print_sym_table(struct perf_session *session)
 		printf(" (all");
 
 	if (cpu_list)
-		printf(", CPU%s: %s)\n", cpus->nr > 1 ? "s" : "", cpu_list);
+		printf(", CPU%s: %s)\n", evsel_list->cpus->nr > 1 ? "s" : "", cpu_list);
 	else {
 		if (target_tid != -1)
 			printf(")\n");
 		else
-			printf(", %d CPU%s)\n", cpus->nr, cpus->nr > 1 ? "s" : "");
+			printf(", %d CPU%s)\n", evsel_list->cpus->nr,
+			       evsel_list->cpus->nr > 1 ? "s" : "");
 	}
 
 	printf("%-*.*s\n", win_width, win_width, graph_dotted_line);
@@ -1124,7 +1123,7 @@ static void perf_session__mmap_read(struct perf_session *self)
 {
 	int i;
 
-	for (i = 0; i < cpus->nr; i++)
+	for (i = 0; i < evsel_list->cpus->nr; i++)
 		perf_session__mmap_read_cpu(self, i);
 }
 
@@ -1150,7 +1149,8 @@ static void start_counters(struct perf_evlist *evlist)
 
 		attr->mmap = 1;
 try_again:
-		if (perf_evsel__open(counter, cpus, threads, group, inherit) < 0) {
+		if (perf_evsel__open(counter, evsel_list->cpus,
+				     evsel_list->threads, group, inherit) < 0) {
 			int err = errno;
 
 			if (err == EPERM || err == EACCES)
@@ -1181,7 +1181,7 @@ try_again:
 		}
 	}
 
-	if (perf_evlist__mmap(evlist, cpus, threads, mmap_pages, false) < 0)
+	if (perf_evlist__mmap(evlist, mmap_pages, false) < 0)
 		die("failed to mmap with %d (%s)\n", errno, strerror(errno));
 }
 
@@ -1296,7 +1296,7 @@ int cmd_top(int argc, const char **argv, const char *prefix __used)
 	struct perf_evsel *pos;
 	int status = -ENOMEM;
 
-	evsel_list = perf_evlist__new();
+	evsel_list = perf_evlist__new(NULL, NULL);
 	if (evsel_list == NULL)
 		return -ENOMEM;
 
@@ -1306,15 +1306,6 @@ int cmd_top(int argc, const char **argv, const char *prefix __used)
 	if (argc)
 		usage_with_options(top_usage, options);
 
-	if (target_pid != -1)
-		target_tid = target_pid;
-
-	threads = thread_map__new(target_pid, target_tid);
-	if (threads == NULL) {
-		pr_err("Problems finding threads of monitor\n");
-		usage_with_options(top_usage, options);
-	}
-
 	/* CPU and PID are mutually exclusive */
 	if (target_tid > 0 && cpu_list) {
 		printf("WARNING: PID switch overriding CPU\n");
@@ -1322,6 +1313,13 @@ int cmd_top(int argc, const char **argv, const char *prefix __used)
 		cpu_list = NULL;
 	}
 
+	if (target_pid != -1)
+		target_tid = target_pid;
+
+	if (perf_evlist__create_maps(evsel_list, target_pid,
+				     target_tid, cpu_list) < 0)
+		usage_with_options(top_usage, options);
+
 	if (!evsel_list->nr_entries &&
 	    perf_evlist__add_default(evsel_list) < 0) {
 		pr_err("Not enough memory for event selector list\n");
@@ -1343,16 +1341,9 @@ int cmd_top(int argc, const char **argv, const char *prefix __used)
 		exit(EXIT_FAILURE);
 	}
 
-	if (target_tid != -1)
-		cpus = cpu_map__dummy_new();
-	else
-		cpus = cpu_map__new(cpu_list);
-
-	if (cpus == NULL)
-		usage_with_options(top_usage, options);
-
 	list_for_each_entry(pos, &evsel_list->entries, node) {
-		if (perf_evsel__alloc_fd(pos, cpus->nr, threads->nr) < 0)
+		if (perf_evsel__alloc_fd(pos, evsel_list->cpus->nr,
+					 evsel_list->threads->nr) < 0)
 			goto out_free_fd;
 		/*
 		 * Fill in the ones not specifically initialized via -c:
@@ -1363,8 +1354,8 @@ int cmd_top(int argc, const char **argv, const char *prefix __used)
 		pos->attr.sample_period = default_interval;
 	}
 
-	if (perf_evlist__alloc_pollfd(evsel_list, cpus->nr, threads->nr) < 0 ||
-	    perf_evlist__alloc_mmap(evsel_list, cpus->nr) < 0)
+	if (perf_evlist__alloc_pollfd(evsel_list) < 0 ||
+	    perf_evlist__alloc_mmap(evsel_list) < 0)
 		goto out_free_fd;
 
 	sym_evsel = list_entry(evsel_list->entries.next, struct perf_evsel, node);
diff --git a/tools/perf/python/twatch.py b/tools/perf/python/twatch.py
index 5e9f3b7..df638c4 100755
--- a/tools/perf/python/twatch.py
+++ b/tools/perf/python/twatch.py
@@ -23,9 +23,9 @@ def main():
 			   sample_id_all = 1,
 			   sample_type = perf.SAMPLE_PERIOD | perf.SAMPLE_TID | perf.SAMPLE_CPU | perf.SAMPLE_TID)
 	evsel.open(cpus = cpus, threads = threads);
-	evlist = perf.evlist()
+	evlist = perf.evlist(cpus, threads)
 	evlist.add(evsel)
-	evlist.mmap(cpus = cpus, threads = threads)
+	evlist.mmap()
 	while True:
 		evlist.poll(timeout = -1)
 		for cpu in cpus:
diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index dcd5932..95b21fe 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -21,21 +21,24 @@
 #define FD(e, x, y) (*(int *)xyarray__entry(e->fd, x, y))
 #define SID(e, x, y) xyarray__entry(e->id, x, y)
 
-void perf_evlist__init(struct perf_evlist *evlist)
+void perf_evlist__init(struct perf_evlist *evlist, struct cpu_map *cpus,
+		       struct thread_map *threads)
 {
 	int i;
 
 	for (i = 0; i < PERF_EVLIST__HLIST_SIZE; ++i)
 		INIT_HLIST_HEAD(&evlist->heads[i]);
 	INIT_LIST_HEAD(&evlist->entries);
+	perf_evlist__set_maps(evlist, cpus, threads);
 }
 
-struct perf_evlist *perf_evlist__new(void)
+struct perf_evlist *perf_evlist__new(struct cpu_map *cpus,
+				     struct thread_map *threads)
 {
 	struct perf_evlist *evlist = zalloc(sizeof(*evlist));
 
 	if (evlist != NULL)
-		perf_evlist__init(evlist);
+		perf_evlist__init(evlist, cpus, threads);
 
 	return evlist;
 }
@@ -88,9 +91,9 @@ int perf_evlist__add_default(struct perf_evlist *evlist)
 	return 0;
 }
 
-int perf_evlist__alloc_pollfd(struct perf_evlist *evlist, int ncpus, int nthreads)
+int perf_evlist__alloc_pollfd(struct perf_evlist *evlist)
 {
-	int nfds = ncpus * nthreads * evlist->nr_entries;
+	int nfds = evlist->cpus->nr * evlist->threads->nr * evlist->nr_entries;
 	evlist->pollfd = malloc(sizeof(struct pollfd) * nfds);
 	return evlist->pollfd != NULL ? 0 : -ENOMEM;
 }
@@ -213,11 +216,11 @@ union perf_event *perf_evlist__read_on_cpu(struct perf_evlist *evlist, int cpu)
 	return event;
 }
 
-void perf_evlist__munmap(struct perf_evlist *evlist, int ncpus)
+void perf_evlist__munmap(struct perf_evlist *evlist)
 {
 	int cpu;
 
-	for (cpu = 0; cpu < ncpus; cpu++) {
+	for (cpu = 0; cpu < evlist->cpus->nr; cpu++) {
 		if (evlist->mmap[cpu].base != NULL) {
 			munmap(evlist->mmap[cpu].base, evlist->mmap_len);
 			evlist->mmap[cpu].base = NULL;
@@ -225,9 +228,9 @@ void perf_evlist__munmap(struct perf_evlist *evlist, int ncpus)
 	}
 }
 
-int perf_evlist__alloc_mmap(struct perf_evlist *evlist, int ncpus)
+int perf_evlist__alloc_mmap(struct perf_evlist *evlist)
 {
-	evlist->mmap = zalloc(ncpus * sizeof(struct perf_mmap));
+	evlist->mmap = zalloc(evlist->cpus->nr * sizeof(struct perf_mmap));
 	return evlist->mmap != NULL ? 0 : -ENOMEM;
 }
 
@@ -248,8 +251,6 @@ static int __perf_evlist__mmap(struct perf_evlist *evlist, int cpu, int prot,
 /** perf_evlist__mmap - Create per cpu maps to receive events
  *
  * @evlist - list of events
- * @cpus - cpu map being monitored
- * @threads - threads map being monitored
  * @pages - map length in pages
  * @overwrite - overwrite older events?
  *
@@ -259,21 +260,22 @@ static int __perf_evlist__mmap(struct perf_evlist *evlist, int cpu, int prot,
  *	unsigned int head = perf_mmap__read_head(m);
  *
  *	perf_mmap__write_tail(m, head)
+ *
+ * Using perf_evlist__read_on_cpu does this automatically.
  */
-int perf_evlist__mmap(struct perf_evlist *evlist, struct cpu_map *cpus,
-		      struct thread_map *threads, int pages, bool overwrite)
+int perf_evlist__mmap(struct perf_evlist *evlist, int pages, bool overwrite)
 {
 	unsigned int page_size = sysconf(_SC_PAGE_SIZE);
 	int mask = pages * page_size - 1, cpu;
 	struct perf_evsel *first_evsel, *evsel;
+	const struct cpu_map *cpus = evlist->cpus;
+	const struct thread_map *threads = evlist->threads;
 	int thread, prot = PROT_READ | (overwrite ? 0 : PROT_WRITE);
 
-	if (evlist->mmap == NULL &&
-	    perf_evlist__alloc_mmap(evlist, cpus->nr) < 0)
+	if (evlist->mmap == NULL && perf_evlist__alloc_mmap(evlist) < 0)
 		return -ENOMEM;
 
-	if (evlist->pollfd == NULL &&
-	    perf_evlist__alloc_pollfd(evlist, cpus->nr, threads->nr) < 0)
+	if (evlist->pollfd == NULL && perf_evlist__alloc_pollfd(evlist) < 0)
 		return -ENOMEM;
 
 	evlist->overwrite = overwrite;
@@ -315,3 +317,34 @@ out_unmap:
 	}
 	return -1;
 }
+
+int perf_evlist__create_maps(struct perf_evlist *evlist, pid_t target_pid,
+			     pid_t target_tid, const char *cpu_list)
+{
+	evlist->threads = thread_map__new(target_pid, target_tid);
+
+	if (evlist->threads == NULL)
+		return -1;
+
+	if (target_tid != -1)
+		evlist->cpus = cpu_map__dummy_new();
+	else
+		evlist->cpus = cpu_map__new(cpu_list);
+
+	if (evlist->cpus == NULL)
+		goto out_delete_threads;
+
+	return 0;
+
+out_delete_threads:
+	thread_map__delete(evlist->threads);
+	return -1;
+}
+
+void perf_evlist__delete_maps(struct perf_evlist *evlist)
+{
+	cpu_map__delete(evlist->cpus);
+	thread_map__delete(evlist->threads);
+	evlist->cpus	= NULL;
+	evlist->threads = NULL;
+}
diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h
index 85aca6e..c988405 100644
--- a/tools/perf/util/evlist.h
+++ b/tools/perf/util/evlist.h
@@ -22,28 +22,43 @@ struct perf_evlist {
 	union perf_event event_copy;
 	struct perf_mmap *mmap;
 	struct pollfd	 *pollfd;
+	struct thread_map *threads;
+	struct cpu_map	  *cpus;
 };
 
 struct perf_evsel;
 
-struct perf_evlist *perf_evlist__new(void);
-void perf_evlist__init(struct perf_evlist *evlist);
+struct perf_evlist *perf_evlist__new(struct cpu_map *cpus,
+				     struct thread_map *threads);
+void perf_evlist__init(struct perf_evlist *evlist, struct cpu_map *cpus,
+		       struct thread_map *threads);
 void perf_evlist__exit(struct perf_evlist *evlist);
 void perf_evlist__delete(struct perf_evlist *evlist);
 
 void perf_evlist__add(struct perf_evlist *evlist, struct perf_evsel *entry);
 int perf_evlist__add_default(struct perf_evlist *evlist);
 
-int perf_evlist__alloc_pollfd(struct perf_evlist *evlist, int ncpus, int nthreads);
+int perf_evlist__alloc_pollfd(struct perf_evlist *evlist);
 void perf_evlist__add_pollfd(struct perf_evlist *evlist, int fd);
 
 struct perf_evsel *perf_evlist__id2evsel(struct perf_evlist *evlist, u64 id);
 
 union perf_event *perf_evlist__read_on_cpu(struct perf_evlist *self, int cpu);
 
-int perf_evlist__alloc_mmap(struct perf_evlist *evlist, int ncpus);
-int perf_evlist__mmap(struct perf_evlist *evlist, struct cpu_map *cpus,
-		      struct thread_map *threads, int pages, bool overwrite);
-void perf_evlist__munmap(struct perf_evlist *evlist, int ncpus);
+int perf_evlist__alloc_mmap(struct perf_evlist *evlist);
+int perf_evlist__mmap(struct perf_evlist *evlist, int pages, bool overwrite);
+void perf_evlist__munmap(struct perf_evlist *evlist);
+
+static inline void perf_evlist__set_maps(struct perf_evlist *evlist,
+					 struct cpu_map *cpus,
+					 struct thread_map *threads)
+{
+	evlist->cpus	= cpus;
+	evlist->threads	= threads;
+}
+
+int perf_evlist__create_maps(struct perf_evlist *evlist, pid_t target_pid,
+			     pid_t target_tid, const char *cpu_list);
+void perf_evlist__delete_maps(struct perf_evlist *evlist);
 
 #endif /* __PERF_EVLIST_H */
diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c
index 88d4789..d2d5217 100644
--- a/tools/perf/util/python.c
+++ b/tools/perf/util/python.c
@@ -553,7 +553,16 @@ struct pyrf_evlist {
 static int pyrf_evlist__init(struct pyrf_evlist *pevlist,
 			     PyObject *args, PyObject *kwargs)
 {
-	perf_evlist__init(&pevlist->evlist);
+	PyObject *pcpus = NULL, *pthreads = NULL;
+	struct cpu_map *cpus;
+	struct thread_map *threads;
+
+	if (!PyArg_ParseTuple(args, "OO", &pcpus, &pthreads))
+		return -1;
+
+	threads = ((struct pyrf_thread_map *)pthreads)->threads;
+	cpus = ((struct pyrf_cpu_map *)pcpus)->cpus;
+	perf_evlist__init(&pevlist->evlist, cpus, threads);
 	return 0;
 }
 
@@ -567,21 +576,15 @@ static PyObject *pyrf_evlist__mmap(struct pyrf_evlist *pevlist,
 				   PyObject *args, PyObject *kwargs)
 {
 	struct perf_evlist *evlist = &pevlist->evlist;
-	PyObject *pcpus = NULL, *pthreads = NULL;
-	struct cpu_map *cpus = NULL;
-	struct thread_map *threads = NULL;
-	static char *kwlist[] = {"cpus", "threads", "pages", "overwrite",
+	static char *kwlist[] = {"pages", "overwrite",
 				  NULL, NULL};
 	int pages = 128, overwrite = false;
 
-	if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OO|ii", kwlist,
-					 &pcpus, &pthreads, &pages, &overwrite))
+	if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ii", kwlist,
+					 &pages, &overwrite))
 		return NULL;
 
-	threads = ((struct pyrf_thread_map *)pthreads)->threads;
-	cpus = ((struct pyrf_cpu_map *)pcpus)->cpus;
-
-	if (perf_evlist__mmap(evlist, cpus, threads, pages, overwrite) < 0) {
+	if (perf_evlist__mmap(evlist, pages, overwrite) < 0) {
 		PyErr_SetFromErrno(PyExc_OSError);
 		return NULL;
 	}
-- 
cgit v0.10.2


From 8c3e10eb1968877d6a1957b7e790c6ce01bd56fc Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 31 Jan 2011 14:50:39 -0200
Subject: perf top: Move display agnostic routines to util/top.[ch]

Paving the way for a slang browser a la 'perf report --tui'.

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index 36ff73c..edc660e 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -437,6 +437,7 @@ LIB_H += util/probe-finder.h
 LIB_H += util/probe-event.h
 LIB_H += util/pstack.h
 LIB_H += util/cpumap.h
+LIB_H += util/top.h
 LIB_H += $(ARCH_INCLUDE)
 
 LIB_OBJS += $(OUTPUT)util/abspath.o
@@ -464,6 +465,7 @@ LIB_OBJS += $(OUTPUT)util/strbuf.o
 LIB_OBJS += $(OUTPUT)util/string.o
 LIB_OBJS += $(OUTPUT)util/strlist.o
 LIB_OBJS += $(OUTPUT)util/strfilter.o
+LIB_OBJS += $(OUTPUT)util/top.o
 LIB_OBJS += $(OUTPUT)util/usage.o
 LIB_OBJS += $(OUTPUT)util/wrapper.o
 LIB_OBJS += $(OUTPUT)util/sigchain.o
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 599036b..3c9ba94 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -27,6 +27,7 @@
 #include "util/symbol.h"
 #include "util/thread.h"
 #include "util/thread_map.h"
+#include "util/top.h"
 #include "util/util.h"
 #include <linux/rbtree.h>
 #include "util/parse-options.h"
@@ -47,7 +48,6 @@
 #include <errno.h>
 #include <time.h>
 #include <sched.h>
-#include <pthread.h>
 
 #include <sys/syscall.h>
 #include <sys/ioctl.h>
@@ -62,75 +62,35 @@
 
 #define FD(e, x, y) (*(int *)xyarray__entry(e->fd, x, y))
 
-struct perf_evlist		*evsel_list;
+static struct perf_top top = {
+	.count_filter		= 5,
+	.delay_secs		= 2,
+	.display_weighted	= -1,
+	.target_pid		= -1,
+	.target_tid		= -1,
+	.active_symbols		= LIST_HEAD_INIT(top.active_symbols),
+	.active_symbols_lock	= PTHREAD_MUTEX_INITIALIZER,
+	.freq			= 1000, /* 1 KHz */
+};
 
 static bool			system_wide			=  false;
 
 static int			default_interval		=      0;
 
-static int			count_filter			=      5;
-static int			print_entries;
-
-static int			target_pid			=     -1;
-static int			target_tid			=     -1;
 static bool			inherit				=  false;
 static int			realtime_prio			=      0;
 static bool			group				=  false;
 static unsigned int		page_size;
 static unsigned int		mmap_pages			=    128;
-static int			freq				=   1000; /* 1 KHz */
 
-static int			delay_secs			=      2;
-static bool			zero                            =  false;
 static bool			dump_symtab                     =  false;
 
-static bool			hide_kernel_symbols		=  false;
-static bool			hide_user_symbols		=  false;
 static struct winsize		winsize;
 
-/*
- * Source
- */
-
-struct source_line {
-	u64			eip;
-	unsigned long		count[MAX_COUNTERS];
-	char			*line;
-	struct source_line	*next;
-};
-
 static const char		*sym_filter			=   NULL;
 struct sym_entry		*sym_filter_entry		=   NULL;
 struct sym_entry		*sym_filter_entry_sched		=   NULL;
 static int			sym_pcnt_filter			=      5;
-static int			sym_counter			=      0;
-static struct perf_evsel	*sym_evsel			=   NULL;
-static int			display_weighted		=     -1;
-static const char		*cpu_list;
-
-/*
- * Symbols
- */
-
-struct sym_entry_source {
-	struct source_line	*source;
-	struct source_line	*lines;
-	struct source_line	**lines_tail;
-	pthread_mutex_t		lock;
-};
-
-struct sym_entry {
-	struct rb_node		rb_node;
-	struct list_head	node;
-	unsigned long		snap_count;
-	double			weight;
-	int			skip;
-	u16			name_len;
-	u8			origin;
-	struct map		*map;
-	struct sym_entry_source	*src;
-	unsigned long		count[0];
-};
 
 /*
  * Source functions
@@ -165,10 +125,10 @@ void get_term_dimensions(struct winsize *ws)
 
 static void update_print_entries(struct winsize *ws)
 {
-	print_entries = ws->ws_row;
+	top.print_entries = ws->ws_row;
 
-	if (print_entries > 9)
-		print_entries -= 9;
+	if (top.print_entries > 9)
+		top.print_entries -= 9;
 }
 
 static void sig_winch_handler(int sig __used)
@@ -269,7 +229,7 @@ static void __zero_source_counters(struct sym_entry *syme)
 
 	line = syme->src->lines;
 	while (line) {
-		for (i = 0; i < evsel_list->nr_entries; i++)
+		for (i = 0; i < top.evlist->nr_entries; i++)
 			line->count[i] = 0;
 		line = line->next;
 	}
@@ -331,9 +291,9 @@ static void show_lines(struct source_line *queue, int count, int total)
 
 	line = queue;
 	for (i = 0; i < count; i++) {
-		float pcnt = 100.0*(float)line->count[sym_counter]/(float)total;
+		float pcnt = 100.0*(float)line->count[top.sym_counter]/(float)total;
 
-		printf("%8li %4.1f%%\t%s\n", line->count[sym_counter], pcnt, line->line);
+		printf("%8li %4.1f%%\t%s\n", line->count[top.sym_counter], pcnt, line->line);
 		line = line->next;
 	}
 }
@@ -358,13 +318,13 @@ static void show_details(struct sym_entry *syme)
 		return;
 
 	symbol = sym_entry__symbol(syme);
-	printf("Showing %s for %s\n", event_name(sym_evsel), symbol->name);
+	printf("Showing %s for %s\n", event_name(top.sym_evsel), symbol->name);
 	printf("  Events  Pcnt (>=%d%%)\n", sym_pcnt_filter);
 
 	pthread_mutex_lock(&syme->src->lock);
 	line = syme->src->source;
 	while (line) {
-		total += line->count[sym_counter];
+		total += line->count[top.sym_counter];
 		line = line->next;
 	}
 
@@ -376,10 +336,10 @@ static void show_details(struct sym_entry *syme)
 			line_queue = line;
 		line_queue_count++;
 
-		if (line->count[sym_counter])
-			pcnt = 100.0 * line->count[sym_counter] / (float)total;
+		if (line->count[top.sym_counter])
+			pcnt = 100.0 * line->count[top.sym_counter] / (float)total;
 		if (pcnt >= (float)sym_pcnt_filter) {
-			if (displayed <= print_entries)
+			if (displayed <= top.print_entries)
 				show_lines(line_queue, line_queue_count, total);
 			else more++;
 			displayed += line_queue_count;
@@ -390,7 +350,7 @@ static void show_details(struct sym_entry *syme)
 			line_queue_count--;
 		}
 
-		line->count[sym_counter] = zero ? 0 : line->count[sym_counter] * 7 / 8;
+		line->count[top.sym_counter] = top.zero ? 0 : line->count[top.sym_counter] * 7 / 8;
 		line = line->next;
 	}
 	pthread_mutex_unlock(&syme->src->lock);
@@ -398,181 +358,30 @@ static void show_details(struct sym_entry *syme)
 		printf("%d lines not displayed, maybe increase display entries [e]\n", more);
 }
 
-/*
- * Symbols will be added here in perf_event__process_sample and will get out
- * after decayed.
- */
-static LIST_HEAD(active_symbols);
-static pthread_mutex_t active_symbols_lock = PTHREAD_MUTEX_INITIALIZER;
-
-/*
- * Ordering weight: count-1 * count-2 * ... / count-n
- */
-static double sym_weight(const struct sym_entry *sym)
-{
-	double weight = sym->snap_count;
-	int counter;
-
-	if (!display_weighted)
-		return weight;
-
-	for (counter = 1; counter < evsel_list->nr_entries - 1; counter++)
-		weight *= sym->count[counter];
-
-	weight /= (sym->count[counter] + 1);
-
-	return weight;
-}
-
-static long			samples;
-static long			kernel_samples, us_samples;
-static long			exact_samples;
-static long			guest_us_samples, guest_kernel_samples;
 static const char		CONSOLE_CLEAR[] = "[H[2J";
 
 static void __list_insert_active_sym(struct sym_entry *syme)
 {
-	list_add(&syme->node, &active_symbols);
-}
-
-static void list_remove_active_sym(struct sym_entry *syme)
-{
-	pthread_mutex_lock(&active_symbols_lock);
-	list_del_init(&syme->node);
-	pthread_mutex_unlock(&active_symbols_lock);
-}
-
-static void rb_insert_active_sym(struct rb_root *tree, struct sym_entry *se)
-{
-	struct rb_node **p = &tree->rb_node;
-	struct rb_node *parent = NULL;
-	struct sym_entry *iter;
-
-	while (*p != NULL) {
-		parent = *p;
-		iter = rb_entry(parent, struct sym_entry, rb_node);
-
-		if (se->weight > iter->weight)
-			p = &(*p)->rb_left;
-		else
-			p = &(*p)->rb_right;
-	}
-
-	rb_link_node(&se->rb_node, parent, p);
-	rb_insert_color(&se->rb_node, tree);
+	list_add(&syme->node, &top.active_symbols);
 }
 
 static void print_sym_table(struct perf_session *session)
 {
-	int printed = 0, j;
-	struct perf_evsel *counter;
-	int snap = !display_weighted ? sym_counter : 0;
-	float samples_per_sec = samples/delay_secs;
-	float ksamples_per_sec = kernel_samples/delay_secs;
-	float us_samples_per_sec = (us_samples)/delay_secs;
-	float guest_kernel_samples_per_sec = (guest_kernel_samples)/delay_secs;
-	float guest_us_samples_per_sec = (guest_us_samples)/delay_secs;
-	float esamples_percent = (100.0*exact_samples)/samples;
-	float sum_ksamples = 0.0;
-	struct sym_entry *syme, *n;
-	struct rb_root tmp = RB_ROOT;
+	char bf[160];
+	int printed = 0;
 	struct rb_node *nd;
-	int sym_width = 0, dso_width = 0, dso_short_width = 0;
+	struct sym_entry *syme;
+	struct rb_root tmp = RB_ROOT;
 	const int win_width = winsize.ws_col - 1;
-
-	samples = us_samples = kernel_samples = exact_samples = 0;
-	guest_kernel_samples = guest_us_samples = 0;
-
-	/* Sort the active symbols */
-	pthread_mutex_lock(&active_symbols_lock);
-	syme = list_entry(active_symbols.next, struct sym_entry, node);
-	pthread_mutex_unlock(&active_symbols_lock);
-
-	list_for_each_entry_safe_from(syme, n, &active_symbols, node) {
-		syme->snap_count = syme->count[snap];
-		if (syme->snap_count != 0) {
-
-			if ((hide_user_symbols &&
-			     syme->origin == PERF_RECORD_MISC_USER) ||
-			    (hide_kernel_symbols &&
-			     syme->origin == PERF_RECORD_MISC_KERNEL)) {
-				list_remove_active_sym(syme);
-				continue;
-			}
-			syme->weight = sym_weight(syme);
-			rb_insert_active_sym(&tmp, syme);
-			sum_ksamples += syme->snap_count;
-
-			for (j = 0; j < evsel_list->nr_entries; j++)
-				syme->count[j] = zero ? 0 : syme->count[j] * 7 / 8;
-		} else
-			list_remove_active_sym(syme);
-	}
+	int sym_width, dso_width, dso_short_width;
+	float sum_ksamples = perf_top__decay_samples(&top, &tmp);
 
 	puts(CONSOLE_CLEAR);
 
-	if (!perf_guest) {
-		printf("   PerfTop:%8.0f irqs/sec  kernel:%4.1f%%"
-			"  exact: %4.1f%% [",
-			samples_per_sec,
-			100.0 - (100.0 * ((samples_per_sec - ksamples_per_sec) /
-					 samples_per_sec)),
-			esamples_percent);
-	} else {
-		printf("   PerfTop:%8.0f irqs/sec  kernel:%4.1f%% us:%4.1f%%"
-			" guest kernel:%4.1f%% guest us:%4.1f%%"
-			" exact: %4.1f%% [",
-			samples_per_sec,
-			100.0 - (100.0 * ((samples_per_sec-ksamples_per_sec) /
-					  samples_per_sec)),
-			100.0 - (100.0 * ((samples_per_sec-us_samples_per_sec) /
-					  samples_per_sec)),
-			100.0 - (100.0 * ((samples_per_sec -
-						guest_kernel_samples_per_sec) /
-					  samples_per_sec)),
-			100.0 - (100.0 * ((samples_per_sec -
-					   guest_us_samples_per_sec) /
-					  samples_per_sec)),
-			esamples_percent);
-	}
+	perf_top__header_snprintf(&top, bf, sizeof(bf));
+	printf("%s\n", bf);
 
-	if (evsel_list->nr_entries == 1 || !display_weighted) {
-		struct perf_evsel *first;
-		first = list_entry(evsel_list->entries.next, struct perf_evsel, node);
-		printf("%" PRIu64, (uint64_t)first->attr.sample_period);
-		if (freq)
-			printf("Hz ");
-		else
-			printf(" ");
-	}
-
-	if (!display_weighted)
-		printf("%s", event_name(sym_evsel));
-	else list_for_each_entry(counter, &evsel_list->entries, node) {
-		if (counter->idx)
-			printf("/");
-
-		printf("%s", event_name(counter));
-	}
-
-	printf( "], ");
-
-	if (target_pid != -1)
-		printf(" (target_pid: %d", target_pid);
-	else if (target_tid != -1)
-		printf(" (target_tid: %d", target_tid);
-	else
-		printf(" (all");
-
-	if (cpu_list)
-		printf(", CPU%s: %s)\n", evsel_list->cpus->nr > 1 ? "s" : "", cpu_list);
-	else {
-		if (target_tid != -1)
-			printf(")\n");
-		else
-			printf(", %d CPU%s)\n", evsel_list->cpus->nr,
-			       evsel_list->cpus->nr > 1 ? "s" : "");
-	}
+	perf_top__reset_sample_counters(&top);
 
 	printf("%-*.*s\n", win_width, win_width, graph_dotted_line);
 
@@ -587,26 +396,8 @@ static void print_sym_table(struct perf_session *session)
 		return;
 	}
 
-	/*
-	 * Find the longest symbol name that will be displayed
-	 */
-	for (nd = rb_first(&tmp); nd; nd = rb_next(nd)) {
-		syme = rb_entry(nd, struct sym_entry, rb_node);
-		if (++printed > print_entries ||
-		    (int)syme->snap_count < count_filter)
-			continue;
-
-		if (syme->map->dso->long_name_len > dso_width)
-			dso_width = syme->map->dso->long_name_len;
-
-		if (syme->map->dso->short_name_len > dso_short_width)
-			dso_short_width = syme->map->dso->short_name_len;
-
-		if (syme->name_len > sym_width)
-			sym_width = syme->name_len;
-	}
-
-	printed = 0;
+	perf_top__find_widths(&top, &tmp, &dso_width, &dso_short_width,
+			      &sym_width);
 
 	if (sym_width + dso_width > winsize.ws_col - 29) {
 		dso_width = dso_short_width;
@@ -614,7 +405,7 @@ static void print_sym_table(struct perf_session *session)
 			sym_width = winsize.ws_col - dso_width - 29;
 	}
 	putchar('\n');
-	if (evsel_list->nr_entries == 1)
+	if (top.evlist->nr_entries == 1)
 		printf("             samples  pcnt");
 	else
 		printf("   weight    samples  pcnt");
@@ -623,7 +414,7 @@ static void print_sym_table(struct perf_session *session)
 		printf("         RIP       ");
 	printf(" %-*.*s DSO\n", sym_width, sym_width, "function");
 	printf("   %s    _______ _____",
-	       evsel_list->nr_entries == 1 ? "      " : "______");
+	       top.evlist->nr_entries == 1 ? "      " : "______");
 	if (verbose)
 		printf(" ________________");
 	printf(" %-*.*s", sym_width, sym_width, graph_line);
@@ -636,13 +427,14 @@ static void print_sym_table(struct perf_session *session)
 
 		syme = rb_entry(nd, struct sym_entry, rb_node);
 		sym = sym_entry__symbol(syme);
-		if (++printed > print_entries || (int)syme->snap_count < count_filter)
+		if (++printed > top.print_entries ||
+		    (int)syme->snap_count < top.count_filter)
 			continue;
 
 		pcnt = 100.0 - (100.0 * ((sum_ksamples - syme->snap_count) /
 					 sum_ksamples));
 
-		if (evsel_list->nr_entries == 1 || !display_weighted)
+		if (top.evlist->nr_entries == 1 || !top.display_weighted)
 			printf("%20.2f ", syme->weight);
 		else
 			printf("%9.1f %10ld ", syme->weight, syme->snap_count);
@@ -715,11 +507,11 @@ static void prompt_symbol(struct sym_entry **target, const char *msg)
 	if (p)
 		*p = 0;
 
-	pthread_mutex_lock(&active_symbols_lock);
-	syme = list_entry(active_symbols.next, struct sym_entry, node);
-	pthread_mutex_unlock(&active_symbols_lock);
+	pthread_mutex_lock(&top.active_symbols_lock);
+	syme = list_entry(top.active_symbols.next, struct sym_entry, node);
+	pthread_mutex_unlock(&top.active_symbols_lock);
 
-	list_for_each_entry_safe_from(syme, n, &active_symbols, node) {
+	list_for_each_entry_safe_from(syme, n, &top.active_symbols, node) {
 		struct symbol *sym = sym_entry__symbol(syme);
 
 		if (!strcmp(buf, sym->name)) {
@@ -749,28 +541,28 @@ static void print_mapped_keys(void)
 	}
 
 	fprintf(stdout, "\nMapped keys:\n");
-	fprintf(stdout, "\t[d]     display refresh delay.             \t(%d)\n", delay_secs);
-	fprintf(stdout, "\t[e]     display entries (lines).           \t(%d)\n", print_entries);
+	fprintf(stdout, "\t[d]     display refresh delay.             \t(%d)\n", top.delay_secs);
+	fprintf(stdout, "\t[e]     display entries (lines).           \t(%d)\n", top.print_entries);
 
-	if (evsel_list->nr_entries > 1)
-		fprintf(stdout, "\t[E]     active event counter.              \t(%s)\n", event_name(sym_evsel));
+	if (top.evlist->nr_entries > 1)
+		fprintf(stdout, "\t[E]     active event counter.              \t(%s)\n", event_name(top.sym_evsel));
 
-	fprintf(stdout, "\t[f]     profile display filter (count).    \t(%d)\n", count_filter);
+	fprintf(stdout, "\t[f]     profile display filter (count).    \t(%d)\n", top.count_filter);
 
 	fprintf(stdout, "\t[F]     annotate display filter (percent). \t(%d%%)\n", sym_pcnt_filter);
 	fprintf(stdout, "\t[s]     annotate symbol.                   \t(%s)\n", name?: "NULL");
 	fprintf(stdout, "\t[S]     stop annotation.\n");
 
-	if (evsel_list->nr_entries > 1)
-		fprintf(stdout, "\t[w]     toggle display weighted/count[E]r. \t(%d)\n", display_weighted ? 1 : 0);
+	if (top.evlist->nr_entries > 1)
+		fprintf(stdout, "\t[w]     toggle display weighted/count[E]r. \t(%d)\n", top.display_weighted ? 1 : 0);
 
 	fprintf(stdout,
 		"\t[K]     hide kernel_symbols symbols.     \t(%s)\n",
-		hide_kernel_symbols ? "yes" : "no");
+		top.hide_kernel_symbols ? "yes" : "no");
 	fprintf(stdout,
 		"\t[U]     hide user symbols.               \t(%s)\n",
-		hide_user_symbols ? "yes" : "no");
-	fprintf(stdout, "\t[z]     toggle sample zeroing.             \t(%d)\n", zero ? 1 : 0);
+		top.hide_user_symbols ? "yes" : "no");
+	fprintf(stdout, "\t[z]     toggle sample zeroing.             \t(%d)\n", top.zero ? 1 : 0);
 	fprintf(stdout, "\t[qQ]    quit.\n");
 }
 
@@ -791,7 +583,7 @@ static int key_mapped(int c)
 			return 1;
 		case 'E':
 		case 'w':
-			return evsel_list->nr_entries > 1 ? 1 : 0;
+			return top.evlist->nr_entries > 1 ? 1 : 0;
 		default:
 			break;
 	}
@@ -826,47 +618,47 @@ static void handle_keypress(struct perf_session *session, int c)
 
 	switch (c) {
 		case 'd':
-			prompt_integer(&delay_secs, "Enter display delay");
-			if (delay_secs < 1)
-				delay_secs = 1;
+			prompt_integer(&top.delay_secs, "Enter display delay");
+			if (top.delay_secs < 1)
+				top.delay_secs = 1;
 			break;
 		case 'e':
-			prompt_integer(&print_entries, "Enter display entries (lines)");
-			if (print_entries == 0) {
+			prompt_integer(&top.print_entries, "Enter display entries (lines)");
+			if (top.print_entries == 0) {
 				sig_winch_handler(SIGWINCH);
 				signal(SIGWINCH, sig_winch_handler);
 			} else
 				signal(SIGWINCH, SIG_DFL);
 			break;
 		case 'E':
-			if (evsel_list->nr_entries > 1) {
+			if (top.evlist->nr_entries > 1) {
 				fprintf(stderr, "\nAvailable events:");
 
-				list_for_each_entry(sym_evsel, &evsel_list->entries, node)
-					fprintf(stderr, "\n\t%d %s", sym_evsel->idx, event_name(sym_evsel));
+				list_for_each_entry(top.sym_evsel, &top.evlist->entries, node)
+					fprintf(stderr, "\n\t%d %s", top.sym_evsel->idx, event_name(top.sym_evsel));
 
-				prompt_integer(&sym_counter, "Enter details event counter");
+				prompt_integer(&top.sym_counter, "Enter details event counter");
 
-				if (sym_counter >= evsel_list->nr_entries) {
-					sym_evsel = list_entry(evsel_list->entries.next, struct perf_evsel, node);
-					sym_counter = 0;
-					fprintf(stderr, "Sorry, no such event, using %s.\n", event_name(sym_evsel));
+				if (top.sym_counter >= top.evlist->nr_entries) {
+					top.sym_evsel = list_entry(top.evlist->entries.next, struct perf_evsel, node);
+					top.sym_counter = 0;
+					fprintf(stderr, "Sorry, no such event, using %s.\n", event_name(top.sym_evsel));
 					sleep(1);
 					break;
 				}
-				list_for_each_entry(sym_evsel, &evsel_list->entries, node)
-					if (sym_evsel->idx == sym_counter)
+				list_for_each_entry(top.sym_evsel, &top.evlist->entries, node)
+					if (top.sym_evsel->idx == top.sym_counter)
 						break;
-			} else sym_counter = 0;
+			} else top.sym_counter = 0;
 			break;
 		case 'f':
-			prompt_integer(&count_filter, "Enter display event count filter");
+			prompt_integer(&top.count_filter, "Enter display event count filter");
 			break;
 		case 'F':
 			prompt_percent(&sym_pcnt_filter, "Enter details display event filter (percent)");
 			break;
 		case 'K':
-			hide_kernel_symbols = !hide_kernel_symbols;
+			top.hide_kernel_symbols = !top.hide_kernel_symbols;
 			break;
 		case 'q':
 		case 'Q':
@@ -890,13 +682,13 @@ static void handle_keypress(struct perf_session *session, int c)
 			}
 			break;
 		case 'U':
-			hide_user_symbols = !hide_user_symbols;
+			top.hide_user_symbols = !top.hide_user_symbols;
 			break;
 		case 'w':
-			display_weighted = ~display_weighted;
+			top.display_weighted = ~top.display_weighted;
 			break;
 		case 'z':
-			zero = !zero;
+			top.zero = !top.zero;
 			break;
 		default:
 			break;
@@ -917,7 +709,7 @@ static void *display_thread(void *arg __used)
 	tc.c_cc[VTIME] = 0;
 
 repeat:
-	delay_msecs = delay_secs * 1000;
+	delay_msecs = top.delay_secs * 1000;
 	tcsetattr(0, TCSANOW, &tc);
 	/* trash return*/
 	getc(stdin);
@@ -1005,27 +797,27 @@ static void perf_event__process_sample(const union perf_event *event,
 	struct machine *machine;
 	u8 origin = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
 
-	++samples;
+	++top.samples;
 
 	switch (origin) {
 	case PERF_RECORD_MISC_USER:
-		++us_samples;
-		if (hide_user_symbols)
+		++top.us_samples;
+		if (top.hide_user_symbols)
 			return;
 		machine = perf_session__find_host_machine(session);
 		break;
 	case PERF_RECORD_MISC_KERNEL:
-		++kernel_samples;
-		if (hide_kernel_symbols)
+		++top.kernel_samples;
+		if (top.hide_kernel_symbols)
 			return;
 		machine = perf_session__find_host_machine(session);
 		break;
 	case PERF_RECORD_MISC_GUEST_KERNEL:
-		++guest_kernel_samples;
+		++top.guest_kernel_samples;
 		machine = perf_session__find_machine(session, event->ip.pid);
 		break;
 	case PERF_RECORD_MISC_GUEST_USER:
-		++guest_us_samples;
+		++top.guest_us_samples;
 		/*
 		 * TODO: we don't process guest user from host side
 		 * except simple counting.
@@ -1042,7 +834,7 @@ static void perf_event__process_sample(const union perf_event *event,
 	}
 
 	if (event->header.misc & PERF_RECORD_MISC_EXACT_IP)
-		exact_samples++;
+		top.exact_samples++;
 
 	if (perf_event__preprocess_sample(event, session, &al, sample,
 					  symbol_filter) < 0 ||
@@ -1093,14 +885,14 @@ static void perf_event__process_sample(const union perf_event *event,
 		struct perf_evsel *evsel;
 
 		syme->origin = origin;
-		evsel = perf_evlist__id2evsel(evsel_list, sample->id);
+		evsel = perf_evlist__id2evsel(top.evlist, sample->id);
 		assert(evsel != NULL);
 		syme->count[evsel->idx]++;
 		record_precise_ip(syme, evsel->idx, ip);
-		pthread_mutex_lock(&active_symbols_lock);
+		pthread_mutex_lock(&top.active_symbols_lock);
 		if (list_empty(&syme->node) || !syme->node.next)
 			__list_insert_active_sym(syme);
-		pthread_mutex_unlock(&active_symbols_lock);
+		pthread_mutex_unlock(&top.active_symbols_lock);
 	}
 }
 
@@ -1109,7 +901,7 @@ static void perf_session__mmap_read_cpu(struct perf_session *self, int cpu)
 	struct perf_sample sample;
 	union perf_event *event;
 
-	while ((event = perf_evlist__read_on_cpu(evsel_list, cpu)) != NULL) {
+	while ((event = perf_evlist__read_on_cpu(top.evlist, cpu)) != NULL) {
 		perf_session__parse_sample(self, event, &sample);
 
 		if (event->header.type == PERF_RECORD_SAMPLE)
@@ -1123,7 +915,7 @@ static void perf_session__mmap_read(struct perf_session *self)
 {
 	int i;
 
-	for (i = 0; i < evsel_list->cpus->nr; i++)
+	for (i = 0; i < top.evlist->cpus->nr; i++)
 		perf_session__mmap_read_cpu(self, i);
 }
 
@@ -1136,10 +928,10 @@ static void start_counters(struct perf_evlist *evlist)
 
 		attr->sample_type = PERF_SAMPLE_IP | PERF_SAMPLE_TID;
 
-		if (freq) {
+		if (top.freq) {
 			attr->sample_type |= PERF_SAMPLE_PERIOD;
 			attr->freq	  = 1;
-			attr->sample_freq = freq;
+			attr->sample_freq = top.freq;
 		}
 
 		if (evlist->nr_entries > 1) {
@@ -1149,8 +941,8 @@ static void start_counters(struct perf_evlist *evlist)
 
 		attr->mmap = 1;
 try_again:
-		if (perf_evsel__open(counter, evsel_list->cpus,
-				     evsel_list->threads, group, inherit) < 0) {
+		if (perf_evsel__open(counter, top.evlist->cpus,
+				     top.evlist->threads, group, inherit) < 0) {
 			int err = errno;
 
 			if (err == EPERM || err == EACCES)
@@ -1198,18 +990,18 @@ static int __cmd_top(void)
 	if (session == NULL)
 		return -ENOMEM;
 
-	if (target_tid != -1)
-		perf_event__synthesize_thread(target_tid, perf_event__process,
+	if (top.target_tid != -1)
+		perf_event__synthesize_thread(top.target_tid, perf_event__process,
 					      session);
 	else
 		perf_event__synthesize_threads(perf_event__process, session);
 
-	start_counters(evsel_list);
-	first = list_entry(evsel_list->entries.next, struct perf_evsel, node);
+	start_counters(top.evlist);
+	first = list_entry(top.evlist->entries.next, struct perf_evsel, node);
 	perf_session__set_sample_type(session, first->attr.sample_type);
 
 	/* Wait for a minimal set of events before starting the snapshot */
-	poll(evsel_list->pollfd, evsel_list->nr_fds, 100);
+	poll(top.evlist->pollfd, top.evlist->nr_fds, 100);
 
 	perf_session__mmap_read(session);
 
@@ -1229,12 +1021,12 @@ static int __cmd_top(void)
 	}
 
 	while (1) {
-		int hits = samples;
+		u64 hits = top.samples;
 
 		perf_session__mmap_read(session);
 
-		if (hits == samples)
-			ret = poll(evsel_list->pollfd, evsel_list->nr_fds, 100);
+		if (hits == top.samples)
+			ret = poll(top.evlist->pollfd, top.evlist->nr_fds, 100);
 	}
 
 	return 0;
@@ -1246,31 +1038,31 @@ static const char * const top_usage[] = {
 };
 
 static const struct option options[] = {
-	OPT_CALLBACK('e', "event", &evsel_list, "event",
+	OPT_CALLBACK('e', "event", &top.evlist, "event",
 		     "event selector. use 'perf list' to list available events",
 		     parse_events),
 	OPT_INTEGER('c', "count", &default_interval,
 		    "event period to sample"),
-	OPT_INTEGER('p', "pid", &target_pid,
+	OPT_INTEGER('p', "pid", &top.target_pid,
 		    "profile events on existing process id"),
-	OPT_INTEGER('t', "tid", &target_tid,
+	OPT_INTEGER('t', "tid", &top.target_tid,
 		    "profile events on existing thread id"),
 	OPT_BOOLEAN('a', "all-cpus", &system_wide,
 			    "system-wide collection from all CPUs"),
-	OPT_STRING('C', "cpu", &cpu_list, "cpu",
+	OPT_STRING('C', "cpu", &top.cpu_list, "cpu",
 		    "list of cpus to monitor"),
 	OPT_STRING('k', "vmlinux", &symbol_conf.vmlinux_name,
 		   "file", "vmlinux pathname"),
-	OPT_BOOLEAN('K', "hide_kernel_symbols", &hide_kernel_symbols,
+	OPT_BOOLEAN('K', "hide_kernel_symbols", &top.hide_kernel_symbols,
 		    "hide kernel symbols"),
 	OPT_UINTEGER('m', "mmap-pages", &mmap_pages, "number of mmap data pages"),
 	OPT_INTEGER('r', "realtime", &realtime_prio,
 		    "collect data with this RT SCHED_FIFO priority"),
-	OPT_INTEGER('d', "delay", &delay_secs,
+	OPT_INTEGER('d', "delay", &top.delay_secs,
 		    "number of seconds to delay between refreshes"),
 	OPT_BOOLEAN('D', "dump-symtab", &dump_symtab,
 			    "dump the symbol table used for profiling"),
-	OPT_INTEGER('f', "count-filter", &count_filter,
+	OPT_INTEGER('f', "count-filter", &top.count_filter,
 		    "only display functions with more events than this"),
 	OPT_BOOLEAN('g', "group", &group,
 			    "put the counters into a counter group"),
@@ -1278,13 +1070,13 @@ static const struct option options[] = {
 		    "child tasks inherit counters"),
 	OPT_STRING('s', "sym-annotate", &sym_filter, "symbol name",
 		    "symbol to annotate"),
-	OPT_BOOLEAN('z', "zero", &zero,
+	OPT_BOOLEAN('z', "zero", &top.zero,
 		    "zero history across updates"),
-	OPT_INTEGER('F', "freq", &freq,
+	OPT_INTEGER('F', "freq", &top.freq,
 		    "profile at this frequency"),
-	OPT_INTEGER('E', "entries", &print_entries,
+	OPT_INTEGER('E', "entries", &top.print_entries,
 		    "display this many functions"),
-	OPT_BOOLEAN('U', "hide_user_symbols", &hide_user_symbols,
+	OPT_BOOLEAN('U', "hide_user_symbols", &top.hide_user_symbols,
 		    "hide user symbols"),
 	OPT_INCR('v', "verbose", &verbose,
 		    "be more verbose (show counter open errors, etc)"),
@@ -1296,8 +1088,8 @@ int cmd_top(int argc, const char **argv, const char *prefix __used)
 	struct perf_evsel *pos;
 	int status = -ENOMEM;
 
-	evsel_list = perf_evlist__new(NULL, NULL);
-	if (evsel_list == NULL)
+	top.evlist = perf_evlist__new(NULL, NULL);
+	if (top.evlist == NULL)
 		return -ENOMEM;
 
 	page_size = sysconf(_SC_PAGE_SIZE);
@@ -1307,43 +1099,43 @@ int cmd_top(int argc, const char **argv, const char *prefix __used)
 		usage_with_options(top_usage, options);
 
 	/* CPU and PID are mutually exclusive */
-	if (target_tid > 0 && cpu_list) {
+	if (top.target_tid > 0 && top.cpu_list) {
 		printf("WARNING: PID switch overriding CPU\n");
 		sleep(1);
-		cpu_list = NULL;
+		top.cpu_list = NULL;
 	}
 
-	if (target_pid != -1)
-		target_tid = target_pid;
+	if (top.target_pid != -1)
+		top.target_tid = top.target_pid;
 
-	if (perf_evlist__create_maps(evsel_list, target_pid,
-				     target_tid, cpu_list) < 0)
+	if (perf_evlist__create_maps(top.evlist, top.target_pid,
+				     top.target_tid, top.cpu_list) < 0)
 		usage_with_options(top_usage, options);
 
-	if (!evsel_list->nr_entries &&
-	    perf_evlist__add_default(evsel_list) < 0) {
+	if (!top.evlist->nr_entries &&
+	    perf_evlist__add_default(top.evlist) < 0) {
 		pr_err("Not enough memory for event selector list\n");
 		return -ENOMEM;
 	}
 
-	if (delay_secs < 1)
-		delay_secs = 1;
+	if (top.delay_secs < 1)
+		top.delay_secs = 1;
 
 	/*
 	 * User specified count overrides default frequency.
 	 */
 	if (default_interval)
-		freq = 0;
-	else if (freq) {
-		default_interval = freq;
+		top.freq = 0;
+	else if (top.freq) {
+		default_interval = top.freq;
 	} else {
 		fprintf(stderr, "frequency and count are zero, aborting\n");
 		exit(EXIT_FAILURE);
 	}
 
-	list_for_each_entry(pos, &evsel_list->entries, node) {
-		if (perf_evsel__alloc_fd(pos, evsel_list->cpus->nr,
-					 evsel_list->threads->nr) < 0)
+	list_for_each_entry(pos, &top.evlist->entries, node) {
+		if (perf_evsel__alloc_fd(pos, top.evlist->cpus->nr,
+					 top.evlist->threads->nr) < 0)
 			goto out_free_fd;
 		/*
 		 * Fill in the ones not specifically initialized via -c:
@@ -1354,28 +1146,28 @@ int cmd_top(int argc, const char **argv, const char *prefix __used)
 		pos->attr.sample_period = default_interval;
 	}
 
-	if (perf_evlist__alloc_pollfd(evsel_list) < 0 ||
-	    perf_evlist__alloc_mmap(evsel_list) < 0)
+	if (perf_evlist__alloc_pollfd(top.evlist) < 0 ||
+	    perf_evlist__alloc_mmap(top.evlist) < 0)
 		goto out_free_fd;
 
-	sym_evsel = list_entry(evsel_list->entries.next, struct perf_evsel, node);
+	top.sym_evsel = list_entry(top.evlist->entries.next, struct perf_evsel, node);
 
 	symbol_conf.priv_size = (sizeof(struct sym_entry) +
-				 (evsel_list->nr_entries + 1) * sizeof(unsigned long));
+				 (top.evlist->nr_entries + 1) * sizeof(unsigned long));
 
 	symbol_conf.try_vmlinux_path = (symbol_conf.vmlinux_name == NULL);
 	if (symbol__init() < 0)
 		return -1;
 
 	get_term_dimensions(&winsize);
-	if (print_entries == 0) {
+	if (top.print_entries == 0) {
 		update_print_entries(&winsize);
 		signal(SIGWINCH, sig_winch_handler);
 	}
 
 	status = __cmd_top();
 out_free_fd:
-	perf_evlist__delete(evsel_list);
+	perf_evlist__delete(top.evlist);
 
 	return status;
 }
diff --git a/tools/perf/util/top.c b/tools/perf/util/top.c
new file mode 100644
index 0000000..c06cc53
--- /dev/null
+++ b/tools/perf/util/top.c
@@ -0,0 +1,212 @@
+/*
+ * Copyright (C) 2011, Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
+ *
+ * Refactored from builtin-top.c, see that files for further copyright notes.
+ *
+ * Released under the GPL v2. (and only v2, not any later version)
+ */
+
+#include "cpumap.h"
+#include "event.h"
+#include "evlist.h"
+#include "evsel.h"
+#include "parse-events.h"
+#include "symbol.h"
+#include "top.h"
+#include <inttypes.h>
+
+/*
+ * Ordering weight: count-1 * count-2 * ... / count-n
+ */
+static double sym_weight(const struct sym_entry *sym, struct perf_top *top)
+{
+	double weight = sym->snap_count;
+	int counter;
+
+	if (!top->display_weighted)
+		return weight;
+
+	for (counter = 1; counter < top->evlist->nr_entries - 1; counter++)
+		weight *= sym->count[counter];
+
+	weight /= (sym->count[counter] + 1);
+
+	return weight;
+}
+
+static void perf_top__remove_active_sym(struct perf_top *top, struct sym_entry *syme)
+{
+	pthread_mutex_lock(&top->active_symbols_lock);
+	list_del_init(&syme->node);
+	pthread_mutex_unlock(&top->active_symbols_lock);
+}
+
+static void rb_insert_active_sym(struct rb_root *tree, struct sym_entry *se)
+{
+	struct rb_node **p = &tree->rb_node;
+	struct rb_node *parent = NULL;
+	struct sym_entry *iter;
+
+	while (*p != NULL) {
+		parent = *p;
+		iter = rb_entry(parent, struct sym_entry, rb_node);
+
+		if (se->weight > iter->weight)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+
+	rb_link_node(&se->rb_node, parent, p);
+	rb_insert_color(&se->rb_node, tree);
+}
+
+size_t perf_top__header_snprintf(struct perf_top *top, char *bf, size_t size)
+{
+	struct perf_evsel *counter;
+	float samples_per_sec = top->samples / top->delay_secs;
+	float ksamples_per_sec = top->kernel_samples / top->delay_secs;
+	float esamples_percent = (100.0 * top->exact_samples) / top->samples;
+	size_t ret = 0;
+
+	if (!perf_guest) {
+		ret = snprintf(bf, size,
+			       "   PerfTop:%8.0f irqs/sec  kernel:%4.1f%%"
+			       "  exact: %4.1f%% [", samples_per_sec,
+			       100.0 - (100.0 * ((samples_per_sec - ksamples_per_sec) /
+					samples_per_sec)),
+				esamples_percent);
+	} else {
+		float us_samples_per_sec = top->us_samples / top->delay_secs;
+		float guest_kernel_samples_per_sec = top->guest_kernel_samples / top->delay_secs;
+		float guest_us_samples_per_sec = top->guest_us_samples / top->delay_secs;
+
+		ret = snprintf(bf, size,
+			       "   PerfTop:%8.0f irqs/sec  kernel:%4.1f%% us:%4.1f%%"
+			       " guest kernel:%4.1f%% guest us:%4.1f%%"
+			       " exact: %4.1f%% [", samples_per_sec,
+			       100.0 - (100.0 * ((samples_per_sec - ksamples_per_sec) /
+						 samples_per_sec)),
+			       100.0 - (100.0 * ((samples_per_sec - us_samples_per_sec) /
+						 samples_per_sec)),
+			       100.0 - (100.0 * ((samples_per_sec -
+						  guest_kernel_samples_per_sec) /
+						 samples_per_sec)),
+			       100.0 - (100.0 * ((samples_per_sec -
+						  guest_us_samples_per_sec) /
+						 samples_per_sec)),
+			       esamples_percent);
+	}
+
+	if (top->evlist->nr_entries == 1 || !top->display_weighted) {
+		struct perf_evsel *first;
+		first = list_entry(top->evlist->entries.next, struct perf_evsel, node);
+		ret += snprintf(bf + ret, size - ret, "%" PRIu64 "%s ",
+				(uint64_t)first->attr.sample_period,
+				top->freq ? "Hz" : "");
+	}
+
+	if (!top->display_weighted) {
+		ret += snprintf(bf + ret, size - ret, "%s",
+				event_name(top->sym_evsel));
+	} else list_for_each_entry(counter, &top->evlist->entries, node) {
+		ret += snprintf(bf + ret, size - ret, "%s%s",
+				counter->idx ? "/" : "", event_name(counter));
+	}
+
+	ret += snprintf(bf + ret, size - ret, "], ");
+
+	if (top->target_pid != -1)
+		ret += snprintf(bf + ret, size - ret, " (target_pid: %d",
+				top->target_pid);
+	else if (top->target_tid != -1)
+		ret += snprintf(bf + ret, size - ret, " (target_tid: %d",
+				top->target_tid);
+	else
+		ret += snprintf(bf + ret, size - ret, " (all");
+
+	if (top->cpu_list)
+		ret += snprintf(bf + ret, size - ret, ", CPU%s: %s)",
+				top->evlist->cpus->nr > 1 ? "s" : "", top->cpu_list);
+	else {
+		if (top->target_tid != -1)
+			ret += snprintf(bf + ret, size - ret, ")");
+		else
+			ret += snprintf(bf + ret, size - ret, ", %d CPU%s)",
+					top->evlist->cpus->nr,
+					top->evlist->cpus->nr > 1 ? "s" : "");
+	}
+
+	return ret;
+}
+
+void perf_top__reset_sample_counters(struct perf_top *top)
+{
+	top->samples = top->us_samples = top->kernel_samples =
+	top->exact_samples = top->guest_kernel_samples =
+	top->guest_us_samples = 0;
+}
+
+float perf_top__decay_samples(struct perf_top *top, struct rb_root *root)
+{
+	struct sym_entry *syme, *n;
+	float sum_ksamples = 0.0;
+	int snap = !top->display_weighted ? top->sym_counter : 0, j;
+
+	/* Sort the active symbols */
+	pthread_mutex_lock(&top->active_symbols_lock);
+	syme = list_entry(top->active_symbols.next, struct sym_entry, node);
+	pthread_mutex_unlock(&top->active_symbols_lock);
+
+	list_for_each_entry_safe_from(syme, n, &top->active_symbols, node) {
+		syme->snap_count = syme->count[snap];
+		if (syme->snap_count != 0) {
+
+			if ((top->hide_user_symbols &&
+			     syme->origin == PERF_RECORD_MISC_USER) ||
+			    (top->hide_kernel_symbols &&
+			     syme->origin == PERF_RECORD_MISC_KERNEL)) {
+				perf_top__remove_active_sym(top, syme);
+				continue;
+			}
+			syme->weight = sym_weight(syme, top);
+			rb_insert_active_sym(root, syme);
+			sum_ksamples += syme->snap_count;
+
+			for (j = 0; j < top->evlist->nr_entries; j++)
+				syme->count[j] = top->zero ? 0 : syme->count[j] * 7 / 8;
+		} else
+			perf_top__remove_active_sym(top, syme);
+	}
+
+	return sum_ksamples;
+}
+
+/*
+ * Find the longest symbol name that will be displayed
+ */
+void perf_top__find_widths(struct perf_top *top, struct rb_root *root,
+			   int *dso_width, int *dso_short_width, int *sym_width)
+{
+	struct rb_node *nd;
+	int printed = 0;
+
+	*sym_width = *dso_width = *dso_short_width = 0;
+
+	for (nd = rb_first(root); nd; nd = rb_next(nd)) {
+		struct sym_entry *syme = rb_entry(nd, struct sym_entry, rb_node);
+
+		if (++printed > top->print_entries ||
+		    (int)syme->snap_count < top->count_filter)
+			continue;
+
+		if (syme->map->dso->long_name_len > *dso_width)
+			*dso_width = syme->map->dso->long_name_len;
+
+		if (syme->map->dso->short_name_len > *dso_short_width)
+			*dso_short_width = syme->map->dso->short_name_len;
+
+		if (syme->name_len > *sym_width)
+			*sym_width = syme->name_len;
+	}
+}
diff --git a/tools/perf/util/top.h b/tools/perf/util/top.h
new file mode 100644
index 0000000..0467b26
--- /dev/null
+++ b/tools/perf/util/top.h
@@ -0,0 +1,67 @@
+#ifndef __PERF_TOP_H
+#define __PERF_TOP_H 1
+
+#include "types.h"
+#include "../perf.h"
+#include <stddef.h>
+#include <pthread.h>
+#include <linux/list.h>
+#include <linux/rbtree.h>
+
+struct perf_evlist;
+struct perf_evsel;
+
+struct source_line {
+	u64			eip;
+	unsigned long		count[MAX_COUNTERS]; /* FIXME */
+	char			*line;
+	struct source_line	*next;
+};
+
+struct sym_entry_source {
+	struct source_line	*source;
+	struct source_line	*lines;
+	struct source_line	**lines_tail;
+	pthread_mutex_t		lock;
+};
+
+struct sym_entry {
+	struct rb_node		rb_node;
+	struct list_head	node;
+	unsigned long		snap_count;
+	double			weight;
+	int			skip;
+	u16			name_len;
+	u8			origin;
+	struct map		*map;
+	struct sym_entry_source	*src;
+	unsigned long		count[0];
+};
+
+struct perf_top {
+	struct perf_evlist *evlist;
+	/*
+	 * Symbols will be added here in perf_event__process_sample and will
+	 * get out after decayed.
+	 */
+	struct list_head   active_symbols;
+	pthread_mutex_t	   active_symbols_lock;
+	u64		   samples;
+	u64		   kernel_samples, us_samples;
+	u64		   exact_samples;
+	u64		   guest_us_samples, guest_kernel_samples;
+	int		   print_entries, count_filter, delay_secs;
+	int		   display_weighted, freq;
+	int		   sym_counter, target_pid, target_tid;
+	bool		   hide_kernel_symbols, hide_user_symbols, zero;
+	const char	   *cpu_list;
+	struct perf_evsel  *sym_evsel;
+};
+
+size_t perf_top__header_snprintf(struct perf_top *top, char *bf, size_t size);
+void perf_top__reset_sample_counters(struct perf_top *top);
+float perf_top__decay_samples(struct perf_top *top, struct rb_root *root);
+void perf_top__find_widths(struct perf_top *top, struct rb_root *root,
+			   int *dso_width, int *dso_short_width, int *sym_width);
+
+#endif /* __PERF_TOP_H */
-- 
cgit v0.10.2


From eff9073790e1286aa12bf1c65814d3e0132b12e1 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 31 Jan 2011 16:59:05 +0100
Subject: x86: Rename incorrectly named parameter of numa_cpu_node()

numa_cpu_node() prototype in numa_32.h has wrongly named
parameter @apicid when it actually takes the CPU number.

Change it to @cpu.

Reported-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
LKML-Reference: <20110131155905.GM7459@htj.dyndns.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/numa_32.h b/arch/x86/include/asm/numa_32.h
index bc66031..c6beed1 100644
--- a/arch/x86/include/asm/numa_32.h
+++ b/arch/x86/include/asm/numa_32.h
@@ -6,7 +6,7 @@ extern int numa_off;
 extern int pxm_to_nid(int pxm);
 
 #ifdef CONFIG_NUMA
-extern int __cpuinit numa_cpu_node(int apicid);
+extern int __cpuinit numa_cpu_node(int cpu);
 #else	/* CONFIG_NUMA */
 static inline int numa_cpu_node(int cpu)		{ return NUMA_NO_NODE; }
 #endif	/* CONFIG_NUMA */
-- 
cgit v0.10.2


From e2830b5c1b2b2217894370a3b95af87d4a958401 Mon Sep 17 00:00:00 2001
From: Torben Hohn <torbenh@gmx.de>
Date: Thu, 27 Jan 2011 16:00:32 +0100
Subject: time: Make do_timer() and xtime_lock local to kernel/time/

All callers of do_timer() are converted to xtime_update(). The only
users of xtime_lock are in kernel/time/. Make both local to
kernel/time/ and remove them from the global header files.

[ tglx: Reuse tick-internal.h instead of creating another local header
  	file. Massaged changelog ]

Signed-off-by: Torben Hohn <torbenh@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: johnstul@us.ibm.com
Cc: yong.zhang0@gmail.com
Cc: hch@infradead.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9d9a078..cdef640 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2049,7 +2049,6 @@ extern void release_uids(struct user_namespace *ns);
 
 #include <asm/current.h>
 
-extern void do_timer(unsigned long ticks);
 extern void xtime_update(unsigned long ticks);
 
 extern int wake_up_state(struct task_struct *tsk, unsigned int state);
diff --git a/include/linux/time.h b/include/linux/time.h
index ce29c86..38c5206 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -113,8 +113,6 @@ static inline struct timespec timespec_sub(struct timespec lhs,
 #define timespec_valid(ts) \
 	(((ts)->tv_sec >= 0) && (((unsigned long) (ts)->tv_nsec) < NSEC_PER_SEC))
 
-extern seqlock_t xtime_lock;
-
 extern void read_persistent_clock(struct timespec *ts);
 extern void read_boot_clock(struct timespec *ts);
 extern int update_persistent_clock(struct timespec now);
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index d7395fd..0d74b9b 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -18,7 +18,6 @@
 #include <linux/notifier.h>
 #include <linux/smp.h>
 #include <linux/sysdev.h>
-#include <linux/tick.h>
 
 #include "tick-internal.h"
 
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index 2fbc207..b2fa506 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -25,6 +25,8 @@
 #include <linux/module.h>
 #include <linux/init.h>
 
+#include "tick-internal.h"
+
 /* The Jiffies based clocksource is the lowest common
  * denominator clock source which should function on
  * all systems. It has the same coarse resolution as
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 5c00242..ed8cfdf 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -16,6 +16,8 @@
 #include <linux/mm.h>
 #include <linux/module.h>
 
+#include "tick-internal.h"
+
 /*
  * NTP timekeeping variables:
  */
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 48b2761..92ef9a5 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -18,7 +18,6 @@
 #include <linux/percpu.h>
 #include <linux/profile.h>
 #include <linux/sched.h>
-#include <linux/tick.h>
 
 #include "tick-internal.h"
 
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 051bc80..0e98fac 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -18,7 +18,6 @@
 #include <linux/percpu.h>
 #include <linux/profile.h>
 #include <linux/sched.h>
-#include <linux/tick.h>
 
 #include <asm/irq_regs.h>
 
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 290eefb..28c5785 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -1,6 +1,8 @@
 /*
  * tick internal variable and functions used by low/high res code
  */
+#include <linux/hrtimer.h>
+#include <linux/tick.h>
 
 #define TICK_DO_TIMER_NONE	-1
 #define TICK_DO_TIMER_BOOT	-2
@@ -132,3 +134,6 @@ static inline int tick_device_is_functional(struct clock_event_device *dev)
 {
 	return !(dev->features & CLOCK_EVT_FEAT_DUMMY);
 }
+
+extern void do_timer(unsigned long ticks);
+extern seqlock_t xtime_lock;
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 5cbc101..2d04411 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -18,7 +18,6 @@
 #include <linux/percpu.h>
 #include <linux/profile.h>
 #include <linux/sched.h>
-#include <linux/tick.h>
 
 #include "tick-internal.h"
 
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index c55ea24..d5097c4 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -19,7 +19,6 @@
 #include <linux/percpu.h>
 #include <linux/profile.h>
 #include <linux/sched.h>
-#include <linux/tick.h>
 #include <linux/module.h>
 
 #include <asm/irq_regs.h>
-- 
cgit v0.10.2


From 229ade9ba36341f7369ecb4f134bcec9133520bf Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 31 Jan 2011 18:08:39 -0200
Subject: perf tools: Don't fallback to setup_pager unconditionally

Because in tools like 'top' we don't want the pager.

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
index 7006786..cd9dec4 100644
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -452,7 +452,7 @@ int cmd_annotate(int argc, const char **argv, const char *prefix __used)
 	else if (use_tui)
 		use_browser = 1;
 
-	setup_browser();
+	setup_browser(true);
 
 	symbol_conf.priv_size = sizeof(struct sym_priv);
 	symbol_conf.try_vmlinux_path = true;
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index a6a4e54..080937c 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -499,7 +499,7 @@ int cmd_report(int argc, const char **argv, const char *prefix __used)
 		use_browser = 1;
 
 	if (strcmp(input_name, "-") != 0)
-		setup_browser();
+		setup_browser(true);
 	else
 		use_browser = 0;
 	/*
diff --git a/tools/perf/util/cache.h b/tools/perf/util/cache.h
index a772979..fc5e5a0 100644
--- a/tools/perf/util/cache.h
+++ b/tools/perf/util/cache.h
@@ -34,13 +34,14 @@ extern int pager_use_color;
 extern int use_browser;
 
 #ifdef NO_NEWT_SUPPORT
-static inline void setup_browser(void)
+static inline void setup_browser(bool fallback_to_pager)
 {
-	setup_pager();
+	if (fallback_to_pager)
+		setup_pager();
 }
 static inline void exit_browser(bool wait_for_ok __used) {}
 #else
-void setup_browser(void);
+void setup_browser(bool fallback_to_pager);
 void exit_browser(bool wait_for_ok);
 #endif
 
diff --git a/tools/perf/util/ui/setup.c b/tools/perf/util/ui/setup.c
index 6620850..fbf1a14 100644
--- a/tools/perf/util/ui/setup.c
+++ b/tools/perf/util/ui/setup.c
@@ -14,11 +14,12 @@ static void newt_suspend(void *d __used)
 	newtResume();
 }
 
-void setup_browser(void)
+void setup_browser(bool fallback_to_pager)
 {
 	if (!isatty(1) || !use_browser || dump_trace) {
 		use_browser = 0;
-		setup_pager();
+		if (fallback_to_pager)
+			setup_pager();
 		return;
 	}
 
-- 
cgit v0.10.2


From c0443df1b69b59675fc6790e0ddce87c8ca00abf Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 31 Jan 2011 18:19:33 -0200
Subject: perf top: Introduce slang based TUI

Disabled by default as there are features found in the stdio based one
that aren't implemented, like live annotation, filtering knobs data
entry.

Annotation hopefully will get somehow merged with the 'perf annotate'
code.

To use it:

perf top --tui

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index edc660e..67a9f4d 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -621,6 +621,7 @@ else
 		LIB_OBJS += $(OUTPUT)util/ui/browsers/annotate.o
 		LIB_OBJS += $(OUTPUT)util/ui/browsers/hists.o
 		LIB_OBJS += $(OUTPUT)util/ui/browsers/map.o
+		LIB_OBJS += $(OUTPUT)util/ui/browsers/top.o
 		LIB_OBJS += $(OUTPUT)util/ui/helpline.o
 		LIB_OBJS += $(OUTPUT)util/ui/progress.o
 		LIB_OBJS += $(OUTPUT)util/ui/util.o
@@ -1050,6 +1051,9 @@ $(OUTPUT)util/ui/browser.o: util/ui/browser.c $(OUTPUT)PERF-CFLAGS
 $(OUTPUT)util/ui/browsers/annotate.o: util/ui/browsers/annotate.c $(OUTPUT)PERF-CFLAGS
 	$(QUIET_CC)$(CC) -o $@ -c $(ALL_CFLAGS) -DENABLE_SLFUTURE_CONST $<
 
+$(OUTPUT)util/ui/browsers/top.o: util/ui/browsers/top.c $(OUTPUT)PERF-CFLAGS
+	$(QUIET_CC)$(CC) -o $@ -c $(ALL_CFLAGS) -DENABLE_SLFUTURE_CONST $<
+
 $(OUTPUT)util/ui/browsers/hists.o: util/ui/browsers/hists.c $(OUTPUT)PERF-CFLAGS
 	$(QUIET_CC)$(CC) -o $@ -c $(ALL_CFLAGS) -DENABLE_SLFUTURE_CONST $<
 
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 3c9ba94..104de9a 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -20,6 +20,7 @@
 
 #include "perf.h"
 
+#include "util/cache.h"
 #include "util/color.h"
 #include "util/evlist.h"
 #include "util/evsel.h"
@@ -75,6 +76,8 @@ static struct perf_top top = {
 
 static bool			system_wide			=  false;
 
+static bool			use_tui, use_stdio;
+
 static int			default_interval		=      0;
 
 static bool			inherit				=  false;
@@ -96,11 +99,6 @@ static int			sym_pcnt_filter			=      5;
  * Source functions
  */
 
-static inline struct symbol *sym_entry__symbol(struct sym_entry *self)
-{
-       return ((void *)self) + symbol_conf.priv_size;
-}
-
 void get_term_dimensions(struct winsize *ws)
 {
 	char *s = getenv("LINES");
@@ -695,6 +693,14 @@ static void handle_keypress(struct perf_session *session, int c)
 	}
 }
 
+static void *display_thread_tui(void *arg __used)
+{
+	perf_top__tui_browser(&top);
+	exit_browser(0);
+	exit(0);
+	return NULL;
+}
+
 static void *display_thread(void *arg __used)
 {
 	struct pollfd stdin_poll = { .fd = 0, .events = POLLIN };
@@ -1005,7 +1011,8 @@ static int __cmd_top(void)
 
 	perf_session__mmap_read(session);
 
-	if (pthread_create(&thread, NULL, display_thread, session)) {
+	if (pthread_create(&thread, NULL, (use_browser > 0 ? display_thread_tui :
+							     display_thread), session)) {
 		printf("Could not create display thread.\n");
 		exit(-1);
 	}
@@ -1078,6 +1085,8 @@ static const struct option options[] = {
 		    "display this many functions"),
 	OPT_BOOLEAN('U', "hide_user_symbols", &top.hide_user_symbols,
 		    "hide user symbols"),
+	OPT_BOOLEAN(0, "tui", &use_tui, "Use the TUI interface"),
+	OPT_BOOLEAN(0, "stdio", &use_stdio, "Use the stdio interface"),
 	OPT_INCR('v', "verbose", &verbose,
 		    "be more verbose (show counter open errors, etc)"),
 	OPT_END()
@@ -1098,6 +1107,20 @@ int cmd_top(int argc, const char **argv, const char *prefix __used)
 	if (argc)
 		usage_with_options(top_usage, options);
 
+	/*
+ 	 * XXX For now start disabled, only using TUI if explicitely asked for.
+ 	 * Change that when handle_keys equivalent gets written, live annotation
+ 	 * done, etc.
+ 	 */
+	use_browser = 0;
+
+	if (use_stdio)
+		use_browser = 0;
+	else if (use_tui)
+		use_browser = 1;
+
+	setup_browser(false);
+
 	/* CPU and PID are mutually exclusive */
 	if (top.target_tid > 0 && top.cpu_list) {
 		printf("WARNING: PID switch overriding CPU\n");
diff --git a/tools/perf/util/top.c b/tools/perf/util/top.c
index c06cc53..1d2e265 100644
--- a/tools/perf/util/top.c
+++ b/tools/perf/util/top.c
@@ -158,6 +158,7 @@ float perf_top__decay_samples(struct perf_top *top, struct rb_root *root)
 	syme = list_entry(top->active_symbols.next, struct sym_entry, node);
 	pthread_mutex_unlock(&top->active_symbols_lock);
 
+	top->rb_entries = 0;
 	list_for_each_entry_safe_from(syme, n, &top->active_symbols, node) {
 		syme->snap_count = syme->count[snap];
 		if (syme->snap_count != 0) {
@@ -170,7 +171,11 @@ float perf_top__decay_samples(struct perf_top *top, struct rb_root *root)
 				continue;
 			}
 			syme->weight = sym_weight(syme, top);
-			rb_insert_active_sym(root, syme);
+
+			if ((int)syme->snap_count >= top->count_filter) {
+				rb_insert_active_sym(root, syme);
+				++top->rb_entries;
+			}
 			sum_ksamples += syme->snap_count;
 
 			for (j = 0; j < top->evlist->nr_entries; j++)
diff --git a/tools/perf/util/top.h b/tools/perf/util/top.h
index 0467b26..611370f 100644
--- a/tools/perf/util/top.h
+++ b/tools/perf/util/top.h
@@ -38,6 +38,11 @@ struct sym_entry {
 	unsigned long		count[0];
 };
 
+static inline struct symbol *sym_entry__symbol(struct sym_entry *self)
+{
+       return ((void *)self) + symbol_conf.priv_size;
+}
+
 struct perf_top {
 	struct perf_evlist *evlist;
 	/*
@@ -51,7 +56,7 @@ struct perf_top {
 	u64		   exact_samples;
 	u64		   guest_us_samples, guest_kernel_samples;
 	int		   print_entries, count_filter, delay_secs;
-	int		   display_weighted, freq;
+	int		   display_weighted, freq, rb_entries;
 	int		   sym_counter, target_pid, target_tid;
 	bool		   hide_kernel_symbols, hide_user_symbols, zero;
 	const char	   *cpu_list;
@@ -64,4 +69,12 @@ float perf_top__decay_samples(struct perf_top *top, struct rb_root *root);
 void perf_top__find_widths(struct perf_top *top, struct rb_root *root,
 			   int *dso_width, int *dso_short_width, int *sym_width);
 
+#ifdef NO_NEWT_SUPPORT
+static inline int perf_top__tui_browser(struct perf_top *top __used)
+{
+	return 0;
+}
+#else
+int perf_top__tui_browser(struct perf_top *top);
+#endif
 #endif /* __PERF_TOP_H */
diff --git a/tools/perf/util/ui/browsers/top.c b/tools/perf/util/ui/browsers/top.c
new file mode 100644
index 0000000..ca60624
--- /dev/null
+++ b/tools/perf/util/ui/browsers/top.c
@@ -0,0 +1,136 @@
+/*
+ * Copyright (C) 2011, Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
+ *
+ * Parts came from builtin-{top,stat,record}.c, see those files for further
+ * copyright notes.
+ *
+ * Released under the GPL v2. (and only v2, not any later version)
+ */
+#include "../browser.h"
+#include "../helpline.h"
+#include "../libslang.h"
+#include "../../evlist.h"
+#include "../../hist.h"
+#include "../../sort.h"
+#include "../../symbol.h"
+#include "../../top.h"
+
+struct perf_top_browser {
+	struct ui_browser b;
+	struct rb_root	  root;
+	float		  sum_ksamples;
+	int		  dso_width;
+	int		  dso_short_width;
+	int		  sym_width;
+};
+
+static void perf_top_browser__write(struct ui_browser *browser, void *entry, int row)
+{
+	struct perf_top_browser *top_browser = container_of(browser, struct perf_top_browser, b);
+	struct sym_entry *syme = rb_entry(entry, struct sym_entry, rb_node);
+	bool current_entry = ui_browser__is_current_entry(browser, row);
+	struct symbol *symbol = sym_entry__symbol(syme);
+	struct perf_top *top = browser->priv;
+	int width = browser->width;
+	double pcnt;
+
+	pcnt = 100.0 - (100.0 * ((top_browser->sum_ksamples - syme->snap_count) /
+				 top_browser->sum_ksamples));
+	ui_browser__set_percent_color(browser, pcnt, current_entry);
+
+	if (top->evlist->nr_entries == 1 || !top->display_weighted) {
+		slsmg_printf("%20.2f ", syme->weight);
+		width -= 24;
+	} else {
+		slsmg_printf("%9.1f %10ld ", syme->weight, syme->snap_count);
+		width -= 23;
+	}
+
+	slsmg_printf("%4.1f%%", pcnt);
+	width -= 7;
+
+	if (verbose) {
+		slsmg_printf(" %016" PRIx64, symbol->start);
+		width -= 17;
+	}
+
+	slsmg_printf(" %-*.*s ", top_browser->sym_width, top_browser->sym_width,
+		     symbol->name);
+	width -= top_browser->sym_width;
+	slsmg_write_nstring(width >= syme->map->dso->long_name_len ?
+				syme->map->dso->long_name :
+				syme->map->dso->short_name, width);
+}
+
+static void perf_top_browser__update_rb_tree(struct perf_top_browser *browser)
+{
+	struct perf_top *top = browser->b.priv;
+
+	browser->root = RB_ROOT;
+	browser->b.top = NULL;
+	browser->sum_ksamples = perf_top__decay_samples(top, &browser->root);
+	perf_top__find_widths(top, &browser->root, &browser->dso_width,
+			      &browser->dso_short_width,
+                              &browser->sym_width);
+	if (browser->sym_width + browser->dso_width > browser->b.width - 29) {
+		browser->dso_width = browser->dso_short_width;
+		if (browser->sym_width + browser->dso_width > browser->b.width - 29)
+			browser->sym_width = browser->b.width - browser->dso_width - 29;
+	}
+	browser->b.nr_entries = top->rb_entries;
+}
+
+static int perf_top_browser__run(struct perf_top_browser *browser)
+{
+	int key;
+	char title[160];
+	struct perf_top *top = browser->b.priv;
+	int delay_msecs = top->delay_secs * 1000;
+
+	perf_top_browser__update_rb_tree(browser);
+        perf_top__header_snprintf(top, title, sizeof(title));
+        perf_top__reset_sample_counters(top);
+
+	if (ui_browser__show(&browser->b, title, "ESC: exit") < 0)
+		return -1;
+
+	newtFormSetTimer(browser->b.form, delay_msecs);
+
+	while (1) {
+		key = ui_browser__run(&browser->b);
+
+		switch (key) {
+		case -1:
+			/* FIXME we need to check if it was es.reason == NEWT_EXIT_TIMER */
+			perf_top_browser__update_rb_tree(browser);
+			perf_top__header_snprintf(top, title, sizeof(title));
+			perf_top__reset_sample_counters(top);
+			ui_browser__set_color(&browser->b, NEWT_COLORSET_ROOT);
+			SLsmg_gotorc(0, 0);
+			slsmg_write_nstring(title, browser->b.width);
+			break;
+		case NEWT_KEY_TAB:
+		default:
+			goto out;
+		}
+	}
+out:
+	ui_browser__hide(&browser->b);
+	return key;
+}
+
+int perf_top__tui_browser(struct perf_top *top)
+{
+	struct perf_top_browser browser = {
+		.b = {
+			.entries = &browser.root,
+			.refresh = ui_browser__rb_tree_refresh,
+			.seek	 = ui_browser__rb_tree_seek,
+			.write	 = perf_top_browser__write,
+			.priv	 = top,
+		},
+	};
+
+	ui_helpline__push("Press <- or ESC to exit");
+	return perf_top_browser__run(&browser);
+}
-- 
cgit v0.10.2


From 823c7164a92a6347d46bb64aaae728b6d08f3bb8 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 31 Jan 2011 19:45:38 -0200
Subject: perf probe: Use %td for pointer arithmetic result
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

%td is for ptrdiff_t, avoiding this warning on 32-bit:

cc1: warnings being treated as errors
builtin-probe.c: In function ‘opt_set_filter’:
builtin-probe.c:176:4: error: format ‘%ld’ expects type ‘long int’, but
argument 3 has type ‘int’

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/builtin-probe.c b/tools/perf/builtin-probe.c
index fcde003..2c0e64d 100644
--- a/tools/perf/builtin-probe.c
+++ b/tools/perf/builtin-probe.c
@@ -173,7 +173,7 @@ static int opt_set_filter(const struct option *opt __used,
 			strfilter__delete(params.filter);
 		params.filter = strfilter__new(str, &err);
 		if (!params.filter) {
-			pr_err("Filter parse error at %ld.\n", err - str + 1);
+			pr_err("Filter parse error at %td.\n", err - str + 1);
 			pr_err("Source: \"%s\"\n", str);
 			pr_err("         %*c\n", (int)(err - str + 1), '^');
 			return -EINVAL;
-- 
cgit v0.10.2


From f6bbc1daac964da551130dbf01809d3fbd178b2d Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 31 Jan 2011 20:56:27 -0200
Subject: perf python: Fix build on 32-bit

Where there are lots of errors related to python methods receiving
'char *' for things like file open mode, which break the build, also
disable strict aliasing and fixup some other warnings. Now builds on
both 32-bit and 64-bit fedora systems.

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c
index d2d5217..5317ef2 100644
--- a/tools/perf/util/python.c
+++ b/tools/perf/util/python.c
@@ -15,6 +15,8 @@ struct throttle_event {
 	u64			 stream_id;
 };
 
+PyMODINIT_FUNC initperf(void);
+
 #define member_def(type, member, ptype, help) \
 	{ #member, ptype, \
 	  offsetof(struct pyrf_event, event) + offsetof(struct type, member), \
@@ -31,17 +33,15 @@ struct pyrf_event {
 	union perf_event   event;
 };
 
-#define T_ULONG_LONG T_ULONG
-
 #define sample_members \
-	sample_member_def(sample_ip, ip, T_ULONG_LONG, "event type"),			 \
+	sample_member_def(sample_ip, ip, T_ULONGLONG, "event type"),			 \
 	sample_member_def(sample_pid, pid, T_INT, "event pid"),			 \
 	sample_member_def(sample_tid, tid, T_INT, "event tid"),			 \
-	sample_member_def(sample_time, time, T_ULONG_LONG, "event timestamp"),		 \
-	sample_member_def(sample_addr, addr, T_ULONG_LONG, "event addr"),		 \
-	sample_member_def(sample_id, id, T_ULONG_LONG, "event id"),			 \
-	sample_member_def(sample_stream_id, stream_id, T_ULONG_LONG, "event stream id"), \
-	sample_member_def(sample_period, period, T_ULONG_LONG, "event period"),		 \
+	sample_member_def(sample_time, time, T_ULONGLONG, "event timestamp"),		 \
+	sample_member_def(sample_addr, addr, T_ULONGLONG, "event addr"),		 \
+	sample_member_def(sample_id, id, T_ULONGLONG, "event id"),			 \
+	sample_member_def(sample_stream_id, stream_id, T_ULONGLONG, "event stream id"), \
+	sample_member_def(sample_period, period, T_ULONGLONG, "event period"),		 \
 	sample_member_def(sample_cpu, cpu, T_UINT, "event cpu"),
 
 static char pyrf_mmap_event__doc[] = PyDoc_STR("perf mmap event object.");
@@ -51,11 +51,11 @@ static PyMemberDef pyrf_mmap_event__members[] = {
 	member_def(perf_event_header, type, T_UINT, "event type"),
 	member_def(mmap_event, pid, T_UINT, "event pid"),
 	member_def(mmap_event, tid, T_UINT, "event tid"),
-	member_def(mmap_event, start, T_ULONG_LONG, "start of the map"),
-	member_def(mmap_event, len, T_ULONG_LONG, "map length"),
-	member_def(mmap_event, pgoff, T_ULONG_LONG, "page offset"),
+	member_def(mmap_event, start, T_ULONGLONG, "start of the map"),
+	member_def(mmap_event, len, T_ULONGLONG, "map length"),
+	member_def(mmap_event, pgoff, T_ULONGLONG, "page offset"),
 	member_def(mmap_event, filename, T_STRING_INPLACE, "backing store"),
-	{ NULL, },
+	{ .name = NULL, },
 };
 
 static PyObject *pyrf_mmap_event__repr(struct pyrf_event *pevent)
@@ -96,8 +96,8 @@ static PyMemberDef pyrf_task_event__members[] = {
 	member_def(fork_event, ppid, T_UINT, "event ppid"),
 	member_def(fork_event, tid, T_UINT, "event tid"),
 	member_def(fork_event, ptid, T_UINT, "event ptid"),
-	member_def(fork_event, time, T_ULONG_LONG, "timestamp"),
-	{ NULL, },
+	member_def(fork_event, time, T_ULONGLONG, "timestamp"),
+	{ .name = NULL, },
 };
 
 static PyObject *pyrf_task_event__repr(struct pyrf_event *pevent)
@@ -130,7 +130,7 @@ static PyMemberDef pyrf_comm_event__members[] = {
 	member_def(comm_event, pid, T_UINT, "event pid"),
 	member_def(comm_event, tid, T_UINT, "event tid"),
 	member_def(comm_event, comm, T_STRING_INPLACE, "process name"),
-	{ NULL, },
+	{ .name = NULL, },
 };
 
 static PyObject *pyrf_comm_event__repr(struct pyrf_event *pevent)
@@ -156,10 +156,10 @@ static char pyrf_throttle_event__doc[] = PyDoc_STR("perf throttle event object."
 static PyMemberDef pyrf_throttle_event__members[] = {
 	sample_members
 	member_def(perf_event_header, type, T_UINT, "event type"),
-	member_def(throttle_event, time, T_ULONG_LONG, "timestamp"),
-	member_def(throttle_event, id, T_ULONG_LONG, "event id"),
-	member_def(throttle_event, stream_id, T_ULONG_LONG, "event stream id"),
-	{ NULL, },
+	member_def(throttle_event, time, T_ULONGLONG, "timestamp"),
+	member_def(throttle_event, id, T_ULONGLONG, "event id"),
+	member_def(throttle_event, stream_id, T_ULONGLONG, "event stream id"),
+	{ .name = NULL, },
 };
 
 static PyObject *pyrf_throttle_event__repr(struct pyrf_event *pevent)
@@ -522,7 +522,7 @@ static PyMethodDef pyrf_evsel__methods[] = {
 		.ml_flags = METH_VARARGS | METH_KEYWORDS,
 		.ml_doc	  = PyDoc_STR("open the event selector file descriptor table.")
 	},
-	{ NULL, }
+	{ .ml_name = NULL, }
 };
 
 static char pyrf_evsel__doc[] = PyDoc_STR("perf event selector list object.");
@@ -551,7 +551,7 @@ struct pyrf_evlist {
 };
 
 static int pyrf_evlist__init(struct pyrf_evlist *pevlist,
-			     PyObject *args, PyObject *kwargs)
+			     PyObject *args, PyObject *kwargs __used)
 {
 	PyObject *pcpus = NULL, *pthreads = NULL;
 	struct cpu_map *cpus;
@@ -613,7 +613,7 @@ static PyObject *pyrf_evlist__poll(struct pyrf_evlist *pevlist,
 }
 
 static PyObject *pyrf_evlist__get_pollfd(struct pyrf_evlist *pevlist,
-					 PyObject *args, PyObject *kwargs)
+					 PyObject *args __used, PyObject *kwargs __used)
 {
 	struct perf_evlist *evlist = &pevlist->evlist;
         PyObject *list = PyList_New(0);
@@ -645,7 +645,7 @@ free_list:
 
 
 static PyObject *pyrf_evlist__add(struct pyrf_evlist *pevlist,
-				  PyObject *args, PyObject *kwargs)
+				  PyObject *args, PyObject *kwargs __used)
 {
 	struct perf_evlist *evlist = &pevlist->evlist;
 	PyObject *pevsel;
@@ -724,7 +724,7 @@ static PyMethodDef pyrf_evlist__methods[] = {
 		.ml_flags = METH_VARARGS | METH_KEYWORDS,
 		.ml_doc	  = PyDoc_STR("reads an event.")
 	},
-	{ NULL, }
+	{ .ml_name = NULL, }
 };
 
 static Py_ssize_t pyrf_evlist__length(PyObject *obj)
@@ -840,11 +840,11 @@ static struct {
 	{ "RECORD_FORK",       PERF_RECORD_FORK },
 	{ "RECORD_READ",       PERF_RECORD_READ },
 	{ "RECORD_SAMPLE",     PERF_RECORD_SAMPLE },
-	{ NULL, },
+	{ .name = NULL, },
 };
 
 static PyMethodDef perf__methods[] = {
-	{ NULL, NULL }
+	{ .ml_name = NULL, }
 };
 
 PyMODINIT_FUNC initperf(void)
diff --git a/tools/perf/util/setup.py b/tools/perf/util/setup.py
index 496d7f4..1947b04 100644
--- a/tools/perf/util/setup.py
+++ b/tools/perf/util/setup.py
@@ -6,7 +6,8 @@ perf = Extension('perf',
 		  sources = ['util/python.c', 'util/ctype.c', 'util/evlist.c',
 			     'util/evsel.c', 'util/cpumap.c', 'util/thread_map.c',
 			     'util/util.c', 'util/xyarray.c'],
-		  include_dirs = ['util/include'])
+		  include_dirs = ['util/include'],
+		  extra_compile_args = ['-fno-strict-aliasing', '-Wno-write-strings'])
 
 setup(name='perf',
       version='0.1',
-- 
cgit v0.10.2


From 7cf37e87dd2cfa17a64f28ea7f31eed4525f79e4 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 1 Feb 2011 09:34:58 +0100
Subject: time: Fix legacy arch fallout

The xtime/dotimer cleanup broke architectures which do not implement
clockevents. Time to send out another __do_IRQ threat.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reported-by: Ingo Molnar <mingo@elte.hu>
Cc: Torben Hohn <torbenh@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: johnstul@us.ibm.com
Cc: yong.zhang0@gmail.com
Cc: hch@infradead.org
LKML-Reference: <20110127145905.23248.30458.stgit@localhost>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 28c5785..f77b93d 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -4,6 +4,8 @@
 #include <linux/hrtimer.h>
 #include <linux/tick.h>
 
+#ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD
+
 #define TICK_DO_TIMER_NONE	-1
 #define TICK_DO_TIMER_BOOT	-2
 
@@ -135,5 +137,7 @@ static inline int tick_device_is_functional(struct clock_event_device *dev)
 	return !(dev->features & CLOCK_EVT_FEAT_DUMMY);
 }
 
+#endif
+
 extern void do_timer(unsigned long ticks);
 extern seqlock_t xtime_lock;
-- 
cgit v0.10.2


From 067187fc9f1d09738fc833392e117f125cb6bbad Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 1 Feb 2011 14:57:02 -0200
Subject: perf tools: Remove verbose build messages for the python binding

Also now it builds it in a well known location:

[acme@felicio linux]$ rm -rf ../build/perf/
[acme@felicio linux]$ mkdir ../build/perf
[acme@felicio linux]$ make -j2 O=~acme/git/build/perf -C tools/perf/
<SNIP>
[acme@felicio linux]$ ls -la ../build/perf/python/
total 152
-rwxrwxr-x 1 acme acme 147957 Feb  1 14:56 perf.so
drwxrwxr-x 3 acme acme     17 Feb  1 14:56 temp
[acme@felicio linux]$

[root@felicio ~]# strip ~acme/git/build/perf/python/perf.so
[root@felicio ~]# ls -la ~acme/git/build/perf/python/perf.so
-rwxrwxr-x 1 acme acme 46264 Feb  1 14:58 /home/acme/git/build/perf/python/perf.so

[root@felicio ~]# export PYTHONPATH=~acme/git/build/perf/python/
[root@felicio ~]# ~acme/git/linux/tools/perf/python/twatch.py
cpu:  0, pid: 7751, tid: 7751 { type: exit, pid: 7751, ppid: 7751, tid: 7751, ptid: 7751, time: 54562393512356}
cpu:  0, pid: 13700, tid: 13700 { type: fork, pid: 7756, ppid: 13700, tid: 7756, ptid: 13700, time: 54562393746739}
cpu:  1, pid: 7756, tid: 7756 { type: fork, pid: 7757, ppid: 7756, tid: 7757, ptid: 7756, time: 54562394246152}
cpu:  1, pid: 7757, tid: 7757 { type: comm, pid: 7757, tid: 7757, comm: awk }
cpu:  1, pid: 7757, tid: 7757 { type: exit, pid: 7757, ppid: 7757, tid: 7757, ptid: 7757, time: 54562395456813}

Reported-by: Ingo Molnar <mingo@elte.hu>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index 67a9f4d..d1984ee 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -325,9 +325,9 @@ SCRIPT_SH += perf-archive.sh
 grep-libs = $(filter -l%,$(1))
 strip-libs = $(filter-out -l%,$(1))
 
-pyrf: $(PYRF_OBJS)
-	python util/setup.py build --build-base='$(OUTPUT)'
-
+$(OUTPUT)python/perf.so: $(PYRF_OBJS)
+	@python util/setup.py --quiet  build_ext --build-lib='$(OUTPUT)python' \
+						--build-temp='$(OUTPUT)python/temp'
 #
 # No Perl scripts right now:
 #
@@ -348,12 +348,14 @@ PROGRAMS += $(EXTRA_PROGRAMS)
 #
 PROGRAMS += $(OUTPUT)perf
 
+LANG_BINDINGS = $(OUTPUT)python/perf.so
+
 # List built-in command $C whose implementation cmd_$C() is not in
 # builtin-$C.o but is linked in as part of some other command.
 #
 
 # what 'all' will build and 'install' will install, in perfexecdir
-ALL_PROGRAMS = $(PROGRAMS) $(SCRIPTS) pyrf
+ALL_PROGRAMS = $(PROGRAMS) $(SCRIPTS) $(LANG_BINDINGS)
 
 # what 'all' will build but not install in perfexecdir
 OTHER_PROGRAMS = $(OUTPUT)perf$X
@@ -1298,6 +1300,8 @@ clean:
 	$(RM) $(htmldocs).tar.gz $(manpages).tar.gz
 	$(MAKE) -C Documentation/ clean
 	$(RM) $(OUTPUT)PERF-VERSION-FILE $(OUTPUT)PERF-CFLAGS $(OUTPUT)PERF-BUILD-OPTIONS
+	@python util/setup.py clean --build-lib='$(OUTPUT)python' \
+				   --build-temp='$(OUTPUT)python/temp'
 
 .PHONY: all install clean strip
 .PHONY: shell_compatibility_test please_set_SHELL_PATH_to_a_more_modern_shell
-- 
cgit v0.10.2


From 568bb7b8e856b9efb98a3f63259c717adc1b96b8 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 1 Feb 2011 15:05:00 -0200
Subject: perf tools: Fix up 'make clean' target

It wasn't using $(OUTPUT) to rm *.o and there were some funny looking
automake files that never get created but were being deleted anyway.

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index d1984ee..85f6549 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -1289,12 +1289,10 @@ distclean: clean
 #	$(RM) configure
 
 clean:
-	$(RM) *.o */*.o */*/*.o */*/*/*.o $(LIB_FILE)
+	$(RM) $(OUTPUT){*.o,*/*.o,*/*/*.o,*/*/*/*.o,$(LIB_FILE),perf-archive}
 	$(RM) $(ALL_PROGRAMS) $(BUILT_INS) perf$X
 	$(RM) $(TEST_PROGRAMS)
 	$(RM) *.spec *.pyc *.pyo */*.pyc */*.pyo $(OUTPUT)common-cmds.h TAGS tags cscope*
-	$(RM) -r autom4te.cache
-	$(RM) config.log config.mak.autogen config.mak.append config.status config.cache
 	$(RM) -r $(PERF_TARNAME) .doc-tmp-dir
 	$(RM) $(PERF_TARNAME).tar.gz perf-core_$(PERF_VERSION)-*.tar.gz
 	$(RM) $(htmldocs).tar.gz $(manpages).tar.gz
-- 
cgit v0.10.2


From 0015e2e101f5fd3256ab8b5a374c0e8806098871 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 1 Feb 2011 16:18:10 -0200
Subject: perf stat: Fix up resource release order

That was causing a SEGV on selected old distros.

Problem introduced in 7e2ed09.

Reported-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index e0f9575..806a999 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -748,8 +748,8 @@ int cmd_stat(int argc, const char **argv, const char *prefix __used)
 out_free_fd:
 	list_for_each_entry(pos, &evsel_list->entries, node)
 		perf_evsel__free_stat_priv(pos);
-	perf_evlist__delete(evsel_list);
-out:
 	perf_evlist__delete_maps(evsel_list);
+out:
+	perf_evlist__delete(evsel_list);
 	return status;
 }
-- 
cgit v0.10.2


From 978f626c4e5b9524d1898788d8e34d86dfa00795 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 1 Feb 2011 16:40:51 -0200
Subject: perf tools: Don't try to build python bindings if Python.h not
 available

Just leverage the test done for python support in 'python script',
emitting a warning about losing those features if python-dev[el] is not
installed.

Reported-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index 85f6549..4c9499c 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -348,14 +348,14 @@ PROGRAMS += $(EXTRA_PROGRAMS)
 #
 PROGRAMS += $(OUTPUT)perf
 
-LANG_BINDINGS = $(OUTPUT)python/perf.so
+LANG_BINDINGS =
 
 # List built-in command $C whose implementation cmd_$C() is not in
 # builtin-$C.o but is linked in as part of some other command.
 #
 
 # what 'all' will build and 'install' will install, in perfexecdir
-ALL_PROGRAMS = $(PROGRAMS) $(SCRIPTS) $(LANG_BINDINGS)
+ALL_PROGRAMS = $(PROGRAMS) $(SCRIPTS)
 
 # what 'all' will build but not install in perfexecdir
 OTHER_PROGRAMS = $(OUTPUT)perf$X
@@ -664,12 +664,14 @@ else
 	PYTHON_EMBED_CCOPTS = `python-config --cflags 2>/dev/null`
 	FLAGS_PYTHON_EMBED=$(PYTHON_EMBED_CCOPTS) $(PYTHON_EMBED_LDOPTS)
 	ifneq ($(call try-cc,$(SOURCE_PYTHON_EMBED),$(FLAGS_PYTHON_EMBED)),y)
+		msg := $(warning No Python.h found, install python-dev[el] to have python support in 'perf script' and to build the python bindings)
 		BASIC_CFLAGS += -DNO_LIBPYTHON
 	else
                ALL_LDFLAGS += $(PYTHON_EMBED_LDFLAGS)
                EXTLIBS += $(PYTHON_EMBED_LIBADD)
 		LIB_OBJS += $(OUTPUT)util/scripting-engines/trace-event-python.o
 		LIB_OBJS += $(OUTPUT)scripts/python/Perf-Trace-Util/Context.o
+		LANG_BINDINGS += $(OUTPUT)python/perf.so
 	endif
 endif
 
@@ -956,7 +958,7 @@ export TAR INSTALL DESTDIR SHELL_PATH
 
 SHELL = $(SHELL_PATH)
 
-all:: shell_compatibility_test $(ALL_PROGRAMS) $(BUILT_INS) $(OTHER_PROGRAMS) $(OUTPUT)PERF-BUILD-OPTIONS
+all:: shell_compatibility_test $(ALL_PROGRAMS) $(LANG_BINDINGS) $(BUILT_INS) $(OTHER_PROGRAMS) $(OUTPUT)PERF-BUILD-OPTIONS
 ifneq (,$X)
 	$(foreach p,$(patsubst %$X,%,$(filter %$X,$(ALL_PROGRAMS) $(BUILT_INS) perf$X)), test '$p' -ef '$p$X' || $(RM) '$p';)
 endif
-- 
cgit v0.10.2


From cdb0861c85c03fe80f4da033aab69df949579dc6 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Tue, 1 Feb 2011 10:51:23 -0800
Subject: perf top: Fix TUI compilation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

> +	slsmg_write_nstring(width >= syme->map->dso->long_name_len ?
> +				syme->map->dso->long_name :
> +				syme->map->dso->short_name, width);

need update macro for that calling

util/ui/browsers/top.c: In function ‘perf_top_browser__write’:
util/ui/browsers/top.c:60:2: error: cast to pointer from integer of different size
util/ui/browsers/top.c:60:2: error: comparison between pointer and integer
util/ui/browsers/top.c:60:2: error: passing argument 1 of ‘SLsmg_write_nstring’ discards qualifiers from pointer target type
/usr/include/slang.h:1728:16: note: expected ‘char *’ but argument is of type ‘const char *’
make: *** [util/ui/browsers/top.o] Error 1

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <4D48562B.20006@kernel.org>
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/ui/libslang.h b/tools/perf/util/ui/libslang.h
index 5623da8..2b63e1c 100644
--- a/tools/perf/util/ui/libslang.h
+++ b/tools/perf/util/ui/libslang.h
@@ -13,11 +13,11 @@
 
 #if SLANG_VERSION < 20104
 #define slsmg_printf(msg, args...) \
-	SLsmg_printf((char *)msg, ##args)
+	SLsmg_printf((char *)(msg), ##args)
 #define slsmg_write_nstring(msg, len) \
-	SLsmg_write_nstring((char *)msg, len)
+	SLsmg_write_nstring((char *)(msg), len)
 #define sltt_set_color(obj, name, fg, bg) \
-	SLtt_set_color(obj,(char *)name, (char *)fg, (char *)bg)
+	SLtt_set_color(obj,(char *)(name), (char *)(fg), (char *)(bg))
 #else
 #define slsmg_printf SLsmg_printf
 #define slsmg_write_nstring SLsmg_write_nstring
-- 
cgit v0.10.2


From 1e6d767924c74929c0cfe839ae8f37bcee9e544e Mon Sep 17 00:00:00 2001
From: Richard Cochran <richard.cochran@omicron.at>
Date: Tue, 1 Feb 2011 13:50:58 +0000
Subject: time: Correct the *settime* parameters

Both settimeofday() and clock_settime() promise with a 'const'
attribute not to alter the arguments passed in. This patch adds the
missing 'const' attribute into the various kernel functions
implementing these calls.

Signed-off-by: Richard Cochran <richard.cochran@omicron.at>
Acked-by: John Stultz <johnstul@us.ibm.com>
LKML-Reference: <20110201134417.545698637@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/drivers/char/mmtimer.c b/drivers/char/mmtimer.c
index e6d7562..ecd0082 100644
--- a/drivers/char/mmtimer.c
+++ b/drivers/char/mmtimer.c
@@ -487,7 +487,7 @@ static int sgi_clock_get(clockid_t clockid, struct timespec *tp)
 	return 0;
 };
 
-static int sgi_clock_set(clockid_t clockid, struct timespec *tp)
+static int sgi_clock_set(const clockid_t clockid, const struct timespec *tp)
 {
 
 	u64 nsec;
diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index 3e23844a..b2c14cb 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -69,7 +69,8 @@ struct k_itimer {
 struct k_clock {
 	int res;		/* in nanoseconds */
 	int (*clock_getres) (const clockid_t which_clock, struct timespec *tp);
-	int (*clock_set) (const clockid_t which_clock, struct timespec * tp);
+	int (*clock_set) (const clockid_t which_clock,
+			  const struct timespec *tp);
 	int (*clock_get) (const clockid_t which_clock, struct timespec * tp);
 	int (*timer_create) (struct k_itimer *timer);
 	int (*nsleep) (const clockid_t which_clock, int flags,
@@ -89,7 +90,7 @@ void register_posix_clock(const clockid_t clock_id, struct k_clock *new_clock);
 /* error handlers for timer_create, nanosleep and settime */
 int do_posix_clock_nonanosleep(const clockid_t, int flags, struct timespec *,
 			       struct timespec __user *);
-int do_posix_clock_nosettime(const clockid_t, struct timespec *tp);
+int do_posix_clock_nosettime(const clockid_t, const struct timespec *tp);
 
 /* function to call to trigger timer event */
 int posix_timer_event(struct k_itimer *timr, int si_private);
diff --git a/include/linux/security.h b/include/linux/security.h
index c642bb8..c096aa6 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -53,7 +53,7 @@ struct audit_krule;
  */
 extern int cap_capable(struct task_struct *tsk, const struct cred *cred,
 		       int cap, int audit);
-extern int cap_settime(struct timespec *ts, struct timezone *tz);
+extern int cap_settime(const struct timespec *ts, const struct timezone *tz);
 extern int cap_ptrace_access_check(struct task_struct *child, unsigned int mode);
 extern int cap_ptrace_traceme(struct task_struct *parent);
 extern int cap_capget(struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted);
@@ -1387,7 +1387,7 @@ struct security_operations {
 	int (*quotactl) (int cmds, int type, int id, struct super_block *sb);
 	int (*quota_on) (struct dentry *dentry);
 	int (*syslog) (int type);
-	int (*settime) (struct timespec *ts, struct timezone *tz);
+	int (*settime) (const struct timespec *ts, const struct timezone *tz);
 	int (*vm_enough_memory) (struct mm_struct *mm, long pages);
 
 	int (*bprm_set_creds) (struct linux_binprm *bprm);
@@ -1669,7 +1669,7 @@ int security_sysctl(struct ctl_table *table, int op);
 int security_quotactl(int cmds, int type, int id, struct super_block *sb);
 int security_quota_on(struct dentry *dentry);
 int security_syslog(int type);
-int security_settime(struct timespec *ts, struct timezone *tz);
+int security_settime(const struct timespec *ts, const struct timezone *tz);
 int security_vm_enough_memory(long pages);
 int security_vm_enough_memory_mm(struct mm_struct *mm, long pages);
 int security_vm_enough_memory_kern(long pages);
@@ -1904,7 +1904,8 @@ static inline int security_syslog(int type)
 	return 0;
 }
 
-static inline int security_settime(struct timespec *ts, struct timezone *tz)
+static inline int security_settime(const struct timespec *ts,
+				   const struct timezone *tz)
 {
 	return cap_settime(ts, tz);
 }
diff --git a/include/linux/time.h b/include/linux/time.h
index 38c5206..7c44e77 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -145,8 +145,9 @@ static inline u32 arch_gettimeoffset(void) { return 0; }
 #endif
 
 extern void do_gettimeofday(struct timeval *tv);
-extern int do_settimeofday(struct timespec *tv);
-extern int do_sys_settimeofday(struct timespec *tv, struct timezone *tz);
+extern int do_settimeofday(const struct timespec *tv);
+extern int do_sys_settimeofday(const struct timespec *tv,
+			       const struct timezone *tz);
 #define do_posix_clock_monotonic_gettime(ts) ktime_get_ts(ts)
 extern long do_utimes(int dfd, const char __user *filename, struct timespec *times, int flags);
 struct itimerval;
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 93bd2eb..21b7ca20 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -192,7 +192,7 @@ static int common_clock_get(clockid_t which_clock, struct timespec *tp)
 }
 
 static inline int common_clock_set(const clockid_t which_clock,
-				   struct timespec *tp)
+				   const struct timespec *tp)
 {
 	return do_sys_settimeofday(tp, NULL);
 }
@@ -928,7 +928,7 @@ void exit_itimers(struct signal_struct *sig)
 }
 
 /* Not available / possible... functions */
-int do_posix_clock_nosettime(const clockid_t clockid, struct timespec *tp)
+int do_posix_clock_nosettime(const clockid_t clockid, const struct timespec *tp)
 {
 	return -EINVAL;
 }
diff --git a/kernel/time.c b/kernel/time.c
index a31b512..5cb8053 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -150,7 +150,7 @@ static inline void warp_clock(void)
  * various programs will get confused when the clock gets warped.
  */
 
-int do_sys_settimeofday(struct timespec *tv, struct timezone *tz)
+int do_sys_settimeofday(const struct timespec *tv, const struct timezone *tz)
 {
 	static int firsttime = 1;
 	int error = 0;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 02c13a3..4f9f65b 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -353,7 +353,7 @@ EXPORT_SYMBOL(do_gettimeofday);
  *
  * Sets the time of day to the new time and update NTP and notify hrtimers
  */
-int do_settimeofday(struct timespec *tv)
+int do_settimeofday(const struct timespec *tv)
 {
 	struct timespec ts_delta;
 	unsigned long flags;
diff --git a/security/commoncap.c b/security/commoncap.c
index 64c2ed9c..dbfdaed 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -93,7 +93,7 @@ int cap_capable(struct task_struct *tsk, const struct cred *cred, int cap,
  * Determine whether the current process may set the system clock and timezone
  * information, returning 0 if permission granted, -ve if denied.
  */
-int cap_settime(struct timespec *ts, struct timezone *tz)
+int cap_settime(const struct timespec *ts, const struct timezone *tz)
 {
 	if (!capable(CAP_SYS_TIME))
 		return -EPERM;
diff --git a/security/security.c b/security/security.c
index 739e403..b995428 100644
--- a/security/security.c
+++ b/security/security.c
@@ -202,7 +202,7 @@ int security_syslog(int type)
 	return security_ops->syslog(type);
 }
 
-int security_settime(struct timespec *ts, struct timezone *tz)
+int security_settime(const struct timespec *ts, const struct timezone *tz)
 {
 	return security_ops->settime(ts, tz);
 }
-- 
cgit v0.10.2


From 65da528d7cc94966cf24d2a1e0837b689159b543 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 1 Feb 2011 13:51:01 +0000
Subject: posix-timers: Define nanosleep not supported error separate

Define the conditional nanosleep not supported error value outside of
do_posix_clock_nonanosleep(). Preparatory patch for further cleanups.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: John Stultz <johnstul@us.ibm.com>
Tested-by: Richard Cochran <richard.cochran@omicron.at>
LKML-Reference: <20110201134417.643486574@linutronix.de>

diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 21b7ca20..89bff37 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -81,6 +81,14 @@ static DEFINE_SPINLOCK(idr_lock);
 #error "SIGEV_THREAD_ID must not share bit with other SIGEV values!"
 #endif
 
+/*
+ * parisc wants ENOTSUP instead of EOPNOTSUPP
+ */
+#ifndef ENOTSUP
+# define ENANOSLEEP_NOTSUP EOPNOTSUPP
+#else
+# define ENANOSLEEP_NOTSUP ENOTSUP
+#endif
 
 /*
  * The timer ID is turned into a timer address by idr_find().
@@ -937,11 +945,7 @@ EXPORT_SYMBOL_GPL(do_posix_clock_nosettime);
 int do_posix_clock_nonanosleep(const clockid_t clock, int flags,
 			       struct timespec *t, struct timespec __user *r)
 {
-#ifndef ENOTSUP
-	return -EOPNOTSUPP;	/* aka ENOTSUP in userland for POSIX */
-#else  /*  parisc does define it separately.  */
-	return -ENOTSUP;
-#endif
+	return -ENANOSLEEP_NOTSUP;
 }
 EXPORT_SYMBOL_GPL(do_posix_clock_nonanosleep);
 
-- 
cgit v0.10.2


From 2fd1f04089cb657c5d6c484b280ec4d3398aa157 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 1 Feb 2011 13:51:03 +0000
Subject: posix-timers: Cleanup struct initializers

Cosmetic. No functional change

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: John Stultz <johnstul@us.ibm.com>
Tested-by: Richard Cochran <richard.cochran@omicron.at>
LKML-Reference: <20110201134417.745627057@linutronix.de>

diff --git a/drivers/char/mmtimer.c b/drivers/char/mmtimer.c
index ecd0082..fd51cd8 100644
--- a/drivers/char/mmtimer.c
+++ b/drivers/char/mmtimer.c
@@ -765,13 +765,13 @@ static int sgi_timer_set(struct k_itimer *timr, int flags,
 
 static struct k_clock sgi_clock = {
 	.res = 0,
-	.clock_set = sgi_clock_set,
-	.clock_get = sgi_clock_get,
-	.timer_create = sgi_timer_create,
-	.nsleep = do_posix_clock_nonanosleep,
-	.timer_set = sgi_timer_set,
-	.timer_del = sgi_timer_del,
-	.timer_get = sgi_timer_get
+	.clock_set	= sgi_clock_set,
+	.clock_get	= sgi_clock_get,
+	.timer_create	= sgi_timer_create,
+	.nsleep		= do_posix_clock_nonanosleep,
+	.timer_set	= sgi_timer_set,
+	.timer_del	= sgi_timer_del,
+	.timer_get	= sgi_timer_get
 };
 
 /**
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 05bb717..11b91dc 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -1607,20 +1607,20 @@ static long thread_cpu_nsleep_restart(struct restart_block *restart_block)
 static __init int init_posix_cpu_timers(void)
 {
 	struct k_clock process = {
-		.clock_getres = process_cpu_clock_getres,
-		.clock_get = process_cpu_clock_get,
-		.clock_set = do_posix_clock_nosettime,
-		.timer_create = process_cpu_timer_create,
-		.nsleep = process_cpu_nsleep,
-		.nsleep_restart = process_cpu_nsleep_restart,
+		.clock_getres	= process_cpu_clock_getres,
+		.clock_get	= process_cpu_clock_get,
+		.clock_set	= do_posix_clock_nosettime,
+		.timer_create	= process_cpu_timer_create,
+		.nsleep		= process_cpu_nsleep,
+		.nsleep_restart	= process_cpu_nsleep_restart,
 	};
 	struct k_clock thread = {
-		.clock_getres = thread_cpu_clock_getres,
-		.clock_get = thread_cpu_clock_get,
-		.clock_set = do_posix_clock_nosettime,
-		.timer_create = thread_cpu_timer_create,
-		.nsleep = thread_cpu_nsleep,
-		.nsleep_restart = thread_cpu_nsleep_restart,
+		.clock_getres	= thread_cpu_clock_getres,
+		.clock_get	= thread_cpu_clock_get,
+		.clock_set	= do_posix_clock_nosettime,
+		.timer_create	= thread_cpu_timer_create,
+		.nsleep		= thread_cpu_nsleep,
+		.nsleep_restart	= thread_cpu_nsleep_restart,
 	};
 	struct timespec ts;
 
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 89bff37..e7d26af 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -281,33 +281,33 @@ static int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp
 static __init int init_posix_timers(void)
 {
 	struct k_clock clock_realtime = {
-		.clock_getres = hrtimer_get_res,
+		.clock_getres	= hrtimer_get_res,
 	};
 	struct k_clock clock_monotonic = {
-		.clock_getres = hrtimer_get_res,
-		.clock_get = posix_ktime_get_ts,
-		.clock_set = do_posix_clock_nosettime,
+		.clock_getres	= hrtimer_get_res,
+		.clock_get	= posix_ktime_get_ts,
+		.clock_set	= do_posix_clock_nosettime,
 	};
 	struct k_clock clock_monotonic_raw = {
-		.clock_getres = hrtimer_get_res,
-		.clock_get = posix_get_monotonic_raw,
-		.clock_set = do_posix_clock_nosettime,
-		.timer_create = no_timer_create,
-		.nsleep = no_nsleep,
+		.clock_getres	= hrtimer_get_res,
+		.clock_get	= posix_get_monotonic_raw,
+		.clock_set	= do_posix_clock_nosettime,
+		.timer_create	= no_timer_create,
+		.nsleep		= no_nsleep,
 	};
 	struct k_clock clock_realtime_coarse = {
-		.clock_getres = posix_get_coarse_res,
-		.clock_get = posix_get_realtime_coarse,
-		.clock_set = do_posix_clock_nosettime,
-		.timer_create = no_timer_create,
-		.nsleep = no_nsleep,
+		.clock_getres	= posix_get_coarse_res,
+		.clock_get	= posix_get_realtime_coarse,
+		.clock_set	= do_posix_clock_nosettime,
+		.timer_create	= no_timer_create,
+		.nsleep		= no_nsleep,
 	};
 	struct k_clock clock_monotonic_coarse = {
-		.clock_getres = posix_get_coarse_res,
-		.clock_get = posix_get_monotonic_coarse,
-		.clock_set = do_posix_clock_nosettime,
-		.timer_create = no_timer_create,
-		.nsleep = no_nsleep,
+		.clock_getres	= posix_get_coarse_res,
+		.clock_get	= posix_get_monotonic_coarse,
+		.clock_set	= do_posix_clock_nosettime,
+		.timer_create	= no_timer_create,
+		.nsleep		= no_nsleep,
 	};
 
 	register_posix_clock(CLOCK_REALTIME, &clock_realtime);
-- 
cgit v0.10.2


From 1976945eeaab5fa461735a6225a82c3cf1e65d62 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 1 Feb 2011 13:51:06 +0000
Subject: posix-timers: Introduce clock_posix_cpu

The CLOCK_DISPATCH() macro is a horrible magic. We call common
functions if a function pointer is not set. That's just backwards.

To support dynamic file decriptor based clocks we need to cleanup that
dispatch logic.

Create a k_clock struct clock_posix_cpu which has all the
posix-cpu-timer functions filled in. After the cleanup the functions
can be made static.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: John Stultz <johnstul@us.ibm.com>
Tested-by: Richard Cochran <richard.cochran@omicron.at>
LKML-Reference: <20110201134417.841974553@linutronix.de>

diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index b2c14cb..1330ff3 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -85,6 +85,8 @@ struct k_clock {
 			   struct itimerspec * cur_setting);
 };
 
+extern struct k_clock clock_posix_cpu;
+
 void register_posix_clock(const clockid_t clock_id, struct k_clock *new_clock);
 
 /* error handlers for timer_create, nanosleep and settime */
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 11b91dc..816cd49 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -1604,6 +1604,18 @@ static long thread_cpu_nsleep_restart(struct restart_block *restart_block)
 	return -EINVAL;
 }
 
+struct k_clock clock_posix_cpu = {
+	.clock_getres	= posix_cpu_clock_getres,
+	.clock_set	= posix_cpu_clock_set,
+	.clock_get	= posix_cpu_clock_get,
+	.timer_create	= posix_cpu_timer_create,
+	.nsleep		= posix_cpu_nsleep,
+	.nsleep_restart	= posix_cpu_nsleep_restart,
+	.timer_set	= posix_cpu_timer_set,
+	.timer_del	= posix_cpu_timer_del,
+	.timer_get	= posix_cpu_timer_get,
+};
+
 static __init int init_posix_cpu_timers(void)
 {
 	struct k_clock process = {
-- 
cgit v0.10.2


From cc785ac22b17ed53e8ff5c1501e422be6d10be3c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 1 Feb 2011 13:51:09 +0000
Subject: posix-timers: Introduce clockid_to_kclock()

New function to find the kclock for a given clockid.

Returns a pointer to clock_posix_cpu if clockid < 0. If clockid >=
MAXCLOCK or if the clock_getres pointer is not set it returns
NULL. For valid clocks it returns a pointer to the matching
posix_clock.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: John Stultz <johnstul@us.ibm.com>
Acked-by: Richard Cochran <richard.cochran@omicron.at>
LKML-Reference: <20110201134417.938447839@linutronix.de>

diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index e7d26af..14b0a70 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -531,6 +531,16 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
 	kmem_cache_free(posix_timers_cache, tmr);
 }
 
+static struct k_clock *clockid_to_kclock(const clockid_t id)
+{
+	if (id < 0)
+		return &clock_posix_cpu;
+
+	if (id >= MAX_CLOCKS || !posix_clocks[id].clock_getres)
+		return NULL;
+	return &posix_clocks[id];
+}
+
 /* Create a POSIX.1b interval timer. */
 
 SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
-- 
cgit v0.10.2


From a5cd2880106cb2c79b3fe24f1c53dadba6a542a0 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 1 Feb 2011 13:51:11 +0000
Subject: posix-timers: Convert clock_nanosleep to clockid_to_kclock()

Use the new kclock decoding function in clock_nanosleep and cleanup all
kclocks which use the default functions.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: John Stultz <johnstul@us.ibm.com>
Tested-by: Richard Cochran <richard.cochran@omicron.at>
LKML-Reference: <20110201134418.034175556@linutronix.de>

diff --git a/drivers/char/mmtimer.c b/drivers/char/mmtimer.c
index fd51cd8..262d104 100644
--- a/drivers/char/mmtimer.c
+++ b/drivers/char/mmtimer.c
@@ -768,7 +768,6 @@ static struct k_clock sgi_clock = {
 	.clock_set	= sgi_clock_set,
 	.clock_get	= sgi_clock_get,
 	.timer_create	= sgi_timer_create,
-	.nsleep		= do_posix_clock_nonanosleep,
 	.timer_set	= sgi_timer_set,
 	.timer_del	= sgi_timer_del,
 	.timer_get	= sgi_timer_get
diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index 1330ff3..cd6da06 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -90,8 +90,6 @@ extern struct k_clock clock_posix_cpu;
 void register_posix_clock(const clockid_t clock_id, struct k_clock *new_clock);
 
 /* error handlers for timer_create, nanosleep and settime */
-int do_posix_clock_nonanosleep(const clockid_t, int flags, struct timespec *,
-			       struct timespec __user *);
 int do_posix_clock_nosettime(const clockid_t, const struct timespec *tp);
 
 /* function to call to trigger timer event */
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 14b0a70..ee69b21 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -216,12 +216,6 @@ static int no_timer_create(struct k_itimer *new_timer)
 	return -EOPNOTSUPP;
 }
 
-static int no_nsleep(const clockid_t which_clock, int flags,
-		     struct timespec *tsave, struct timespec __user *rmtp)
-{
-	return -EOPNOTSUPP;
-}
-
 /*
  * Return nonzero if we know a priori this clockid_t value is bogus.
  */
@@ -282,32 +276,31 @@ static __init int init_posix_timers(void)
 {
 	struct k_clock clock_realtime = {
 		.clock_getres	= hrtimer_get_res,
+		.nsleep		= common_nsleep,
 	};
 	struct k_clock clock_monotonic = {
 		.clock_getres	= hrtimer_get_res,
 		.clock_get	= posix_ktime_get_ts,
 		.clock_set	= do_posix_clock_nosettime,
+		.nsleep		= common_nsleep,
 	};
 	struct k_clock clock_monotonic_raw = {
 		.clock_getres	= hrtimer_get_res,
 		.clock_get	= posix_get_monotonic_raw,
 		.clock_set	= do_posix_clock_nosettime,
 		.timer_create	= no_timer_create,
-		.nsleep		= no_nsleep,
 	};
 	struct k_clock clock_realtime_coarse = {
 		.clock_getres	= posix_get_coarse_res,
 		.clock_get	= posix_get_realtime_coarse,
 		.clock_set	= do_posix_clock_nosettime,
 		.timer_create	= no_timer_create,
-		.nsleep		= no_nsleep,
 	};
 	struct k_clock clock_monotonic_coarse = {
 		.clock_getres	= posix_get_coarse_res,
 		.clock_get	= posix_get_monotonic_coarse,
 		.clock_set	= do_posix_clock_nosettime,
 		.timer_create	= no_timer_create,
-		.nsleep		= no_nsleep,
 	};
 
 	register_posix_clock(CLOCK_REALTIME, &clock_realtime);
@@ -952,13 +945,6 @@ int do_posix_clock_nosettime(const clockid_t clockid, const struct timespec *tp)
 }
 EXPORT_SYMBOL_GPL(do_posix_clock_nosettime);
 
-int do_posix_clock_nonanosleep(const clockid_t clock, int flags,
-			       struct timespec *t, struct timespec __user *r)
-{
-	return -ENANOSLEEP_NOTSUP;
-}
-EXPORT_SYMBOL_GPL(do_posix_clock_nonanosleep);
-
 SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
 		const struct timespec __user *, tp)
 {
@@ -1023,10 +1009,13 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
 		const struct timespec __user *, rqtp,
 		struct timespec __user *, rmtp)
 {
+	struct k_clock *kc = clockid_to_kclock(which_clock);
 	struct timespec t;
 
-	if (invalid_clockid(which_clock))
+	if (!kc)
 		return -EINVAL;
+	if (!kc->nsleep)
+		return -ENANOSLEEP_NOTSUP;
 
 	if (copy_from_user(&t, rqtp, sizeof (struct timespec)))
 		return -EFAULT;
@@ -1034,8 +1023,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
 	if (!timespec_valid(&t))
 		return -EINVAL;
 
-	return CLOCK_DISPATCH(which_clock, nsleep,
-			      (which_clock, flags, &t, rmtp));
+	return kc->nsleep(which_clock, flags, &t, rmtp);
 }
 
 /*
-- 
cgit v0.10.2


From 59bd5bc24aa69f6c62da1e242c16f09f667def96 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 1 Feb 2011 13:51:17 +0000
Subject: posix-timers: Convert clock_nanosleep_restart to clockid_to_kclock()

Use the new kclock decoding function in clock_nanosleep_restart.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: John Stultz <johnstul@us.ibm.com>
Tested-by: Richard Cochran <richard.cochran@omicron.at>
LKML-Reference: <20110201134418.131263211@linutronix.de>

diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index ee69b21..4dd86d1 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -277,12 +277,14 @@ static __init int init_posix_timers(void)
 	struct k_clock clock_realtime = {
 		.clock_getres	= hrtimer_get_res,
 		.nsleep		= common_nsleep,
+		.nsleep_restart	= hrtimer_nanosleep_restart,
 	};
 	struct k_clock clock_monotonic = {
 		.clock_getres	= hrtimer_get_res,
 		.clock_get	= posix_ktime_get_ts,
 		.clock_set	= do_posix_clock_nosettime,
 		.nsleep		= common_nsleep,
+		.nsleep_restart	= hrtimer_nanosleep_restart,
 	};
 	struct k_clock clock_monotonic_raw = {
 		.clock_getres	= hrtimer_get_res,
@@ -1027,22 +1029,16 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
 }
 
 /*
- * nanosleep_restart for monotonic and realtime clocks
- */
-static int common_nsleep_restart(struct restart_block *restart_block)
-{
-	return hrtimer_nanosleep_restart(restart_block);
-}
-
-/*
  * This will restart clock_nanosleep. This is required only by
  * compat_clock_nanosleep_restart for now.
  */
-long
-clock_nanosleep_restart(struct restart_block *restart_block)
+long clock_nanosleep_restart(struct restart_block *restart_block)
 {
 	clockid_t which_clock = restart_block->arg0;
+	struct k_clock *kc = clockid_to_kclock(which_clock);
+
+	if (WARN_ON_ONCE(!kc || !kc->nsleep_restart))
+		return -EINVAL;
 
-	return CLOCK_DISPATCH(which_clock, nsleep_restart,
-			      (restart_block));
+	return kc->nsleep_restart(restart_block);
 }
-- 
cgit v0.10.2


From 3751f9f29bcbc19bd10e92254a273486f150c245 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 1 Feb 2011 13:51:20 +0000
Subject: posix-timers: Cleanup restart_block usage

posix timers still use the legacy arg0-arg3 members of
restart_block. Use restart_block.nanosleep instead

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: John Stultz <johnstul@us.ibm.com>
Tested-by: Richard Cochran <richard.cochran@omicron.at>
LKML-Reference: <20110201134418.232288779@linutronix.de>

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 816cd49..9e617b0 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -1485,7 +1485,7 @@ int posix_cpu_nsleep(const clockid_t which_clock, int flags,
 		     struct timespec *rqtp, struct timespec __user *rmtp)
 {
 	struct restart_block *restart_block =
-	    &current_thread_info()->restart_block;
+		&current_thread_info()->restart_block;
 	struct itimerspec it;
 	int error;
 
@@ -1501,50 +1501,42 @@ int posix_cpu_nsleep(const clockid_t which_clock, int flags,
 
 	if (error == -ERESTART_RESTARTBLOCK) {
 
-	       	if (flags & TIMER_ABSTIME)
+		if (flags & TIMER_ABSTIME)
 			return -ERESTARTNOHAND;
 		/*
-	 	 * Report back to the user the time still remaining.
-	 	 */
-		if (rmtp != NULL && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
+		 * Report back to the user the time still remaining.
+		 */
+		if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
 			return -EFAULT;
 
 		restart_block->fn = posix_cpu_nsleep_restart;
-		restart_block->arg0 = which_clock;
-		restart_block->arg1 = (unsigned long) rmtp;
-		restart_block->arg2 = rqtp->tv_sec;
-		restart_block->arg3 = rqtp->tv_nsec;
+		restart_block->nanosleep.index = which_clock;
+		restart_block->nanosleep.rmtp = rmtp;
+		restart_block->nanosleep.expires = timespec_to_ns(rqtp);
 	}
 	return error;
 }
 
 long posix_cpu_nsleep_restart(struct restart_block *restart_block)
 {
-	clockid_t which_clock = restart_block->arg0;
-	struct timespec __user *rmtp;
+	clockid_t which_clock = restart_block->nanosleep.index;
 	struct timespec t;
 	struct itimerspec it;
 	int error;
 
-	rmtp = (struct timespec __user *) restart_block->arg1;
-	t.tv_sec = restart_block->arg2;
-	t.tv_nsec = restart_block->arg3;
+	t = ns_to_timespec(restart_block->nanosleep.expires);
 
-	restart_block->fn = do_no_restart_syscall;
 	error = do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t, &it);
 
 	if (error == -ERESTART_RESTARTBLOCK) {
+		struct timespec __user *rmtp = restart_block->nanosleep.rmtp;
 		/*
-	 	 * Report back to the user the time still remaining.
-	 	 */
-		if (rmtp != NULL && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
+		 * Report back to the user the time still remaining.
+		 */
+		if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
 			return -EFAULT;
 
-		restart_block->fn = posix_cpu_nsleep_restart;
-		restart_block->arg0 = which_clock;
-		restart_block->arg1 = (unsigned long) rmtp;
-		restart_block->arg2 = t.tv_sec;
-		restart_block->arg3 = t.tv_nsec;
+		restart_block->nanosleep.expires = timespec_to_ns(&t);
 	}
 	return error;
 
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 4dd86d1..4762986 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -1034,7 +1034,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
  */
 long clock_nanosleep_restart(struct restart_block *restart_block)
 {
-	clockid_t which_clock = restart_block->arg0;
+	clockid_t which_clock = restart_block->nanosleep.index;
 	struct k_clock *kc = clockid_to_kclock(which_clock);
 
 	if (WARN_ON_ONCE(!kc || !kc->nsleep_restart))
-- 
cgit v0.10.2


From d608c18203a969e5d14572a9861c646d0bb66872 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 1 Feb 2011 13:51:43 +0000
Subject: thread_info: Remove legacy arg0-3 from restart_block

posix timers were the last users of the legacy arg0-3 members of
restart_block. Remove the cruft.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: John Stultz <johnstul@us.ibm.com>
Tested-by: Richard Cochran <richard.cochran@omicron.at>
LKML-Reference: <20110201134418.326209775@linutronix.de>

diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index c906965..20fc303 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -18,9 +18,6 @@ struct compat_timespec;
 struct restart_block {
 	long (*fn)(struct restart_block *);
 	union {
-		struct {
-			unsigned long arg0, arg1, arg2, arg3;
-		};
 		/* For futex_wait and futex_wait_requeue_pi */
 		struct {
 			u32 __user *uaddr;
-- 
cgit v0.10.2


From 79c9da0d0539fb341a1b48a2a5a23d974726de90 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 1 Feb 2011 13:51:45 +0000
Subject: posix-cpu-timers: Remove the stub nanosleep functions

CLOCK_THREAD_CPUTIME_ID implements stub functions for nanosleep and
nanosleep_restart, which return -EINVAL. That return value is
wrong. The correct return value is -ENOTSUP.

Remove the stubs and let the new dispatch code return the correct
error code.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: John Stultz <johnstul@us.ibm.com>
Tested-by: Richard Cochran <richard.cochran@omicron.at>
LKML-Reference: <20110201134418.422446502@linutronix.de>

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 9e617b0..8dc4cd7 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -1586,15 +1586,6 @@ static int thread_cpu_timer_create(struct k_itimer *timer)
 	timer->it_clock = THREAD_CLOCK;
 	return posix_cpu_timer_create(timer);
 }
-static int thread_cpu_nsleep(const clockid_t which_clock, int flags,
-			      struct timespec *rqtp, struct timespec __user *rmtp)
-{
-	return -EINVAL;
-}
-static long thread_cpu_nsleep_restart(struct restart_block *restart_block)
-{
-	return -EINVAL;
-}
 
 struct k_clock clock_posix_cpu = {
 	.clock_getres	= posix_cpu_clock_getres,
@@ -1623,8 +1614,6 @@ static __init int init_posix_cpu_timers(void)
 		.clock_get	= thread_cpu_clock_get,
 		.clock_set	= do_posix_clock_nosettime,
 		.timer_create	= thread_cpu_timer_create,
-		.nsleep		= thread_cpu_nsleep,
-		.nsleep_restart	= thread_cpu_nsleep_restart,
 	};
 	struct timespec ts;
 
-- 
cgit v0.10.2


From 26f9a4796af330173d790c8d2b5e2efcc489e755 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 1 Feb 2011 13:51:48 +0000
Subject: posix-timers: Convert clock_settime to clockid_to_kclock()

Use the new kclock decoding function in clock_settime and cleanup all
kclocks which use the default functions. Rename the misnomed
common_clock_set() to posix_clock_realtime_set().

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: John Stultz <johnstul@us.ibm.com>
Tested-by: Richard Cochran <richard.cochran@omicron.at>
LKML-Reference: <20110201134418.518851246@linutronix.de>

diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index cd6da06..4aaf0c5 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -89,9 +89,6 @@ extern struct k_clock clock_posix_cpu;
 
 void register_posix_clock(const clockid_t clock_id, struct k_clock *new_clock);
 
-/* error handlers for timer_create, nanosleep and settime */
-int do_posix_clock_nosettime(const clockid_t, const struct timespec *tp);
-
 /* function to call to trigger timer event */
 int posix_timer_event(struct k_itimer *timr, int si_private);
 
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 8dc4cd7..504fbab 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -1604,7 +1604,6 @@ static __init int init_posix_cpu_timers(void)
 	struct k_clock process = {
 		.clock_getres	= process_cpu_clock_getres,
 		.clock_get	= process_cpu_clock_get,
-		.clock_set	= do_posix_clock_nosettime,
 		.timer_create	= process_cpu_timer_create,
 		.nsleep		= process_cpu_nsleep,
 		.nsleep_restart	= process_cpu_nsleep_restart,
@@ -1612,7 +1611,6 @@ static __init int init_posix_cpu_timers(void)
 	struct k_clock thread = {
 		.clock_getres	= thread_cpu_clock_getres,
 		.clock_get	= thread_cpu_clock_get,
-		.clock_set	= do_posix_clock_nosettime,
 		.timer_create	= thread_cpu_timer_create,
 	};
 	struct timespec ts;
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 4762986..49f358c 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -199,12 +199,6 @@ static int common_clock_get(clockid_t which_clock, struct timespec *tp)
 	return 0;
 }
 
-static inline int common_clock_set(const clockid_t which_clock,
-				   const struct timespec *tp)
-{
-	return do_sys_settimeofday(tp, NULL);
-}
-
 static int common_timer_create(struct k_itimer *new_timer)
 {
 	hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0);
@@ -232,6 +226,13 @@ static inline int invalid_clockid(const clockid_t which_clock)
 	return 1;
 }
 
+/* Set clock_realtime */
+static int posix_clock_realtime_set(const clockid_t which_clock,
+				    const struct timespec *tp)
+{
+	return do_sys_settimeofday(tp, NULL);
+}
+
 /*
  * Get monotonic time for posix timers
  */
@@ -276,32 +277,29 @@ static __init int init_posix_timers(void)
 {
 	struct k_clock clock_realtime = {
 		.clock_getres	= hrtimer_get_res,
+		.clock_set	= posix_clock_realtime_set,
 		.nsleep		= common_nsleep,
 		.nsleep_restart	= hrtimer_nanosleep_restart,
 	};
 	struct k_clock clock_monotonic = {
 		.clock_getres	= hrtimer_get_res,
 		.clock_get	= posix_ktime_get_ts,
-		.clock_set	= do_posix_clock_nosettime,
 		.nsleep		= common_nsleep,
 		.nsleep_restart	= hrtimer_nanosleep_restart,
 	};
 	struct k_clock clock_monotonic_raw = {
 		.clock_getres	= hrtimer_get_res,
 		.clock_get	= posix_get_monotonic_raw,
-		.clock_set	= do_posix_clock_nosettime,
 		.timer_create	= no_timer_create,
 	};
 	struct k_clock clock_realtime_coarse = {
 		.clock_getres	= posix_get_coarse_res,
 		.clock_get	= posix_get_realtime_coarse,
-		.clock_set	= do_posix_clock_nosettime,
 		.timer_create	= no_timer_create,
 	};
 	struct k_clock clock_monotonic_coarse = {
 		.clock_getres	= posix_get_coarse_res,
 		.clock_get	= posix_get_monotonic_coarse,
-		.clock_set	= do_posix_clock_nosettime,
 		.timer_create	= no_timer_create,
 	};
 
@@ -940,24 +938,19 @@ void exit_itimers(struct signal_struct *sig)
 	}
 }
 
-/* Not available / possible... functions */
-int do_posix_clock_nosettime(const clockid_t clockid, const struct timespec *tp)
-{
-	return -EINVAL;
-}
-EXPORT_SYMBOL_GPL(do_posix_clock_nosettime);
-
 SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
 		const struct timespec __user *, tp)
 {
+	struct k_clock *kc = clockid_to_kclock(which_clock);
 	struct timespec new_tp;
 
-	if (invalid_clockid(which_clock))
+	if (!kc || !kc->clock_set)
 		return -EINVAL;
+
 	if (copy_from_user(&new_tp, tp, sizeof (*tp)))
 		return -EFAULT;
 
-	return CLOCK_DISPATCH(which_clock, clock_set, (which_clock, &new_tp));
+	return kc->clock_set(which_clock, &new_tp);
 }
 
 SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
-- 
cgit v0.10.2


From 42285777631aa0654fbb6442057b3e176445c6c5 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 1 Feb 2011 13:51:50 +0000
Subject: posix-timers: Convert clock_gettime() to clockid_to_kclock()

Use the new kclock decoding mechanism and rename the misnomed
common_clock_get() to posix_clock_realtime_get().

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: John Stultz <johnstul@us.ibm.com>
Tested-by: Richard Cochran <richard.cochran@omicron.at>
LKML-Reference: <20110201134418.611097203@linutronix.de>

diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 49f358c..d9e5edf 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -190,15 +190,6 @@ static inline int common_clock_getres(const clockid_t which_clock,
 	return 0;
 }
 
-/*
- * Get real time for posix timers
- */
-static int common_clock_get(clockid_t which_clock, struct timespec *tp)
-{
-	ktime_get_real_ts(tp);
-	return 0;
-}
-
 static int common_timer_create(struct k_itimer *new_timer)
 {
 	hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0);
@@ -226,6 +217,13 @@ static inline int invalid_clockid(const clockid_t which_clock)
 	return 1;
 }
 
+/* Get clock_realtime */
+static int posix_clock_realtime_get(clockid_t which_clock, struct timespec *tp)
+{
+	ktime_get_real_ts(tp);
+	return 0;
+}
+
 /* Set clock_realtime */
 static int posix_clock_realtime_set(const clockid_t which_clock,
 				    const struct timespec *tp)
@@ -277,6 +275,7 @@ static __init int init_posix_timers(void)
 {
 	struct k_clock clock_realtime = {
 		.clock_getres	= hrtimer_get_res,
+		.clock_get	= posix_clock_realtime_get,
 		.clock_set	= posix_clock_realtime_set,
 		.nsleep		= common_nsleep,
 		.nsleep_restart	= hrtimer_nanosleep_restart,
@@ -956,18 +955,21 @@ SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
 SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
 		struct timespec __user *,tp)
 {
+	struct k_clock *kc = clockid_to_kclock(which_clock);
 	struct timespec kernel_tp;
 	int error;
 
-	if (invalid_clockid(which_clock))
+	if (!kc)
 		return -EINVAL;
-	error = CLOCK_DISPATCH(which_clock, clock_get,
-			       (which_clock, &kernel_tp));
+	if (!kc->clock_get)
+		return -EOPNOTSUPP;
+
+	error = kc->clock_get(which_clock, &kernel_tp);
+
 	if (!error && copy_to_user(tp, &kernel_tp, sizeof (kernel_tp)))
 		error = -EFAULT;
 
 	return error;
-
 }
 
 SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
-- 
cgit v0.10.2


From 4359ac0ace1a2a267927390ad27f781a2f8e0ab8 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 2 Feb 2011 11:45:23 +0100
Subject: posix-timers: Make clock_getres and clock_get mandatory

Richard said: "I would think that we can require k_clocks to provide
the read function. This could be checked and enforced in
register_posix_clock()."

Add checks for clock_getres and clock_get in the register function.

Suggested-by: Richard Cochran <richardcochran@gmail.com>
Cc: John Stultz <johnstul@us.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index d9e5edf..7f66143 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -485,7 +485,18 @@ static struct pid *good_sigevent(sigevent_t * event)
 void register_posix_clock(const clockid_t clock_id, struct k_clock *new_clock)
 {
 	if ((unsigned) clock_id >= MAX_CLOCKS) {
-		printk("POSIX clock register failed for clock_id %d\n",
+		printk(KERN_WARNING "POSIX clock register failed for clock_id %d\n",
+		       clock_id);
+		return;
+	}
+
+	if (!new_clock->clock_get) {
+		printk(KERN_WARNING "POSIX clock id %d lacks clock_get()\n",
+		       clock_id);
+		return;
+	}
+	if (!new_clock->clock_getres) {
+		printk(KERN_WARNING "POSIX clock id %d lacks clock_getres()\n",
 		       clock_id);
 		return;
 	}
@@ -961,8 +972,6 @@ SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
 
 	if (!kc)
 		return -EINVAL;
-	if (!kc->clock_get)
-		return -EOPNOTSUPP;
 
 	error = kc->clock_get(which_clock, &kernel_tp);
 
-- 
cgit v0.10.2


From e5e542eea9075dd008993c2ee80b2cc9f31fc494 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 1 Feb 2011 13:51:53 +0000
Subject: posix-timers: Convert clock_getres() to clockid_to_kclock()

Use the new kclock decoding. Fixup the fallout in mmtimer.c

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: John Stultz <johnstul@us.ibm.com>
Tested-by: Richard Cochran <richard.cochran@omicron.at>
LKML-Reference: <20110201134418.709802797@linutronix.de>

diff --git a/drivers/char/mmtimer.c b/drivers/char/mmtimer.c
index 262d104..141ffae 100644
--- a/drivers/char/mmtimer.c
+++ b/drivers/char/mmtimer.c
@@ -53,6 +53,8 @@ MODULE_LICENSE("GPL");
 
 #define RTC_BITS 55 /* 55 bits for this implementation */
 
+static struct k_clock sgi_clock;
+
 extern unsigned long sn_rtc_cycles_per_second;
 
 #define RTC_COUNTER_ADDR        ((long *)LOCAL_MMR_ADDR(SH_RTC))
@@ -763,10 +765,18 @@ static int sgi_timer_set(struct k_itimer *timr, int flags,
 	return err;
 }
 
+static int sgi_clock_getres(const clockid_t which_clock, struct timespec *tp)
+{
+	tp->tv_sec = 0;
+	tp->tv_nsec = sgi_clock.res;
+	return 0;
+}
+
 static struct k_clock sgi_clock = {
 	.res = 0,
 	.clock_set	= sgi_clock_set,
 	.clock_get	= sgi_clock_get,
+	.clock_getres	= sgi_clock_getres,
 	.timer_create	= sgi_timer_create,
 	.timer_set	= sgi_timer_set,
 	.timer_del	= sgi_timer_del,
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 7f66143..748497f 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -182,14 +182,6 @@ static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
  * the function pointer CALL in struct k_clock.
  */
 
-static inline int common_clock_getres(const clockid_t which_clock,
-				      struct timespec *tp)
-{
-	tp->tv_sec = 0;
-	tp->tv_nsec = posix_clocks[which_clock].res;
-	return 0;
-}
-
 static int common_timer_create(struct k_itimer *new_timer)
 {
 	hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0);
@@ -984,18 +976,17 @@ SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
 SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
 		struct timespec __user *, tp)
 {
+	struct k_clock *kc = clockid_to_kclock(which_clock);
 	struct timespec rtn_tp;
 	int error;
 
-	if (invalid_clockid(which_clock))
+	if (!kc)
 		return -EINVAL;
 
-	error = CLOCK_DISPATCH(which_clock, clock_getres,
-			       (which_clock, &rtn_tp));
+	error = kc->clock_getres(which_clock, &rtn_tp);
 
-	if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp))) {
+	if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp)))
 		error = -EFAULT;
-	}
 
 	return error;
 }
-- 
cgit v0.10.2


From ebaac757acae0431e2c79c00e09f1debdabbddd7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 1 Feb 2011 13:51:56 +0000
Subject: posix-timers: Remove useless res field from k_clock

The res member of kclock is only used by mmtimer.c, but even there it
contains redundant information. Remove the field and fixup mmtimer.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: John Stultz <johnstul@us.ibm.com>
Tested-by: Richard Cochran <richard.cochran@omicron.at>
LKML-Reference: <20110201134418.808714587@linutronix.de>

diff --git a/drivers/char/mmtimer.c b/drivers/char/mmtimer.c
index 141ffae..ff41eb3 100644
--- a/drivers/char/mmtimer.c
+++ b/drivers/char/mmtimer.c
@@ -768,12 +768,11 @@ static int sgi_timer_set(struct k_itimer *timr, int flags,
 static int sgi_clock_getres(const clockid_t which_clock, struct timespec *tp)
 {
 	tp->tv_sec = 0;
-	tp->tv_nsec = sgi_clock.res;
+	tp->tv_nsec = sgi_clock_period;
 	return 0;
 }
 
 static struct k_clock sgi_clock = {
-	.res = 0,
 	.clock_set	= sgi_clock_set,
 	.clock_get	= sgi_clock_get,
 	.clock_getres	= sgi_clock_getres,
@@ -840,7 +839,7 @@ static int __init mmtimer_init(void)
 			(unsigned long) node);
 	}
 
-	sgi_clock_period = sgi_clock.res = NSEC_PER_SEC / sn_rtc_cycles_per_second;
+	sgi_clock_period = NSEC_PER_SEC / sn_rtc_cycles_per_second;
 	register_posix_clock(CLOCK_SGI_CYCLE, &sgi_clock);
 
 	printk(KERN_INFO "%s: v%s, %ld MHz\n", MMTIMER_DESC, MMTIMER_VERSION,
diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index 4aaf0c5..ef574d1 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -67,7 +67,6 @@ struct k_itimer {
 };
 
 struct k_clock {
-	int res;		/* in nanoseconds */
 	int (*clock_getres) (const clockid_t which_clock, struct timespec *tp);
 	int (*clock_set) (const clockid_t which_clock,
 			  const struct timespec *tp);
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 748497f..f9142a9 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -204,8 +204,6 @@ static inline int invalid_clockid(const clockid_t which_clock)
 		return 1;
 	if (posix_clocks[which_clock].clock_getres != NULL)
 		return 0;
-	if (posix_clocks[which_clock].res != 0)
-		return 0;
 	return 1;
 }
 
-- 
cgit v0.10.2


From 838394fbf989973ec7f5a0ad82cb6ff09e5c39aa Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 1 Feb 2011 13:51:58 +0000
Subject: posix-timers: Convert timer_create() to clockid_to_kclock()

Setup timer_create for CLOCK_MONOTONIC and CLOCK_REALTIME kclocks and
remove the no_timer_create() implementation.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: John Stultz <johnstul@us.ibm.com>
Tested-by: Richard Cochran <richard.cochran@omicron.at>
LKML-Reference: <20110201134418.903604289@linutronix.de>

diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index f9142a9..4f71382 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -146,6 +146,7 @@ static struct k_clock posix_clocks[MAX_CLOCKS];
  */
 static int common_nsleep(const clockid_t, int flags, struct timespec *t,
 			 struct timespec __user *rmtp);
+static int common_timer_create(struct k_itimer *new_timer);
 static void common_timer_get(struct k_itimer *, struct itimerspec *);
 static int common_timer_set(struct k_itimer *, int,
 			    struct itimerspec *, struct itimerspec *);
@@ -175,25 +176,6 @@ static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
  	  ? (*posix_clocks[clock].call) arglist : common_##call arglist))
 
 /*
- * Default clock hook functions when the struct k_clock passed
- * to register_posix_clock leaves a function pointer null.
- *
- * The function common_CALL is the default implementation for
- * the function pointer CALL in struct k_clock.
- */
-
-static int common_timer_create(struct k_itimer *new_timer)
-{
-	hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0);
-	return 0;
-}
-
-static int no_timer_create(struct k_itimer *new_timer)
-{
-	return -EOPNOTSUPP;
-}
-
-/*
  * Return nonzero if we know a priori this clockid_t value is bogus.
  */
 static inline int invalid_clockid(const clockid_t which_clock)
@@ -269,27 +251,26 @@ static __init int init_posix_timers(void)
 		.clock_set	= posix_clock_realtime_set,
 		.nsleep		= common_nsleep,
 		.nsleep_restart	= hrtimer_nanosleep_restart,
+		.timer_create	= common_timer_create,
 	};
 	struct k_clock clock_monotonic = {
 		.clock_getres	= hrtimer_get_res,
 		.clock_get	= posix_ktime_get_ts,
 		.nsleep		= common_nsleep,
 		.nsleep_restart	= hrtimer_nanosleep_restart,
+		.timer_create	= common_timer_create,
 	};
 	struct k_clock clock_monotonic_raw = {
 		.clock_getres	= hrtimer_get_res,
 		.clock_get	= posix_get_monotonic_raw,
-		.timer_create	= no_timer_create,
 	};
 	struct k_clock clock_realtime_coarse = {
 		.clock_getres	= posix_get_coarse_res,
 		.clock_get	= posix_get_realtime_coarse,
-		.timer_create	= no_timer_create,
 	};
 	struct k_clock clock_monotonic_coarse = {
 		.clock_getres	= posix_get_coarse_res,
 		.clock_get	= posix_get_monotonic_coarse,
-		.timer_create	= no_timer_create,
 	};
 
 	register_posix_clock(CLOCK_REALTIME, &clock_realtime);
@@ -534,19 +515,28 @@ static struct k_clock *clockid_to_kclock(const clockid_t id)
 	return &posix_clocks[id];
 }
 
+static int common_timer_create(struct k_itimer *new_timer)
+{
+	hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0);
+	return 0;
+}
+
 /* Create a POSIX.1b interval timer. */
 
 SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
 		struct sigevent __user *, timer_event_spec,
 		timer_t __user *, created_timer_id)
 {
+	struct k_clock *kc = clockid_to_kclock(which_clock);
 	struct k_itimer *new_timer;
 	int error, new_timer_id;
 	sigevent_t event;
 	int it_id_set = IT_ID_NOT_SET;
 
-	if (invalid_clockid(which_clock))
+	if (!kc)
 		return -EINVAL;
+	if (!kc->timer_create)
+		return -EOPNOTSUPP;
 
 	new_timer = alloc_posix_timer();
 	if (unlikely(!new_timer))
@@ -608,7 +598,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
 		goto out;
 	}
 
-	error = CLOCK_DISPATCH(which_clock, timer_create, (new_timer));
+	error = kc->timer_create(new_timer);
 	if (error)
 		goto out;
 
@@ -618,7 +608,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
 	spin_unlock_irq(&current->sighand->siglock);
 
 	return 0;
- 	/*
+	/*
 	 * In the case of the timer belonging to another task, after
 	 * the task is unlocked, the timer is owned by the other task
 	 * and may cease to exist at any time.  Don't use or modify
-- 
cgit v0.10.2


From 27722df16ef143017db55ac7baac1703a68017ff Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 1 Feb 2011 13:52:01 +0000
Subject: posix-timers: Convert timer_settime() to clockid_to_kclock()

Set the common function for CLOCK_MONOTONIC and CLOCK_REALTIME kclocks
and use the new decoding function.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: John Stultz <johnstul@us.ibm.com>
Tested-by: Richard Cochran <richard.cochran@omicron.at>
LKML-Reference: <20110201134419.001863714@linutronix.de>

diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 4f71382..a4dbfe7 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -252,6 +252,7 @@ static __init int init_posix_timers(void)
 		.nsleep		= common_nsleep,
 		.nsleep_restart	= hrtimer_nanosleep_restart,
 		.timer_create	= common_timer_create,
+		.timer_set	= common_timer_set,
 	};
 	struct k_clock clock_monotonic = {
 		.clock_getres	= hrtimer_get_res,
@@ -259,6 +260,7 @@ static __init int init_posix_timers(void)
 		.nsleep		= common_nsleep,
 		.nsleep_restart	= hrtimer_nanosleep_restart,
 		.timer_create	= common_timer_create,
+		.timer_set	= common_timer_set,
 	};
 	struct k_clock clock_monotonic_raw = {
 		.clock_getres	= hrtimer_get_res,
@@ -814,6 +816,7 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
 	int error = 0;
 	unsigned long flag;
 	struct itimerspec *rtn = old_setting ? &old_spec : NULL;
+	struct k_clock *kc;
 
 	if (!new_setting)
 		return -EINVAL;
@@ -829,8 +832,11 @@ retry:
 	if (!timr)
 		return -EINVAL;
 
-	error = CLOCK_DISPATCH(timr->it_clock, timer_set,
-			       (timr, flags, &new_spec, rtn));
+	kc = clockid_to_kclock(timr->it_clock);
+	if (WARN_ON_ONCE(!kc || !kc->timer_set))
+		error = -EINVAL;
+	else
+		error = kc->timer_set(timr, flags, &new_spec, rtn);
 
 	unlock_timer(timr, flag);
 	if (error == TIMER_RETRY) {
-- 
cgit v0.10.2


From a7319fa253a549c4c6528fb550ae6e72a9c83811 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 1 Feb 2011 13:52:04 +0000
Subject: posix-timers: Convert timer_gettime() to clockid_to_kclock()

Set the common function for CLOCK_MONOTONIC and CLOCK_REALTIME kclocks
and use the new decoding function.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: John Stultz <johnstul@us.ibm.com>
Tested-by: Richard Cochran <richard.cochran@omicron.at>
LKML-Reference: <20110201134419.101243181@linutronix.de>

diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index a4dbfe7..c1e2636 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -253,6 +253,7 @@ static __init int init_posix_timers(void)
 		.nsleep_restart	= hrtimer_nanosleep_restart,
 		.timer_create	= common_timer_create,
 		.timer_set	= common_timer_set,
+		.timer_get	= common_timer_get,
 	};
 	struct k_clock clock_monotonic = {
 		.clock_getres	= hrtimer_get_res,
@@ -261,6 +262,7 @@ static __init int init_posix_timers(void)
 		.nsleep_restart	= hrtimer_nanosleep_restart,
 		.timer_create	= common_timer_create,
 		.timer_set	= common_timer_set,
+		.timer_get	= common_timer_get,
 	};
 	struct k_clock clock_monotonic_raw = {
 		.clock_getres	= hrtimer_get_res,
@@ -712,22 +714,28 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
 SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
 		struct itimerspec __user *, setting)
 {
-	struct k_itimer *timr;
 	struct itimerspec cur_setting;
+	struct k_itimer *timr;
+	struct k_clock *kc;
 	unsigned long flags;
+	int ret = 0;
 
 	timr = lock_timer(timer_id, &flags);
 	if (!timr)
 		return -EINVAL;
 
-	CLOCK_DISPATCH(timr->it_clock, timer_get, (timr, &cur_setting));
+	kc = clockid_to_kclock(timr->it_clock);
+	if (WARN_ON_ONCE(!kc || !kc->timer_get))
+		ret = -EINVAL;
+	else
+		kc->timer_get(timr, &cur_setting);
 
 	unlock_timer(timr, flags);
 
-	if (copy_to_user(setting, &cur_setting, sizeof (cur_setting)))
+	if (!ret && copy_to_user(setting, &cur_setting, sizeof (cur_setting)))
 		return -EFAULT;
 
-	return 0;
+	return ret;
 }
 
 /*
-- 
cgit v0.10.2


From 6761c6702e2c647582e1829abe8cf90794f61d9d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 1 Feb 2011 13:52:07 +0000
Subject: posix-timers: Convert timer_delete() to clockid_to_kclock()

Set the common function for CLOCK_MONOTONIC and CLOCK_REALTIME kclocks
and use the new decoding function.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: John Stultz <johnstul@us.ibm.com>
Tested-by: Richard Cochran <richard.cochran@omicron.at>
LKML-Reference: <20110201134419.198999420@linutronix.de>

diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index c1e2636..ade7dec 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -254,6 +254,7 @@ static __init int init_posix_timers(void)
 		.timer_create	= common_timer_create,
 		.timer_set	= common_timer_set,
 		.timer_get	= common_timer_get,
+		.timer_del	= common_timer_del,
 	};
 	struct k_clock clock_monotonic = {
 		.clock_getres	= hrtimer_get_res,
@@ -263,6 +264,7 @@ static __init int init_posix_timers(void)
 		.timer_create	= common_timer_create,
 		.timer_set	= common_timer_set,
 		.timer_get	= common_timer_get,
+		.timer_del	= common_timer_del,
 	};
 	struct k_clock clock_monotonic_raw = {
 		.clock_getres	= hrtimer_get_res,
@@ -859,7 +861,7 @@ retry:
 	return error;
 }
 
-static inline int common_timer_del(struct k_itimer *timer)
+static int common_timer_del(struct k_itimer *timer)
 {
 	timer->it.real.interval.tv64 = 0;
 
@@ -870,7 +872,11 @@ static inline int common_timer_del(struct k_itimer *timer)
 
 static inline int timer_delete_hook(struct k_itimer *timer)
 {
-	return CLOCK_DISPATCH(timer->it_clock, timer_del, (timer));
+	struct k_clock *kc = clockid_to_kclock(timer->it_clock);
+
+	if (WARN_ON_ONCE(!kc || !kc->timer_del))
+		return -EINVAL;
+	return kc->timer_del(timer);
 }
 
 /* Delete a POSIX.1b interval timer. */
-- 
cgit v0.10.2


From 0aa3975f02ce78f27be3076fbfa3d94ae5a659d5 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 1 Feb 2011 13:52:09 +0000
Subject: posix-timers: Remove CLOCK_DISPATCH leftovers

All users gone. Remove the cruft.

Huge thanks to Richard Cochran who tackled that maze first.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: John Stultz <johnstul@us.ibm.com>
Tested-by: Richard Cochran <richard.cochran@omicron.at>
LKML-Reference: <20110201134419.294620613@linutronix.de>

diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index ade7dec..ad154df 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -167,28 +167,6 @@ static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
 	spin_unlock_irqrestore(&timr->it_lock, flags);
 }
 
-/*
- * Call the k_clock hook function if non-null, or the default function.
- */
-#define CLOCK_DISPATCH(clock, call, arglist) \
- 	((clock) < 0 ? posix_cpu_##call arglist : \
- 	 (posix_clocks[clock].call != NULL \
- 	  ? (*posix_clocks[clock].call) arglist : common_##call arglist))
-
-/*
- * Return nonzero if we know a priori this clockid_t value is bogus.
- */
-static inline int invalid_clockid(const clockid_t which_clock)
-{
-	if (which_clock < 0)	/* CPU clock, posix_cpu_* will check it */
-		return 0;
-	if ((unsigned) which_clock >= MAX_CLOCKS)
-		return 1;
-	if (posix_clocks[which_clock].clock_getres != NULL)
-		return 0;
-	return 1;
-}
-
 /* Get clock_realtime */
 static int posix_clock_realtime_get(clockid_t which_clock, struct timespec *tp)
 {
-- 
cgit v0.10.2


From bc2c8ea483d73e95fc88f1fc9e7755180f89b892 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 1 Feb 2011 13:52:12 +0000
Subject: posix-timers: Make posix-cpu-timers functions static

All functions are accessed via clock_posix_cpu now. So make them static.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: John Stultz <johnstul@us.ibm.com>
Tested-by: Richard Cochran <richard.cochran@omicron.at>
LKML-Reference: <20110201134419.389755466@linutronix.de>

diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index ef574d1..8206255 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -91,18 +91,6 @@ void register_posix_clock(const clockid_t clock_id, struct k_clock *new_clock);
 /* function to call to trigger timer event */
 int posix_timer_event(struct k_itimer *timr, int si_private);
 
-int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *ts);
-int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *ts);
-int posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *ts);
-int posix_cpu_timer_create(struct k_itimer *timer);
-int posix_cpu_nsleep(const clockid_t which_clock, int flags,
-		     struct timespec *rqtp, struct timespec __user *rmtp);
-long posix_cpu_nsleep_restart(struct restart_block *restart_block);
-int posix_cpu_timer_set(struct k_itimer *timer, int flags,
-			struct itimerspec *new, struct itimerspec *old);
-int posix_cpu_timer_del(struct k_itimer *timer);
-void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp);
-
 void posix_cpu_timer_schedule(struct k_itimer *timer);
 
 void run_posix_cpu_timers(struct task_struct *task);
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 504fbab..609e187 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -176,7 +176,8 @@ static inline cputime_t virt_ticks(struct task_struct *p)
 	return p->utime;
 }
 
-int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)
+static int
+posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)
 {
 	int error = check_clock(which_clock);
 	if (!error) {
@@ -194,7 +195,8 @@ int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)
 	return error;
 }
 
-int posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp)
+static int
+posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp)
 {
 	/*
 	 * You can never reset a CPU clock, but we check for other errors
@@ -317,7 +319,7 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
 }
 
 
-int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
+static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
 {
 	const pid_t pid = CPUCLOCK_PID(which_clock);
 	int error = -EINVAL;
@@ -379,7 +381,7 @@ int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
  * This is called from sys_timer_create() and do_cpu_nanosleep() with the
  * new timer already all-zeros initialized.
  */
-int posix_cpu_timer_create(struct k_itimer *new_timer)
+static int posix_cpu_timer_create(struct k_itimer *new_timer)
 {
 	int ret = 0;
 	const pid_t pid = CPUCLOCK_PID(new_timer->it_clock);
@@ -425,7 +427,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer)
  * If we return TIMER_RETRY, it's necessary to release the timer's lock
  * and try again.  (This happens when the timer is in the middle of firing.)
  */
-int posix_cpu_timer_del(struct k_itimer *timer)
+static int posix_cpu_timer_del(struct k_itimer *timer)
 {
 	struct task_struct *p = timer->it.cpu.task;
 	int ret = 0;
@@ -665,8 +667,8 @@ static int cpu_timer_sample_group(const clockid_t which_clock,
  * If we return TIMER_RETRY, it's necessary to release the timer's lock
  * and try again.  (This happens when the timer is in the middle of firing.)
  */
-int posix_cpu_timer_set(struct k_itimer *timer, int flags,
-			struct itimerspec *new, struct itimerspec *old)
+static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
+			       struct itimerspec *new, struct itimerspec *old)
 {
 	struct task_struct *p = timer->it.cpu.task;
 	union cpu_time_count old_expires, new_expires, old_incr, val;
@@ -820,7 +822,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
 	return ret;
 }
 
-void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
+static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
 {
 	union cpu_time_count now;
 	struct task_struct *p = timer->it.cpu.task;
@@ -1481,8 +1483,10 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
 	return error;
 }
 
-int posix_cpu_nsleep(const clockid_t which_clock, int flags,
-		     struct timespec *rqtp, struct timespec __user *rmtp)
+static long posix_cpu_nsleep_restart(struct restart_block *restart_block);
+
+static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
+			    struct timespec *rqtp, struct timespec __user *rmtp)
 {
 	struct restart_block *restart_block =
 		&current_thread_info()->restart_block;
@@ -1517,7 +1521,7 @@ int posix_cpu_nsleep(const clockid_t which_clock, int flags,
 	return error;
 }
 
-long posix_cpu_nsleep_restart(struct restart_block *restart_block)
+static long posix_cpu_nsleep_restart(struct restart_block *restart_block)
 {
 	clockid_t which_clock = restart_block->nanosleep.index;
 	struct timespec t;
@@ -1542,7 +1546,6 @@ long posix_cpu_nsleep_restart(struct restart_block *restart_block)
 
 }
 
-
 #define PROCESS_CLOCK	MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED)
 #define THREAD_CLOCK	MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED)
 
-- 
cgit v0.10.2


From 0061748dd2400d0bcd4d49d258db5d7b5d106ca0 Mon Sep 17 00:00:00 2001
From: Richard Cochran <richardcochran@gmail.com>
Date: Tue, 1 Feb 2011 13:52:15 +0000
Subject: posix-timer: Update comment

Pick the cleanup to the comment in posix-timers.c from Richards all in
one conversion patch.

Originally-from: Richard Cochran <richardcochran@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: John Stultz <johnstul@us.ibm.com>
LKML-Reference: <20110201134419.487708516@linutronix.de>

diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index ad154df..a3fdfd4 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -102,11 +102,7 @@ static DEFINE_SPINLOCK(idr_lock);
 /*
  * CLOCKs: The POSIX standard calls for a couple of clocks and allows us
  *	    to implement others.  This structure defines the various
- *	    clocks and allows the possibility of adding others.	 We
- *	    provide an interface to add clocks to the table and expect
- *	    the "arch" code to add at least one clock that is high
- *	    resolution.	 Here we define the standard CLOCK_REALTIME as a
- *	    1/HZ resolution clock.
+ *	    clocks.
  *
  * RESOLUTION: Clock resolution is used to round up timer and interval
  *	    times, NOT to report clock times, which are reported with as
@@ -116,20 +112,13 @@ static DEFINE_SPINLOCK(idr_lock);
  *	    necessary code is written.	The standard says we should say
  *	    something about this issue in the documentation...
  *
- * FUNCTIONS: The CLOCKs structure defines possible functions to handle
- *	    various clock functions.  For clocks that use the standard
- *	    system timer code these entries should be NULL.  This will
- *	    allow dispatch without the overhead of indirect function
- *	    calls.  CLOCKS that depend on other sources (e.g. WWV or GPS)
- *	    must supply functions here, even if the function just returns
- *	    ENOSYS.  The standard POSIX timer management code assumes the
- *	    following: 1.) The k_itimer struct (sched.h) is used for the
- *	    timer.  2.) The list, it_lock, it_clock, it_id and it_pid
- *	    fields are not modified by timer code.
+ * FUNCTIONS: The CLOCKs structure defines possible functions to
+ *	    handle various clock functions.
  *
- *          At this time all functions EXCEPT clock_nanosleep can be
- *          redirected by the CLOCKS structure.  Clock_nanosleep is in
- *          there, but the code ignores it.
+ *	    The standard POSIX timer management code assumes the
+ *	    following: 1.) The k_itimer struct (sched.h) is used for
+ *	    the timer.  2.) The list, it_lock, it_clock, it_id and
+ *	    it_pid fields are not modified by timer code.
  *
  * Permissions: It is assumed that the clock_settime() function defined
  *	    for each clock will take care of permission checks.	 Some
-- 
cgit v0.10.2


From c528f7c6c208f1fae6b4025957173dec045e5f21 Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Tue, 1 Feb 2011 13:52:17 +0000
Subject: time: Introduce timekeeping_inject_offset

This adds a kernel-internal timekeeping interface to add or subtract
a fixed amount from CLOCK_REALTIME. This makes it so kernel users or
interfaces trying to do so do not have to read the time, then add an
offset and then call settimeofday(), which adds some extra error in
comparision to just simply adding the offset in the kernel timekeeping
core.

Signed-off-by: John Stultz <john.stultz@linaro.org>
Signed-off-by: Richard Cochran <richard.cochran@omicron.at>
LKML-Reference: <20110201134419.584311693@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/time.h b/include/linux/time.h
index 7c44e77..379b903 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -166,6 +166,7 @@ extern struct timespec timespec_trunc(struct timespec t, unsigned gran);
 extern int timekeeping_valid_for_hres(void);
 extern u64 timekeeping_max_deferment(void);
 extern void timekeeping_leap_insert(int leapsecond);
+extern int timekeeping_inject_offset(struct timespec *ts);
 
 struct tms;
 extern void do_sys_times(struct tms *);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 4f9f65b..6262c1d 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -387,6 +387,42 @@ int do_settimeofday(const struct timespec *tv)
 
 EXPORT_SYMBOL(do_settimeofday);
 
+
+/**
+ * timekeeping_inject_offset - Adds or subtracts from the current time.
+ * @tv:		pointer to the timespec variable containing the offset
+ *
+ * Adds or subtracts an offset value from the current time.
+ */
+int timekeeping_inject_offset(struct timespec *ts)
+{
+	unsigned long flags;
+
+	if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC)
+		return -EINVAL;
+
+	write_seqlock_irqsave(&xtime_lock, flags);
+
+	timekeeping_forward_now();
+
+	xtime = timespec_add(xtime, *ts);
+	wall_to_monotonic = timespec_sub(wall_to_monotonic, *ts);
+
+	timekeeper.ntp_error = 0;
+	ntp_clear();
+
+	update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
+				timekeeper.mult);
+
+	write_sequnlock_irqrestore(&xtime_lock, flags);
+
+	/* signal hrtimers about time change */
+	clock_was_set();
+
+	return 0;
+}
+EXPORT_SYMBOL(timekeeping_inject_offset);
+
 /**
  * change_clocksource - Swaps clocksources if a new one is available
  *
-- 
cgit v0.10.2


From 094aa1881fdc1b8889b442eb3511b31f3ec2b762 Mon Sep 17 00:00:00 2001
From: Richard Cochran <richardcochran@gmail.com>
Date: Tue, 1 Feb 2011 13:52:20 +0000
Subject: ntp: Add ADJ_SETOFFSET mode bit

This patch adds a new mode bit into the timex structure. When set, the bit
instructs the kernel to add the given time value to the current time.

Signed-off-by: Richard Cochran <richard.cochran@omicron.at>
Acked-by: John Stultz <johnstul@us.ibm.com>
LKML-Reference: <20110201134320.688829863@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/timex.h b/include/linux/timex.h
index d23999f..aa60fe7 100644
--- a/include/linux/timex.h
+++ b/include/linux/timex.h
@@ -73,7 +73,7 @@ struct timex {
 	long tolerance;		/* clock frequency tolerance (ppm)
 				 * (read only)
 				 */
-	struct timeval time;	/* (read only) */
+	struct timeval time;	/* (read only, except for ADJ_SETOFFSET) */
 	long tick;		/* (modified) usecs between clock ticks */
 
 	long ppsfreq;           /* pps frequency (scaled ppm) (ro) */
@@ -102,6 +102,7 @@ struct timex {
 #define ADJ_STATUS		0x0010	/* clock status */
 #define ADJ_TIMECONST		0x0020	/* pll time constant */
 #define ADJ_TAI			0x0080	/* set TAI offset */
+#define ADJ_SETOFFSET		0x0100  /* add 'time' to current time */
 #define ADJ_MICRO		0x1000	/* select microsecond resolution */
 #define ADJ_NANO		0x2000	/* select nanosecond resolution */
 #define ADJ_TICK		0x4000	/* tick value */
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index ed8cfdf..5ac5932 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -648,6 +648,17 @@ int do_adjtimex(struct timex *txc)
 			hrtimer_cancel(&leap_timer);
 	}
 
+	if (txc->modes & ADJ_SETOFFSET) {
+		struct timespec delta;
+		if ((unsigned long)txc->time.tv_usec >= NSEC_PER_SEC)
+			return -EINVAL;
+		delta.tv_sec  = txc->time.tv_sec;
+		delta.tv_nsec = txc->time.tv_usec;
+		if (!(txc->modes & ADJ_NANO))
+			delta.tv_nsec *= 1000;
+		timekeeping_inject_offset(&delta);
+	}
+
 	getnstimeofday(&ts);
 
 	write_seqlock_irq(&xtime_lock);
-- 
cgit v0.10.2


From 65f5d80bdf83ec0d7f3887db10153bf3f36ed73c Mon Sep 17 00:00:00 2001
From: Richard Cochran <richardcochran@gmail.com>
Date: Tue, 1 Feb 2011 13:52:23 +0000
Subject: time: Splitout compat timex accessors

Split out the compat timex accessors into separate
functions. Preparatory patch for a new syscall.

[ tglx: Split that patch from Richards "posix-timers: Introduce a
  	syscall for clock tuning.". Keeps the changes strictly
  	separate ]

Originally-from: Richard Cochran <richardcochran@gmail.com>
Acked-by: John Stultz <johnstul@us.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <20110201134419.772343089@linutronix.de>

diff --git a/kernel/compat.c b/kernel/compat.c
index c9e2ec0..449e853 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -52,6 +52,64 @@ static int compat_put_timeval(struct compat_timeval __user *o,
 		put_user(i->tv_usec, &o->tv_usec)) ? -EFAULT : 0;
 }
 
+static int compat_get_timex(struct timex *txc, struct compat_timex __user *utp)
+{
+	memset(txc, 0, sizeof(struct timex));
+
+	if (!access_ok(VERIFY_READ, utp, sizeof(struct compat_timex)) ||
+			__get_user(txc->modes, &utp->modes) ||
+			__get_user(txc->offset, &utp->offset) ||
+			__get_user(txc->freq, &utp->freq) ||
+			__get_user(txc->maxerror, &utp->maxerror) ||
+			__get_user(txc->esterror, &utp->esterror) ||
+			__get_user(txc->status, &utp->status) ||
+			__get_user(txc->constant, &utp->constant) ||
+			__get_user(txc->precision, &utp->precision) ||
+			__get_user(txc->tolerance, &utp->tolerance) ||
+			__get_user(txc->time.tv_sec, &utp->time.tv_sec) ||
+			__get_user(txc->time.tv_usec, &utp->time.tv_usec) ||
+			__get_user(txc->tick, &utp->tick) ||
+			__get_user(txc->ppsfreq, &utp->ppsfreq) ||
+			__get_user(txc->jitter, &utp->jitter) ||
+			__get_user(txc->shift, &utp->shift) ||
+			__get_user(txc->stabil, &utp->stabil) ||
+			__get_user(txc->jitcnt, &utp->jitcnt) ||
+			__get_user(txc->calcnt, &utp->calcnt) ||
+			__get_user(txc->errcnt, &utp->errcnt) ||
+			__get_user(txc->stbcnt, &utp->stbcnt))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int compat_put_timex(struct compat_timex __user *utp, struct timex *txc)
+{
+	if (!access_ok(VERIFY_WRITE, utp, sizeof(struct compat_timex)) ||
+			__put_user(txc->modes, &utp->modes) ||
+			__put_user(txc->offset, &utp->offset) ||
+			__put_user(txc->freq, &utp->freq) ||
+			__put_user(txc->maxerror, &utp->maxerror) ||
+			__put_user(txc->esterror, &utp->esterror) ||
+			__put_user(txc->status, &utp->status) ||
+			__put_user(txc->constant, &utp->constant) ||
+			__put_user(txc->precision, &utp->precision) ||
+			__put_user(txc->tolerance, &utp->tolerance) ||
+			__put_user(txc->time.tv_sec, &utp->time.tv_sec) ||
+			__put_user(txc->time.tv_usec, &utp->time.tv_usec) ||
+			__put_user(txc->tick, &utp->tick) ||
+			__put_user(txc->ppsfreq, &utp->ppsfreq) ||
+			__put_user(txc->jitter, &utp->jitter) ||
+			__put_user(txc->shift, &utp->shift) ||
+			__put_user(txc->stabil, &utp->stabil) ||
+			__put_user(txc->jitcnt, &utp->jitcnt) ||
+			__put_user(txc->calcnt, &utp->calcnt) ||
+			__put_user(txc->errcnt, &utp->errcnt) ||
+			__put_user(txc->stbcnt, &utp->stbcnt) ||
+			__put_user(txc->tai, &utp->tai))
+		return -EFAULT;
+	return 0;
+}
+
 asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv,
 		struct timezone __user *tz)
 {
@@ -951,58 +1009,17 @@ asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat
 asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp)
 {
 	struct timex txc;
-	int ret;
+	int err, ret;
 
-	memset(&txc, 0, sizeof(struct timex));
-
-	if (!access_ok(VERIFY_READ, utp, sizeof(struct compat_timex)) ||
-			__get_user(txc.modes, &utp->modes) ||
-			__get_user(txc.offset, &utp->offset) ||
-			__get_user(txc.freq, &utp->freq) ||
-			__get_user(txc.maxerror, &utp->maxerror) ||
-			__get_user(txc.esterror, &utp->esterror) ||
-			__get_user(txc.status, &utp->status) ||
-			__get_user(txc.constant, &utp->constant) ||
-			__get_user(txc.precision, &utp->precision) ||
-			__get_user(txc.tolerance, &utp->tolerance) ||
-			__get_user(txc.time.tv_sec, &utp->time.tv_sec) ||
-			__get_user(txc.time.tv_usec, &utp->time.tv_usec) ||
-			__get_user(txc.tick, &utp->tick) ||
-			__get_user(txc.ppsfreq, &utp->ppsfreq) ||
-			__get_user(txc.jitter, &utp->jitter) ||
-			__get_user(txc.shift, &utp->shift) ||
-			__get_user(txc.stabil, &utp->stabil) ||
-			__get_user(txc.jitcnt, &utp->jitcnt) ||
-			__get_user(txc.calcnt, &utp->calcnt) ||
-			__get_user(txc.errcnt, &utp->errcnt) ||
-			__get_user(txc.stbcnt, &utp->stbcnt))
-		return -EFAULT;
+	err = compat_get_timex(&txc, utp);
+	if (err)
+		return err;
 
 	ret = do_adjtimex(&txc);
 
-	if (!access_ok(VERIFY_WRITE, utp, sizeof(struct compat_timex)) ||
-			__put_user(txc.modes, &utp->modes) ||
-			__put_user(txc.offset, &utp->offset) ||
-			__put_user(txc.freq, &utp->freq) ||
-			__put_user(txc.maxerror, &utp->maxerror) ||
-			__put_user(txc.esterror, &utp->esterror) ||
-			__put_user(txc.status, &utp->status) ||
-			__put_user(txc.constant, &utp->constant) ||
-			__put_user(txc.precision, &utp->precision) ||
-			__put_user(txc.tolerance, &utp->tolerance) ||
-			__put_user(txc.time.tv_sec, &utp->time.tv_sec) ||
-			__put_user(txc.time.tv_usec, &utp->time.tv_usec) ||
-			__put_user(txc.tick, &utp->tick) ||
-			__put_user(txc.ppsfreq, &utp->ppsfreq) ||
-			__put_user(txc.jitter, &utp->jitter) ||
-			__put_user(txc.shift, &utp->shift) ||
-			__put_user(txc.stabil, &utp->stabil) ||
-			__put_user(txc.jitcnt, &utp->jitcnt) ||
-			__put_user(txc.calcnt, &utp->calcnt) ||
-			__put_user(txc.errcnt, &utp->errcnt) ||
-			__put_user(txc.stbcnt, &utp->stbcnt) ||
-			__put_user(txc.tai, &utp->tai))
-		ret = -EFAULT;
+	err = compat_put_timex(utp, &txc);
+	if (err)
+		return err;
 
 	return ret;
 }
-- 
cgit v0.10.2


From f1f1d5ebd10ffa4242bce7a90a56a222d6b7bc77 Mon Sep 17 00:00:00 2001
From: Richard Cochran <richardcochran@gmail.com>
Date: Tue, 1 Feb 2011 13:52:26 +0000
Subject: posix-timers: Introduce a syscall for clock tuning.

A new syscall is introduced that allows tuning of a POSIX clock. The
new call, clock_adjtime, takes two parameters, the clock ID and a
pointer to a struct timex. Any ADJTIMEX(2) operation may be requested
via this system call, but various POSIX clocks may or may not support
tuning.

[ tglx: Adapted to the posix-timer cleanup series. Avoid copy_to_user
  	in the error case ]

Signed-off-by: Richard Cochran <richard.cochran@omicron.at>
Acked-by: John Stultz <johnstul@us.ibm.com>
LKML-Reference: <20110201134419.869804645@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index 8206255..79a1cea 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -4,6 +4,7 @@
 #include <linux/spinlock.h>
 #include <linux/list.h>
 #include <linux/sched.h>
+#include <linux/timex.h>
 
 union cpu_time_count {
 	cputime_t cpu;
@@ -71,6 +72,7 @@ struct k_clock {
 	int (*clock_set) (const clockid_t which_clock,
 			  const struct timespec *tp);
 	int (*clock_get) (const clockid_t which_clock, struct timespec * tp);
+	int (*clock_adj) (const clockid_t which_clock, struct timex *tx);
 	int (*timer_create) (struct k_itimer *timer);
 	int (*nsleep) (const clockid_t which_clock, int flags,
 		       struct timespec *, struct timespec __user *);
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 18cd068..bfacab9 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -311,6 +311,8 @@ asmlinkage long sys_clock_settime(clockid_t which_clock,
 				const struct timespec __user *tp);
 asmlinkage long sys_clock_gettime(clockid_t which_clock,
 				struct timespec __user *tp);
+asmlinkage long sys_clock_adjtime(clockid_t which_clock,
+				struct timex __user *tx);
 asmlinkage long sys_clock_getres(clockid_t which_clock,
 				struct timespec __user *tp);
 asmlinkage long sys_clock_nanosleep(clockid_t which_clock, int flags,
diff --git a/kernel/compat.c b/kernel/compat.c
index 449e853..38b1d2c 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -675,6 +675,29 @@ long compat_sys_clock_gettime(clockid_t which_clock,
 	return err;
 }
 
+long compat_sys_clock_adjtime(clockid_t which_clock,
+		struct compat_timex __user *utp)
+{
+	struct timex txc;
+	mm_segment_t oldfs;
+	int err, ret;
+
+	err = compat_get_timex(&txc, utp);
+	if (err)
+		return err;
+
+	oldfs = get_fs();
+	set_fs(KERNEL_DS);
+	ret = sys_clock_adjtime(which_clock, (struct timex __user *) &txc);
+	set_fs(oldfs);
+
+	err = compat_put_timex(utp, &txc);
+	if (err)
+		return err;
+
+	return ret;
+}
+
 long compat_sys_clock_getres(clockid_t which_clock,
 		struct compat_timespec __user *tp)
 {
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index a3fdfd4..5a5a4f1 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -170,6 +170,12 @@ static int posix_clock_realtime_set(const clockid_t which_clock,
 	return do_sys_settimeofday(tp, NULL);
 }
 
+static int posix_clock_realtime_adj(const clockid_t which_clock,
+				    struct timex *t)
+{
+	return do_adjtimex(t);
+}
+
 /*
  * Get monotonic time for posix timers
  */
@@ -216,6 +222,7 @@ static __init int init_posix_timers(void)
 		.clock_getres	= hrtimer_get_res,
 		.clock_get	= posix_clock_realtime_get,
 		.clock_set	= posix_clock_realtime_set,
+		.clock_adj	= posix_clock_realtime_adj,
 		.nsleep		= common_nsleep,
 		.nsleep_restart	= hrtimer_nanosleep_restart,
 		.timer_create	= common_timer_create,
@@ -948,6 +955,29 @@ SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
 	return error;
 }
 
+SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock,
+		struct timex __user *, utx)
+{
+	struct k_clock *kc = clockid_to_kclock(which_clock);
+	struct timex ktx;
+	int err;
+
+	if (!kc)
+		return -EINVAL;
+	if (!kc->clock_adj)
+		return -EOPNOTSUPP;
+
+	if (copy_from_user(&ktx, utx, sizeof(ktx)))
+		return -EFAULT;
+
+	err = kc->clock_adj(which_clock, &ktx);
+
+	if (!err && copy_to_user(utx, &ktx, sizeof(ktx)))
+		return -EFAULT;
+
+	return err;
+}
+
 SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
 		struct timespec __user *, tp)
 {
-- 
cgit v0.10.2


From ce26efdefa5e8f22d933df72d7f7482725091d6d Mon Sep 17 00:00:00 2001
From: Richard Cochran <richard.cochran@omicron.at>
Date: Tue, 1 Feb 2011 13:52:30 +0000
Subject: x86: Add clock_adjtime for x86

This patch adds the clock_adjtime system call to the x86 architecture.

Signed-off-by: Richard Cochran <richard.cochran@omicron.at>
Acked-by: John Stultz <johnstul@us.ibm.com>
LKML-Reference: <20110201134419.968905083@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 518bb99..0ed7896 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -851,4 +851,5 @@ ia32_sys_call_table:
 	.quad sys_fanotify_init
 	.quad sys32_fanotify_mark
 	.quad sys_prlimit64		/* 340 */
+	.quad compat_sys_clock_adjtime
 ia32_syscall_end:
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index b766a5e..b6f73f1 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -346,10 +346,11 @@
 #define __NR_fanotify_init	338
 #define __NR_fanotify_mark	339
 #define __NR_prlimit64		340
+#define __NR_clock_adjtime	341
 
 #ifdef __KERNEL__
 
-#define NR_syscalls 341
+#define NR_syscalls 342
 
 #define __ARCH_WANT_IPC_PARSE_VERSION
 #define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index 363e9b8..5ee3085 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -669,6 +669,8 @@ __SYSCALL(__NR_fanotify_init, sys_fanotify_init)
 __SYSCALL(__NR_fanotify_mark, sys_fanotify_mark)
 #define __NR_prlimit64				302
 __SYSCALL(__NR_prlimit64, sys_prlimit64)
+#define __NR_clock_adjtime			303
+__SYSCALL(__NR_clock_adjtime, sys_clock_adjtime)
 
 #ifndef __NO_STUBS
 #define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index b35786d..68c7b9a 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -340,3 +340,4 @@ ENTRY(sys_call_table)
 	.long sys_fanotify_init
 	.long sys_fanotify_mark
 	.long sys_prlimit64		/* 340 */
+	.long sys_clock_adjtime
-- 
cgit v0.10.2


From 81e294cba2596f5f10848bbe19d98b344c2a2d5c Mon Sep 17 00:00:00 2001
From: Richard Cochran <richard.cochran@omicron.at>
Date: Tue, 1 Feb 2011 13:52:32 +0000
Subject: posix-timers: Add support for fd based clocks

Extend the negative clockids which are currently used by posix cpu
timers to encode the PID with a file descriptor based type which
encodes the fd in the upper bits.

Originally-from: Richard Cochran <richard.cochran@omicron.at>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: John Stultz <johnstul@us.ibm.com>
LKML-Reference: <20110201134420.062860200@linutronix.de>

diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index 79a1cea..88b9256 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -18,6 +18,17 @@ struct cpu_timer_list {
 	int firing;
 };
 
+/*
+ * Bit fields within a clockid:
+ *
+ * The most significant 29 bits hold either a pid or a file descriptor.
+ *
+ * Bit 2 indicates whether a cpu clock refers to a thread or a process.
+ *
+ * Bits 1 and 0 give the type: PROF=0, VIRT=1, SCHED=2, or FD=3.
+ *
+ * A clockid is invalid if bits 2, 1, and 0 are all set.
+ */
 #define CPUCLOCK_PID(clock)		((pid_t) ~((clock) >> 3))
 #define CPUCLOCK_PERTHREAD(clock) \
 	(((clock) & (clockid_t) CPUCLOCK_PERTHREAD_MASK) != 0)
@@ -29,6 +40,8 @@ struct cpu_timer_list {
 #define CPUCLOCK_VIRT		1
 #define CPUCLOCK_SCHED		2
 #define CPUCLOCK_MAX		3
+#define CLOCKFD			CPUCLOCK_MAX
+#define CLOCKFD_MASK		(CPUCLOCK_PERTHREAD_MASK|CPUCLOCK_CLOCK_MASK)
 
 #define MAKE_PROCESS_CPUCLOCK(pid, clock) \
 	((~(clockid_t) (pid) << 3) | (clockid_t) (clock))
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 5a5a4f1..df629d8 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -488,7 +488,7 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
 static struct k_clock *clockid_to_kclock(const clockid_t id)
 {
 	if (id < 0)
-		return &clock_posix_cpu;
+		return (id & CLOCKFD_MASK) == CLOCKFD ? NULL : &clock_posix_cpu;
 
 	if (id >= MAX_CLOCKS || !posix_clocks[id].clock_getres)
 		return NULL;
-- 
cgit v0.10.2


From 527087374faa488776a789375a7d6ea74fda6f71 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 2 Feb 2011 12:10:09 +0100
Subject: posix-timers: Cleanup namespace

Rename register_posix_clock() to posix_timers_register_clock(). That's
what the function really does. As a side effect this cleans up the
posix_clock namespace for the upcoming dynamic posix_clock
infrastructure.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Richard Cochran <richard.cochran@omicron.at>
Cc: John Stultz <johnstul@us.ibm.com>
LKML-Reference: <alpine.LFD.2.00.1102021222240.31804@localhost6.localdomain6>

diff --git a/drivers/char/mmtimer.c b/drivers/char/mmtimer.c
index ff41eb3..33dc229 100644
--- a/drivers/char/mmtimer.c
+++ b/drivers/char/mmtimer.c
@@ -840,7 +840,7 @@ static int __init mmtimer_init(void)
 	}
 
 	sgi_clock_period = NSEC_PER_SEC / sn_rtc_cycles_per_second;
-	register_posix_clock(CLOCK_SGI_CYCLE, &sgi_clock);
+	posix_timers_register_clock(CLOCK_SGI_CYCLE, &sgi_clock);
 
 	printk(KERN_INFO "%s: v%s, %ld MHz\n", MMTIMER_DESC, MMTIMER_VERSION,
 	       sn_rtc_cycles_per_second/(unsigned long)1E6);
diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index 88b9256..9d6ffe2 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -101,7 +101,7 @@ struct k_clock {
 
 extern struct k_clock clock_posix_cpu;
 
-void register_posix_clock(const clockid_t clock_id, struct k_clock *new_clock);
+void posix_timers_register_clock(const clockid_t clock_id, struct k_clock *new_clock);
 
 /* function to call to trigger timer event */
 int posix_timer_event(struct k_itimer *timr, int si_private);
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 609e187..67fea9d 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -1618,8 +1618,8 @@ static __init int init_posix_cpu_timers(void)
 	};
 	struct timespec ts;
 
-	register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process);
-	register_posix_clock(CLOCK_THREAD_CPUTIME_ID, &thread);
+	posix_timers_register_clock(CLOCK_PROCESS_CPUTIME_ID, &process);
+	posix_timers_register_clock(CLOCK_THREAD_CPUTIME_ID, &thread);
 
 	cputime_to_timespec(cputime_one_jiffy, &ts);
 	onecputick = ts.tv_nsec;
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index df629d8..af936fd 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -253,11 +253,11 @@ static __init int init_posix_timers(void)
 		.clock_get	= posix_get_monotonic_coarse,
 	};
 
-	register_posix_clock(CLOCK_REALTIME, &clock_realtime);
-	register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic);
-	register_posix_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw);
-	register_posix_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse);
-	register_posix_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse);
+	posix_timers_register_clock(CLOCK_REALTIME, &clock_realtime);
+	posix_timers_register_clock(CLOCK_MONOTONIC, &clock_monotonic);
+	posix_timers_register_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw);
+	posix_timers_register_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse);
+	posix_timers_register_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse);
 
 	posix_timers_cache = kmem_cache_create("posix_timers_cache",
 					sizeof (struct k_itimer), 0, SLAB_PANIC,
@@ -433,7 +433,8 @@ static struct pid *good_sigevent(sigevent_t * event)
 	return task_pid(rtn);
 }
 
-void register_posix_clock(const clockid_t clock_id, struct k_clock *new_clock)
+void posix_timers_register_clock(const clockid_t clock_id,
+				 struct k_clock *new_clock)
 {
 	if ((unsigned) clock_id >= MAX_CLOCKS) {
 		printk(KERN_WARNING "POSIX clock register failed for clock_id %d\n",
@@ -454,7 +455,7 @@ void register_posix_clock(const clockid_t clock_id, struct k_clock *new_clock)
 
 	posix_clocks[clock_id] = *new_clock;
 }
-EXPORT_SYMBOL_GPL(register_posix_clock);
+EXPORT_SYMBOL_GPL(posix_timers_register_clock);
 
 static struct k_itimer * alloc_posix_timer(void)
 {
-- 
cgit v0.10.2


From 0606f422b453f76c31ab2b1bd52943ff06a2dcf2 Mon Sep 17 00:00:00 2001
From: Richard Cochran <richardcochran@gmail.com>
Date: Tue, 1 Feb 2011 13:52:35 +0000
Subject: posix clocks: Introduce dynamic clocks

This patch adds support for adding and removing posix clocks. The
clock lifetime cycle is patterned after usb devices. Each clock is
represented by a standard character device. In addition, the driver
may optionally implement custom character device operations.

The posix clock and timer system calls listed below now work with
dynamic posix clocks, as well as the traditional static clocks.
The following system calls are affected:

   - clock_adjtime (brand new syscall)
   - clock_gettime
   - clock_getres
   - clock_settime
   - timer_create
   - timer_delete
   - timer_gettime
   - timer_settime

[ tglx: Adapted to the posix-timer cleanup. Moved clock_posix_dynamic
  	to posix-clock.c and made all referenced functions static ]

Signed-off-by: Richard Cochran <richard.cochran@omicron.at>
Acked-by: John Stultz <johnstul@us.ibm.com>
LKML-Reference: <20110201134420.164172635@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/posix-clock.h b/include/linux/posix-clock.h
new file mode 100644
index 0000000..369e19d
--- /dev/null
+++ b/include/linux/posix-clock.h
@@ -0,0 +1,150 @@
+/*
+ * posix-clock.h - support for dynamic clock devices
+ *
+ * Copyright (C) 2010 OMICRON electronics GmbH
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#ifndef _LINUX_POSIX_CLOCK_H_
+#define _LINUX_POSIX_CLOCK_H_
+
+#include <linux/cdev.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+#include <linux/posix-timers.h>
+
+struct posix_clock;
+
+/**
+ * struct posix_clock_operations - functional interface to the clock
+ *
+ * Every posix clock is represented by a character device. Drivers may
+ * optionally offer extended capabilities by implementing the
+ * character device methods. The character device file operations are
+ * first handled by the clock device layer, then passed on to the
+ * driver by calling these functions.
+ *
+ * @owner:          The clock driver should set to THIS_MODULE
+ * @clock_adjtime:  Adjust the clock
+ * @clock_gettime:  Read the current time
+ * @clock_getres:   Get the clock resolution
+ * @clock_settime:  Set the current time value
+ * @timer_create:   Create a new timer
+ * @timer_delete:   Remove a previously created timer
+ * @timer_gettime:  Get remaining time and interval of a timer
+ * @timer_setttime: Set a timer's initial expiration and interval
+ * @fasync:         Optional character device fasync method
+ * @mmap:           Optional character device mmap method
+ * @open:           Optional character device open method
+ * @release:        Optional character device release method
+ * @ioctl:          Optional character device ioctl method
+ * @read:           Optional character device read method
+ * @poll:           Optional character device poll method
+ */
+struct posix_clock_operations {
+	struct module *owner;
+
+	int  (*clock_adjtime)(struct posix_clock *pc, struct timex *tx);
+
+	int  (*clock_gettime)(struct posix_clock *pc, struct timespec *ts);
+
+	int  (*clock_getres) (struct posix_clock *pc, struct timespec *ts);
+
+	int  (*clock_settime)(struct posix_clock *pc,
+			      const struct timespec *ts);
+
+	int  (*timer_create) (struct posix_clock *pc, struct k_itimer *kit);
+
+	int  (*timer_delete) (struct posix_clock *pc, struct k_itimer *kit);
+
+	void (*timer_gettime)(struct posix_clock *pc,
+			      struct k_itimer *kit, struct itimerspec *tsp);
+
+	int  (*timer_settime)(struct posix_clock *pc,
+			      struct k_itimer *kit, int flags,
+			      struct itimerspec *tsp, struct itimerspec *old);
+	/*
+	 * Optional character device methods:
+	 */
+	int     (*fasync)  (struct posix_clock *pc,
+			    int fd, struct file *file, int on);
+
+	long    (*ioctl)   (struct posix_clock *pc,
+			    unsigned int cmd, unsigned long arg);
+
+	int     (*mmap)    (struct posix_clock *pc,
+			    struct vm_area_struct *vma);
+
+	int     (*open)    (struct posix_clock *pc, fmode_t f_mode);
+
+	uint    (*poll)    (struct posix_clock *pc,
+			    struct file *file, poll_table *wait);
+
+	int     (*release) (struct posix_clock *pc);
+
+	ssize_t (*read)    (struct posix_clock *pc,
+			    uint flags, char __user *buf, size_t cnt);
+};
+
+/**
+ * struct posix_clock - represents a dynamic posix clock
+ *
+ * @ops:     Functional interface to the clock
+ * @cdev:    Character device instance for this clock
+ * @kref:    Reference count.
+ * @mutex:   Protects the 'zombie' field from concurrent access.
+ * @zombie:  If 'zombie' is true, then the hardware has disappeared.
+ * @release: A function to free the structure when the reference count reaches
+ *           zero. May be NULL if structure is statically allocated.
+ *
+ * Drivers should embed their struct posix_clock within a private
+ * structure, obtaining a reference to it during callbacks using
+ * container_of().
+ */
+struct posix_clock {
+	struct posix_clock_operations ops;
+	struct cdev cdev;
+	struct kref kref;
+	struct mutex mutex;
+	bool zombie;
+	void (*release)(struct posix_clock *clk);
+};
+
+/**
+ * posix_clock_register() - register a new clock
+ * @clk:   Pointer to the clock. Caller must provide 'ops' and 'release'
+ * @devid: Allocated device id
+ *
+ * A clock driver calls this function to register itself with the
+ * clock device subsystem. If 'clk' points to dynamically allocated
+ * memory, then the caller must provide a 'release' function to free
+ * that memory.
+ *
+ * Returns zero on success, non-zero otherwise.
+ */
+int posix_clock_register(struct posix_clock *clk, dev_t devid);
+
+/**
+ * posix_clock_unregister() - unregister a clock
+ * @clk: Clock instance previously registered via posix_clock_register()
+ *
+ * A clock driver calls this function to remove itself from the clock
+ * device subsystem. The posix_clock itself will remain (in an
+ * inactive state) until its reference count drops to zero, at which
+ * point it will be deallocated with its 'release' method.
+ */
+void posix_clock_unregister(struct posix_clock *clk);
+
+#endif
diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index 9d6ffe2..d51243a 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -32,7 +32,7 @@ struct cpu_timer_list {
 #define CPUCLOCK_PID(clock)		((pid_t) ~((clock) >> 3))
 #define CPUCLOCK_PERTHREAD(clock) \
 	(((clock) & (clockid_t) CPUCLOCK_PERTHREAD_MASK) != 0)
-#define CPUCLOCK_PID_MASK	7
+
 #define CPUCLOCK_PERTHREAD_MASK	4
 #define CPUCLOCK_WHICH(clock)	((clock) & (clockid_t) CPUCLOCK_CLOCK_MASK)
 #define CPUCLOCK_CLOCK_MASK	3
@@ -48,6 +48,9 @@ struct cpu_timer_list {
 #define MAKE_THREAD_CPUCLOCK(tid, clock) \
 	MAKE_PROCESS_CPUCLOCK((tid), (clock) | CPUCLOCK_PERTHREAD_MASK)
 
+#define FD_TO_CLOCKID(fd)	((~(clockid_t) (fd) << 3) | CLOCKFD)
+#define CLOCKID_TO_FD(clk)	((unsigned int) ~((clk) >> 3))
+
 /* POSIX.1b interval timer structure. */
 struct k_itimer {
 	struct list_head list;		/* free/ allocate list */
@@ -100,6 +103,7 @@ struct k_clock {
 };
 
 extern struct k_clock clock_posix_cpu;
+extern struct k_clock clock_posix_dynamic;
 
 void posix_timers_register_clock(const clockid_t clock_id, struct k_clock *new_clock);
 
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index af936fd..44fcff1 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -41,6 +41,7 @@
 #include <linux/init.h>
 #include <linux/compiler.h>
 #include <linux/idr.h>
+#include <linux/posix-clock.h>
 #include <linux/posix-timers.h>
 #include <linux/syscalls.h>
 #include <linux/wait.h>
@@ -489,7 +490,8 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
 static struct k_clock *clockid_to_kclock(const clockid_t id)
 {
 	if (id < 0)
-		return (id & CLOCKFD_MASK) == CLOCKFD ? NULL : &clock_posix_cpu;
+		return (id & CLOCKFD_MASK) == CLOCKFD ?
+			&clock_posix_dynamic : &clock_posix_cpu;
 
 	if (id >= MAX_CLOCKS || !posix_clocks[id].clock_getres)
 		return NULL;
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index ee26662..b042599 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1,4 +1,5 @@
-obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o timeconv.o
+obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o
+obj-y += timeconv.o posix-clock.o
 
 obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD)		+= clockevents.o
 obj-$(CONFIG_GENERIC_CLOCKEVENTS)		+= tick-common.o
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
new file mode 100644
index 0000000..04498cb
--- /dev/null
+++ b/kernel/time/posix-clock.c
@@ -0,0 +1,441 @@
+/*
+ * posix-clock.c - support for dynamic clock devices
+ *
+ * Copyright (C) 2010 OMICRON electronics GmbH
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#include <linux/device.h>
+#include <linux/file.h>
+#include <linux/mutex.h>
+#include <linux/posix-clock.h>
+#include <linux/slab.h>
+#include <linux/syscalls.h>
+#include <linux/uaccess.h>
+
+static void delete_clock(struct kref *kref);
+
+/*
+ * Returns NULL if the posix_clock instance attached to 'fp' is old and stale.
+ */
+static struct posix_clock *get_posix_clock(struct file *fp)
+{
+	struct posix_clock *clk = fp->private_data;
+
+	mutex_lock(&clk->mutex);
+
+	if (!clk->zombie)
+		return clk;
+
+	mutex_unlock(&clk->mutex);
+
+	return NULL;
+}
+
+static void put_posix_clock(struct posix_clock *clk)
+{
+	mutex_unlock(&clk->mutex);
+}
+
+static ssize_t posix_clock_read(struct file *fp, char __user *buf,
+				size_t count, loff_t *ppos)
+{
+	struct posix_clock *clk = get_posix_clock(fp);
+	int err = -EINVAL;
+
+	if (!clk)
+		return -ENODEV;
+
+	if (clk->ops.read)
+		err = clk->ops.read(clk, fp->f_flags, buf, count);
+
+	put_posix_clock(clk);
+
+	return err;
+}
+
+static unsigned int posix_clock_poll(struct file *fp, poll_table *wait)
+{
+	struct posix_clock *clk = get_posix_clock(fp);
+	int result = 0;
+
+	if (!clk)
+		return -ENODEV;
+
+	if (clk->ops.poll)
+		result = clk->ops.poll(clk, fp, wait);
+
+	put_posix_clock(clk);
+
+	return result;
+}
+
+static int posix_clock_fasync(int fd, struct file *fp, int on)
+{
+	struct posix_clock *clk = get_posix_clock(fp);
+	int err = 0;
+
+	if (!clk)
+		return -ENODEV;
+
+	if (clk->ops.fasync)
+		err = clk->ops.fasync(clk, fd, fp, on);
+
+	put_posix_clock(clk);
+
+	return err;
+}
+
+static int posix_clock_mmap(struct file *fp, struct vm_area_struct *vma)
+{
+	struct posix_clock *clk = get_posix_clock(fp);
+	int err = -ENODEV;
+
+	if (!clk)
+		return -ENODEV;
+
+	if (clk->ops.mmap)
+		err = clk->ops.mmap(clk, vma);
+
+	put_posix_clock(clk);
+
+	return err;
+}
+
+static long posix_clock_ioctl(struct file *fp,
+			      unsigned int cmd, unsigned long arg)
+{
+	struct posix_clock *clk = get_posix_clock(fp);
+	int err = -ENOTTY;
+
+	if (!clk)
+		return -ENODEV;
+
+	if (clk->ops.ioctl)
+		err = clk->ops.ioctl(clk, cmd, arg);
+
+	put_posix_clock(clk);
+
+	return err;
+}
+
+#ifdef CONFIG_COMPAT
+static long posix_clock_compat_ioctl(struct file *fp,
+				     unsigned int cmd, unsigned long arg)
+{
+	struct posix_clock *clk = get_posix_clock(fp);
+	int err = -ENOTTY;
+
+	if (!clk)
+		return -ENODEV;
+
+	if (clk->ops.ioctl)
+		err = clk->ops.ioctl(clk, cmd, arg);
+
+	put_posix_clock(clk);
+
+	return err;
+}
+#endif
+
+static int posix_clock_open(struct inode *inode, struct file *fp)
+{
+	int err;
+	struct posix_clock *clk =
+		container_of(inode->i_cdev, struct posix_clock, cdev);
+
+	mutex_lock(&clk->mutex);
+
+	if (clk->zombie) {
+		err = -ENODEV;
+		goto out;
+	}
+	if (clk->ops.open)
+		err = clk->ops.open(clk, fp->f_mode);
+	else
+		err = 0;
+
+	if (!err) {
+		kref_get(&clk->kref);
+		fp->private_data = clk;
+	}
+out:
+	mutex_unlock(&clk->mutex);
+	return err;
+}
+
+static int posix_clock_release(struct inode *inode, struct file *fp)
+{
+	struct posix_clock *clk = fp->private_data;
+	int err = 0;
+
+	if (clk->ops.release)
+		err = clk->ops.release(clk);
+
+	kref_put(&clk->kref, delete_clock);
+
+	fp->private_data = NULL;
+
+	return err;
+}
+
+static const struct file_operations posix_clock_file_operations = {
+	.owner		= THIS_MODULE,
+	.llseek		= no_llseek,
+	.read		= posix_clock_read,
+	.poll		= posix_clock_poll,
+	.unlocked_ioctl	= posix_clock_ioctl,
+	.open		= posix_clock_open,
+	.release	= posix_clock_release,
+	.fasync		= posix_clock_fasync,
+	.mmap		= posix_clock_mmap,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= posix_clock_compat_ioctl,
+#endif
+};
+
+int posix_clock_register(struct posix_clock *clk, dev_t devid)
+{
+	int err;
+
+	kref_init(&clk->kref);
+	mutex_init(&clk->mutex);
+
+	cdev_init(&clk->cdev, &posix_clock_file_operations);
+	clk->cdev.owner = clk->ops.owner;
+	err = cdev_add(&clk->cdev, devid, 1);
+	if (err)
+		goto no_cdev;
+
+	return err;
+no_cdev:
+	mutex_destroy(&clk->mutex);
+	return err;
+}
+EXPORT_SYMBOL_GPL(posix_clock_register);
+
+static void delete_clock(struct kref *kref)
+{
+	struct posix_clock *clk = container_of(kref, struct posix_clock, kref);
+	mutex_destroy(&clk->mutex);
+	if (clk->release)
+		clk->release(clk);
+}
+
+void posix_clock_unregister(struct posix_clock *clk)
+{
+	cdev_del(&clk->cdev);
+
+	mutex_lock(&clk->mutex);
+	clk->zombie = true;
+	mutex_unlock(&clk->mutex);
+
+	kref_put(&clk->kref, delete_clock);
+}
+EXPORT_SYMBOL_GPL(posix_clock_unregister);
+
+struct posix_clock_desc {
+	struct file *fp;
+	struct posix_clock *clk;
+};
+
+static int get_clock_desc(const clockid_t id, struct posix_clock_desc *cd)
+{
+	struct file *fp = fget(CLOCKID_TO_FD(id));
+	int err = -EINVAL;
+
+	if (!fp)
+		return err;
+
+	if (fp->f_op->open != posix_clock_open || !fp->private_data)
+		goto out;
+
+	cd->fp = fp;
+	cd->clk = get_posix_clock(fp);
+
+	err = cd->clk ? 0 : -ENODEV;
+out:
+	if (err)
+		fput(fp);
+	return err;
+}
+
+static void put_clock_desc(struct posix_clock_desc *cd)
+{
+	put_posix_clock(cd->clk);
+	fput(cd->fp);
+}
+
+static int pc_clock_adjtime(clockid_t id, struct timex *tx)
+{
+	struct posix_clock_desc cd;
+	int err;
+
+	err = get_clock_desc(id, &cd);
+	if (err)
+		return err;
+
+	if (cd.clk->ops.clock_adjtime)
+		err = cd.clk->ops.clock_adjtime(cd.clk, tx);
+	else
+		err = -EOPNOTSUPP;
+
+	put_clock_desc(&cd);
+
+	return err;
+}
+
+static int pc_clock_gettime(clockid_t id, struct timespec *ts)
+{
+	struct posix_clock_desc cd;
+	int err;
+
+	err = get_clock_desc(id, &cd);
+	if (err)
+		return err;
+
+	if (cd.clk->ops.clock_gettime)
+		err = cd.clk->ops.clock_gettime(cd.clk, ts);
+	else
+		err = -EOPNOTSUPP;
+
+	put_clock_desc(&cd);
+
+	return err;
+}
+
+static int pc_clock_getres(clockid_t id, struct timespec *ts)
+{
+	struct posix_clock_desc cd;
+	int err;
+
+	err = get_clock_desc(id, &cd);
+	if (err)
+		return err;
+
+	if (cd.clk->ops.clock_getres)
+		err = cd.clk->ops.clock_getres(cd.clk, ts);
+	else
+		err = -EOPNOTSUPP;
+
+	put_clock_desc(&cd);
+
+	return err;
+}
+
+static int pc_clock_settime(clockid_t id, const struct timespec *ts)
+{
+	struct posix_clock_desc cd;
+	int err;
+
+	err = get_clock_desc(id, &cd);
+	if (err)
+		return err;
+
+	if (cd.clk->ops.clock_settime)
+		err = cd.clk->ops.clock_settime(cd.clk, ts);
+	else
+		err = -EOPNOTSUPP;
+
+	put_clock_desc(&cd);
+
+	return err;
+}
+
+static int pc_timer_create(struct k_itimer *kit)
+{
+	clockid_t id = kit->it_clock;
+	struct posix_clock_desc cd;
+	int err;
+
+	err = get_clock_desc(id, &cd);
+	if (err)
+		return err;
+
+	if (cd.clk->ops.timer_create)
+		err = cd.clk->ops.timer_create(cd.clk, kit);
+	else
+		err = -EOPNOTSUPP;
+
+	put_clock_desc(&cd);
+
+	return err;
+}
+
+static int pc_timer_delete(struct k_itimer *kit)
+{
+	clockid_t id = kit->it_clock;
+	struct posix_clock_desc cd;
+	int err;
+
+	err = get_clock_desc(id, &cd);
+	if (err)
+		return err;
+
+	if (cd.clk->ops.timer_delete)
+		err = cd.clk->ops.timer_delete(cd.clk, kit);
+	else
+		err = -EOPNOTSUPP;
+
+	put_clock_desc(&cd);
+
+	return err;
+}
+
+static void pc_timer_gettime(struct k_itimer *kit, struct itimerspec *ts)
+{
+	clockid_t id = kit->it_clock;
+	struct posix_clock_desc cd;
+
+	if (get_clock_desc(id, &cd))
+		return;
+
+	if (cd.clk->ops.timer_gettime)
+		cd.clk->ops.timer_gettime(cd.clk, kit, ts);
+
+	put_clock_desc(&cd);
+}
+
+static int pc_timer_settime(struct k_itimer *kit, int flags,
+			    struct itimerspec *ts, struct itimerspec *old)
+{
+	clockid_t id = kit->it_clock;
+	struct posix_clock_desc cd;
+	int err;
+
+	err = get_clock_desc(id, &cd);
+	if (err)
+		return err;
+
+	if (cd.clk->ops.timer_settime)
+		err = cd.clk->ops.timer_settime(cd.clk, kit, flags, ts, old);
+	else
+		err = -EOPNOTSUPP;
+
+	put_clock_desc(&cd);
+
+	return err;
+}
+
+struct k_clock clock_posix_dynamic = {
+	.clock_getres	= pc_clock_getres,
+	.clock_set	= pc_clock_settime,
+	.clock_get	= pc_clock_gettime,
+	.clock_adj	= pc_clock_adjtime,
+	.timer_create	= pc_timer_create,
+	.timer_set	= pc_timer_settime,
+	.timer_del	= pc_timer_delete,
+	.timer_get	= pc_timer_gettime,
+};
-- 
cgit v0.10.2


From fe4b04fa31a6dcf4358aa84cf81e5a7fd079469b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 2 Feb 2011 13:19:09 +0100
Subject: perf: Cure task_oncpu_function_call() races

Oleg reported that on architectures with
__ARCH_WANT_INTERRUPTS_ON_CTXSW the IPI from
task_oncpu_function_call() can land before perf_event_task_sched_in()
and cause interesting situations for eg. perf_install_in_context().

This patch reworks the task_oncpu_function_call() interface to give a
more usable primitive as well as rework all its users to hopefully be
more obvious as well as remove the races.

While looking at the code I also found a number of races against
perf_event_task_sched_out() which can flip contexts between tasks so
plug those too.

Reported-and-reviewed-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/sched.h b/include/linux/sched.h
index d747f94..0b40ee3 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2578,13 +2578,6 @@ static inline void inc_syscw(struct task_struct *tsk)
 #define TASK_SIZE_OF(tsk)	TASK_SIZE
 #endif
 
-/*
- * Call the function if the target task is executing on a CPU right now:
- */
-extern void task_oncpu_function_call(struct task_struct *p,
-				     void (*func) (void *info), void *info);
-
-
 #ifdef CONFIG_MM_OWNER
 extern void mm_update_next_owner(struct mm_struct *mm);
 extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p);
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 126a302..7d3faa2 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -38,6 +38,79 @@
 
 #include <asm/irq_regs.h>
 
+struct remote_function_call {
+	struct task_struct *p;
+	int (*func)(void *info);
+	void *info;
+	int ret;
+};
+
+static void remote_function(void *data)
+{
+	struct remote_function_call *tfc = data;
+	struct task_struct *p = tfc->p;
+
+	if (p) {
+		tfc->ret = -EAGAIN;
+		if (task_cpu(p) != smp_processor_id() || !task_curr(p))
+			return;
+	}
+
+	tfc->ret = tfc->func(tfc->info);
+}
+
+/**
+ * task_function_call - call a function on the cpu on which a task runs
+ * @p:		the task to evaluate
+ * @func:	the function to be called
+ * @info:	the function call argument
+ *
+ * Calls the function @func when the task is currently running. This might
+ * be on the current CPU, which just calls the function directly
+ *
+ * returns: @func return value, or
+ *	    -ESRCH  - when the process isn't running
+ *	    -EAGAIN - when the process moved away
+ */
+static int
+task_function_call(struct task_struct *p, int (*func) (void *info), void *info)
+{
+	struct remote_function_call data = {
+		.p = p,
+		.func = func,
+		.info = info,
+		.ret = -ESRCH, /* No such (running) process */
+	};
+
+	if (task_curr(p))
+		smp_call_function_single(task_cpu(p), remote_function, &data, 1);
+
+	return data.ret;
+}
+
+/**
+ * cpu_function_call - call a function on the cpu
+ * @func:	the function to be called
+ * @info:	the function call argument
+ *
+ * Calls the function @func on the remote cpu.
+ *
+ * returns: @func return value or -ENXIO when the cpu is offline
+ */
+static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
+{
+	struct remote_function_call data = {
+		.p = NULL,
+		.func = func,
+		.info = info,
+		.ret = -ENXIO, /* No such CPU */
+	};
+
+	smp_call_function_single(cpu, remote_function, &data, 1);
+
+	return data.ret;
+}
+
 enum event_type_t {
 	EVENT_FLEXIBLE = 0x1,
 	EVENT_PINNED = 0x2,
@@ -254,7 +327,6 @@ static void perf_unpin_context(struct perf_event_context *ctx)
 	raw_spin_lock_irqsave(&ctx->lock, flags);
 	--ctx->pin_count;
 	raw_spin_unlock_irqrestore(&ctx->lock, flags);
-	put_ctx(ctx);
 }
 
 /*
@@ -618,35 +690,24 @@ __get_cpu_context(struct perf_event_context *ctx)
  * We disable the event on the hardware level first. After that we
  * remove it from the context list.
  */
-static void __perf_event_remove_from_context(void *info)
+static int __perf_remove_from_context(void *info)
 {
 	struct perf_event *event = info;
 	struct perf_event_context *ctx = event->ctx;
 	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
 
-	/*
-	 * If this is a task context, we need to check whether it is
-	 * the current task context of this cpu. If not it has been
-	 * scheduled out before the smp call arrived.
-	 */
-	if (ctx->task && cpuctx->task_ctx != ctx)
-		return;
-
 	raw_spin_lock(&ctx->lock);
-
 	event_sched_out(event, cpuctx, ctx);
-
 	list_del_event(event, ctx);
-
 	raw_spin_unlock(&ctx->lock);
+
+	return 0;
 }
 
 
 /*
  * Remove the event from a task's (or a CPU's) list of events.
  *
- * Must be called with ctx->mutex held.
- *
  * CPU events are removed with a smp call. For task events we only
  * call when the task is on a CPU.
  *
@@ -657,49 +718,48 @@ static void __perf_event_remove_from_context(void *info)
  * When called from perf_event_exit_task, it's OK because the
  * context has been detached from its task.
  */
-static void perf_event_remove_from_context(struct perf_event *event)
+static void perf_remove_from_context(struct perf_event *event)
 {
 	struct perf_event_context *ctx = event->ctx;
 	struct task_struct *task = ctx->task;
 
+	lockdep_assert_held(&ctx->mutex);
+
 	if (!task) {
 		/*
 		 * Per cpu events are removed via an smp call and
 		 * the removal is always successful.
 		 */
-		smp_call_function_single(event->cpu,
-					 __perf_event_remove_from_context,
-					 event, 1);
+		cpu_function_call(event->cpu, __perf_remove_from_context, event);
 		return;
 	}
 
 retry:
-	task_oncpu_function_call(task, __perf_event_remove_from_context,
-				 event);
+	if (!task_function_call(task, __perf_remove_from_context, event))
+		return;
 
 	raw_spin_lock_irq(&ctx->lock);
 	/*
-	 * If the context is active we need to retry the smp call.
+	 * If we failed to find a running task, but find the context active now
+	 * that we've acquired the ctx->lock, retry.
 	 */
-	if (ctx->nr_active && !list_empty(&event->group_entry)) {
+	if (ctx->is_active) {
 		raw_spin_unlock_irq(&ctx->lock);
 		goto retry;
 	}
 
 	/*
-	 * The lock prevents that this context is scheduled in so we
-	 * can remove the event safely, if the call above did not
-	 * succeed.
+	 * Since the task isn't running, its safe to remove the event, us
+	 * holding the ctx->lock ensures the task won't get scheduled in.
 	 */
-	if (!list_empty(&event->group_entry))
-		list_del_event(event, ctx);
+	list_del_event(event, ctx);
 	raw_spin_unlock_irq(&ctx->lock);
 }
 
 /*
  * Cross CPU call to disable a performance event
  */
-static void __perf_event_disable(void *info)
+static int __perf_event_disable(void *info)
 {
 	struct perf_event *event = info;
 	struct perf_event_context *ctx = event->ctx;
@@ -708,9 +768,12 @@ static void __perf_event_disable(void *info)
 	/*
 	 * If this is a per-task event, need to check whether this
 	 * event's task is the current task on this cpu.
+	 *
+	 * Can trigger due to concurrent perf_event_context_sched_out()
+	 * flipping contexts around.
 	 */
 	if (ctx->task && cpuctx->task_ctx != ctx)
-		return;
+		return -EINVAL;
 
 	raw_spin_lock(&ctx->lock);
 
@@ -729,6 +792,8 @@ static void __perf_event_disable(void *info)
 	}
 
 	raw_spin_unlock(&ctx->lock);
+
+	return 0;
 }
 
 /*
@@ -753,13 +818,13 @@ void perf_event_disable(struct perf_event *event)
 		/*
 		 * Disable the event on the cpu that it's on
 		 */
-		smp_call_function_single(event->cpu, __perf_event_disable,
-					 event, 1);
+		cpu_function_call(event->cpu, __perf_event_disable, event);
 		return;
 	}
 
 retry:
-	task_oncpu_function_call(task, __perf_event_disable, event);
+	if (!task_function_call(task, __perf_event_disable, event))
+		return;
 
 	raw_spin_lock_irq(&ctx->lock);
 	/*
@@ -767,6 +832,11 @@ retry:
 	 */
 	if (event->state == PERF_EVENT_STATE_ACTIVE) {
 		raw_spin_unlock_irq(&ctx->lock);
+		/*
+		 * Reload the task pointer, it might have been changed by
+		 * a concurrent perf_event_context_sched_out().
+		 */
+		task = ctx->task;
 		goto retry;
 	}
 
@@ -778,7 +848,6 @@ retry:
 		update_group_times(event);
 		event->state = PERF_EVENT_STATE_OFF;
 	}
-
 	raw_spin_unlock_irq(&ctx->lock);
 }
 
@@ -928,12 +997,14 @@ static void add_event_to_ctx(struct perf_event *event,
 	event->tstamp_stopped = tstamp;
 }
 
+static void perf_event_context_sched_in(struct perf_event_context *ctx);
+
 /*
  * Cross CPU call to install and enable a performance event
  *
  * Must be called with ctx->mutex held
  */
-static void __perf_install_in_context(void *info)
+static int  __perf_install_in_context(void *info)
 {
 	struct perf_event *event = info;
 	struct perf_event_context *ctx = event->ctx;
@@ -942,17 +1013,12 @@ static void __perf_install_in_context(void *info)
 	int err;
 
 	/*
-	 * If this is a task context, we need to check whether it is
-	 * the current task context of this cpu. If not it has been
-	 * scheduled out before the smp call arrived.
-	 * Or possibly this is the right context but it isn't
-	 * on this cpu because it had no events.
+	 * In case we're installing a new context to an already running task,
+	 * could also happen before perf_event_task_sched_in() on architectures
+	 * which do context switches with IRQs enabled.
 	 */
-	if (ctx->task && cpuctx->task_ctx != ctx) {
-		if (cpuctx->task_ctx || ctx->task != current)
-			return;
-		cpuctx->task_ctx = ctx;
-	}
+	if (ctx->task && !cpuctx->task_ctx)
+		perf_event_context_sched_in(ctx);
 
 	raw_spin_lock(&ctx->lock);
 	ctx->is_active = 1;
@@ -997,6 +1063,8 @@ static void __perf_install_in_context(void *info)
 
 unlock:
 	raw_spin_unlock(&ctx->lock);
+
+	return 0;
 }
 
 /*
@@ -1008,8 +1076,6 @@ unlock:
  * If the event is attached to a task which is on a CPU we use a smp
  * call to enable it in the task context. The task might have been
  * scheduled away, but we check this in the smp call again.
- *
- * Must be called with ctx->mutex held.
  */
 static void
 perf_install_in_context(struct perf_event_context *ctx,
@@ -1018,6 +1084,8 @@ perf_install_in_context(struct perf_event_context *ctx,
 {
 	struct task_struct *task = ctx->task;
 
+	lockdep_assert_held(&ctx->mutex);
+
 	event->ctx = ctx;
 
 	if (!task) {
@@ -1025,31 +1093,29 @@ perf_install_in_context(struct perf_event_context *ctx,
 		 * Per cpu events are installed via an smp call and
 		 * the install is always successful.
 		 */
-		smp_call_function_single(cpu, __perf_install_in_context,
-					 event, 1);
+		cpu_function_call(cpu, __perf_install_in_context, event);
 		return;
 	}
 
 retry:
-	task_oncpu_function_call(task, __perf_install_in_context,
-				 event);
+	if (!task_function_call(task, __perf_install_in_context, event))
+		return;
 
 	raw_spin_lock_irq(&ctx->lock);
 	/*
-	 * we need to retry the smp call.
+	 * If we failed to find a running task, but find the context active now
+	 * that we've acquired the ctx->lock, retry.
 	 */
-	if (ctx->is_active && list_empty(&event->group_entry)) {
+	if (ctx->is_active) {
 		raw_spin_unlock_irq(&ctx->lock);
 		goto retry;
 	}
 
 	/*
-	 * The lock prevents that this context is scheduled in so we
-	 * can add the event safely, if it the call above did not
-	 * succeed.
+	 * Since the task isn't running, its safe to add the event, us holding
+	 * the ctx->lock ensures the task won't get scheduled in.
 	 */
-	if (list_empty(&event->group_entry))
-		add_event_to_ctx(event, ctx);
+	add_event_to_ctx(event, ctx);
 	raw_spin_unlock_irq(&ctx->lock);
 }
 
@@ -1078,7 +1144,7 @@ static void __perf_event_mark_enabled(struct perf_event *event,
 /*
  * Cross CPU call to enable a performance event
  */
-static void __perf_event_enable(void *info)
+static int __perf_event_enable(void *info)
 {
 	struct perf_event *event = info;
 	struct perf_event_context *ctx = event->ctx;
@@ -1086,18 +1152,10 @@ static void __perf_event_enable(void *info)
 	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
 	int err;
 
-	/*
-	 * If this is a per-task event, need to check whether this
-	 * event's task is the current task on this cpu.
-	 */
-	if (ctx->task && cpuctx->task_ctx != ctx) {
-		if (cpuctx->task_ctx || ctx->task != current)
-			return;
-		cpuctx->task_ctx = ctx;
-	}
+	if (WARN_ON_ONCE(!ctx->is_active))
+		return -EINVAL;
 
 	raw_spin_lock(&ctx->lock);
-	ctx->is_active = 1;
 	update_context_time(ctx);
 
 	if (event->state >= PERF_EVENT_STATE_INACTIVE)
@@ -1138,6 +1196,8 @@ static void __perf_event_enable(void *info)
 
 unlock:
 	raw_spin_unlock(&ctx->lock);
+
+	return 0;
 }
 
 /*
@@ -1158,8 +1218,7 @@ void perf_event_enable(struct perf_event *event)
 		/*
 		 * Enable the event on the cpu that it's on
 		 */
-		smp_call_function_single(event->cpu, __perf_event_enable,
-					 event, 1);
+		cpu_function_call(event->cpu, __perf_event_enable, event);
 		return;
 	}
 
@@ -1178,8 +1237,15 @@ void perf_event_enable(struct perf_event *event)
 		event->state = PERF_EVENT_STATE_OFF;
 
 retry:
+	if (!ctx->is_active) {
+		__perf_event_mark_enabled(event, ctx);
+		goto out;
+	}
+
 	raw_spin_unlock_irq(&ctx->lock);
-	task_oncpu_function_call(task, __perf_event_enable, event);
+
+	if (!task_function_call(task, __perf_event_enable, event))
+		return;
 
 	raw_spin_lock_irq(&ctx->lock);
 
@@ -1187,15 +1253,14 @@ retry:
 	 * If the context is active and the event is still off,
 	 * we need to retry the cross-call.
 	 */
-	if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF)
+	if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) {
+		/*
+		 * task could have been flipped by a concurrent
+		 * perf_event_context_sched_out()
+		 */
+		task = ctx->task;
 		goto retry;
-
-	/*
-	 * Since we have the lock this context can't be scheduled
-	 * in, so we can change the state safely.
-	 */
-	if (event->state == PERF_EVENT_STATE_OFF)
-		__perf_event_mark_enabled(event, ctx);
+	}
 
 out:
 	raw_spin_unlock_irq(&ctx->lock);
@@ -1339,8 +1404,8 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
 	}
 }
 
-void perf_event_context_sched_out(struct task_struct *task, int ctxn,
-				  struct task_struct *next)
+static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
+					 struct task_struct *next)
 {
 	struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
 	struct perf_event_context *next_ctx;
@@ -1533,7 +1598,7 @@ static void task_ctx_sched_in(struct perf_event_context *ctx,
 {
 	struct perf_cpu_context *cpuctx;
 
-       	cpuctx = __get_cpu_context(ctx);
+	cpuctx = __get_cpu_context(ctx);
 	if (cpuctx->task_ctx == ctx)
 		return;
 
@@ -1541,7 +1606,7 @@ static void task_ctx_sched_in(struct perf_event_context *ctx,
 	cpuctx->task_ctx = ctx;
 }
 
-void perf_event_context_sched_in(struct perf_event_context *ctx)
+static void perf_event_context_sched_in(struct perf_event_context *ctx)
 {
 	struct perf_cpu_context *cpuctx;
 
@@ -1627,7 +1692,7 @@ static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
 	 * Reduce accuracy by one bit such that @a and @b converge
 	 * to a similar magnitude.
 	 */
-#define REDUCE_FLS(a, b) 		\
+#define REDUCE_FLS(a, b)		\
 do {					\
 	if (a##_fls > b##_fls) {	\
 		a >>= 1;		\
@@ -2213,6 +2278,9 @@ errout:
 
 }
 
+/*
+ * Returns a matching context with refcount and pincount.
+ */
 static struct perf_event_context *
 find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
 {
@@ -2237,6 +2305,7 @@ find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
 		cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
 		ctx = &cpuctx->ctx;
 		get_ctx(ctx);
+		++ctx->pin_count;
 
 		return ctx;
 	}
@@ -2250,6 +2319,7 @@ retry:
 	ctx = perf_lock_task_context(task, ctxn, &flags);
 	if (ctx) {
 		unclone_ctx(ctx);
+		++ctx->pin_count;
 		raw_spin_unlock_irqrestore(&ctx->lock, flags);
 	}
 
@@ -2271,8 +2341,10 @@ retry:
 			err = -ESRCH;
 		else if (task->perf_event_ctxp[ctxn])
 			err = -EAGAIN;
-		else
+		else {
+			++ctx->pin_count;
 			rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
+		}
 		mutex_unlock(&task->perf_event_mutex);
 
 		if (unlikely(err)) {
@@ -5950,10 +6022,10 @@ SYSCALL_DEFINE5(perf_event_open,
 		struct perf_event_context *gctx = group_leader->ctx;
 
 		mutex_lock(&gctx->mutex);
-		perf_event_remove_from_context(group_leader);
+		perf_remove_from_context(group_leader);
 		list_for_each_entry(sibling, &group_leader->sibling_list,
 				    group_entry) {
-			perf_event_remove_from_context(sibling);
+			perf_remove_from_context(sibling);
 			put_ctx(gctx);
 		}
 		mutex_unlock(&gctx->mutex);
@@ -5976,6 +6048,7 @@ SYSCALL_DEFINE5(perf_event_open,
 
 	perf_install_in_context(ctx, event, cpu);
 	++ctx->generation;
+	perf_unpin_context(ctx);
 	mutex_unlock(&ctx->mutex);
 
 	event->owner = current;
@@ -6001,6 +6074,7 @@ SYSCALL_DEFINE5(perf_event_open,
 	return event_fd;
 
 err_context:
+	perf_unpin_context(ctx);
 	put_ctx(ctx);
 err_alloc:
 	free_event(event);
@@ -6051,6 +6125,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
 	mutex_lock(&ctx->mutex);
 	perf_install_in_context(ctx, event, cpu);
 	++ctx->generation;
+	perf_unpin_context(ctx);
 	mutex_unlock(&ctx->mutex);
 
 	return event;
@@ -6104,7 +6179,7 @@ __perf_event_exit_task(struct perf_event *child_event,
 {
 	struct perf_event *parent_event;
 
-	perf_event_remove_from_context(child_event);
+	perf_remove_from_context(child_event);
 
 	parent_event = child_event->parent;
 	/*
@@ -6411,7 +6486,7 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
 		return 0;
 	}
 
-       	child_ctx = child->perf_event_ctxp[ctxn];
+	child_ctx = child->perf_event_ctxp[ctxn];
 	if (!child_ctx) {
 		/*
 		 * This is executed from the parent task context, so
@@ -6526,6 +6601,7 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
 	mutex_unlock(&parent_ctx->mutex);
 
 	perf_unpin_context(parent_ctx);
+	put_ctx(parent_ctx);
 
 	return ret;
 }
@@ -6595,9 +6671,9 @@ static void __perf_event_exit_context(void *__info)
 	perf_pmu_rotate_stop(ctx->pmu);
 
 	list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
-		__perf_event_remove_from_context(event);
+		__perf_remove_from_context(event);
 	list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry)
-		__perf_event_remove_from_context(event);
+		__perf_remove_from_context(event);
 }
 
 static void perf_event_exit_cpu_context(int cpu)
diff --git a/kernel/sched.c b/kernel/sched.c
index 18d38e4..31cb5d5 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2265,27 +2265,6 @@ void kick_process(struct task_struct *p)
 EXPORT_SYMBOL_GPL(kick_process);
 #endif /* CONFIG_SMP */
 
-/**
- * task_oncpu_function_call - call a function on the cpu on which a task runs
- * @p:		the task to evaluate
- * @func:	the function to be called
- * @info:	the function call argument
- *
- * Calls the function @func when the task is currently running. This might
- * be on the current CPU, which just calls the function directly
- */
-void task_oncpu_function_call(struct task_struct *p,
-			      void (*func) (void *info), void *info)
-{
-	int cpu;
-
-	preempt_disable();
-	cpu = task_cpu(p);
-	if (task_curr(p))
-		smp_call_function_single(cpu, func, info, 1);
-	preempt_enable();
-}
-
 #ifdef CONFIG_SMP
 /*
  * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held.
@@ -2776,9 +2755,12 @@ static inline void
 prepare_task_switch(struct rq *rq, struct task_struct *prev,
 		    struct task_struct *next)
 {
+	sched_info_switch(prev, next);
+	perf_event_task_sched_out(prev, next);
 	fire_sched_out_preempt_notifiers(prev, next);
 	prepare_lock_switch(rq, next);
 	prepare_arch_switch(next);
+	trace_sched_switch(prev, next);
 }
 
 /**
@@ -2911,7 +2893,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
 	struct mm_struct *mm, *oldmm;
 
 	prepare_task_switch(rq, prev, next);
-	trace_sched_switch(prev, next);
+
 	mm = next->mm;
 	oldmm = prev->active_mm;
 	/*
@@ -3989,9 +3971,6 @@ need_resched_nonpreemptible:
 	rq->skip_clock_update = 0;
 
 	if (likely(prev != next)) {
-		sched_info_switch(prev, next);
-		perf_event_task_sched_out(prev, next);
-
 		rq->nr_switches++;
 		rq->curr = next;
 		++*switch_count;
-- 
cgit v0.10.2


From 725e7580aaf98e9f7b22b8ccfc640ad0c09e2b08 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Tue, 1 Feb 2011 09:47:15 -0500
Subject: sched: Check the right ->nr_running in yield_task_fair()

With CONFIG_FAIR_GROUP_SCHED, each task_group has its own cfs_rq.
Yielding to a task from another cfs_rq may be worthwhile, since
a process calling yield typically cannot use the CPU right now.

Therefor, we want to check the per-cpu nr_running, not the
cgroup local one.

Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20110201094715.798c4f86@annuminas.surriel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 55040f3..4de9905 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1331,7 +1331,7 @@ static void yield_task_fair(struct rq *rq)
 	/*
 	 * Are we the only task in the tree?
 	 */
-	if (unlikely(cfs_rq->nr_running == 1))
+	if (unlikely(rq->nr_running == 1))
 		return;
 
 	clear_buddies(cfs_rq, se);
-- 
cgit v0.10.2


From 2c13c919d9e9a3db9896143a501f83dcbbe1ced4 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Tue, 1 Feb 2011 09:48:37 -0500
Subject: sched: Limit the scope of clear_buddies

The clear_buddies function does not seem to play well with the concept
of hierarchical runqueues.  In the following tree, task groups are
represented by 'G', tasks by 'T', next by 'n' and last by 'l'.

     (nl)
    /    \
   G(nl)  G
   / \     \
 T(l) T(n)  T

This situation can arise when a task is woken up T(n), and the previously
running task T(l) is marked last.

When clear_buddies is called from either T(l) or T(n), the next and last
buddies of the group G(nl) will be cleared.  This is not the desired
result, since we would like to be able to find the other type of buddy
in many cases.

This especially a worry when implementing yield_task_fair through the
buddy system.

The fix is simple: only clear the buddy type that the task itself
is indicated to be.  As an added bonus, we stop walking up the tree
when the buddy has already been cleared or pointed elsewhere.

Signed-off-by: Rik van Riel <riel@redhat.coM>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20110201094837.6b0962a9@annuminas.surriel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 4de9905..a785e08 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -995,19 +995,35 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 		list_add_leaf_cfs_rq(cfs_rq);
 }
 
-static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static void __clear_buddies_last(struct sched_entity *se)
 {
-	if (!se || cfs_rq->last == se)
-		cfs_rq->last = NULL;
+	for_each_sched_entity(se) {
+		struct cfs_rq *cfs_rq = cfs_rq_of(se);
+		if (cfs_rq->last == se)
+			cfs_rq->last = NULL;
+		else
+			break;
+	}
+}
 
-	if (!se || cfs_rq->next == se)
-		cfs_rq->next = NULL;
+static void __clear_buddies_next(struct sched_entity *se)
+{
+	for_each_sched_entity(se) {
+		struct cfs_rq *cfs_rq = cfs_rq_of(se);
+		if (cfs_rq->next == se)
+			cfs_rq->next = NULL;
+		else
+			break;
+	}
 }
 
 static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	for_each_sched_entity(se)
-		__clear_buddies(cfs_rq_of(se), se);
+	if (cfs_rq->last == se)
+		__clear_buddies_last(se);
+
+	if (cfs_rq->next == se)
+		__clear_buddies_next(se);
 }
 
 static void
-- 
cgit v0.10.2


From ac53db596cc08ecb8040cfb6f71ae40c6f2041c4 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Tue, 1 Feb 2011 09:51:03 -0500
Subject: sched: Use a buddy to implement yield_task_fair()

Use the buddy mechanism to implement yield_task_fair.  This
allows us to skip onto the next highest priority se at every
level in the CFS tree, unless doing so would introduce gross
unfairness in CPU time distribution.

We order the buddy selection in pick_next_entity to check
yield first, then last, then next.  We need next to be able
to override yield, because it is possible for the "next" and
"yield" task to be different processen in the same sub-tree
of the CFS tree.  When they are, we need to go into that
sub-tree regardless of the "yield" hint, and pick the correct
entity once we get to the right level.

Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20110201095103.3a79e92a@annuminas.surriel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0542774..4e9fad2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1942,8 +1942,6 @@ int sched_rt_handler(struct ctl_table *table, int write,
 		void __user *buffer, size_t *lenp,
 		loff_t *ppos);
 
-extern unsigned int sysctl_sched_compat_yield;
-
 #ifdef CONFIG_SCHED_AUTOGROUP
 extern unsigned int sysctl_sched_autogroup_enabled;
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 477e1bc..ae5e1a1 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -324,7 +324,7 @@ struct cfs_rq {
 	 * 'curr' points to currently running entity on this cfs_rq.
 	 * It is set to NULL otherwise (i.e when none are currently running).
 	 */
-	struct sched_entity *curr, *next, *last;
+	struct sched_entity *curr, *next, *last, *skip;
 
 	unsigned int nr_spread_over;
 
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index eb6cb8e..7bacd83 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -179,7 +179,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 
 	raw_spin_lock_irqsave(&rq->lock, flags);
 	if (cfs_rq->rb_leftmost)
-		MIN_vruntime = (__pick_next_entity(cfs_rq))->vruntime;
+		MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime;
 	last = __pick_last_entity(cfs_rq);
 	if (last)
 		max_vruntime = last->vruntime;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index a785e08..c0fbeb9 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -69,14 +69,6 @@ static unsigned int sched_nr_latency = 8;
 unsigned int sysctl_sched_child_runs_first __read_mostly;
 
 /*
- * sys_sched_yield() compat mode
- *
- * This option switches the agressive yield implementation of the
- * old scheduler back on.
- */
-unsigned int __read_mostly sysctl_sched_compat_yield;
-
-/*
  * SCHED_OTHER wake-up granularity.
  * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
  *
@@ -419,7 +411,7 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
 }
 
-static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
+static struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
 {
 	struct rb_node *left = cfs_rq->rb_leftmost;
 
@@ -429,6 +421,17 @@ static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
 	return rb_entry(left, struct sched_entity, run_node);
 }
 
+static struct sched_entity *__pick_next_entity(struct sched_entity *se)
+{
+	struct rb_node *next = rb_next(&se->run_node);
+
+	if (!next)
+		return NULL;
+
+	return rb_entry(next, struct sched_entity, run_node);
+}
+
+#ifdef CONFIG_SCHED_DEBUG
 static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
 {
 	struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
@@ -443,7 +446,6 @@ static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
  * Scheduling class statistics methods:
  */
 
-#ifdef CONFIG_SCHED_DEBUG
 int sched_proc_update_handler(struct ctl_table *table, int write,
 		void __user *buffer, size_t *lenp,
 		loff_t *ppos)
@@ -1017,6 +1019,17 @@ static void __clear_buddies_next(struct sched_entity *se)
 	}
 }
 
+static void __clear_buddies_skip(struct sched_entity *se)
+{
+	for_each_sched_entity(se) {
+		struct cfs_rq *cfs_rq = cfs_rq_of(se);
+		if (cfs_rq->skip == se)
+			cfs_rq->skip = NULL;
+		else
+			break;
+	}
+}
+
 static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	if (cfs_rq->last == se)
@@ -1024,6 +1037,9 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
 
 	if (cfs_rq->next == se)
 		__clear_buddies_next(se);
+
+	if (cfs_rq->skip == se)
+		__clear_buddies_skip(se);
 }
 
 static void
@@ -1099,7 +1115,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 		return;
 
 	if (cfs_rq->nr_running > 1) {
-		struct sched_entity *se = __pick_next_entity(cfs_rq);
+		struct sched_entity *se = __pick_first_entity(cfs_rq);
 		s64 delta = curr->vruntime - se->vruntime;
 
 		if (delta < 0)
@@ -1143,13 +1159,27 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 static int
 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
 
+/*
+ * Pick the next process, keeping these things in mind, in this order:
+ * 1) keep things fair between processes/task groups
+ * 2) pick the "next" process, since someone really wants that to run
+ * 3) pick the "last" process, for cache locality
+ * 4) do not run the "skip" process, if something else is available
+ */
 static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
 {
-	struct sched_entity *se = __pick_next_entity(cfs_rq);
+	struct sched_entity *se = __pick_first_entity(cfs_rq);
 	struct sched_entity *left = se;
 
-	if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
-		se = cfs_rq->next;
+	/*
+	 * Avoid running the skip buddy, if running something else can
+	 * be done without getting too unfair.
+	 */
+	if (cfs_rq->skip == se) {
+		struct sched_entity *second = __pick_next_entity(se);
+		if (second && wakeup_preempt_entity(second, left) < 1)
+			se = second;
+	}
 
 	/*
 	 * Prefer last buddy, try to return the CPU to a preempted task.
@@ -1157,6 +1187,12 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
 	if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
 		se = cfs_rq->last;
 
+	/*
+	 * Someone really wants this to run. If it's not unfair, run it.
+	 */
+	if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
+		se = cfs_rq->next;
+
 	clear_buddies(cfs_rq, se);
 
 	return se;
@@ -1333,52 +1369,6 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	hrtick_update(rq);
 }
 
-/*
- * sched_yield() support is very simple - we dequeue and enqueue.
- *
- * If compat_yield is turned on then we requeue to the end of the tree.
- */
-static void yield_task_fair(struct rq *rq)
-{
-	struct task_struct *curr = rq->curr;
-	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
-	struct sched_entity *rightmost, *se = &curr->se;
-
-	/*
-	 * Are we the only task in the tree?
-	 */
-	if (unlikely(rq->nr_running == 1))
-		return;
-
-	clear_buddies(cfs_rq, se);
-
-	if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) {
-		update_rq_clock(rq);
-		/*
-		 * Update run-time statistics of the 'current'.
-		 */
-		update_curr(cfs_rq);
-
-		return;
-	}
-	/*
-	 * Find the rightmost entry in the rbtree:
-	 */
-	rightmost = __pick_last_entity(cfs_rq);
-	/*
-	 * Already in the rightmost position?
-	 */
-	if (unlikely(!rightmost || entity_before(rightmost, se)))
-		return;
-
-	/*
-	 * Minimally necessary key value to be last in the tree:
-	 * Upon rescheduling, sched_class::put_prev_task() will place
-	 * 'current' within the tree based on its new key value.
-	 */
-	se->vruntime = rightmost->vruntime + 1;
-}
-
 #ifdef CONFIG_SMP
 
 static void task_waking_fair(struct rq *rq, struct task_struct *p)
@@ -1849,6 +1839,14 @@ static void set_next_buddy(struct sched_entity *se)
 	}
 }
 
+static void set_skip_buddy(struct sched_entity *se)
+{
+	if (likely(task_of(se)->policy != SCHED_IDLE)) {
+		for_each_sched_entity(se)
+			cfs_rq_of(se)->skip = se;
+	}
+}
+
 /*
  * Preempt the current task with a newly woken task if needed:
  */
@@ -1947,6 +1945,36 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
 	}
 }
 
+/*
+ * sched_yield() is very simple
+ *
+ * The magic of dealing with the ->skip buddy is in pick_next_entity.
+ */
+static void yield_task_fair(struct rq *rq)
+{
+	struct task_struct *curr = rq->curr;
+	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
+	struct sched_entity *se = &curr->se;
+
+	/*
+	 * Are we the only task in the tree?
+	 */
+	if (unlikely(rq->nr_running == 1))
+		return;
+
+	clear_buddies(cfs_rq, se);
+
+	if (curr->policy != SCHED_BATCH) {
+		update_rq_clock(rq);
+		/*
+		 * Update run-time statistics of the 'current'.
+		 */
+		update_curr(cfs_rq);
+	}
+
+	set_skip_buddy(se);
+}
+
 #ifdef CONFIG_SMP
 /**************************************************
  * Fair scheduling class load-balancing methods:
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index bc86bb3..cbfda7e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -360,13 +360,6 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= sched_rt_handler,
 	},
-	{
-		.procname	= "sched_compat_yield",
-		.data		= &sysctl_sched_compat_yield,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
 #ifdef CONFIG_SCHED_AUTOGROUP
 	{
 		.procname	= "sched_autogroup_enabled",
-- 
cgit v0.10.2


From d95f412200652694e63e64bfd49f0ae274a54479 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Tue, 1 Feb 2011 09:50:51 -0500
Subject: sched: Add yield_to(task, preempt) functionality

Currently only implemented for fair class tasks.

Add a yield_to_task method() to the fair scheduling class. allowing the
caller of yield_to() to accelerate another thread in it's thread group,
task group.

Implemented via a scheduler hint, using cfs_rq->next to encourage the
target being selected.  We can rely on pick_next_entity to keep things
fair, so noone can accelerate a thread that has already used its fair
share of CPU time.

This also means callers should only call yield_to when they really
mean it.  Calling it too often can result in the scheduler just
ignoring the hint.

Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20110201095051.4ddb7738@annuminas.surriel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4e9fad2..c88b3bf 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1058,6 +1058,7 @@ struct sched_class {
 	void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
 	void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
 	void (*yield_task) (struct rq *rq);
+	bool (*yield_to_task) (struct rq *rq, struct task_struct *p, bool preempt);
 
 	void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags);
 
@@ -1972,6 +1973,7 @@ static inline int rt_mutex_getprio(struct task_struct *p)
 # define rt_mutex_adjust_pi(p)		do { } while (0)
 #endif
 
+extern bool yield_to(struct task_struct *p, bool preempt);
 extern void set_user_nice(struct task_struct *p, long nice);
 extern int task_prio(const struct task_struct *p);
 extern int task_nice(const struct task_struct *p);
diff --git a/kernel/sched.c b/kernel/sched.c
index ae5e1a1..2effcb7 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1686,6 +1686,39 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
 		__release(rq2->lock);
 }
 
+#else /* CONFIG_SMP */
+
+/*
+ * double_rq_lock - safely lock two runqueues
+ *
+ * Note this does not disable interrupts like task_rq_lock,
+ * you need to do so manually before calling.
+ */
+static void double_rq_lock(struct rq *rq1, struct rq *rq2)
+	__acquires(rq1->lock)
+	__acquires(rq2->lock)
+{
+	BUG_ON(!irqs_disabled());
+	BUG_ON(rq1 != rq2);
+	raw_spin_lock(&rq1->lock);
+	__acquire(rq2->lock);	/* Fake it out ;) */
+}
+
+/*
+ * double_rq_unlock - safely unlock two runqueues
+ *
+ * Note this does not restore interrupts like task_rq_unlock,
+ * you need to do so manually after calling.
+ */
+static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
+	__releases(rq1->lock)
+	__releases(rq2->lock)
+{
+	BUG_ON(rq1 != rq2);
+	raw_spin_unlock(&rq1->lock);
+	__release(rq2->lock);
+}
+
 #endif
 
 static void calc_load_account_idle(struct rq *this_rq);
@@ -5448,6 +5481,58 @@ void __sched yield(void)
 }
 EXPORT_SYMBOL(yield);
 
+/**
+ * yield_to - yield the current processor to another thread in
+ * your thread group, or accelerate that thread toward the
+ * processor it's on.
+ *
+ * It's the caller's job to ensure that the target task struct
+ * can't go away on us before we can do any checks.
+ *
+ * Returns true if we indeed boosted the target task.
+ */
+bool __sched yield_to(struct task_struct *p, bool preempt)
+{
+	struct task_struct *curr = current;
+	struct rq *rq, *p_rq;
+	unsigned long flags;
+	bool yielded = 0;
+
+	local_irq_save(flags);
+	rq = this_rq();
+
+again:
+	p_rq = task_rq(p);
+	double_rq_lock(rq, p_rq);
+	while (task_rq(p) != p_rq) {
+		double_rq_unlock(rq, p_rq);
+		goto again;
+	}
+
+	if (!curr->sched_class->yield_to_task)
+		goto out;
+
+	if (curr->sched_class != p->sched_class)
+		goto out;
+
+	if (task_running(p_rq, p) || p->state)
+		goto out;
+
+	yielded = curr->sched_class->yield_to_task(rq, p, preempt);
+	if (yielded)
+		schedstat_inc(rq, yld_count);
+
+out:
+	double_rq_unlock(rq, p_rq);
+	local_irq_restore(flags);
+
+	if (yielded)
+		schedule();
+
+	return yielded;
+}
+EXPORT_SYMBOL_GPL(yield_to);
+
 /*
  * This task is about to go to sleep on IO. Increment rq->nr_iowait so
  * that process accounting knows that this is a task in IO wait state.
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index c0fbeb9..0270246 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1975,6 +1975,25 @@ static void yield_task_fair(struct rq *rq)
 	set_skip_buddy(se);
 }
 
+static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
+{
+	struct sched_entity *se = &p->se;
+
+	if (!se->on_rq)
+		return false;
+
+	/* Tell the scheduler that we'd really like pse to run next. */
+	set_next_buddy(se);
+
+	/* Make p's CPU reschedule; pick_next_entity takes care of fairness. */
+	if (preempt)
+		resched_task(rq->curr);
+
+	yield_task_fair(rq);
+
+	return true;
+}
+
 #ifdef CONFIG_SMP
 /**************************************************
  * Fair scheduling class load-balancing methods:
@@ -4243,6 +4262,7 @@ static const struct sched_class fair_sched_class = {
 	.enqueue_task		= enqueue_task_fair,
 	.dequeue_task		= dequeue_task_fair,
 	.yield_task		= yield_task_fair,
+	.yield_to_task		= yield_to_task_fair,
 
 	.check_preempt_curr	= check_preempt_wakeup,
 
-- 
cgit v0.10.2


From 04bea68b2f0eeebb089ecc67b618795925268b4a Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <sebastian@breakpoint.cc>
Date: Mon, 24 Jan 2011 09:58:55 +0530
Subject: of/pci: move of_irq_map_pci() into generic code

There is a tiny difference between PPC32 and PPC64. Microblaze uses the
PPC32 variant.

Signed-off-by: Sebastian Andrzej Siewior <sebastian@breakpoint.cc>
[grant.likely@secretlab.ca: Added comment to #endif, moved documentation
	block to function implementation, fixed for non ppc and microblaze
	compiles]
Signed-off-by: Grant Likely <grant.likely@secretlab.ca>

diff --git a/arch/microblaze/include/asm/pci-bridge.h b/arch/microblaze/include/asm/pci-bridge.h
index 0c68764..1071766 100644
--- a/arch/microblaze/include/asm/pci-bridge.h
+++ b/arch/microblaze/include/asm/pci-bridge.h
@@ -104,11 +104,22 @@ struct pci_controller {
 	int global_number;	/* PCI domain number */
 };
 
+#ifdef CONFIG_PCI
 static inline struct pci_controller *pci_bus_to_host(const struct pci_bus *bus)
 {
 	return bus->sysdata;
 }
 
+static inline struct device_node *pci_bus_to_OF_node(struct pci_bus *bus)
+{
+	struct pci_controller *host;
+
+	if (bus->self)
+		return pci_device_to_OF_node(bus->self);
+	host = pci_bus_to_host(bus);
+	return host ? host->dn : NULL;
+}
+
 static inline int isa_vaddr_is_ioport(void __iomem *address)
 {
 	/* No specific ISA handling on ppc32 at this stage, it
@@ -116,6 +127,7 @@ static inline int isa_vaddr_is_ioport(void __iomem *address)
 	 */
 	return 0;
 }
+#endif /* CONFIG_PCI */
 
 /* These are used for config access before all the PCI probing
    has been done. */
diff --git a/arch/microblaze/include/asm/prom.h b/arch/microblaze/include/asm/prom.h
index 2e72af0..d0890d3 100644
--- a/arch/microblaze/include/asm/prom.h
+++ b/arch/microblaze/include/asm/prom.h
@@ -64,21 +64,6 @@ extern void kdump_move_device_tree(void);
 /* CPU OF node matching */
 struct device_node *of_get_cpu_node(int cpu, unsigned int *thread);
 
-/**
- * of_irq_map_pci - Resolve the interrupt for a PCI device
- * @pdev:	the device whose interrupt is to be resolved
- * @out_irq:	structure of_irq filled by this function
- *
- * This function resolves the PCI interrupt for a given PCI device. If a
- * device-node exists for a given pci_dev, it will use normal OF tree
- * walking. If not, it will implement standard swizzling and walk up the
- * PCI tree until an device-node is found, at which point it will finish
- * resolving using the OF tree walking.
- */
-struct pci_dev;
-struct of_irq;
-extern int of_irq_map_pci(struct pci_dev *pdev, struct of_irq *out_irq);
-
 #endif /* __ASSEMBLY__ */
 #endif /* __KERNEL__ */
 
diff --git a/arch/microblaze/kernel/prom_parse.c b/arch/microblaze/kernel/prom_parse.c
index 9ae24f4..47187cc 100644
--- a/arch/microblaze/kernel/prom_parse.c
+++ b/arch/microblaze/kernel/prom_parse.c
@@ -2,88 +2,11 @@
 
 #include <linux/kernel.h>
 #include <linux/string.h>
-#include <linux/pci_regs.h>
 #include <linux/module.h>
 #include <linux/ioport.h>
 #include <linux/etherdevice.h>
 #include <linux/of_address.h>
 #include <asm/prom.h>
-#include <asm/pci-bridge.h>
-
-#ifdef CONFIG_PCI
-int of_irq_map_pci(struct pci_dev *pdev, struct of_irq *out_irq)
-{
-	struct device_node *dn, *ppnode;
-	struct pci_dev *ppdev;
-	u32 lspec;
-	u32 laddr[3];
-	u8 pin;
-	int rc;
-
-	/* Check if we have a device node, if yes, fallback to standard OF
-	 * parsing
-	 */
-	dn = pci_device_to_OF_node(pdev);
-	if (dn)
-		return of_irq_map_one(dn, 0, out_irq);
-
-	/* Ok, we don't, time to have fun. Let's start by building up an
-	 * interrupt spec.  we assume #interrupt-cells is 1, which is standard
-	 * for PCI. If you do different, then don't use that routine.
-	 */
-	rc = pci_read_config_byte(pdev, PCI_INTERRUPT_PIN, &pin);
-	if (rc != 0)
-		return rc;
-	/* No pin, exit */
-	if (pin == 0)
-		return -ENODEV;
-
-	/* Now we walk up the PCI tree */
-	lspec = pin;
-	for (;;) {
-		/* Get the pci_dev of our parent */
-		ppdev = pdev->bus->self;
-
-		/* Ouch, it's a host bridge... */
-		if (ppdev == NULL) {
-			struct pci_controller *host;
-			host = pci_bus_to_host(pdev->bus);
-			ppnode = host ? host->dn : NULL;
-			/* No node for host bridge ? give up */
-			if (ppnode == NULL)
-				return -EINVAL;
-		} else
-			/* We found a P2P bridge, check if it has a node */
-			ppnode = pci_device_to_OF_node(ppdev);
-
-		/* Ok, we have found a parent with a device-node, hand over to
-		 * the OF parsing code.
-		 * We build a unit address from the linux device to be used for
-		 * resolution. Note that we use the linux bus number which may
-		 * not match your firmware bus numbering.
-		 * Fortunately, in most cases, interrupt-map-mask doesn't
-		 * include the bus number as part of the matching.
-		 * You should still be careful about that though if you intend
-		 * to rely on this function (you ship  a firmware that doesn't
-		 * create device nodes for all PCI devices).
-		 */
-		if (ppnode)
-			break;
-
-		/* We can only get here if we hit a P2P bridge with no node,
-		 * let's do standard swizzling and try again
-		 */
-		lspec = pci_swizzle_interrupt_pin(pdev, lspec);
-		pdev = ppdev;
-	}
-
-	laddr[0] = (pdev->bus->number << 16)
-		| (pdev->devfn << 8);
-	laddr[1]  = laddr[2] = 0;
-	return of_irq_map_raw(ppnode, &lspec, 1, laddr, out_irq);
-}
-EXPORT_SYMBOL_GPL(of_irq_map_pci);
-#endif /* CONFIG_PCI */
 
 void of_parse_dma_window(struct device_node *dn, const void *dma_window_prop,
 		unsigned long *busno, unsigned long *phys, unsigned long *size)
diff --git a/arch/microblaze/pci/pci-common.c b/arch/microblaze/pci/pci-common.c
index e363615..1e01a12 100644
--- a/arch/microblaze/pci/pci-common.c
+++ b/arch/microblaze/pci/pci-common.c
@@ -29,6 +29,7 @@
 #include <linux/slab.h>
 #include <linux/of.h>
 #include <linux/of_address.h>
+#include <linux/of_pci.h>
 
 #include <asm/processor.h>
 #include <asm/io.h>
diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
index 51e9e6f..edeb80f 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -171,6 +171,16 @@ static inline struct pci_controller *pci_bus_to_host(const struct pci_bus *bus)
 	return bus->sysdata;
 }
 
+static inline struct device_node *pci_bus_to_OF_node(struct pci_bus *bus)
+{
+	struct pci_controller *host;
+
+	if (bus->self)
+		return pci_device_to_OF_node(bus->self);
+	host = pci_bus_to_host(bus);
+	return host ? host->dn : NULL;
+}
+
 static inline int isa_vaddr_is_ioport(void __iomem *address)
 {
 	/* No specific ISA handling on ppc32 at this stage, it
diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h
index d727575..c189aa5 100644
--- a/arch/powerpc/include/asm/prom.h
+++ b/arch/powerpc/include/asm/prom.h
@@ -70,21 +70,6 @@ static inline int of_node_to_nid(struct device_node *device) { return 0; }
 #endif
 #define of_node_to_nid of_node_to_nid
 
-/**
- * of_irq_map_pci - Resolve the interrupt for a PCI device
- * @pdev:	the device whose interrupt is to be resolved
- * @out_irq:	structure of_irq filled by this function
- *
- * This function resolves the PCI interrupt for a given PCI device. If a
- * device-node exists for a given pci_dev, it will use normal OF tree
- * walking. If not, it will implement standard swizzling and walk up the
- * PCI tree until an device-node is found, at which point it will finish
- * resolving using the OF tree walking.
- */
-struct pci_dev;
-struct of_irq;
-extern int of_irq_map_pci(struct pci_dev *pdev, struct of_irq *out_irq);
-
 extern void of_instantiate_rtc(void);
 
 /* These includes are put at the bottom because they may contain things
diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c
index 10a44e6..eb341be 100644
--- a/arch/powerpc/kernel/pci-common.c
+++ b/arch/powerpc/kernel/pci-common.c
@@ -22,6 +22,7 @@
 #include <linux/init.h>
 #include <linux/bootmem.h>
 #include <linux/of_address.h>
+#include <linux/of_pci.h>
 #include <linux/mm.h>
 #include <linux/list.h>
 #include <linux/syscalls.h>
diff --git a/arch/powerpc/kernel/prom_parse.c b/arch/powerpc/kernel/prom_parse.c
index c2b7a07..47187cc 100644
--- a/arch/powerpc/kernel/prom_parse.c
+++ b/arch/powerpc/kernel/prom_parse.c
@@ -2,95 +2,11 @@
 
 #include <linux/kernel.h>
 #include <linux/string.h>
-#include <linux/pci_regs.h>
 #include <linux/module.h>
 #include <linux/ioport.h>
 #include <linux/etherdevice.h>
 #include <linux/of_address.h>
 #include <asm/prom.h>
-#include <asm/pci-bridge.h>
-
-#ifdef CONFIG_PCI
-int of_irq_map_pci(struct pci_dev *pdev, struct of_irq *out_irq)
-{
-	struct device_node *dn, *ppnode;
-	struct pci_dev *ppdev;
-	u32 lspec;
-	u32 laddr[3];
-	u8 pin;
-	int rc;
-
-	/* Check if we have a device node, if yes, fallback to standard OF
-	 * parsing
-	 */
-	dn = pci_device_to_OF_node(pdev);
-	if (dn) {
-		rc = of_irq_map_one(dn, 0, out_irq);
-		if (!rc)
-			return rc;
-	}
-
-	/* Ok, we don't, time to have fun. Let's start by building up an
-	 * interrupt spec.  we assume #interrupt-cells is 1, which is standard
-	 * for PCI. If you do different, then don't use that routine.
-	 */
-	rc = pci_read_config_byte(pdev, PCI_INTERRUPT_PIN, &pin);
-	if (rc != 0)
-		return rc;
-	/* No pin, exit */
-	if (pin == 0)
-		return -ENODEV;
-
-	/* Now we walk up the PCI tree */
-	lspec = pin;
-	for (;;) {
-		/* Get the pci_dev of our parent */
-		ppdev = pdev->bus->self;
-
-		/* Ouch, it's a host bridge... */
-		if (ppdev == NULL) {
-#ifdef CONFIG_PPC64
-			ppnode = pci_bus_to_OF_node(pdev->bus);
-#else
-			struct pci_controller *host;
-			host = pci_bus_to_host(pdev->bus);
-			ppnode = host ? host->dn : NULL;
-#endif
-			/* No node for host bridge ? give up */
-			if (ppnode == NULL)
-				return -EINVAL;
-		} else
-			/* We found a P2P bridge, check if it has a node */
-			ppnode = pci_device_to_OF_node(ppdev);
-
-		/* Ok, we have found a parent with a device-node, hand over to
-		 * the OF parsing code.
-		 * We build a unit address from the linux device to be used for
-		 * resolution. Note that we use the linux bus number which may
-		 * not match your firmware bus numbering.
-		 * Fortunately, in most cases, interrupt-map-mask doesn't include
-		 * the bus number as part of the matching.
-		 * You should still be careful about that though if you intend
-		 * to rely on this function (you ship  a firmware that doesn't
-		 * create device nodes for all PCI devices).
-		 */
-		if (ppnode)
-			break;
-
-		/* We can only get here if we hit a P2P bridge with no node,
-		 * let's do standard swizzling and try again
-		 */
-		lspec = pci_swizzle_interrupt_pin(pdev, lspec);
-		pdev = ppdev;
-	}
-
-	laddr[0] = (pdev->bus->number << 16)
-		| (pdev->devfn << 8);
-	laddr[1]  = laddr[2] = 0;
-	return of_irq_map_raw(ppnode, &lspec, 1, laddr, out_irq);
-}
-EXPORT_SYMBOL_GPL(of_irq_map_pci);
-#endif /* CONFIG_PCI */
 
 void of_parse_dma_window(struct device_node *dn, const void *dma_window_prop,
 		unsigned long *busno, unsigned long *phys, unsigned long *size)
diff --git a/drivers/of/Kconfig b/drivers/of/Kconfig
index 3c6e100..efabbf9 100644
--- a/drivers/of/Kconfig
+++ b/drivers/of/Kconfig
@@ -69,4 +69,10 @@ config OF_MDIO
 	help
 	  OpenFirmware MDIO bus (Ethernet PHY) accessors
 
+config OF_PCI
+	def_tristate PCI
+	depends on PCI && (PPC || MICROBLAZE)
+	help
+	  OpenFirmware PCI bus accessors
+
 endmenu # OF
diff --git a/drivers/of/Makefile b/drivers/of/Makefile
index 3ab21a0..f7861ed 100644
--- a/drivers/of/Makefile
+++ b/drivers/of/Makefile
@@ -9,3 +9,4 @@ obj-$(CONFIG_OF_I2C)	+= of_i2c.o
 obj-$(CONFIG_OF_NET)	+= of_net.o
 obj-$(CONFIG_OF_SPI)	+= of_spi.o
 obj-$(CONFIG_OF_MDIO)	+= of_mdio.o
+obj-$(CONFIG_OF_PCI)	+= of_pci.o
diff --git a/drivers/of/of_pci.c b/drivers/of/of_pci.c
new file mode 100644
index 0000000..314535f
--- /dev/null
+++ b/drivers/of/of_pci.c
@@ -0,0 +1,91 @@
+#include <linux/kernel.h>
+#include <linux/of_pci.h>
+#include <asm/prom.h>
+
+/**
+ * of_irq_map_pci - Resolve the interrupt for a PCI device
+ * @pdev:       the device whose interrupt is to be resolved
+ * @out_irq:    structure of_irq filled by this function
+ *
+ * This function resolves the PCI interrupt for a given PCI device. If a
+ * device-node exists for a given pci_dev, it will use normal OF tree
+ * walking. If not, it will implement standard swizzling and walk up the
+ * PCI tree until an device-node is found, at which point it will finish
+ * resolving using the OF tree walking.
+ */
+int of_irq_map_pci(struct pci_dev *pdev, struct of_irq *out_irq)
+{
+	struct device_node *dn, *ppnode;
+	struct pci_dev *ppdev;
+	u32 lspec;
+	__be32 lspec_be;
+	__be32 laddr[3];
+	u8 pin;
+	int rc;
+
+	/* Check if we have a device node, if yes, fallback to standard
+	 * device tree parsing
+	 */
+	dn = pci_device_to_OF_node(pdev);
+	if (dn) {
+		rc = of_irq_map_one(dn, 0, out_irq);
+		if (!rc)
+			return rc;
+	}
+
+	/* Ok, we don't, time to have fun. Let's start by building up an
+	 * interrupt spec.  we assume #interrupt-cells is 1, which is standard
+	 * for PCI. If you do different, then don't use that routine.
+	 */
+	rc = pci_read_config_byte(pdev, PCI_INTERRUPT_PIN, &pin);
+	if (rc != 0)
+		return rc;
+	/* No pin, exit */
+	if (pin == 0)
+		return -ENODEV;
+
+	/* Now we walk up the PCI tree */
+	lspec = pin;
+	for (;;) {
+		/* Get the pci_dev of our parent */
+		ppdev = pdev->bus->self;
+
+		/* Ouch, it's a host bridge... */
+		if (ppdev == NULL) {
+			ppnode = pci_bus_to_OF_node(pdev->bus);
+
+			/* No node for host bridge ? give up */
+			if (ppnode == NULL)
+				return -EINVAL;
+		} else {
+			/* We found a P2P bridge, check if it has a node */
+			ppnode = pci_device_to_OF_node(ppdev);
+		}
+
+		/* Ok, we have found a parent with a device-node, hand over to
+		 * the OF parsing code.
+		 * We build a unit address from the linux device to be used for
+		 * resolution. Note that we use the linux bus number which may
+		 * not match your firmware bus numbering.
+		 * Fortunately, in most cases, interrupt-map-mask doesn't
+		 * include the bus number as part of the matching.
+		 * You should still be careful about that though if you intend
+		 * to rely on this function (you ship  a firmware that doesn't
+		 * create device nodes for all PCI devices).
+		 */
+		if (ppnode)
+			break;
+
+		/* We can only get here if we hit a P2P bridge with no node,
+		 * let's do standard swizzling and try again
+		 */
+		lspec = pci_swizzle_interrupt_pin(pdev, lspec);
+		pdev = ppdev;
+	}
+
+	lspec_be = cpu_to_be32(lspec);
+	laddr[0] = cpu_to_be32((pdev->bus->number << 16) | (pdev->devfn << 8));
+	laddr[1]  = laddr[2] = cpu_to_be32(0);
+	return of_irq_map_raw(ppnode, &lspec_be, 1, laddr, out_irq);
+}
+EXPORT_SYMBOL_GPL(of_irq_map_pci);
diff --git a/include/linux/of_pci.h b/include/linux/of_pci.h
new file mode 100644
index 0000000..85a27b6
--- /dev/null
+++ b/include/linux/of_pci.h
@@ -0,0 +1,9 @@
+#ifndef __OF_PCI_H
+#define __OF_PCI_H
+
+#include <linux/pci.h>
+
+struct pci_dev;
+struct of_irq;
+int of_irq_map_pci(struct pci_dev *pdev, struct of_irq *out_irq);
+#endif
-- 
cgit v0.10.2


From 764328d3209dd81b02a55722556b07b6f35e3ca0 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 4 Feb 2011 07:33:24 -0200
Subject: perf top: Remove superfluous name_len field

From the sym_entry struct, struct symbol already has this field.

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 104de9a..154e088 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -787,9 +787,6 @@ static int symbol_filter(struct map *map, struct symbol *sym)
 		}
 	}
 
-	if (!syme->skip)
-		syme->name_len = strlen(sym->name);
-
 	return 0;
 }
 
diff --git a/tools/perf/util/top.c b/tools/perf/util/top.c
index 1d2e265..70a9c13 100644
--- a/tools/perf/util/top.c
+++ b/tools/perf/util/top.c
@@ -200,6 +200,7 @@ void perf_top__find_widths(struct perf_top *top, struct rb_root *root,
 
 	for (nd = rb_first(root); nd; nd = rb_next(nd)) {
 		struct sym_entry *syme = rb_entry(nd, struct sym_entry, rb_node);
+		struct symbol *sym = sym_entry__symbol(syme);
 
 		if (++printed > top->print_entries ||
 		    (int)syme->snap_count < top->count_filter)
@@ -211,7 +212,7 @@ void perf_top__find_widths(struct perf_top *top, struct rb_root *root,
 		if (syme->map->dso->short_name_len > *dso_short_width)
 			*dso_short_width = syme->map->dso->short_name_len;
 
-		if (syme->name_len > *sym_width)
-			*sym_width = syme->name_len;
+		if (sym->namelen > *sym_width)
+			*sym_width = sym->namelen;
 	}
 }
diff --git a/tools/perf/util/top.h b/tools/perf/util/top.h
index 611370f..5009508 100644
--- a/tools/perf/util/top.h
+++ b/tools/perf/util/top.h
@@ -31,7 +31,6 @@ struct sym_entry {
 	unsigned long		snap_count;
 	double			weight;
 	int			skip;
-	u16			name_len;
 	u8			origin;
 	struct map		*map;
 	struct sym_entry_source	*src;
-- 
cgit v0.10.2


From 78f7defedbb4da73b9a07635c357c1afcaa55c8f Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 4 Feb 2011 09:45:46 -0200
Subject: perf annotate: Move annotate functions to util/

They will be used by perf top, so that we have just one set of routines
to do annotation.

Rename "struct sym_priv" to "struct annotation", etc, to clarify this
code a bit.

Rename "struct sym_ext" to "struct source_line", to give it a meaningful
name, that clarifies that it is a the result of an addr2line call, that
is sorted by percentage one particular source code line appeared in the
annotation.

And since we're moving things around also rename 'sym_hist->ip' to
'sym_hist->addr' as we want to do data structure annotation at some
point.

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index 4c9499c..be3eb1d 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -401,6 +401,7 @@ LIB_H += util/include/dwarf-regs.h
 LIB_H += util/include/asm/dwarf2.h
 LIB_H += util/include/asm/cpufeature.h
 LIB_H += perf.h
+LIB_H += util/annotate.h
 LIB_H += util/cache.h
 LIB_H += util/callchain.h
 LIB_H += util/build-id.h
@@ -444,6 +445,7 @@ LIB_H += $(ARCH_INCLUDE)
 
 LIB_OBJS += $(OUTPUT)util/abspath.o
 LIB_OBJS += $(OUTPUT)util/alias.o
+LIB_OBJS += $(OUTPUT)util/annotate.o
 LIB_OBJS += $(OUTPUT)util/build-id.o
 LIB_OBJS += $(OUTPUT)util/config.o
 LIB_OBJS += $(OUTPUT)util/ctype.o
diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
index cd9dec4..9072ef4 100644
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -9,6 +9,7 @@
 
 #include "util/util.h"
 
+#include "util/util.h"
 #include "util/color.h"
 #include <linux/list.h>
 #include "util/cache.h"
@@ -18,6 +19,7 @@
 #include "perf.h"
 #include "util/debug.h"
 
+#include "util/annotate.h"
 #include "util/event.h"
 #include "util/parse-options.h"
 #include "util/parse-events.h"
@@ -79,245 +81,10 @@ static int process_sample_event(union perf_event *event,
 	return 0;
 }
 
-static int objdump_line__print(struct objdump_line *self,
-			       struct list_head *head,
-			       struct hist_entry *he, u64 len)
-{
-	struct symbol *sym = he->ms.sym;
-	static const char *prev_line;
-	static const char *prev_color;
-
-	if (self->offset != -1) {
-		const char *path = NULL;
-		unsigned int hits = 0;
-		double percent = 0.0;
-		const char *color;
-		struct sym_priv *priv = symbol__priv(sym);
-		struct sym_ext *sym_ext = priv->ext;
-		struct sym_hist *h = priv->hist;
-		s64 offset = self->offset;
-		struct objdump_line *next = objdump__get_next_ip_line(head, self);
-
-		while (offset < (s64)len &&
-		       (next == NULL || offset < next->offset)) {
-			if (sym_ext) {
-				if (path == NULL)
-					path = sym_ext[offset].path;
-				percent += sym_ext[offset].percent;
-			} else
-				hits += h->ip[offset];
-
-			++offset;
-		}
-
-		if (sym_ext == NULL && h->sum)
-			percent = 100.0 * hits / h->sum;
-
-		color = get_percent_color(percent);
-
-		/*
-		 * Also color the filename and line if needed, with
-		 * the same color than the percentage. Don't print it
-		 * twice for close colored ip with the same filename:line
-		 */
-		if (path) {
-			if (!prev_line || strcmp(prev_line, path)
-				       || color != prev_color) {
-				color_fprintf(stdout, color, " %s", path);
-				prev_line = path;
-				prev_color = color;
-			}
-		}
-
-		color_fprintf(stdout, color, " %7.2f", percent);
-		printf(" :	");
-		color_fprintf(stdout, PERF_COLOR_BLUE, "%s\n", self->line);
-	} else {
-		if (!*self->line)
-			printf("         :\n");
-		else
-			printf("         :	%s\n", self->line);
-	}
-
-	return 0;
-}
-
-static struct rb_root root_sym_ext;
-
-static void insert_source_line(struct sym_ext *sym_ext)
-{
-	struct sym_ext *iter;
-	struct rb_node **p = &root_sym_ext.rb_node;
-	struct rb_node *parent = NULL;
-
-	while (*p != NULL) {
-		parent = *p;
-		iter = rb_entry(parent, struct sym_ext, node);
-
-		if (sym_ext->percent > iter->percent)
-			p = &(*p)->rb_left;
-		else
-			p = &(*p)->rb_right;
-	}
-
-	rb_link_node(&sym_ext->node, parent, p);
-	rb_insert_color(&sym_ext->node, &root_sym_ext);
-}
-
-static void free_source_line(struct hist_entry *he, int len)
-{
-	struct sym_priv *priv = symbol__priv(he->ms.sym);
-	struct sym_ext *sym_ext = priv->ext;
-	int i;
-
-	if (!sym_ext)
-		return;
-
-	for (i = 0; i < len; i++)
-		free(sym_ext[i].path);
-	free(sym_ext);
-
-	priv->ext = NULL;
-	root_sym_ext = RB_ROOT;
-}
-
-/* Get the filename:line for the colored entries */
-static void
-get_source_line(struct hist_entry *he, int len, const char *filename)
-{
-	struct symbol *sym = he->ms.sym;
-	u64 start;
-	int i;
-	char cmd[PATH_MAX * 2];
-	struct sym_ext *sym_ext;
-	struct sym_priv *priv = symbol__priv(sym);
-	struct sym_hist *h = priv->hist;
-
-	if (!h->sum)
-		return;
-
-	sym_ext = priv->ext = calloc(len, sizeof(struct sym_ext));
-	if (!priv->ext)
-		return;
-
-	start = he->ms.map->unmap_ip(he->ms.map, sym->start);
-
-	for (i = 0; i < len; i++) {
-		char *path = NULL;
-		size_t line_len;
-		u64 offset;
-		FILE *fp;
-
-		sym_ext[i].percent = 100.0 * h->ip[i] / h->sum;
-		if (sym_ext[i].percent <= 0.5)
-			continue;
-
-		offset = start + i;
-		sprintf(cmd, "addr2line -e %s %016" PRIx64, filename, offset);
-		fp = popen(cmd, "r");
-		if (!fp)
-			continue;
-
-		if (getline(&path, &line_len, fp) < 0 || !line_len)
-			goto next;
-
-		sym_ext[i].path = malloc(sizeof(char) * line_len + 1);
-		if (!sym_ext[i].path)
-			goto next;
-
-		strcpy(sym_ext[i].path, path);
-		insert_source_line(&sym_ext[i]);
-
-	next:
-		pclose(fp);
-	}
-}
-
-static void print_summary(const char *filename)
-{
-	struct sym_ext *sym_ext;
-	struct rb_node *node;
-
-	printf("\nSorted summary for file %s\n", filename);
-	printf("----------------------------------------------\n\n");
-
-	if (RB_EMPTY_ROOT(&root_sym_ext)) {
-		printf(" Nothing higher than %1.1f%%\n", MIN_GREEN);
-		return;
-	}
-
-	node = rb_first(&root_sym_ext);
-	while (node) {
-		double percent;
-		const char *color;
-		char *path;
-
-		sym_ext = rb_entry(node, struct sym_ext, node);
-		percent = sym_ext->percent;
-		color = get_percent_color(percent);
-		path = sym_ext->path;
-
-		color_fprintf(stdout, color, " %7.2f %s", percent, path);
-		node = rb_next(node);
-	}
-}
-
-static void hist_entry__print_hits(struct hist_entry *self)
-{
-	struct symbol *sym = self->ms.sym;
-	struct sym_priv *priv = symbol__priv(sym);
-	struct sym_hist *h = priv->hist;
-	u64 len = sym->end - sym->start, offset;
-
-	for (offset = 0; offset < len; ++offset)
-		if (h->ip[offset] != 0)
-			printf("%*" PRIx64 ": %" PRIu64 "\n", BITS_PER_LONG / 2,
-			       sym->start + offset, h->ip[offset]);
-	printf("%*s: %" PRIu64 "\n", BITS_PER_LONG / 2, "h->sum", h->sum);
-}
-
 static int hist_entry__tty_annotate(struct hist_entry *he)
 {
-	struct map *map = he->ms.map;
-	struct dso *dso = map->dso;
-	struct symbol *sym = he->ms.sym;
-	const char *filename = dso->long_name, *d_filename;
-	u64 len;
-	LIST_HEAD(head);
-	struct objdump_line *pos, *n;
-
-	if (hist_entry__annotate(he, &head, 0) < 0)
-		return -1;
-
-	if (full_paths)
-		d_filename = filename;
-	else
-		d_filename = basename(filename);
-
-	len = sym->end - sym->start;
-
-	if (print_line) {
-		get_source_line(he, len, filename);
-		print_summary(filename);
-	}
-
-	printf("\n\n------------------------------------------------\n");
-	printf(" Percent |	Source code & Disassembly of %s\n", d_filename);
-	printf("------------------------------------------------\n");
-
-	if (verbose)
-		hist_entry__print_hits(he);
-
-	list_for_each_entry_safe(pos, n, &head, node) {
-		objdump_line__print(pos, &head, he, len);
-		list_del(&pos->node);
-		objdump_line__free(pos);
-	}
-
-	if (print_line)
-		free_source_line(he, len);
-
-	return 0;
+	return symbol__tty_annotate(he->ms.sym, he->ms.map,
+				    print_line, full_paths);
 }
 
 static void hists__find_annotations(struct hists *self)
@@ -327,13 +94,13 @@ static void hists__find_annotations(struct hists *self)
 
 	while (nd) {
 		struct hist_entry *he = rb_entry(nd, struct hist_entry, rb_node);
-		struct sym_priv *priv;
+		struct annotation *notes;
 
 		if (he->ms.sym == NULL || he->ms.map->dso->annotate_warned)
 			goto find_next;
 
-		priv = symbol__priv(he->ms.sym);
-		if (priv->hist == NULL) {
+		notes = symbol__annotation(he->ms.sym);
+		if (notes->histogram == NULL) {
 find_next:
 			if (key == KEY_LEFT)
 				nd = rb_prev(nd);
@@ -362,11 +129,11 @@ find_next:
 			nd = rb_next(nd);
 			/*
 			 * Since we have a hist_entry per IP for the same
-			 * symbol, free he->ms.sym->hist to signal we already
+			 * symbol, free he->ms.sym->histogram to signal we already
 			 * processed this symbol.
 			 */
-			free(priv->hist);
-			priv->hist = NULL;
+			free(notes->histogram);
+			notes->histogram = NULL;
 		}
 	}
 }
@@ -454,7 +221,7 @@ int cmd_annotate(int argc, const char **argv, const char *prefix __used)
 
 	setup_browser(true);
 
-	symbol_conf.priv_size = sizeof(struct sym_priv);
+	symbol_conf.priv_size = sizeof(struct annotation);
 	symbol_conf.try_vmlinux_path = true;
 
 	if (symbol__init() < 0)
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 080937c..91e4cdb 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -9,6 +9,7 @@
 
 #include "util/util.h"
 
+#include "util/annotate.h"
 #include "util/color.h"
 #include <linux/list.h>
 #include "util/cache.h"
@@ -508,7 +509,7 @@ int cmd_report(int argc, const char **argv, const char *prefix __used)
 	 * implementation.
 	 */
 	if (use_browser > 0) {
-		symbol_conf.priv_size = sizeof(struct sym_priv);
+		symbol_conf.priv_size = sizeof(struct annotation);
 		/*
  		 * For searching by name on the "Browse map details".
  		 * providing it only in verbose mode not to bloat too
diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
new file mode 100644
index 0000000..9b25575
--- /dev/null
+++ b/tools/perf/util/annotate.c
@@ -0,0 +1,467 @@
+/*
+ * Copyright (C) 2011, Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
+ *
+ * Parts came from builtin-annotate.c, see those files for further
+ * copyright notes.
+ *
+ * Released under the GPL v2. (and only v2, not any later version)
+ */
+
+#include "util.h"
+#include "build-id.h"
+#include "color.h"
+#include "cache.h"
+#include "symbol.h"
+#include "debug.h"
+#include "annotate.h"
+
+static int symbol__alloc_hist(struct symbol *sym)
+{
+	struct annotation *notes = symbol__annotation(sym);
+	const int size = (sizeof(*notes->histogram) +
+			  (sym->end - sym->start) * sizeof(u64));
+
+	notes->histogram = zalloc(size);
+	return notes->histogram == NULL ? -1 : 0;
+}
+
+int symbol__inc_addr_samples(struct symbol *sym, struct map *map, u64 addr)
+{
+	unsigned int sym_size, offset;
+	struct annotation *notes;
+	struct sym_hist *h;
+
+	if (!sym || !map)
+		return 0;
+
+	notes = symbol__annotation(sym);
+	if (notes->histogram == NULL && symbol__alloc_hist(sym) < 0)
+		return -ENOMEM;
+
+	sym_size = sym->end - sym->start;
+	offset = addr - sym->start;
+
+	pr_debug3("%s: addr=%#" PRIx64 "\n", __func__, map->unmap_ip(map, addr));
+
+	if (offset >= sym_size)
+		return 0;
+
+	h = notes->histogram;
+	h->sum++;
+	h->addr[offset]++;
+
+	pr_debug3("%#" PRIx64 " %s: period++ [addr: %#" PRIx64 ", %#" PRIx64
+		  "] => %" PRIu64 "\n", sym->start, sym->name,
+		  addr, addr - sym->start, h->addr[offset]);
+	return 0;
+}
+
+static struct objdump_line *objdump_line__new(s64 offset, char *line, size_t privsize)
+{
+	struct objdump_line *self = malloc(sizeof(*self) + privsize);
+
+	if (self != NULL) {
+		self->offset = offset;
+		self->line = line;
+	}
+
+	return self;
+}
+
+void objdump_line__free(struct objdump_line *self)
+{
+	free(self->line);
+	free(self);
+}
+
+static void objdump__add_line(struct list_head *head, struct objdump_line *line)
+{
+	list_add_tail(&line->node, head);
+}
+
+struct objdump_line *objdump__get_next_ip_line(struct list_head *head,
+					       struct objdump_line *pos)
+{
+	list_for_each_entry_continue(pos, head, node)
+		if (pos->offset >= 0)
+			return pos;
+
+	return NULL;
+}
+
+static void objdump_line__print(struct objdump_line *oline,
+				struct list_head *head,
+				struct symbol *sym, u64 len)
+{
+	static const char *prev_line;
+	static const char *prev_color;
+
+	if (oline->offset != -1) {
+		const char *path = NULL;
+		unsigned int hits = 0;
+		double percent = 0.0;
+		const char *color;
+		struct annotation *notes = symbol__annotation(sym);
+		struct source_line *src_line = notes->src_line;
+		struct sym_hist *h = notes->histogram;
+		s64 offset = oline->offset;
+		struct objdump_line *next = objdump__get_next_ip_line(head, oline);
+
+		while (offset < (s64)len &&
+		       (next == NULL || offset < next->offset)) {
+			if (src_line) {
+				if (path == NULL)
+					path = src_line[offset].path;
+				percent += src_line[offset].percent;
+			} else
+				hits += h->addr[offset];
+
+			++offset;
+		}
+
+		if (src_line == NULL && h->sum)
+			percent = 100.0 * hits / h->sum;
+
+		color = get_percent_color(percent);
+
+		/*
+		 * Also color the filename and line if needed, with
+		 * the same color than the percentage. Don't print it
+		 * twice for close colored addr with the same filename:line
+		 */
+		if (path) {
+			if (!prev_line || strcmp(prev_line, path)
+				       || color != prev_color) {
+				color_fprintf(stdout, color, " %s", path);
+				prev_line = path;
+				prev_color = color;
+			}
+		}
+
+		color_fprintf(stdout, color, " %7.2f", percent);
+		printf(" :	");
+		color_fprintf(stdout, PERF_COLOR_BLUE, "%s\n", oline->line);
+	} else {
+		if (!*oline->line)
+			printf("         :\n");
+		else
+			printf("         :	%s\n", oline->line);
+	}
+}
+
+static int symbol__parse_objdump_line(struct symbol *sym, struct map *map, FILE *file,
+				      struct list_head *head, size_t privsize)
+{
+	struct objdump_line *objdump_line;
+	char *line = NULL, *tmp, *tmp2, *c;
+	size_t line_len;
+	s64 line_ip, offset = -1;
+
+	if (getline(&line, &line_len, file) < 0)
+		return -1;
+
+	if (!line)
+		return -1;
+
+	while (line_len != 0 && isspace(line[line_len - 1]))
+		line[--line_len] = '\0';
+
+	c = strchr(line, '\n');
+	if (c)
+		*c = 0;
+
+	line_ip = -1;
+
+	/*
+	 * Strip leading spaces:
+	 */
+	tmp = line;
+	while (*tmp) {
+		if (*tmp != ' ')
+			break;
+		tmp++;
+	}
+
+	if (*tmp) {
+		/*
+		 * Parse hexa addresses followed by ':'
+		 */
+		line_ip = strtoull(tmp, &tmp2, 16);
+		if (*tmp2 != ':' || tmp == tmp2 || tmp2[1] == '\0')
+			line_ip = -1;
+	}
+
+	if (line_ip != -1) {
+		u64 start = map__rip_2objdump(map, sym->start),
+		    end = map__rip_2objdump(map, sym->end);
+
+		offset = line_ip - start;
+		if (offset < 0 || (u64)line_ip > end)
+			offset = -1;
+	}
+
+	objdump_line = objdump_line__new(offset, line, privsize);
+	if (objdump_line == NULL) {
+		free(line);
+		return -1;
+	}
+	objdump__add_line(head, objdump_line);
+
+	return 0;
+}
+
+int symbol__annotate(struct symbol *sym, struct map *map,
+		     struct list_head *head, size_t privsize)
+{
+	struct dso *dso = map->dso;
+	char *filename = dso__build_id_filename(dso, NULL, 0);
+	bool free_filename = true;
+	char command[PATH_MAX * 2];
+	FILE *file;
+	int err = 0;
+	u64 len;
+	char symfs_filename[PATH_MAX];
+
+	if (filename) {
+		snprintf(symfs_filename, sizeof(symfs_filename), "%s%s",
+			 symbol_conf.symfs, filename);
+	}
+
+	if (filename == NULL) {
+		if (dso->has_build_id) {
+			pr_err("Can't annotate %s: not enough memory\n",
+			       sym->name);
+			return -ENOMEM;
+		}
+		goto fallback;
+	} else if (readlink(symfs_filename, command, sizeof(command)) < 0 ||
+		   strstr(command, "[kernel.kallsyms]") ||
+		   access(symfs_filename, R_OK)) {
+		free(filename);
+fallback:
+		/*
+		 * If we don't have build-ids or the build-id file isn't in the
+		 * cache, or is just a kallsyms file, well, lets hope that this
+		 * DSO is the same as when 'perf record' ran.
+		 */
+		filename = dso->long_name;
+		snprintf(symfs_filename, sizeof(symfs_filename), "%s%s",
+			 symbol_conf.symfs, filename);
+		free_filename = false;
+	}
+
+	if (dso->origin == DSO__ORIG_KERNEL) {
+		if (dso->annotate_warned)
+			goto out_free_filename;
+		err = -ENOENT;
+		dso->annotate_warned = 1;
+		pr_err("Can't annotate %s: No vmlinux file was found in the "
+		       "path\n", sym->name);
+		goto out_free_filename;
+	}
+
+	pr_debug("%s: filename=%s, sym=%s, start=%#" PRIx64 ", end=%#" PRIx64 "\n", __func__,
+		 filename, sym->name, map->unmap_ip(map, sym->start),
+		 map->unmap_ip(map, sym->end));
+
+	len = sym->end - sym->start;
+
+	pr_debug("annotating [%p] %30s : [%p] %30s\n",
+		 dso, dso->long_name, sym, sym->name);
+
+	snprintf(command, sizeof(command),
+		 "objdump --start-address=0x%016" PRIx64
+		 " --stop-address=0x%016" PRIx64 " -dS -C %s|grep -v %s|expand",
+		 map__rip_2objdump(map, sym->start),
+		 map__rip_2objdump(map, sym->end),
+		 symfs_filename, filename);
+
+	pr_debug("Executing: %s\n", command);
+
+	file = popen(command, "r");
+	if (!file)
+		goto out_free_filename;
+
+	while (!feof(file))
+		if (symbol__parse_objdump_line(sym, map, file, head, privsize) < 0)
+			break;
+
+	pclose(file);
+out_free_filename:
+	if (free_filename)
+		free(filename);
+	return err;
+}
+
+static void insert_source_line(struct rb_root *root, struct source_line *src_line)
+{
+	struct source_line *iter;
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent = NULL;
+
+	while (*p != NULL) {
+		parent = *p;
+		iter = rb_entry(parent, struct source_line, node);
+
+		if (src_line->percent > iter->percent)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+
+	rb_link_node(&src_line->node, parent, p);
+	rb_insert_color(&src_line->node, root);
+}
+
+static void symbol__free_source_line(struct symbol *sym, int len)
+{
+	struct annotation *notes = symbol__annotation(sym);
+	struct source_line *src_line = notes->src_line;
+	int i;
+
+	for (i = 0; i < len; i++)
+		free(src_line[i].path);
+
+	free(src_line);
+	notes->src_line = NULL;
+}
+
+/* Get the filename:line for the colored entries */
+static int symbol__get_source_line(struct symbol *sym, struct map *map,
+				   struct rb_root *root, int len,
+				   const char *filename)
+{
+	u64 start;
+	int i;
+	char cmd[PATH_MAX * 2];
+	struct source_line *src_line;
+	struct annotation *notes = symbol__annotation(sym);
+	struct sym_hist *h = notes->histogram;
+
+	if (!h->sum)
+		return 0;
+
+	src_line = notes->src_line = calloc(len, sizeof(struct source_line));
+	if (!notes->src_line)
+		return -1;
+
+	start = map->unmap_ip(map, sym->start);
+
+	for (i = 0; i < len; i++) {
+		char *path = NULL;
+		size_t line_len;
+		u64 offset;
+		FILE *fp;
+
+		src_line[i].percent = 100.0 * h->addr[i] / h->sum;
+		if (src_line[i].percent <= 0.5)
+			continue;
+
+		offset = start + i;
+		sprintf(cmd, "addr2line -e %s %016" PRIx64, filename, offset);
+		fp = popen(cmd, "r");
+		if (!fp)
+			continue;
+
+		if (getline(&path, &line_len, fp) < 0 || !line_len)
+			goto next;
+
+		src_line[i].path = malloc(sizeof(char) * line_len + 1);
+		if (!src_line[i].path)
+			goto next;
+
+		strcpy(src_line[i].path, path);
+		insert_source_line(root, &src_line[i]);
+
+	next:
+		pclose(fp);
+	}
+
+	return 0;
+}
+
+static void print_summary(struct rb_root *root, const char *filename)
+{
+	struct source_line *src_line;
+	struct rb_node *node;
+
+	printf("\nSorted summary for file %s\n", filename);
+	printf("----------------------------------------------\n\n");
+
+	if (RB_EMPTY_ROOT(root)) {
+		printf(" Nothing higher than %1.1f%%\n", MIN_GREEN);
+		return;
+	}
+
+	node = rb_first(root);
+	while (node) {
+		double percent;
+		const char *color;
+		char *path;
+
+		src_line = rb_entry(node, struct source_line, node);
+		percent = src_line->percent;
+		color = get_percent_color(percent);
+		path = src_line->path;
+
+		color_fprintf(stdout, color, " %7.2f %s", percent, path);
+		node = rb_next(node);
+	}
+}
+
+static void symbol__annotate_hits(struct symbol *sym)
+{
+	struct annotation *notes = symbol__annotation(sym);
+	struct sym_hist *h = notes->histogram;
+	u64 len = sym->end - sym->start, offset;
+
+	for (offset = 0; offset < len; ++offset)
+		if (h->addr[offset] != 0)
+			printf("%*" PRIx64 ": %" PRIu64 "\n", BITS_PER_LONG / 2,
+			       sym->start + offset, h->addr[offset]);
+	printf("%*s: %" PRIu64 "\n", BITS_PER_LONG / 2, "h->sum", h->sum);
+}
+
+int symbol__tty_annotate(struct symbol *sym, struct map *map, bool print_lines,
+			 bool full_paths)
+{
+	struct dso *dso = map->dso;
+	const char *filename = dso->long_name, *d_filename;
+	struct rb_root source_line = RB_ROOT;
+	struct objdump_line *pos, *n;
+	LIST_HEAD(head);
+	u64 len;
+
+	if (symbol__annotate(sym, map, &head, 0) < 0)
+		return -1;
+
+	if (full_paths)
+		d_filename = filename;
+	else
+		d_filename = basename(filename);
+
+	len = sym->end - sym->start;
+
+	if (print_lines) {
+		symbol__get_source_line(sym, map, &source_line, len, filename);
+		print_summary(&source_line, filename);
+	}
+
+	printf("\n\n------------------------------------------------\n");
+	printf(" Percent |	Source code & Disassembly of %s\n", d_filename);
+	printf("------------------------------------------------\n");
+
+	if (verbose)
+		symbol__annotate_hits(sym);
+
+	list_for_each_entry_safe(pos, n, &head, node) {
+		objdump_line__print(pos, &head, sym, len);
+		list_del(&pos->node);
+		objdump_line__free(pos);
+	}
+
+	if (print_lines)
+		symbol__free_source_line(sym, len);
+
+	return 0;
+}
diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h
new file mode 100644
index 0000000..6e2fbc2
--- /dev/null
+++ b/tools/perf/util/annotate.h
@@ -0,0 +1,65 @@
+#ifndef __PERF_ANNOTATE_H
+#define __PERF_ANNOTATE_H
+
+#include <stdbool.h>
+#include "types.h"
+#include "symbol.h"
+#include <linux/list.h>
+#include <linux/rbtree.h>
+
+struct objdump_line {
+	struct list_head node;
+	s64		 offset;
+	char		 *line;
+};
+
+void objdump_line__free(struct objdump_line *self);
+struct objdump_line *objdump__get_next_ip_line(struct list_head *head,
+					       struct objdump_line *pos);
+
+struct sym_hist {
+	u64		sum;
+	u64		addr[0];
+};
+
+struct source_line {
+	struct rb_node	node;
+	double		percent;
+	char		*path;
+};
+
+struct annotation {
+	struct sym_hist	   *histogram;
+	struct source_line *src_line;
+};
+
+struct sannotation {
+	struct annotation annotation;
+	struct symbol	  symbol;
+};
+
+static inline struct annotation *symbol__annotation(struct symbol *sym)
+{
+	struct sannotation *a = container_of(sym, struct sannotation, symbol);
+	return &a->annotation;
+}
+
+int symbol__inc_addr_samples(struct symbol *sym, struct map *map, u64 addr);
+
+int symbol__annotate(struct symbol *sym, struct map *map,
+		     struct list_head *head, size_t privsize);
+
+int symbol__tty_annotate(struct symbol *sym, struct map *map,
+			 bool print_lines, bool full_paths);
+
+#ifdef NO_NEWT_SUPPORT
+static inline int symbol__tui_annotate(symbol *sym __used,
+				       struct map *map __used)
+{
+	return 0;
+}
+#else
+int symbol__tui_annotate(struct symbol *sym, struct map *map);
+#endif
+
+#endif	/* __PERF_ANNOTATE_H */
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index 9588780..6d9c92c 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -1,3 +1,4 @@
+#include "annotate.h"
 #include "util.h"
 #include "build-id.h"
 #include "hist.h"
@@ -949,225 +950,15 @@ void hists__filter_by_thread(struct hists *self, const struct thread *thread)
 	}
 }
 
-static int symbol__alloc_hist(struct symbol *self)
+int hist_entry__inc_addr_samples(struct hist_entry *he, u64 ip)
 {
-	struct sym_priv *priv = symbol__priv(self);
-	const int size = (sizeof(*priv->hist) +
-			  (self->end - self->start) * sizeof(u64));
-
-	priv->hist = zalloc(size);
-	return priv->hist == NULL ? -1 : 0;
-}
-
-int hist_entry__inc_addr_samples(struct hist_entry *self, u64 ip)
-{
-	unsigned int sym_size, offset;
-	struct symbol *sym = self->ms.sym;
-	struct sym_priv *priv;
-	struct sym_hist *h;
-
-	if (!sym || !self->ms.map)
-		return 0;
-
-	priv = symbol__priv(sym);
-	if (priv->hist == NULL && symbol__alloc_hist(sym) < 0)
-		return -ENOMEM;
-
-	sym_size = sym->end - sym->start;
-	offset = ip - sym->start;
-
-	pr_debug3("%s: ip=%#" PRIx64 "\n", __func__, self->ms.map->unmap_ip(self->ms.map, ip));
-
-	if (offset >= sym_size)
-		return 0;
-
-	h = priv->hist;
-	h->sum++;
-	h->ip[offset]++;
-
-	pr_debug3("%#" PRIx64 " %s: period++ [ip: %#" PRIx64 ", %#" PRIx64
-		  "] => %" PRIu64 "\n", self->ms.sym->start, self->ms.sym->name,
-		  ip, ip - self->ms.sym->start, h->ip[offset]);
-	return 0;
-}
-
-static struct objdump_line *objdump_line__new(s64 offset, char *line, size_t privsize)
-{
-	struct objdump_line *self = malloc(sizeof(*self) + privsize);
-
-	if (self != NULL) {
-		self->offset = offset;
-		self->line = line;
-	}
-
-	return self;
-}
-
-void objdump_line__free(struct objdump_line *self)
-{
-	free(self->line);
-	free(self);
-}
-
-static void objdump__add_line(struct list_head *head, struct objdump_line *line)
-{
-	list_add_tail(&line->node, head);
-}
-
-struct objdump_line *objdump__get_next_ip_line(struct list_head *head,
-					       struct objdump_line *pos)
-{
-	list_for_each_entry_continue(pos, head, node)
-		if (pos->offset >= 0)
-			return pos;
-
-	return NULL;
+	return symbol__inc_addr_samples(he->ms.sym, he->ms.map, ip);
 }
 
-static int hist_entry__parse_objdump_line(struct hist_entry *self, FILE *file,
-					  struct list_head *head, size_t privsize)
-{
-	struct symbol *sym = self->ms.sym;
-	struct objdump_line *objdump_line;
-	char *line = NULL, *tmp, *tmp2, *c;
-	size_t line_len;
-	s64 line_ip, offset = -1;
-
-	if (getline(&line, &line_len, file) < 0)
-		return -1;
-
-	if (!line)
-		return -1;
-
-	while (line_len != 0 && isspace(line[line_len - 1]))
-		line[--line_len] = '\0';
-
-	c = strchr(line, '\n');
-	if (c)
-		*c = 0;
-
-	line_ip = -1;
-
-	/*
-	 * Strip leading spaces:
-	 */
-	tmp = line;
-	while (*tmp) {
-		if (*tmp != ' ')
-			break;
-		tmp++;
-	}
-
-	if (*tmp) {
-		/*
-		 * Parse hexa addresses followed by ':'
-		 */
-		line_ip = strtoull(tmp, &tmp2, 16);
-		if (*tmp2 != ':' || tmp == tmp2 || tmp2[1] == '\0')
-			line_ip = -1;
-	}
-
-	if (line_ip != -1) {
-		u64 start = map__rip_2objdump(self->ms.map, sym->start),
-		    end = map__rip_2objdump(self->ms.map, sym->end);
-
-		offset = line_ip - start;
-		if (offset < 0 || (u64)line_ip > end)
-			offset = -1;
-	}
-
-	objdump_line = objdump_line__new(offset, line, privsize);
-	if (objdump_line == NULL) {
-		free(line);
-		return -1;
-	}
-	objdump__add_line(head, objdump_line);
-
-	return 0;
-}
-
-int hist_entry__annotate(struct hist_entry *self, struct list_head *head,
+int hist_entry__annotate(struct hist_entry *he, struct list_head *head,
 			 size_t privsize)
 {
-	struct symbol *sym = self->ms.sym;
-	struct map *map = self->ms.map;
-	struct dso *dso = map->dso;
-	char *filename = dso__build_id_filename(dso, NULL, 0);
-	bool free_filename = true;
-	char command[PATH_MAX * 2];
-	FILE *file;
-	int err = 0;
-	u64 len;
-	char symfs_filename[PATH_MAX];
-
-	if (filename) {
-		snprintf(symfs_filename, sizeof(symfs_filename), "%s%s",
-			 symbol_conf.symfs, filename);
-	}
-
-	if (filename == NULL) {
-		if (dso->has_build_id) {
-			pr_err("Can't annotate %s: not enough memory\n",
-			       sym->name);
-			return -ENOMEM;
-		}
-		goto fallback;
-	} else if (readlink(symfs_filename, command, sizeof(command)) < 0 ||
-		   strstr(command, "[kernel.kallsyms]") ||
-		   access(symfs_filename, R_OK)) {
-		free(filename);
-fallback:
-		/*
-		 * If we don't have build-ids or the build-id file isn't in the
-		 * cache, or is just a kallsyms file, well, lets hope that this
-		 * DSO is the same as when 'perf record' ran.
-		 */
-		filename = dso->long_name;
-		snprintf(symfs_filename, sizeof(symfs_filename), "%s%s",
-			 symbol_conf.symfs, filename);
-		free_filename = false;
-	}
-
-	if (dso->origin == DSO__ORIG_KERNEL) {
-		if (dso->annotate_warned)
-			goto out_free_filename;
-		err = -ENOENT;
-		dso->annotate_warned = 1;
-		pr_err("Can't annotate %s: No vmlinux file was found in the "
-		       "path\n", sym->name);
-		goto out_free_filename;
-	}
-
-	pr_debug("%s: filename=%s, sym=%s, start=%#" PRIx64 ", end=%#" PRIx64 "\n", __func__,
-		 filename, sym->name, map->unmap_ip(map, sym->start),
-		 map->unmap_ip(map, sym->end));
-
-	len = sym->end - sym->start;
-
-	pr_debug("annotating [%p] %30s : [%p] %30s\n",
-		 dso, dso->long_name, sym, sym->name);
-
-	snprintf(command, sizeof(command),
-		 "objdump --start-address=0x%016" PRIx64 " --stop-address=0x%016" PRIx64 " -dS -C %s|grep -v %s|expand",
-		 map__rip_2objdump(map, sym->start),
-		 map__rip_2objdump(map, sym->end),
-		 symfs_filename, filename);
-
-	pr_debug("Executing: %s\n", command);
-
-	file = popen(command, "r");
-	if (!file)
-		goto out_free_filename;
-
-	while (!feof(file))
-		if (hist_entry__parse_objdump_line(self, file, head, privsize) < 0)
-			break;
-
-	pclose(file);
-out_free_filename:
-	if (free_filename)
-		free(filename);
-	return err;
+	return symbol__annotate(he->ms.sym, he->ms.map, head, privsize);
 }
 
 void hists__inc_nr_events(struct hists *self, u32 type)
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index 889559b..8a201f7 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -9,33 +9,6 @@ extern struct callchain_param callchain_param;
 struct hist_entry;
 struct addr_location;
 struct symbol;
-struct rb_root;
-
-struct objdump_line {
-	struct list_head node;
-	s64		 offset;
-	char		 *line;
-};
-
-void objdump_line__free(struct objdump_line *self);
-struct objdump_line *objdump__get_next_ip_line(struct list_head *head,
-					       struct objdump_line *pos);
-
-struct sym_hist {
-	u64		sum;
-	u64		ip[0];
-};
-
-struct sym_ext {
-	struct rb_node	node;
-	double		percent;
-	char		*path;
-};
-
-struct sym_priv {
-	struct sym_hist	*hist;
-	struct sym_ext	*ext;
-};
 
 /*
  * The kernel collects the number of events it couldn't send in a stretch and
diff --git a/tools/perf/util/ui/browsers/annotate.c b/tools/perf/util/ui/browsers/annotate.c
index 82b78f9..daa7138 100644
--- a/tools/perf/util/ui/browsers/annotate.c
+++ b/tools/perf/util/ui/browsers/annotate.c
@@ -1,9 +1,11 @@
 #include "../browser.h"
 #include "../helpline.h"
 #include "../libslang.h"
+#include "../../annotate.h"
 #include "../../hist.h"
 #include "../../sort.h"
 #include "../../symbol.h"
+#include "../../annotate.h"
 
 static void ui__error_window(const char *fmt, ...)
 {
@@ -66,24 +68,26 @@ static double objdump_line__calc_percent(struct objdump_line *self,
 	if (self->offset != -1) {
 		int len = sym->end - sym->start;
 		unsigned int hits = 0;
-		struct sym_priv *priv = symbol__priv(sym);
-		struct sym_ext *sym_ext = priv->ext;
-		struct sym_hist *h = priv->hist;
+		struct annotation *notes = symbol__annotation(sym);
+		struct source_line *src_line = notes->src_line;
+		struct sym_hist *h = notes->histogram;
 		s64 offset = self->offset;
 		struct objdump_line *next = objdump__get_next_ip_line(head, self);
 
-
 		while (offset < (s64)len &&
 		       (next == NULL || offset < next->offset)) {
-			if (sym_ext) {
-				percent += sym_ext[offset].percent;
+			if (src_line) {
+				percent += src_line[offset].percent;
 			} else
-				hits += h->ip[offset];
+				hits += h->addr[offset];
 
 			++offset;
 		}
-
-		if (sym_ext == NULL && h->sum)
+		/*
+ 		 * If the percentage wasn't already calculated in
+ 		 * symbol__get_source_line, do it now:
+ 		 */
+		if (src_line == NULL && h->sum)
 			percent = 100.0 * hits / h->sum;
 	}
 
@@ -136,10 +140,10 @@ static void annotate_browser__set_top(struct annotate_browser *self,
 static int annotate_browser__run(struct annotate_browser *self)
 {
 	struct rb_node *nd;
-	struct hist_entry *he = self->b.priv;
+	struct symbol *sym = self->b.priv;
 	int key;
 
-	if (ui_browser__show(&self->b, he->ms.sym->name,
+	if (ui_browser__show(&self->b, sym->name,
 			     "<-, -> or ESC: exit, TAB/shift+TAB: cycle thru samples") < 0)
 		return -1;
 	/*
@@ -179,7 +183,12 @@ out:
 	return key;
 }
 
-int hist_entry__tui_annotate(struct hist_entry *self)
+int hist_entry__tui_annotate(struct hist_entry *he)
+{
+	return symbol__tui_annotate(he->ms.sym, he->ms.map);
+}
+
+int symbol__tui_annotate(struct symbol *sym, struct map *map)
 {
 	struct objdump_line *pos, *n;
 	struct objdump_line_rb_node *rbpos;
@@ -190,18 +199,18 @@ int hist_entry__tui_annotate(struct hist_entry *self)
 			.refresh = ui_browser__list_head_refresh,
 			.seek	 = ui_browser__list_head_seek,
 			.write	 = annotate_browser__write,
-			.priv	 = self,
+			.priv	 = sym,
 		},
 	};
 	int ret;
 
-	if (self->ms.sym == NULL)
+	if (sym == NULL)
 		return -1;
 
-	if (self->ms.map->dso->annotate_warned)
+	if (map->dso->annotate_warned)
 		return -1;
 
-	if (hist_entry__annotate(self, &head, sizeof(*rbpos)) < 0) {
+	if (symbol__annotate(sym, map, &head, sizeof(*rbpos)) < 0) {
 		ui__error_window(ui_helpline__last_msg);
 		return -1;
 	}
@@ -214,7 +223,7 @@ int hist_entry__tui_annotate(struct hist_entry *self)
 			browser.b.width = line_len;
 		rbpos = objdump_line__rb(pos);
 		rbpos->idx = browser.b.nr_entries++;
-		rbpos->percent = objdump_line__calc_percent(pos, &head, self->ms.sym);
+		rbpos->percent = objdump_line__calc_percent(pos, &head, sym);
 		if (rbpos->percent < 0.01)
 			continue;
 		objdump__insert_line(&browser.entries, rbpos);
-- 
cgit v0.10.2


From 2f525d0148ef2734c8a172201e5e1e9167a8a5fd Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 4 Feb 2011 13:43:24 -0200
Subject: perf annotate: Support multiple histograms in annotation

The perf annotate tool continues aggregating everything on just one
histograms, but to support the top model add support for one histogram
perf evsel in the evlist.

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
index 9072ef4..f3e4423 100644
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -57,7 +57,18 @@ static int hists__add_entry(struct hists *self, struct addr_location *al)
 	if (he == NULL)
 		return -ENOMEM;
 
-	return hist_entry__inc_addr_samples(he, al->addr);
+	if (he->ms.sym != NULL) {
+		/*
+		 * All aggregated on the first sym_hist.
+		 */
+		struct annotation *notes = symbol__annotation(he->ms.sym);
+		if (notes->histograms == NULL && symbol__alloc_hist(he->ms.sym, 1) < 0)
+			return -ENOMEM;
+
+		return hist_entry__inc_addr_samples(he, 0, al->addr);
+	}
+
+	return 0;
 }
 
 static int process_sample_event(union perf_event *event,
@@ -81,9 +92,9 @@ static int process_sample_event(union perf_event *event,
 	return 0;
 }
 
-static int hist_entry__tty_annotate(struct hist_entry *he)
+static int hist_entry__tty_annotate(struct hist_entry *he, int evidx)
 {
-	return symbol__tty_annotate(he->ms.sym, he->ms.map,
+	return symbol__tty_annotate(he->ms.sym, he->ms.map, evidx,
 				    print_line, full_paths);
 }
 
@@ -100,7 +111,7 @@ static void hists__find_annotations(struct hists *self)
 			goto find_next;
 
 		notes = symbol__annotation(he->ms.sym);
-		if (notes->histogram == NULL) {
+		if (notes->histograms == NULL) {
 find_next:
 			if (key == KEY_LEFT)
 				nd = rb_prev(nd);
@@ -110,7 +121,8 @@ find_next:
 		}
 
 		if (use_browser > 0) {
-			key = hist_entry__tui_annotate(he);
+			/* For now all is aggregated on the first */
+			key = hist_entry__tui_annotate(he, 0);
 			switch (key) {
 			case KEY_RIGHT:
 				next = rb_next(nd);
@@ -125,15 +137,16 @@ find_next:
 			if (next != NULL)
 				nd = next;
 		} else {
-			hist_entry__tty_annotate(he);
+			/* For now all is aggregated on the first */
+			hist_entry__tty_annotate(he, 0);
 			nd = rb_next(nd);
 			/*
 			 * Since we have a hist_entry per IP for the same
 			 * symbol, free he->ms.sym->histogram to signal we already
 			 * processed this symbol.
 			 */
-			free(notes->histogram);
-			notes->histogram = NULL;
+			free(notes->histograms);
+			notes->histograms = NULL;
 		}
 	}
 }
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 91e4cdb..de06bf5 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -118,8 +118,17 @@ static int perf_session__add_hist_entry(struct perf_session *session,
 	 * so we don't allocated the extra space needed because the stdio
 	 * code will not use it.
 	 */
-	if (use_browser > 0)
-		err = hist_entry__inc_addr_samples(he, al->addr);
+	if (al->sym != NULL && use_browser > 0) {
+		/*
+		 * All aggregated on the first sym_hist.
+		 */
+		struct annotation *notes = symbol__annotation(he->ms.sym);
+		if (notes->histograms == NULL &&
+		    symbol__alloc_hist(he->ms.sym, 1) < 0)
+			err = -ENOMEM;
+		else
+			err = hist_entry__inc_addr_samples(he, 0, al->addr);
+	}
 
 	return err;
 }
@@ -349,7 +358,7 @@ static int __cmd_report(void)
 	}
 
 	if (use_browser > 0)
-		hists__tui_browse_tree(&session->hists_tree, help);
+		hists__tui_browse_tree(&session->hists_tree, help, 0);
 	else
 		hists__tty_browse_tree(&session->hists_tree, help);
 
diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index 9b25575..7488fe9 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -15,44 +15,40 @@
 #include "debug.h"
 #include "annotate.h"
 
-static int symbol__alloc_hist(struct symbol *sym)
+int symbol__alloc_hist(struct symbol *sym, int nevents)
 {
 	struct annotation *notes = symbol__annotation(sym);
-	const int size = (sizeof(*notes->histogram) +
-			  (sym->end - sym->start) * sizeof(u64));
 
-	notes->histogram = zalloc(size);
-	return notes->histogram == NULL ? -1 : 0;
+	notes->sizeof_sym_hist = (sizeof(*notes->histograms) +
+				  (sym->end - sym->start) * sizeof(u64));
+	notes->histograms = calloc(nevents, notes->sizeof_sym_hist);
+	return notes->histograms == NULL ? -1 : 0;
 }
 
-int symbol__inc_addr_samples(struct symbol *sym, struct map *map, u64 addr)
+int symbol__inc_addr_samples(struct symbol *sym, struct map *map,
+			     int evidx, u64 addr)
 {
-	unsigned int sym_size, offset;
+	unsigned offset;
 	struct annotation *notes;
 	struct sym_hist *h;
 
-	if (!sym || !map)
-		return 0;
-
 	notes = symbol__annotation(sym);
-	if (notes->histogram == NULL && symbol__alloc_hist(sym) < 0)
+	if (notes->histograms == NULL)
 		return -ENOMEM;
 
-	sym_size = sym->end - sym->start;
-	offset = addr - sym->start;
-
 	pr_debug3("%s: addr=%#" PRIx64 "\n", __func__, map->unmap_ip(map, addr));
 
-	if (offset >= sym_size)
+	if (addr >= sym->end)
 		return 0;
 
-	h = notes->histogram;
+	offset = addr - sym->start;
+	h = annotation__histogram(notes, evidx);
 	h->sum++;
 	h->addr[offset]++;
 
 	pr_debug3("%#" PRIx64 " %s: period++ [addr: %#" PRIx64 ", %#" PRIx64
-		  "] => %" PRIu64 "\n", sym->start, sym->name,
-		  addr, addr - sym->start, h->addr[offset]);
+		  ", evidx=%d] => %" PRIu64 "\n", sym->start, sym->name,
+		  addr, addr - sym->start, evidx, h->addr[offset]);
 	return 0;
 }
 
@@ -90,8 +86,8 @@ struct objdump_line *objdump__get_next_ip_line(struct list_head *head,
 }
 
 static void objdump_line__print(struct objdump_line *oline,
-				struct list_head *head,
-				struct symbol *sym, u64 len)
+				struct list_head *head, struct symbol *sym,
+				int evidx, u64 len)
 {
 	static const char *prev_line;
 	static const char *prev_color;
@@ -103,7 +99,7 @@ static void objdump_line__print(struct objdump_line *oline,
 		const char *color;
 		struct annotation *notes = symbol__annotation(sym);
 		struct source_line *src_line = notes->src_line;
-		struct sym_hist *h = notes->histogram;
+		struct sym_hist *h = annotation__histogram(notes, evidx);
 		s64 offset = oline->offset;
 		struct objdump_line *next = objdump__get_next_ip_line(head, oline);
 
@@ -328,7 +324,7 @@ static void symbol__free_source_line(struct symbol *sym, int len)
 
 /* Get the filename:line for the colored entries */
 static int symbol__get_source_line(struct symbol *sym, struct map *map,
-				   struct rb_root *root, int len,
+				   int evidx, struct rb_root *root, int len,
 				   const char *filename)
 {
 	u64 start;
@@ -336,7 +332,7 @@ static int symbol__get_source_line(struct symbol *sym, struct map *map,
 	char cmd[PATH_MAX * 2];
 	struct source_line *src_line;
 	struct annotation *notes = symbol__annotation(sym);
-	struct sym_hist *h = notes->histogram;
+	struct sym_hist *h = annotation__histogram(notes, evidx);
 
 	if (!h->sum)
 		return 0;
@@ -409,10 +405,10 @@ static void print_summary(struct rb_root *root, const char *filename)
 	}
 }
 
-static void symbol__annotate_hits(struct symbol *sym)
+static void symbol__annotate_hits(struct symbol *sym, int evidx)
 {
 	struct annotation *notes = symbol__annotation(sym);
-	struct sym_hist *h = notes->histogram;
+	struct sym_hist *h = annotation__histogram(notes, evidx);
 	u64 len = sym->end - sym->start, offset;
 
 	for (offset = 0; offset < len; ++offset)
@@ -422,8 +418,8 @@ static void symbol__annotate_hits(struct symbol *sym)
 	printf("%*s: %" PRIu64 "\n", BITS_PER_LONG / 2, "h->sum", h->sum);
 }
 
-int symbol__tty_annotate(struct symbol *sym, struct map *map, bool print_lines,
-			 bool full_paths)
+int symbol__tty_annotate(struct symbol *sym, struct map *map, int evidx,
+			 bool print_lines, bool full_paths)
 {
 	struct dso *dso = map->dso;
 	const char *filename = dso->long_name, *d_filename;
@@ -443,7 +439,8 @@ int symbol__tty_annotate(struct symbol *sym, struct map *map, bool print_lines,
 	len = sym->end - sym->start;
 
 	if (print_lines) {
-		symbol__get_source_line(sym, map, &source_line, len, filename);
+		symbol__get_source_line(sym, map, evidx, &source_line,
+					len, filename);
 		print_summary(&source_line, filename);
 	}
 
@@ -452,10 +449,10 @@ int symbol__tty_annotate(struct symbol *sym, struct map *map, bool print_lines,
 	printf("------------------------------------------------\n");
 
 	if (verbose)
-		symbol__annotate_hits(sym);
+		symbol__annotate_hits(sym, evidx);
 
 	list_for_each_entry_safe(pos, n, &head, node) {
-		objdump_line__print(pos, &head, sym, len);
+		objdump_line__print(pos, &head, sym, evidx, len);
 		list_del(&pos->node);
 		objdump_line__free(pos);
 	}
diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h
index 6e2fbc2..0a5069ca 100644
--- a/tools/perf/util/annotate.h
+++ b/tools/perf/util/annotate.h
@@ -28,9 +28,21 @@ struct source_line {
 	char		*path;
 };
 
+/** struct annotation - symbols with hits have this attached as in sannotation
+ *
+ * @histogram: Array of addr hit histograms per event being monitored
+ * @src_line: If 'print_lines' is specified, per source code line percentages
+ *
+ * src_line is allocated, percentages calculated and all sorted by percentage
+ * when the annotation is about to be presented, so the percentages are for
+ * one of the entries in the histogram array, i.e. for the event/counter being
+ * presented. It is deallocated right after symbol__{tui,tty,etc}_annotate
+ * returns.
+ */
 struct annotation {
-	struct sym_hist	   *histogram;
 	struct source_line *src_line;
+	struct sym_hist	   *histograms;
+	int    		   sizeof_sym_hist;
 };
 
 struct sannotation {
@@ -38,28 +50,35 @@ struct sannotation {
 	struct symbol	  symbol;
 };
 
+static inline struct sym_hist *annotation__histogram(struct annotation *notes, int idx)
+{
+	return ((void *)notes->histograms) + (notes->sizeof_sym_hist * idx);
+}
+
 static inline struct annotation *symbol__annotation(struct symbol *sym)
 {
 	struct sannotation *a = container_of(sym, struct sannotation, symbol);
 	return &a->annotation;
 }
 
-int symbol__inc_addr_samples(struct symbol *sym, struct map *map, u64 addr);
+int symbol__inc_addr_samples(struct symbol *sym, struct map *map,
+			     int evidx, u64 addr);
+int symbol__alloc_hist(struct symbol *sym, int nevents);
 
 int symbol__annotate(struct symbol *sym, struct map *map,
 		     struct list_head *head, size_t privsize);
 
-int symbol__tty_annotate(struct symbol *sym, struct map *map,
+int symbol__tty_annotate(struct symbol *sym, struct map *map, int evidx,
 			 bool print_lines, bool full_paths);
 
 #ifdef NO_NEWT_SUPPORT
 static inline int symbol__tui_annotate(symbol *sym __used,
-				       struct map *map __used)
+				       struct map *map __used, int evidx __used)
 {
 	return 0;
 }
 #else
-int symbol__tui_annotate(struct symbol *sym, struct map *map);
+int symbol__tui_annotate(struct symbol *sym, struct map *map, int evidx);
 #endif
 
 #endif	/* __PERF_ANNOTATE_H */
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index 6d9c92c..bac5ab6 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -950,9 +950,9 @@ void hists__filter_by_thread(struct hists *self, const struct thread *thread)
 	}
 }
 
-int hist_entry__inc_addr_samples(struct hist_entry *he, u64 ip)
+int hist_entry__inc_addr_samples(struct hist_entry *he, int evidx, u64 ip)
 {
-	return symbol__inc_addr_samples(he->ms.sym, he->ms.map, ip);
+	return symbol__inc_addr_samples(he->ms.sym, he->ms.map, evidx, ip);
 }
 
 int hist_entry__annotate(struct hist_entry *he, struct list_head *head,
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index 8a201f7..2c6cdae 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -77,7 +77,7 @@ size_t hists__fprintf_nr_events(struct hists *self, FILE *fp);
 size_t hists__fprintf(struct hists *self, struct hists *pair,
 		      bool show_displacement, FILE *fp);
 
-int hist_entry__inc_addr_samples(struct hist_entry *self, u64 ip);
+int hist_entry__inc_addr_samples(struct hist_entry *self, int evidx, u64 addr);
 int hist_entry__annotate(struct hist_entry *self, struct list_head *head,
 			 size_t privsize);
 
@@ -91,18 +91,20 @@ bool hists__new_col_len(struct hists *self, enum hist_column col, u16 len);
 #ifdef NO_NEWT_SUPPORT
 static inline int hists__browse(struct hists *self __used,
 				const char *helpline __used,
-				const char *ev_name __used)
+				const char *ev_name __used, int evidx __used)
 {
 	return 0;
 }
 
 static inline int hists__tui_browse_tree(struct rb_root *self __used,
-					 const char *help __used)
+					 const char *help __used,
+					 int evidx __used)
 {
 	return 0;
 }
 
-static inline int hist_entry__tui_annotate(struct hist_entry *self __used)
+static inline int hist_entry__tui_annotate(struct hist_entry *self __used,
+					   int evidx __used)
 {
 	return 0;
 }
@@ -111,13 +113,13 @@ static inline int hist_entry__tui_annotate(struct hist_entry *self __used)
 #else
 #include <newt.h>
 int hists__browse(struct hists *self, const char *helpline,
-		  const char *ev_name);
-int hist_entry__tui_annotate(struct hist_entry *self);
+		  const char *ev_name, int evidx);
+int hist_entry__tui_annotate(struct hist_entry *self, int evidx);
 
 #define KEY_LEFT NEWT_KEY_LEFT
 #define KEY_RIGHT NEWT_KEY_RIGHT
 
-int hists__tui_browse_tree(struct rb_root *self, const char *help);
+int hists__tui_browse_tree(struct rb_root *self, const char *help, int evidx);
 #endif
 
 unsigned int hists__sort_list_width(struct hists *self);
diff --git a/tools/perf/util/ui/browsers/annotate.c b/tools/perf/util/ui/browsers/annotate.c
index daa7138..8d8a168 100644
--- a/tools/perf/util/ui/browsers/annotate.c
+++ b/tools/perf/util/ui/browsers/annotate.c
@@ -61,7 +61,7 @@ static void annotate_browser__write(struct ui_browser *self, void *entry, int ro
 
 static double objdump_line__calc_percent(struct objdump_line *self,
 					 struct list_head *head,
-					 struct symbol *sym)
+					 struct symbol *sym, int evidx)
 {
 	double percent = 0.0;
 
@@ -70,7 +70,7 @@ static double objdump_line__calc_percent(struct objdump_line *self,
 		unsigned int hits = 0;
 		struct annotation *notes = symbol__annotation(sym);
 		struct source_line *src_line = notes->src_line;
-		struct sym_hist *h = notes->histogram;
+		struct sym_hist *h = annotation__histogram(notes, evidx);
 		s64 offset = self->offset;
 		struct objdump_line *next = objdump__get_next_ip_line(head, self);
 
@@ -183,12 +183,12 @@ out:
 	return key;
 }
 
-int hist_entry__tui_annotate(struct hist_entry *he)
+int hist_entry__tui_annotate(struct hist_entry *he, int evidx)
 {
-	return symbol__tui_annotate(he->ms.sym, he->ms.map);
+	return symbol__tui_annotate(he->ms.sym, he->ms.map, evidx);
 }
 
-int symbol__tui_annotate(struct symbol *sym, struct map *map)
+int symbol__tui_annotate(struct symbol *sym, struct map *map, int evidx)
 {
 	struct objdump_line *pos, *n;
 	struct objdump_line_rb_node *rbpos;
@@ -223,7 +223,7 @@ int symbol__tui_annotate(struct symbol *sym, struct map *map)
 			browser.b.width = line_len;
 		rbpos = objdump_line__rb(pos);
 		rbpos->idx = browser.b.nr_entries++;
-		rbpos->percent = objdump_line__calc_percent(pos, &head, sym);
+		rbpos->percent = objdump_line__calc_percent(pos, &head, sym, evidx);
 		if (rbpos->percent < 0.01)
 			continue;
 		objdump__insert_line(&browser.entries, rbpos);
diff --git a/tools/perf/util/ui/browsers/hists.c b/tools/perf/util/ui/browsers/hists.c
index 8642823..294b495 100644
--- a/tools/perf/util/ui/browsers/hists.c
+++ b/tools/perf/util/ui/browsers/hists.c
@@ -797,7 +797,8 @@ static int hists__browser_title(struct hists *self, char *bf, size_t size,
 	return printed;
 }
 
-int hists__browse(struct hists *self, const char *helpline, const char *ev_name)
+int hists__browse(struct hists *self, const char *helpline,
+		  const char *ev_name, int evidx)
 {
 	struct hist_browser *browser = hist_browser__new(self);
 	struct pstack *fstack;
@@ -935,7 +936,7 @@ do_annotate:
 			if (he == NULL)
 				continue;
 
-			hist_entry__tui_annotate(he);
+			hist_entry__tui_annotate(he, evidx);
 		} else if (choice == browse_map)
 			map__browse(browser->selection->map);
 		else if (choice == zoom_dso) {
@@ -984,7 +985,7 @@ out:
 	return key;
 }
 
-int hists__tui_browse_tree(struct rb_root *self, const char *help)
+int hists__tui_browse_tree(struct rb_root *self, const char *help, int evidx)
 {
 	struct rb_node *first = rb_first(self), *nd = first, *next;
 	int key = 0;
@@ -993,7 +994,7 @@ int hists__tui_browse_tree(struct rb_root *self, const char *help)
 		struct hists *hists = rb_entry(nd, struct hists, rb_node);
 		const char *ev_name = __event_name(hists->type, hists->config);
 
-		key = hists__browse(hists, help, ev_name);
+		key = hists__browse(hists, help, ev_name, evidx);
 		switch (key) {
 		case NEWT_KEY_TAB:
 			next = rb_next(nd);
-- 
cgit v0.10.2


From d040bd363824f9f0ad6610b91ee6c65f292c066c Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Sat, 5 Feb 2011 15:37:31 -0200
Subject: perf annotate: Config options for symbol__tty_annotate

Max line# that should be printed, minimum percentage filter, just like
'perf top', alas, due to it :-)

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
index f3e4423..ea6a116 100644
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -95,7 +95,7 @@ static int process_sample_event(union perf_event *event,
 static int hist_entry__tty_annotate(struct hist_entry *he, int evidx)
 {
 	return symbol__tty_annotate(he->ms.sym, he->ms.map, evidx,
-				    print_line, full_paths);
+				    print_line, full_paths, 0, 0);
 }
 
 static void hists__find_annotations(struct hists *self)
diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index 7488fe9..072bc8d 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -87,7 +87,7 @@ struct objdump_line *objdump__get_next_ip_line(struct list_head *head,
 
 static void objdump_line__print(struct objdump_line *oline,
 				struct list_head *head, struct symbol *sym,
-				int evidx, u64 len)
+				int evidx, u64 len, int min_pcnt)
 {
 	static const char *prev_line;
 	static const char *prev_color;
@@ -118,6 +118,9 @@ static void objdump_line__print(struct objdump_line *oline,
 		if (src_line == NULL && h->sum)
 			percent = 100.0 * hits / h->sum;
 
+		if (percent < min_pcnt)
+			return;
+
 		color = get_percent_color(percent);
 
 		/*
@@ -419,13 +422,15 @@ static void symbol__annotate_hits(struct symbol *sym, int evidx)
 }
 
 int symbol__tty_annotate(struct symbol *sym, struct map *map, int evidx,
-			 bool print_lines, bool full_paths)
+			 bool print_lines, bool full_paths, int min_pcnt,
+			 int max_lines)
 {
 	struct dso *dso = map->dso;
 	const char *filename = dso->long_name, *d_filename;
 	struct rb_root source_line = RB_ROOT;
 	struct objdump_line *pos, *n;
 	LIST_HEAD(head);
+	int printed = 2;
 	u64 len;
 
 	if (symbol__annotate(sym, map, &head, 0) < 0)
@@ -444,7 +449,6 @@ int symbol__tty_annotate(struct symbol *sym, struct map *map, int evidx,
 		print_summary(&source_line, filename);
 	}
 
-	printf("\n\n------------------------------------------------\n");
 	printf(" Percent |	Source code & Disassembly of %s\n", d_filename);
 	printf("------------------------------------------------\n");
 
@@ -452,9 +456,11 @@ int symbol__tty_annotate(struct symbol *sym, struct map *map, int evidx,
 		symbol__annotate_hits(sym, evidx);
 
 	list_for_each_entry_safe(pos, n, &head, node) {
-		objdump_line__print(pos, &head, sym, evidx, len);
+		objdump_line__print(pos, &head, sym, evidx, len, min_pcnt);
 		list_del(&pos->node);
 		objdump_line__free(pos);
+		if (max_lines && ++printed >= max_lines)
+			break;
 	}
 
 	if (print_lines)
diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h
index 0a5069ca..6b70732 100644
--- a/tools/perf/util/annotate.h
+++ b/tools/perf/util/annotate.h
@@ -69,7 +69,8 @@ int symbol__annotate(struct symbol *sym, struct map *map,
 		     struct list_head *head, size_t privsize);
 
 int symbol__tty_annotate(struct symbol *sym, struct map *map, int evidx,
-			 bool print_lines, bool full_paths);
+			 bool print_lines, bool full_paths, int min_pcnt,
+			 int max_lines);
 
 #ifdef NO_NEWT_SUPPORT
 static inline int symbol__tui_annotate(symbol *sym __used,
-- 
cgit v0.10.2


From f1e2701de02cff6d988b1dd49960620d5720cb89 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Sat, 5 Feb 2011 18:51:38 -0200
Subject: perf annotate: Separate objdump parsing from actual screen rendering

Because in 'perf top' we'll need to parse just once and then, as samples
come, render multiple times with evolving counter values.

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index 072bc8d..10cdbad 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -421,21 +421,16 @@ static void symbol__annotate_hits(struct symbol *sym, int evidx)
 	printf("%*s: %" PRIu64 "\n", BITS_PER_LONG / 2, "h->sum", h->sum);
 }
 
-int symbol__tty_annotate(struct symbol *sym, struct map *map, int evidx,
-			 bool print_lines, bool full_paths, int min_pcnt,
-			 int max_lines)
+void symbol__annotate_printf(struct symbol *sym, struct map *map,
+			     struct list_head *head, int evidx, bool full_paths,
+			     int min_pcnt, int max_lines)
 {
 	struct dso *dso = map->dso;
 	const char *filename = dso->long_name, *d_filename;
-	struct rb_root source_line = RB_ROOT;
-	struct objdump_line *pos, *n;
-	LIST_HEAD(head);
+	struct objdump_line *pos;
 	int printed = 2;
 	u64 len;
 
-	if (symbol__annotate(sym, map, &head, 0) < 0)
-		return -1;
-
 	if (full_paths)
 		d_filename = filename;
 	else
@@ -443,28 +438,57 @@ int symbol__tty_annotate(struct symbol *sym, struct map *map, int evidx,
 
 	len = sym->end - sym->start;
 
-	if (print_lines) {
-		symbol__get_source_line(sym, map, evidx, &source_line,
-					len, filename);
-		print_summary(&source_line, filename);
-	}
-
 	printf(" Percent |	Source code & Disassembly of %s\n", d_filename);
 	printf("------------------------------------------------\n");
 
 	if (verbose)
 		symbol__annotate_hits(sym, evidx);
 
-	list_for_each_entry_safe(pos, n, &head, node) {
-		objdump_line__print(pos, &head, sym, evidx, len, min_pcnt);
-		list_del(&pos->node);
-		objdump_line__free(pos);
+	list_for_each_entry(pos, head, node) {
+		objdump_line__print(pos, head, sym, evidx, len, min_pcnt);
 		if (max_lines && ++printed >= max_lines)
 			break;
+
+	}
+}
+
+void objdump_line_list__purge(struct list_head *head)
+{
+	struct objdump_line *pos, *n;
+
+	list_for_each_entry_safe(pos, n, head, node) {
+		list_del(&pos->node);
+		objdump_line__free(pos);
+	}
+}
+
+int symbol__tty_annotate(struct symbol *sym, struct map *map, int evidx,
+			 bool print_lines, bool full_paths, int min_pcnt,
+			 int max_lines)
+{
+	struct dso *dso = map->dso;
+	const char *filename = dso->long_name;
+	struct rb_root source_line = RB_ROOT;
+	LIST_HEAD(head);
+	u64 len;
+
+	if (symbol__annotate(sym, map, &head, 0) < 0)
+		return -1;
+
+	len = sym->end - sym->start;
+
+	if (print_lines) {
+		symbol__get_source_line(sym, map, evidx, &source_line,
+					len, filename);
+		print_summary(&source_line, filename);
 	}
 
+	symbol__annotate_printf(sym, map, &head, evidx, full_paths,
+				min_pcnt, max_lines);
 	if (print_lines)
 		symbol__free_source_line(sym, len);
 
+	objdump_line_list__purge(&head);
+
 	return 0;
 }
diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h
index 6b70732..53dd92d 100644
--- a/tools/perf/util/annotate.h
+++ b/tools/perf/util/annotate.h
@@ -67,6 +67,10 @@ int symbol__alloc_hist(struct symbol *sym, int nevents);
 
 int symbol__annotate(struct symbol *sym, struct map *map,
 		     struct list_head *head, size_t privsize);
+void symbol__annotate_printf(struct symbol *sym, struct map *map,
+			     struct list_head *head, int evidx, bool full_paths,
+			     int min_pcnt, int max_lines);
+void objdump_line_list__purge(struct list_head *head);
 
 int symbol__tty_annotate(struct symbol *sym, struct map *map, int evidx,
 			 bool print_lines, bool full_paths, int min_pcnt,
-- 
cgit v0.10.2


From 36532461a0f60bb36c5470a0326f7394f19db23c Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Sun, 6 Feb 2011 14:54:44 -0200
Subject: perf top: Ditch private annotation code, share perf annotate's

Next step: Live TUI annotation in perf top, just press enter on a symbol
line.

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 154e088..716118a 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -20,6 +20,7 @@
 
 #include "perf.h"
 
+#include "util/annotate.h"
 #include "util/cache.h"
 #include "util/color.h"
 #include "util/evlist.h"
@@ -140,10 +141,7 @@ static int parse_source(struct sym_entry *syme)
 	struct symbol *sym;
 	struct sym_entry_source *source;
 	struct map *map;
-	FILE *file;
-	char command[PATH_MAX*2];
-	const char *path;
-	u64 len;
+	int err = -1;
 
 	if (!syme)
 		return -1;
@@ -162,197 +160,80 @@ static int parse_source(struct sym_entry *syme)
 		if (syme->src == NULL)
 			return -1;
 		pthread_mutex_init(&syme->src->lock, NULL);
+		INIT_LIST_HEAD(&syme->src->head);
 	}
 
 	source = syme->src;
 
-	if (source->lines) {
+	if (symbol__annotation(sym)->histograms != NULL) {
 		pthread_mutex_lock(&source->lock);
 		goto out_assign;
 	}
-	path = map->dso->long_name;
-
-	len = sym->end - sym->start;
-
-	sprintf(command,
-		"objdump --start-address=%#0*" PRIx64 " --stop-address=%#0*" PRIx64 " -dS %s",
-		BITS_PER_LONG / 4, map__rip_2objdump(map, sym->start),
-		BITS_PER_LONG / 4, map__rip_2objdump(map, sym->end), path);
-
-	file = popen(command, "r");
-	if (!file)
-		return -1;
 
 	pthread_mutex_lock(&source->lock);
-	source->lines_tail = &source->lines;
-	while (!feof(file)) {
-		struct source_line *src;
-		size_t dummy = 0;
-		char *c, *sep;
-
-		src = malloc(sizeof(struct source_line));
-		assert(src != NULL);
-		memset(src, 0, sizeof(struct source_line));
-
-		if (getline(&src->line, &dummy, file) < 0)
-			break;
-		if (!src->line)
-			break;
-
-		c = strchr(src->line, '\n');
-		if (c)
-			*c = 0;
 
-		src->next = NULL;
-		*source->lines_tail = src;
-		source->lines_tail = &src->next;
-
-		src->eip = strtoull(src->line, &sep, 16);
-		if (*sep == ':')
-			src->eip = map__objdump_2ip(map, src->eip);
-		else /* this line has no ip info (e.g. source line) */
-			src->eip = 0;
+	if (symbol__alloc_hist(sym, top.evlist->nr_entries) < 0) {
+		pr_err("Not enough memory for annotating '%s' symbol!\n",
+		       sym->name);
+		goto out_unlock;
 	}
-	pclose(file);
+
+	err = symbol__annotate(sym, syme->map, &source->head, 0);
+	if (err == 0) {
 out_assign:
 	sym_filter_entry = syme;
+	}
+out_unlock:
 	pthread_mutex_unlock(&source->lock);
-	return 0;
+	return err;
 }
 
 static void __zero_source_counters(struct sym_entry *syme)
 {
-	int i;
-	struct source_line *line;
-
-	line = syme->src->lines;
-	while (line) {
-		for (i = 0; i < top.evlist->nr_entries; i++)
-			line->count[i] = 0;
-		line = line->next;
-	}
+	struct symbol *sym = sym_entry__symbol(syme);
+	symbol__annotate_zero_histograms(sym);
 }
 
 static void record_precise_ip(struct sym_entry *syme, int counter, u64 ip)
 {
-	struct source_line *line;
-
 	if (syme != sym_filter_entry)
 		return;
 
 	if (pthread_mutex_trylock(&syme->src->lock))
 		return;
 
-	if (syme->src == NULL || syme->src->source == NULL)
-		goto out_unlock;
-
-	for (line = syme->src->lines; line; line = line->next) {
-		/* skip lines without IP info */
-		if (line->eip == 0)
-			continue;
-		if (line->eip == ip) {
-			line->count[counter]++;
-			break;
-		}
-		if (line->eip > ip)
-			break;
-	}
-out_unlock:
-	pthread_mutex_unlock(&syme->src->lock);
-}
-
-#define PATTERN_LEN		(BITS_PER_LONG / 4 + 2)
+	ip = syme->map->map_ip(syme->map, ip);
+	symbol__inc_addr_samples(sym_entry__symbol(syme), syme->map, counter, ip);
 
-static void lookup_sym_source(struct sym_entry *syme)
-{
-	struct symbol *symbol = sym_entry__symbol(syme);
-	struct source_line *line;
-	char pattern[PATTERN_LEN + 1];
-
-	sprintf(pattern, "%0*" PRIx64 " <", BITS_PER_LONG / 4,
-		map__rip_2objdump(syme->map, symbol->start));
-
-	pthread_mutex_lock(&syme->src->lock);
-	for (line = syme->src->lines; line; line = line->next) {
-		if (memcmp(line->line, pattern, PATTERN_LEN) == 0) {
-			syme->src->source = line;
-			break;
-		}
-	}
 	pthread_mutex_unlock(&syme->src->lock);
 }
 
-static void show_lines(struct source_line *queue, int count, int total)
-{
-	int i;
-	struct source_line *line;
-
-	line = queue;
-	for (i = 0; i < count; i++) {
-		float pcnt = 100.0*(float)line->count[top.sym_counter]/(float)total;
-
-		printf("%8li %4.1f%%\t%s\n", line->count[top.sym_counter], pcnt, line->line);
-		line = line->next;
-	}
-}
-
-#define TRACE_COUNT     3
-
 static void show_details(struct sym_entry *syme)
 {
 	struct symbol *symbol;
-	struct source_line *line;
-	struct source_line *line_queue = NULL;
-	int displayed = 0;
-	int line_queue_count = 0, total = 0, more = 0;
+	int more;
 
 	if (!syme)
 		return;
 
-	if (!syme->src->source)
-		lookup_sym_source(syme);
-
-	if (!syme->src->source)
+	symbol = sym_entry__symbol(syme);
+	if (!syme->src || symbol__annotation(symbol)->histograms == NULL)
 		return;
 
-	symbol = sym_entry__symbol(syme);
 	printf("Showing %s for %s\n", event_name(top.sym_evsel), symbol->name);
 	printf("  Events  Pcnt (>=%d%%)\n", sym_pcnt_filter);
 
 	pthread_mutex_lock(&syme->src->lock);
-	line = syme->src->source;
-	while (line) {
-		total += line->count[top.sym_counter];
-		line = line->next;
-	}
-
-	line = syme->src->source;
-	while (line) {
-		float pcnt = 0.0;
-
-		if (!line_queue_count)
-			line_queue = line;
-		line_queue_count++;
-
-		if (line->count[top.sym_counter])
-			pcnt = 100.0 * line->count[top.sym_counter] / (float)total;
-		if (pcnt >= (float)sym_pcnt_filter) {
-			if (displayed <= top.print_entries)
-				show_lines(line_queue, line_queue_count, total);
-			else more++;
-			displayed += line_queue_count;
-			line_queue_count = 0;
-			line_queue = NULL;
-		} else if (line_queue_count > TRACE_COUNT) {
-			line_queue = line_queue->next;
-			line_queue_count--;
-		}
-
-		line->count[top.sym_counter] = top.zero ? 0 : line->count[top.sym_counter] * 7 / 8;
-		line = line->next;
-	}
+	more = symbol__annotate_printf(symbol, syme->map, &syme->src->head,
+				       top.sym_evsel->idx, 0, sym_pcnt_filter,
+				       top.print_entries);
+	if (top.zero)
+		symbol__annotate_zero_histogram(symbol, top.sym_evsel->idx);
+	else
+		symbol__annotate_decay_histogram(symbol, &syme->src->head,
+						 top.sym_evsel->idx);
 	pthread_mutex_unlock(&syme->src->lock);
-	if (more)
+	if (more != 0)
 		printf("%d lines not displayed, maybe increase display entries [e]\n", more);
 }
 
@@ -1172,7 +1053,7 @@ int cmd_top(int argc, const char **argv, const char *prefix __used)
 
 	top.sym_evsel = list_entry(top.evlist->entries.next, struct perf_evsel, node);
 
-	symbol_conf.priv_size = (sizeof(struct sym_entry) +
+	symbol_conf.priv_size = (sizeof(struct sym_entry) + sizeof(struct annotation) +
 				 (top.evlist->nr_entries + 1) * sizeof(unsigned long));
 
 	symbol_conf.try_vmlinux_path = (symbol_conf.vmlinux_name == NULL);
diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index 10cdbad..2973376 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -22,9 +22,19 @@ int symbol__alloc_hist(struct symbol *sym, int nevents)
 	notes->sizeof_sym_hist = (sizeof(*notes->histograms) +
 				  (sym->end - sym->start) * sizeof(u64));
 	notes->histograms = calloc(nevents, notes->sizeof_sym_hist);
+	notes->nr_histograms = nevents;
 	return notes->histograms == NULL ? -1 : 0;
 }
 
+void symbol__annotate_zero_histograms(struct symbol *sym)
+{
+	struct annotation *notes = symbol__annotation(sym);
+
+	if (notes->histograms != NULL)
+		memset(notes->histograms, 0,
+		       notes->nr_histograms * notes->sizeof_sym_hist);
+}
+
 int symbol__inc_addr_samples(struct symbol *sym, struct map *map,
 			     int evidx, u64 addr)
 {
@@ -85,9 +95,10 @@ struct objdump_line *objdump__get_next_ip_line(struct list_head *head,
 	return NULL;
 }
 
-static void objdump_line__print(struct objdump_line *oline,
-				struct list_head *head, struct symbol *sym,
-				int evidx, u64 len, int min_pcnt)
+static int objdump_line__print(struct objdump_line *oline,
+			       struct list_head *head, struct symbol *sym,
+			       int evidx, u64 len, int min_pcnt,
+			       int printed, int max_lines)
 {
 	static const char *prev_line;
 	static const char *prev_color;
@@ -119,7 +130,10 @@ static void objdump_line__print(struct objdump_line *oline,
 			percent = 100.0 * hits / h->sum;
 
 		if (percent < min_pcnt)
-			return;
+			return -1;
+
+		if (printed >= max_lines)
+			return 1;
 
 		color = get_percent_color(percent);
 
@@ -140,12 +154,16 @@ static void objdump_line__print(struct objdump_line *oline,
 		color_fprintf(stdout, color, " %7.2f", percent);
 		printf(" :	");
 		color_fprintf(stdout, PERF_COLOR_BLUE, "%s\n", oline->line);
-	} else {
+	} else if (printed >= max_lines)
+		return 1;
+	else {
 		if (!*oline->line)
 			printf("         :\n");
 		else
 			printf("         :	%s\n", oline->line);
 	}
+
+	return 0;
 }
 
 static int symbol__parse_objdump_line(struct symbol *sym, struct map *map, FILE *file,
@@ -421,14 +439,15 @@ static void symbol__annotate_hits(struct symbol *sym, int evidx)
 	printf("%*s: %" PRIu64 "\n", BITS_PER_LONG / 2, "h->sum", h->sum);
 }
 
-void symbol__annotate_printf(struct symbol *sym, struct map *map,
-			     struct list_head *head, int evidx, bool full_paths,
-			     int min_pcnt, int max_lines)
+int symbol__annotate_printf(struct symbol *sym, struct map *map,
+			    struct list_head *head, int evidx, bool full_paths,
+			    int min_pcnt, int max_lines)
 {
 	struct dso *dso = map->dso;
 	const char *filename = dso->long_name, *d_filename;
 	struct objdump_line *pos;
 	int printed = 2;
+	int more = 0;
 	u64 len;
 
 	if (full_paths)
@@ -445,10 +464,47 @@ void symbol__annotate_printf(struct symbol *sym, struct map *map,
 		symbol__annotate_hits(sym, evidx);
 
 	list_for_each_entry(pos, head, node) {
-		objdump_line__print(pos, head, sym, evidx, len, min_pcnt);
-		if (max_lines && ++printed >= max_lines)
+		switch (objdump_line__print(pos, head, sym, evidx, len, min_pcnt,
+					    printed, max_lines)) {
+		case 0:
+			++printed;
+			break;
+		case 1:
+			/* filtered by max_lines */
+			++more;
 			break;
+		case -1:
+		default:
+			/* filtered by min_pcnt */
+			break;
+		}
+	}
+
+	return more;
+}
 
+void symbol__annotate_zero_histogram(struct symbol *sym, int evidx)
+{
+	struct annotation *notes = symbol__annotation(sym);
+	struct sym_hist *h = annotation__histogram(notes, evidx);
+
+	memset(h, 0, notes->sizeof_sym_hist);
+}
+
+void symbol__annotate_decay_histogram(struct symbol *sym,
+				      struct list_head *head, int evidx)
+{
+	struct annotation *notes = symbol__annotation(sym);
+	struct sym_hist *h = annotation__histogram(notes, evidx);
+	struct objdump_line *pos;
+
+	h->sum = 0;
+
+	list_for_each_entry(pos, head, node) {
+		if (pos->offset != -1) {
+			h->addr[pos->offset] = h->addr[pos->offset] * 7 / 8;
+			h->sum += h->addr[pos->offset];
+		}
 	}
 }
 
diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h
index 53dd92d..b1253aa 100644
--- a/tools/perf/util/annotate.h
+++ b/tools/perf/util/annotate.h
@@ -42,6 +42,7 @@ struct source_line {
 struct annotation {
 	struct source_line *src_line;
 	struct sym_hist	   *histograms;
+	int    		   nr_histograms;
 	int    		   sizeof_sym_hist;
 };
 
@@ -64,12 +65,16 @@ static inline struct annotation *symbol__annotation(struct symbol *sym)
 int symbol__inc_addr_samples(struct symbol *sym, struct map *map,
 			     int evidx, u64 addr);
 int symbol__alloc_hist(struct symbol *sym, int nevents);
+void symbol__annotate_zero_histograms(struct symbol *sym);
 
 int symbol__annotate(struct symbol *sym, struct map *map,
 		     struct list_head *head, size_t privsize);
-void symbol__annotate_printf(struct symbol *sym, struct map *map,
-			     struct list_head *head, int evidx, bool full_paths,
-			     int min_pcnt, int max_lines);
+int symbol__annotate_printf(struct symbol *sym, struct map *map,
+			    struct list_head *head, int evidx, bool full_paths,
+			    int min_pcnt, int max_lines);
+void symbol__annotate_zero_histogram(struct symbol *sym, int evidx);
+void symbol__annotate_decay_histogram(struct symbol *sym,
+				      struct list_head *head, int evidx);
 void objdump_line_list__purge(struct list_head *head);
 
 int symbol__tty_annotate(struct symbol *sym, struct map *map, int evidx,
diff --git a/tools/perf/util/top.h b/tools/perf/util/top.h
index 5009508..fe44afb 100644
--- a/tools/perf/util/top.h
+++ b/tools/perf/util/top.h
@@ -11,17 +11,8 @@
 struct perf_evlist;
 struct perf_evsel;
 
-struct source_line {
-	u64			eip;
-	unsigned long		count[MAX_COUNTERS]; /* FIXME */
-	char			*line;
-	struct source_line	*next;
-};
-
 struct sym_entry_source {
-	struct source_line	*source;
-	struct source_line	*lines;
-	struct source_line	**lines_tail;
+	struct list_head	head;
 	pthread_mutex_t		lock;
 };
 
-- 
cgit v0.10.2


From 9c56dfeb784a586713f467e2028a127a2a58a238 Mon Sep 17 00:00:00 2001
From: Michael Witten <mfwitten@gmail.com>
Date: Thu, 3 Feb 2011 22:10:55 -0600
Subject: perf tools: Makefile: Use $(QUIET_GEN) for perf.so

So that we get this:

    CC /home/acme/git/build/perf/bench/mem-memcpy-x86-64-asm.o
    GEN perf-archive
*   GEN /home/acme/git/build/perf/python/perf.so
    CC /home/acme/git/build/perf/builtin-annotate.o

Instead of silently building the python binding.

LKML-Reference: <1296890359-22659-1-git-send-email-mfwitten@gmail.com>
Signed-off-by: Michael Witten <mfwitten@gmail.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index be3eb1d..94f73ab 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -326,7 +326,7 @@ grep-libs = $(filter -l%,$(1))
 strip-libs = $(filter-out -l%,$(1))
 
 $(OUTPUT)python/perf.so: $(PYRF_OBJS)
-	@python util/setup.py --quiet  build_ext --build-lib='$(OUTPUT)python' \
+	$(QUIET_GEN)python util/setup.py --quiet  build_ext --build-lib='$(OUTPUT)python' \
 						--build-temp='$(OUTPUT)python/temp'
 #
 # No Perl scripts right now:
-- 
cgit v0.10.2


From ef4d001d79ac4bab6c2d81e9986a42059f877ec3 Mon Sep 17 00:00:00 2001
From: Denis Kirjanov <dkirjanov@kernel.org>
Date: Sat, 5 Feb 2011 20:39:38 +0000
Subject: perf top: Use pid_t for target_{pid|tid}

Use pid_t data type for target_{pid|tid} vars.

Cc: Ingo Molnar <mingo@elte.hu>
LKML-Reference: <20110205203938.GA15328@hera.kernel.org>
Signed-off-by: Denis Kirjanov <dkirjanov@kernel.org>
[ committer note: those variables are now in struct perf_top, fixed ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/top.h b/tools/perf/util/top.h
index fe44afb..62e3293 100644
--- a/tools/perf/util/top.h
+++ b/tools/perf/util/top.h
@@ -46,8 +46,8 @@ struct perf_top {
 	u64		   exact_samples;
 	u64		   guest_us_samples, guest_kernel_samples;
 	int		   print_entries, count_filter, delay_secs;
-	int		   display_weighted, freq, rb_entries;
-	int		   sym_counter, target_pid, target_tid;
+	int		   display_weighted, freq, rb_entries, sym_counter;
+	pid_t		   target_pid, target_tid;
 	bool		   hide_kernel_symbols, hide_user_symbols, zero;
 	const char	   *cpu_list;
 	struct perf_evsel  *sym_evsel;
-- 
cgit v0.10.2


From f50c2169bd054984e976e67e8651d28f3caf6ba3 Mon Sep 17 00:00:00 2001
From: Franck Bui-Huu <fbuihuu@gmail.com>
Date: Thu, 13 Jan 2011 11:18:30 +0100
Subject: perf probe: Rewrite find_lazy_match_lines() by using getline(3)

Acked-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: lkml <linux-kernel@vger.kernel.org>
LKML-Reference: <m3d3o185u1.fsf@gmail.com>
Signed-off-by: Franck Bui-Huu <fbuihuu@gmail.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c
index 69215bf..46addfb 100644
--- a/tools/perf/util/probe-finder.c
+++ b/tools/perf/util/probe-finder.c
@@ -1234,51 +1234,38 @@ static int find_probe_point_by_line(struct probe_finder *pf)
 static int find_lazy_match_lines(struct list_head *head,
 				 const char *fname, const char *pat)
 {
-	char *fbuf, *p1, *p2;
-	int fd, line, nlines = -1;
-	struct stat st;
-
-	fd = open(fname, O_RDONLY);
-	if (fd < 0) {
-		pr_warning("Failed to open %s: %s\n", fname, strerror(-fd));
+	FILE *fp;
+	char *line = NULL;
+	size_t line_len;
+	ssize_t len;
+	int count = 0, linenum = 1;
+
+	fp = fopen(fname, "r");
+	if (!fp) {
+		pr_warning("Failed to open %s: %s\n", fname, strerror(errno));
 		return -errno;
 	}
 
-	if (fstat(fd, &st) < 0) {
-		pr_warning("Failed to get the size of %s: %s\n",
-			   fname, strerror(errno));
-		nlines = -errno;
-		goto out_close;
-	}
-
-	nlines = -ENOMEM;
-	fbuf = malloc(st.st_size + 2);
-	if (fbuf == NULL)
-		goto out_close;
-	if (read(fd, fbuf, st.st_size) < 0) {
-		pr_warning("Failed to read %s: %s\n", fname, strerror(errno));
-		nlines = -errno;
-		goto out_free_fbuf;
-	}
-	fbuf[st.st_size] = '\n';	/* Dummy line */
-	fbuf[st.st_size + 1] = '\0';
-	p1 = fbuf;
-	line = 1;
-	nlines = 0;
-	while ((p2 = strchr(p1, '\n')) != NULL) {
-		*p2 = '\0';
-		if (strlazymatch(p1, pat)) {
-			line_list__add_line(head, line);
-			nlines++;
+	while ((len = getline(&line, &line_len, fp)) > 0) {
+
+		if (line[len - 1] == '\n')
+			line[len - 1] = '\0';
+
+		if (strlazymatch(line, pat)) {
+			line_list__add_line(head, linenum);
+			count++;
 		}
-		line++;
-		p1 = p2 + 1;
+		linenum++;
 	}
-out_free_fbuf:
-	free(fbuf);
-out_close:
-	close(fd);
-	return nlines;
+
+	if (ferror(fp))
+		count = -errno;
+	free(line);
+	fclose(fp);
+
+	if (count == 0)
+		pr_debug("No matched lines found in %s.\n", fname);
+	return count;
 }
 
 static int probe_point_lazy_walker(const char *fname, int lineno,
@@ -1312,10 +1299,7 @@ static int find_probe_point_lazy(Dwarf_Die *sp_die, struct probe_finder *pf)
 		/* Matching lazy line pattern */
 		ret = find_lazy_match_lines(&pf->lcache, pf->fname,
 					    pf->pev->point.lazy_line);
-		if (ret == 0) {
-			pr_debug("No matched lines found in %s.\n", pf->fname);
-			return 0;
-		} else if (ret < 0)
+		if (ret <= 0)
 			return ret;
 	}
 
-- 
cgit v0.10.2


From 76022db323dd6d7c6958df3d595f7dedf7a14778 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Fri, 4 Feb 2011 21:51:53 +0900
Subject: tracing/kprobes: Cleanup strict_strtol() using code

Since strict_strtol() accepts minus digits started with '-', it doesn't
need to invert after converting.

Cc: 2nddept-manager@sdl.hitachi.co.jp
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <20110204125153.9507.49335.stgit@ltc236.sdl.hitachi.co.jp>
Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 2dec9bc..2088893 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -767,16 +767,15 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t,
 		}
 		break;
 	case '+':	/* deref memory */
+		arg++;	/* Skip '+', because strict_strtol() rejects it. */
 	case '-':
 		tmp = strchr(arg, '(');
 		if (!tmp)
 			break;
 		*tmp = '\0';
-		ret = strict_strtol(arg + 1, 0, &offset);
+		ret = strict_strtol(arg, 0, &offset);
 		if (ret)
 			break;
-		if (arg[0] == '-')
-			offset = -offset;
 		arg = tmp + 1;
 		tmp = strrchr(arg, ')');
 		if (tmp) {
-- 
cgit v0.10.2


From e3745369986ddcdaa19f70e2d24e658876b97e84 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Fri, 4 Feb 2011 21:51:59 +0900
Subject: tracing/kprobes: Support longer (>128 bytes) command

Expand command line buffer of kprobe-tracer to 4096 bytes.

Reported-by: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Cc: 2nddept-manager@sdl.hitachi.co.jp
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <20110204125159.9507.20895.stgit@ltc236.sdl.hitachi.co.jp>
Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 2088893..c6ed886 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1129,7 +1129,7 @@ static int command_trace_probe(const char *buf)
 	return ret;
 }
 
-#define WRITE_BUFSIZE 128
+#define WRITE_BUFSIZE 4096
 
 static ssize_t probes_write(struct file *file, const char __user *buffer,
 			    size_t count, loff_t *ppos)
-- 
cgit v0.10.2


From 1ff511e35ed87cc2ebade9e678e4a2fe39b6f9c5 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Fri, 4 Feb 2011 21:52:05 +0900
Subject: tracing/kprobes: Add bitfield type

Add bitfield type for tracing arguments on kprobe-tracer.  The syntax of
a bitfield type is:

 b<bit-size>@<bit-offset>/<container-size>

e.g.

Accessing 2 bits-width field with 4 bits-offset in 32 bits-width data at
4 bytes offseted from the address pointed by AX register:

 +4(%ax):b2@4/32

Since the width of container data depends on the arch, so I just added
the container-size at the end.

Cc: 2nddept-manager@sdl.hitachi.co.jp
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <20110204125205.9507.11363.stgit@ltc236.sdl.hitachi.co.jp>
Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/Documentation/trace/kprobetrace.txt b/Documentation/trace/kprobetrace.txt
index 5f77d94..6d27ab8 100644
--- a/Documentation/trace/kprobetrace.txt
+++ b/Documentation/trace/kprobetrace.txt
@@ -42,11 +42,25 @@ Synopsis of kprobe_events
   +|-offs(FETCHARG) : Fetch memory at FETCHARG +|- offs address.(**)
   NAME=FETCHARG : Set NAME as the argument name of FETCHARG.
   FETCHARG:TYPE : Set TYPE as the type of FETCHARG. Currently, basic types
-		  (u8/u16/u32/u64/s8/s16/s32/s64) and string are supported.
+		  (u8/u16/u32/u64/s8/s16/s32/s64), "string" and bitfield
+		  are supported.
 
   (*) only for return probe.
   (**) this is useful for fetching a field of data structures.
 
+Types
+-----
+Several types are supported for fetch-args. Kprobe tracer will access memory
+by given type. Prefix 's' and 'u' means those types are signed and unsigned
+respectively. Traced arguments are shown in decimal (signed) or hex (unsigned).
+String type is a special type, which fetches a "null-terminated" string from
+kernel space. This means it will fail and store NULL if the string container
+has been paged out.
+Bitfield is another special type, which takes 3 parameters, bit-width, bit-
+offset, and container-size (usually 32). The syntax is;
+
+ b<bit-width>@<bit-offset>/<container-size>
+
 
 Per-Probe Event Filtering
 -------------------------
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index c6ed886..ccdc542 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -353,6 +353,43 @@ static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data)
 	kfree(data);
 }
 
+/* Bitfield fetch function */
+struct bitfield_fetch_param {
+	struct fetch_param orig;
+	unsigned char hi_shift;
+	unsigned char low_shift;
+};
+
+#define DEFINE_FETCH_bitfield(type)					\
+static __kprobes void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs,\
+					    void *data, void *dest)	\
+{									\
+	struct bitfield_fetch_param *bprm = data;			\
+	type buf = 0;							\
+	call_fetch(&bprm->orig, regs, &buf);				\
+	if (buf) {							\
+		buf <<= bprm->hi_shift;					\
+		buf >>= bprm->low_shift;				\
+	}								\
+	*(type *)dest = buf;						\
+}
+DEFINE_BASIC_FETCH_FUNCS(bitfield)
+#define fetch_bitfield_string NULL
+#define fetch_bitfield_string_size NULL
+
+static __kprobes void
+free_bitfield_fetch_param(struct bitfield_fetch_param *data)
+{
+	/*
+	 * Don't check the bitfield itself, because this must be the
+	 * last fetch function.
+	 */
+	if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
+		free_deref_fetch_param(data->orig.data);
+	else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
+		free_symbol_cache(data->orig.data);
+	kfree(data);
+}
 /* Default (unsigned long) fetch type */
 #define __DEFAULT_FETCH_TYPE(t) u##t
 #define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t)
@@ -367,6 +404,7 @@ enum {
 	FETCH_MTD_memory,
 	FETCH_MTD_symbol,
 	FETCH_MTD_deref,
+	FETCH_MTD_bitfield,
 	FETCH_MTD_END,
 };
 
@@ -387,6 +425,7 @@ ASSIGN_FETCH_FUNC(retval, ftype),			\
 ASSIGN_FETCH_FUNC(memory, ftype),			\
 ASSIGN_FETCH_FUNC(symbol, ftype),			\
 ASSIGN_FETCH_FUNC(deref, ftype),			\
+ASSIGN_FETCH_FUNC(bitfield, ftype),			\
 	  }						\
 	}
 
@@ -430,9 +469,33 @@ static const struct fetch_type *find_fetch_type(const char *type)
 	if (!type)
 		type = DEFAULT_FETCH_TYPE_STR;
 
+	/* Special case: bitfield */
+	if (*type == 'b') {
+		unsigned long bs;
+		type = strchr(type, '/');
+		if (!type)
+			goto fail;
+		type++;
+		if (strict_strtoul(type, 0, &bs))
+			goto fail;
+		switch (bs) {
+		case 8:
+			return find_fetch_type("u8");
+		case 16:
+			return find_fetch_type("u16");
+		case 32:
+			return find_fetch_type("u32");
+		case 64:
+			return find_fetch_type("u64");
+		default:
+			goto fail;
+		}
+	}
+
 	for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++)
 		if (strcmp(type, fetch_type_table[i].name) == 0)
 			return &fetch_type_table[i];
+fail:
 	return NULL;
 }
 
@@ -586,7 +649,9 @@ error:
 
 static void free_probe_arg(struct probe_arg *arg)
 {
-	if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn))
+	if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn))
+		free_bitfield_fetch_param(arg->fetch.data);
+	else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn))
 		free_deref_fetch_param(arg->fetch.data);
 	else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn))
 		free_symbol_cache(arg->fetch.data);
@@ -806,6 +871,41 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t,
 	return ret;
 }
 
+#define BYTES_TO_BITS(nb)	((BITS_PER_LONG * (nb)) / sizeof(long))
+
+/* Bitfield type needs to be parsed into a fetch function */
+static int __parse_bitfield_probe_arg(const char *bf,
+				      const struct fetch_type *t,
+				      struct fetch_param *f)
+{
+	struct bitfield_fetch_param *bprm;
+	unsigned long bw, bo;
+	char *tail;
+
+	if (*bf != 'b')
+		return 0;
+
+	bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
+	if (!bprm)
+		return -ENOMEM;
+	bprm->orig = *f;
+	f->fn = t->fetch[FETCH_MTD_bitfield];
+	f->data = (void *)bprm;
+
+	bw = simple_strtoul(bf + 1, &tail, 0);	/* Use simple one */
+	if (bw == 0 || *tail != '@')
+		return -EINVAL;
+
+	bf = tail + 1;
+	bo = simple_strtoul(bf, &tail, 0);
+	if (tail == bf || *tail != '/')
+		return -EINVAL;
+
+	bprm->hi_shift = BYTES_TO_BITS(t->size) - (bw + bo);
+	bprm->low_shift = bprm->hi_shift + bo;
+	return (BYTES_TO_BITS(t->size) < (bw + bo)) ? -EINVAL : 0;
+}
+
 /* String length checking wrapper */
 static int parse_probe_arg(char *arg, struct trace_probe *tp,
 			   struct probe_arg *parg, int is_return)
@@ -835,6 +935,8 @@ static int parse_probe_arg(char *arg, struct trace_probe *tp,
 	parg->offset = tp->size;
 	tp->size += parg->type->size;
 	ret = __parse_probe_arg(arg, parg->type, &parg->fetch, is_return);
+	if (ret >= 0)
+		ret = __parse_bitfield_probe_arg(t, parg->type, &parg->fetch);
 	if (ret >= 0) {
 		parg->fetch_size.fn = get_fetch_size_function(parg->type,
 							      parg->fetch.fn);
-- 
cgit v0.10.2


From fb7d0b3cefb80a105f7fd26bbc62e0cbf9192822 Mon Sep 17 00:00:00 2001
From: Kyle McMartin <kyle@mcmartin.ca>
Date: Mon, 24 Jan 2011 11:13:04 -0500
Subject: perf tool: Fix gcc 4.6.0 issues

GCC 4.6.0 in Fedora rawhide turned up some compile errors in tools/perf
due to the -Werror=unused-but-set-variable flag.

I've gone through and annotated some of the assignments that had side
effects (ie: return value from a function) with the __used annotation,
and in some cases, just removed unused code.

In a few cases, we were assigning something useful, but not using it in
later parts of the function.

kyle@dreadnought:~/src% gcc --version
gcc (GCC) 4.6.0 20110122 (Red Hat 4.6.0-0.3)

Cc: Ingo Molnar <mingo@redhat.com>
LKML-Reference: <20110124161304.GK27353@bombadil.infradead.org>
Signed-off-by: Kyle McMartin <kyle@redhat.com>
[ committer note: Fixed up the annotation fixes, as that code moved recently ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/bench/sched-pipe.c b/tools/perf/bench/sched-pipe.c
index d9ab3ce..0c7454f 100644
--- a/tools/perf/bench/sched-pipe.c
+++ b/tools/perf/bench/sched-pipe.c
@@ -55,7 +55,7 @@ int bench_sched_pipe(int argc, const char **argv,
 	 * discarding returned value of read(), write()
 	 * causes error in building environment for perf
 	 */
-	int ret, wait_stat;
+	int __used ret, wait_stat;
 	pid_t pid, retpid;
 
 	argc = parse_options(argc, argv, options,
diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index ae26211..a32f411 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -369,11 +369,6 @@ static void
 process_sched_event(struct task_desc *this_task __used, struct sched_atom *atom)
 {
 	int ret = 0;
-	u64 now;
-	long long delta;
-
-	now = get_nsecs();
-	delta = start_time + atom->timestamp - now;
 
 	switch (atom->type) {
 		case SCHED_EVENT_RUN:
@@ -562,7 +557,7 @@ static void wait_for_tasks(void)
 
 static void run_one_test(void)
 {
-	u64 T0, T1, delta, avg_delta, fluct, std_dev;
+	u64 T0, T1, delta, avg_delta, fluct;
 
 	T0 = get_nsecs();
 	wait_for_tasks();
@@ -578,7 +573,6 @@ static void run_one_test(void)
 	else
 		fluct = delta - avg_delta;
 	sum_fluct += fluct;
-	std_dev = sum_fluct / nr_runs / sqrt(nr_runs);
 	if (!run_avg)
 		run_avg = delta;
 	run_avg = (run_avg*9 + delta)/10;
@@ -799,7 +793,7 @@ replay_switch_event(struct trace_switch_event *switch_event,
 		    u64 timestamp,
 		    struct thread *thread __used)
 {
-	struct task_desc *prev, *next;
+	struct task_desc *prev, __used *next;
 	u64 timestamp0;
 	s64 delta;
 
@@ -1404,7 +1398,7 @@ map_switch_event(struct trace_switch_event *switch_event,
 		 u64 timestamp,
 		 struct thread *thread __used)
 {
-	struct thread *sched_out, *sched_in;
+	struct thread *sched_out __used, *sched_in;
 	int new_shortname;
 	u64 timestamp0;
 	s64 delta;
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 716118a..b790673 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -865,7 +865,7 @@ static int __cmd_top(void)
 {
 	pthread_t thread;
 	struct perf_evsel *first;
-	int ret;
+	int ret __used;
 	/*
 	 * FIXME: perf_session__new should allow passing a O_MMAP, so that all this
 	 * mmap reading, etc is encapsulated in it. Use O_WRONLY for now.
diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index 2973376..1012841 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -236,7 +236,6 @@ int symbol__annotate(struct symbol *sym, struct map *map,
 	char command[PATH_MAX * 2];
 	FILE *file;
 	int err = 0;
-	u64 len;
 	char symfs_filename[PATH_MAX];
 
 	if (filename) {
@@ -281,8 +280,6 @@ fallback:
 		 filename, sym->name, map->unmap_ip(map, sym->start),
 		 map->unmap_ip(map, sym->end));
 
-	len = sym->end - sym->start;
-
 	pr_debug("annotating [%p] %30s : [%p] %30s\n",
 		 dso, dso->long_name, sym, sym->name);
 
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index c0de5ec..72c124d 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -1145,7 +1145,7 @@ int perf_event__synthesize_tracing_data(int fd, struct perf_evlist *evlist,
 {
 	union perf_event ev;
 	ssize_t size = 0, aligned_size = 0, padding;
-	int err = 0;
+	int err __used = 0;
 
 	memset(&ev, 0, sizeof(ev));
 
diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c
index c6d9933..2040b85 100644
--- a/tools/perf/util/scripting-engines/trace-event-python.c
+++ b/tools/perf/util/scripting-engines/trace-event-python.c
@@ -248,8 +248,7 @@ static void python_process_event(int cpu, void *data,
 	context = PyCObject_FromVoidPtr(scripting_context, NULL);
 
 	PyTuple_SetItem(t, n++, PyString_FromString(handler_name));
-	PyTuple_SetItem(t, n++,
-			PyCObject_FromVoidPtr(scripting_context, NULL));
+	PyTuple_SetItem(t, n++, context);
 
 	if (handler) {
 		PyTuple_SetItem(t, n++, PyInt_FromLong(cpu));
diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index 7821d0e..3e193f8 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -1525,8 +1525,8 @@ int dso__load(struct dso *self, struct map *map, symbol_filter_t filter)
 			     symbol_conf.symfs, self->long_name);
 			break;
 		case DSO__ORIG_GUEST_KMODULE:
-			if (map->groups && map->groups->machine)
-				root_dir = map->groups->machine->root_dir;
+			if (map->groups && machine)
+				root_dir = machine->root_dir;
 			else
 				root_dir = "";
 			snprintf(name, size, "%s%s%s", symbol_conf.symfs,
diff --git a/tools/perf/util/trace-event-parse.c b/tools/perf/util/trace-event-parse.c
index 73a0222..d8e622d 100644
--- a/tools/perf/util/trace-event-parse.c
+++ b/tools/perf/util/trace-event-parse.c
@@ -153,7 +153,7 @@ void parse_proc_kallsyms(char *file, unsigned int size __unused)
 	char *next = NULL;
 	char *addr_str;
 	char ch;
-	int ret;
+	int ret __used;
 	int i;
 
 	line = strtok_r(file, "\n", &next);
diff --git a/tools/perf/util/ui/browsers/map.c b/tools/perf/util/ui/browsers/map.c
index e515836..8462bff 100644
--- a/tools/perf/util/ui/browsers/map.c
+++ b/tools/perf/util/ui/browsers/map.c
@@ -41,7 +41,7 @@ static int ui_entry__read(const char *title, char *bf, size_t size, int width)
 out_free_form:
 	newtPopWindow();
 	newtFormDestroy(form);
-	return 0;
+	return err;
 }
 
 struct map_browser {
-- 
cgit v0.10.2


From a2221796256ea7b236cec6bf027c1c1de5b8ccd7 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@amd64.org>
Date: Mon, 7 Feb 2011 15:32:18 +0100
Subject: perf annotate: Fix build error

A small fix for when NO_NEWT_SUPPORT is defined.

Add a missing "struct" to the function prototype.

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <20110207143218.GA31197@kryptos.osrc.amd.com>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h
index b1253aa..bc08b36 100644
--- a/tools/perf/util/annotate.h
+++ b/tools/perf/util/annotate.h
@@ -82,7 +82,7 @@ int symbol__tty_annotate(struct symbol *sym, struct map *map, int evidx,
 			 int max_lines);
 
 #ifdef NO_NEWT_SUPPORT
-static inline int symbol__tui_annotate(symbol *sym __used,
+static inline int symbol__tui_annotate(struct symbol *sym __used,
 				       struct map *map __used, int evidx __used)
 {
 	return 0;
-- 
cgit v0.10.2


From 124bb83cd7de4d851af7595650233fb9e9279d5d Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Fri, 4 Feb 2011 21:52:11 +0900
Subject: perf probe: Add bitfield member support

Add bitfield member accessing support to probe arguments.

Suggested-by: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Cc: 2nddept-manager@sdl.hitachi.co.jp
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <20110204125211.9507.60265.stgit@ltc236.sdl.hitachi.co.jp>
Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
[ committer note: Fixed up '%lu' use for return of BYTES_TO_BITS ('%zd') ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c
index 46addfb..fe461f6 100644
--- a/tools/perf/util/probe-finder.c
+++ b/tools/perf/util/probe-finder.c
@@ -33,6 +33,7 @@
 #include <ctype.h>
 #include <dwarf-regs.h>
 
+#include <linux/bitops.h>
 #include "event.h"
 #include "debug.h"
 #include "util.h"
@@ -333,13 +334,23 @@ static Dwarf_Die *die_get_real_type(Dwarf_Die *vr_die, Dwarf_Die *die_mem)
 	return vr_die;
 }
 
-static bool die_is_signed_type(Dwarf_Die *tp_die)
+static int die_get_attr_udata(Dwarf_Die *tp_die, unsigned int attr_name,
+			      Dwarf_Word *result)
 {
 	Dwarf_Attribute attr;
+
+	if (dwarf_attr(tp_die, attr_name, &attr) == NULL ||
+	    dwarf_formudata(&attr, result) != 0)
+		return -ENOENT;
+
+	return 0;
+}
+
+static bool die_is_signed_type(Dwarf_Die *tp_die)
+{
 	Dwarf_Word ret;
 
-	if (dwarf_attr(tp_die, DW_AT_encoding, &attr) == NULL ||
-	    dwarf_formudata(&attr, &ret) != 0)
+	if (die_get_attr_udata(tp_die, DW_AT_encoding, &ret))
 		return false;
 
 	return (ret == DW_ATE_signed_char || ret == DW_ATE_signed ||
@@ -348,11 +359,29 @@ static bool die_is_signed_type(Dwarf_Die *tp_die)
 
 static int die_get_byte_size(Dwarf_Die *tp_die)
 {
-	Dwarf_Attribute attr;
 	Dwarf_Word ret;
 
-	if (dwarf_attr(tp_die, DW_AT_byte_size, &attr) == NULL ||
-	    dwarf_formudata(&attr, &ret) != 0)
+	if (die_get_attr_udata(tp_die, DW_AT_byte_size, &ret))
+		return 0;
+
+	return (int)ret;
+}
+
+static int die_get_bit_size(Dwarf_Die *tp_die)
+{
+	Dwarf_Word ret;
+
+	if (die_get_attr_udata(tp_die, DW_AT_bit_size, &ret))
+		return 0;
+
+	return (int)ret;
+}
+
+static int die_get_bit_offset(Dwarf_Die *tp_die)
+{
+	Dwarf_Word ret;
+
+	if (die_get_attr_udata(tp_die, DW_AT_bit_offset, &ret))
 		return 0;
 
 	return (int)ret;
@@ -827,6 +856,8 @@ static_var:
 	return 0;
 }
 
+#define BYTES_TO_BITS(nb)	((nb) * BITS_PER_LONG / sizeof(long))
+
 static int convert_variable_type(Dwarf_Die *vr_die,
 				 struct probe_trace_arg *tvar,
 				 const char *cast)
@@ -843,6 +874,14 @@ static int convert_variable_type(Dwarf_Die *vr_die,
 		return (tvar->type == NULL) ? -ENOMEM : 0;
 	}
 
+	if (die_get_bit_size(vr_die) != 0) {
+		/* This is a bitfield */
+		ret = snprintf(buf, 16, "b%d@%d/%zd", die_get_bit_size(vr_die),
+				die_get_bit_offset(vr_die),
+				BYTES_TO_BITS(die_get_byte_size(vr_die)));
+		goto formatted;
+	}
+
 	if (die_get_real_type(vr_die, &type) == NULL) {
 		pr_warning("Failed to get a type information of %s.\n",
 			   dwarf_diename(vr_die));
@@ -887,29 +926,31 @@ static int convert_variable_type(Dwarf_Die *vr_die,
 		return (tvar->type == NULL) ? -ENOMEM : 0;
 	}
 
-	ret = die_get_byte_size(&type) * 8;
-	if (ret) {
-		/* Check the bitwidth */
-		if (ret > MAX_BASIC_TYPE_BITS) {
-			pr_info("%s exceeds max-bitwidth."
-				" Cut down to %d bits.\n",
-				dwarf_diename(&type), MAX_BASIC_TYPE_BITS);
-			ret = MAX_BASIC_TYPE_BITS;
-		}
+	ret = BYTES_TO_BITS(die_get_byte_size(&type));
+	if (!ret)
+		/* No size ... try to use default type */
+		return 0;
 
-		ret = snprintf(buf, 16, "%c%d",
-			       die_is_signed_type(&type) ? 's' : 'u', ret);
-		if (ret < 0 || ret >= 16) {
-			if (ret >= 16)
-				ret = -E2BIG;
-			pr_warning("Failed to convert variable type: %s\n",
-				   strerror(-ret));
-			return ret;
-		}
-		tvar->type = strdup(buf);
-		if (tvar->type == NULL)
-			return -ENOMEM;
+	/* Check the bitwidth */
+	if (ret > MAX_BASIC_TYPE_BITS) {
+		pr_info("%s exceeds max-bitwidth. Cut down to %d bits.\n",
+			dwarf_diename(&type), MAX_BASIC_TYPE_BITS);
+		ret = MAX_BASIC_TYPE_BITS;
+	}
+	ret = snprintf(buf, 16, "%c%d",
+		       die_is_signed_type(&type) ? 's' : 'u', ret);
+
+formatted:
+	if (ret < 0 || ret >= 16) {
+		if (ret >= 16)
+			ret = -E2BIG;
+		pr_warning("Failed to convert variable type: %s\n",
+			   strerror(-ret));
+		return ret;
 	}
+	tvar->type = strdup(buf);
+	if (tvar->type == NULL)
+		return -ENOMEM;
 	return 0;
 }
 
-- 
cgit v0.10.2


From 6d54057d76e25c91165cda0e6e007f1811faa2be Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 27 Jan 2011 22:33:26 -0500
Subject: tracing/filter: Have no filter return a match

The n_preds field of a file can change at anytime, and even can become
zero, just as the filter is about to be processed by an event.
In the case that is zero on entering the filter, return 1, telling
the caller the event matchs and should be trace.

Also use a variable and assign it with ACCESS_ONCE() such that the
count stays consistent within the function.

Cc: Tom Zanussi <tzanussi@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 36d4010..7275f03 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -383,9 +383,14 @@ int filter_match_preds(struct event_filter *filter, void *rec)
 	int match, top = 0, val1 = 0, val2 = 0;
 	int stack[MAX_FILTER_PRED];
 	struct filter_pred *pred;
+	int n_preds = ACCESS_ONCE(filter->n_preds);
 	int i;
 
-	for (i = 0; i < filter->n_preds; i++) {
+	/* no filter is considered a match */
+	if (!n_preds)
+		return 1;
+
+	for (i = 0; i < n_preds; i++) {
 		pred = filter->preds[i];
 		if (!pred->pop_n) {
 			match = pred->fn(pred, rec, val1, val2);
-- 
cgit v0.10.2


From 58d9a597c4275d830a819625e7d437cd6fb23fa5 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 27 Jan 2011 22:37:09 -0500
Subject: tracing/filter: Move OR and AND logic out of fn() method

The ops OR and AND act different from the other ops, as they
are the only ones to take other ops as their arguements.
These ops als change the logic of the filter_match_preds.

By removing the OR and AND fn's we can also remove the val1 and val2
that is passed to all other fn's and are unused.

Cc: Tom Zanussi <tzanussi@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 9021f8c..1597bc0 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -677,8 +677,7 @@ struct event_subsystem {
 struct filter_pred;
 struct regex;
 
-typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event,
-				 int val1, int val2);
+typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event);
 
 typedef int (*regex_match_func)(char *str, struct regex *r, int len);
 
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 7275f03..5d719b3 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -124,8 +124,7 @@ struct filter_parse_state {
 };
 
 #define DEFINE_COMPARISON_PRED(type)					\
-static int filter_pred_##type(struct filter_pred *pred, void *event,	\
-			      int val1, int val2)			\
+static int filter_pred_##type(struct filter_pred *pred, void *event)	\
 {									\
 	type *addr = (type *)(event + pred->offset);			\
 	type val = (type)pred->val;					\
@@ -152,8 +151,7 @@ static int filter_pred_##type(struct filter_pred *pred, void *event,	\
 }
 
 #define DEFINE_EQUALITY_PRED(size)					\
-static int filter_pred_##size(struct filter_pred *pred, void *event,	\
-			      int val1, int val2)			\
+static int filter_pred_##size(struct filter_pred *pred, void *event)	\
 {									\
 	u##size *addr = (u##size *)(event + pred->offset);		\
 	u##size val = (u##size)pred->val;				\
@@ -178,23 +176,8 @@ DEFINE_EQUALITY_PRED(32);
 DEFINE_EQUALITY_PRED(16);
 DEFINE_EQUALITY_PRED(8);
 
-static int filter_pred_and(struct filter_pred *pred __attribute((unused)),
-			   void *event __attribute((unused)),
-			   int val1, int val2)
-{
-	return val1 && val2;
-}
-
-static int filter_pred_or(struct filter_pred *pred __attribute((unused)),
-			  void *event __attribute((unused)),
-			  int val1, int val2)
-{
-	return val1 || val2;
-}
-
 /* Filter predicate for fixed sized arrays of characters */
-static int filter_pred_string(struct filter_pred *pred, void *event,
-			      int val1, int val2)
+static int filter_pred_string(struct filter_pred *pred, void *event)
 {
 	char *addr = (char *)(event + pred->offset);
 	int cmp, match;
@@ -207,8 +190,7 @@ static int filter_pred_string(struct filter_pred *pred, void *event,
 }
 
 /* Filter predicate for char * pointers */
-static int filter_pred_pchar(struct filter_pred *pred, void *event,
-			     int val1, int val2)
+static int filter_pred_pchar(struct filter_pred *pred, void *event)
 {
 	char **addr = (char **)(event + pred->offset);
 	int cmp, match;
@@ -231,8 +213,7 @@ static int filter_pred_pchar(struct filter_pred *pred, void *event,
  * and add it to the address of the entry, and at last we have
  * the address of the string.
  */
-static int filter_pred_strloc(struct filter_pred *pred, void *event,
-			      int val1, int val2)
+static int filter_pred_strloc(struct filter_pred *pred, void *event)
 {
 	u32 str_item = *(u32 *)(event + pred->offset);
 	int str_loc = str_item & 0xffff;
@@ -247,8 +228,7 @@ static int filter_pred_strloc(struct filter_pred *pred, void *event,
 	return match;
 }
 
-static int filter_pred_none(struct filter_pred *pred, void *event,
-			    int val1, int val2)
+static int filter_pred_none(struct filter_pred *pred, void *event)
 {
 	return 0;
 }
@@ -380,7 +360,7 @@ static void filter_build_regex(struct filter_pred *pred)
 /* return 1 if event matches, 0 otherwise (discard) */
 int filter_match_preds(struct event_filter *filter, void *rec)
 {
-	int match, top = 0, val1 = 0, val2 = 0;
+	int match = -1, top = 0, val1 = 0, val2 = 0;
 	int stack[MAX_FILTER_PRED];
 	struct filter_pred *pred;
 	int n_preds = ACCESS_ONCE(filter->n_preds);
@@ -393,7 +373,7 @@ int filter_match_preds(struct event_filter *filter, void *rec)
 	for (i = 0; i < n_preds; i++) {
 		pred = filter->preds[i];
 		if (!pred->pop_n) {
-			match = pred->fn(pred, rec, val1, val2);
+			match = pred->fn(pred, rec);
 			stack[top++] = match;
 			continue;
 		}
@@ -403,7 +383,16 @@ int filter_match_preds(struct event_filter *filter, void *rec)
 		}
 		val1 = stack[--top];
 		val2 = stack[--top];
-		match = pred->fn(pred, rec, val1, val2);
+		switch (pred->op) {
+		case OP_AND:
+			match = val1 && val2;
+			break;
+		case OP_OR:
+			match = val1 || val2;
+			break;
+		default:
+			WARN_ONCE(1, "filter op is not AND or OR");
+		}
 		stack[top++] = match;
 	}
 
@@ -775,15 +764,13 @@ static int filter_add_pred(struct filter_parse_state *ps,
 	unsigned long long val;
 	int ret;
 
-	pred->fn = filter_pred_none;
+	fn = pred->fn = filter_pred_none;
 
 	if (pred->op == OP_AND) {
 		pred->pop_n = 2;
-		fn = filter_pred_and;
 		goto add_pred_fn;
 	} else if (pred->op == OP_OR) {
 		pred->pop_n = 2;
-		fn = filter_pred_or;
 		goto add_pred_fn;
 	}
 
-- 
cgit v0.10.2


From c9c53ca03d6f97fdd9832d5ed3f15b30ee5cdb86 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 27 Jan 2011 22:42:43 -0500
Subject: tracing/filter: Dynamically allocate preds

For every filter that is made, we create predicates to hold every
operation within the filter. We have a max of 32 predicates that we
can hold. Currently, we allocate all 32 even if we only need to
use one.

Part of the reason we do this is that the filter can be used at
any moment by any event. Fortunately, the filter is only used
with preemption disabled. By reseting the count of preds used "n_preds"
to zero, then performing a synchronize_sched(), we can safely
free and reallocate a new array of preds.

Cc: Tom Zanussi <tzanussi@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 1597bc0..441fc1b 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -661,7 +661,8 @@ struct ftrace_event_field {
 };
 
 struct event_filter {
-	int			n_preds;
+	int			n_preds;	/* Number assigned */
+	int			a_preds;	/* allocated */
 	struct filter_pred	**preds;
 	char			*filter_string;
 };
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 5d719b3..aac6a61 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -362,6 +362,7 @@ int filter_match_preds(struct event_filter *filter, void *rec)
 {
 	int match = -1, top = 0, val1 = 0, val2 = 0;
 	int stack[MAX_FILTER_PRED];
+	struct filter_pred **preds;
 	struct filter_pred *pred;
 	int n_preds = ACCESS_ONCE(filter->n_preds);
 	int i;
@@ -370,8 +371,13 @@ int filter_match_preds(struct event_filter *filter, void *rec)
 	if (!n_preds)
 		return 1;
 
+	/*
+	 * n_preds and filter->preds is protect with preemption disabled.
+	 */
+	preds = rcu_dereference_sched(filter->preds);
+
 	for (i = 0; i < n_preds; i++) {
-		pred = filter->preds[i];
+		pred = preds[i];
 		if (!pred->pop_n) {
 			match = pred->fn(pred, rec);
 			stack[top++] = match;
@@ -548,46 +554,55 @@ static int filter_set_pred(struct filter_pred *dest,
 	return 0;
 }
 
+static void __free_preds(struct event_filter *filter)
+{
+	int i;
+
+	if (filter->preds) {
+		for (i = 0; i < filter->a_preds; i++) {
+			if (filter->preds[i])
+				filter_free_pred(filter->preds[i]);
+		}
+		kfree(filter->preds);
+		filter->preds = NULL;
+	}
+	filter->a_preds = 0;
+	filter->n_preds = 0;
+}
+
 static void filter_disable_preds(struct ftrace_event_call *call)
 {
 	struct event_filter *filter = call->filter;
 	int i;
 
 	call->flags &= ~TRACE_EVENT_FL_FILTERED;
+	if (filter->preds) {
+		for (i = 0; i < filter->n_preds; i++)
+			filter->preds[i]->fn = filter_pred_none;
+	}
 	filter->n_preds = 0;
-
-	for (i = 0; i < MAX_FILTER_PRED; i++)
-		filter->preds[i]->fn = filter_pred_none;
 }
 
-static void __free_preds(struct event_filter *filter)
+static void __free_filter(struct event_filter *filter)
 {
-	int i;
-
 	if (!filter)
 		return;
 
-	for (i = 0; i < MAX_FILTER_PRED; i++) {
-		if (filter->preds[i])
-			filter_free_pred(filter->preds[i]);
-	}
-	kfree(filter->preds);
+	__free_preds(filter);
 	kfree(filter->filter_string);
 	kfree(filter);
 }
 
 void destroy_preds(struct ftrace_event_call *call)
 {
-	__free_preds(call->filter);
+	__free_filter(call->filter);
 	call->filter = NULL;
 	call->flags &= ~TRACE_EVENT_FL_FILTERED;
 }
 
-static struct event_filter *__alloc_preds(void)
+static struct event_filter *__alloc_filter(void)
 {
 	struct event_filter *filter;
-	struct filter_pred *pred;
-	int i;
 
 	filter = kzalloc(sizeof(*filter), GFP_KERNEL);
 	if (!filter)
@@ -595,32 +610,63 @@ static struct event_filter *__alloc_preds(void)
 
 	filter->n_preds = 0;
 
-	filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL);
+	return filter;
+}
+
+static int __alloc_preds(struct event_filter *filter, int n_preds)
+{
+	struct filter_pred *pred;
+	int i;
+
+	if (filter->preds) {
+		if (filter->a_preds < n_preds) {
+			/* We need to reallocate */
+			filter->n_preds = 0;
+			/*
+			 * It is possible that the filter is currently
+			 * being used. We need to zero out the number
+			 * of preds, wait on preemption and then free
+			 * the preds.
+			 */
+			synchronize_sched();
+			__free_preds(filter);
+		}
+	}
+
+	if (!filter->preds) {
+		filter->preds =
+			kzalloc(sizeof(*filter->preds) * n_preds, GFP_KERNEL);
+		filter->a_preds = n_preds;
+	}
 	if (!filter->preds)
-		goto oom;
+		return -ENOMEM;
+
+	if (WARN_ON(filter->a_preds < n_preds))
+		return -EINVAL;
 
-	for (i = 0; i < MAX_FILTER_PRED; i++) {
-		pred = kzalloc(sizeof(*pred), GFP_KERNEL);
+	for (i = 0; i < n_preds; i++) {
+		pred = filter->preds[i];
+		if (!pred)
+			pred = kzalloc(sizeof(*pred), GFP_KERNEL);
 		if (!pred)
 			goto oom;
 		pred->fn = filter_pred_none;
 		filter->preds[i] = pred;
 	}
 
-	return filter;
-
-oom:
+	return 0;
+ oom:
 	__free_preds(filter);
-	return ERR_PTR(-ENOMEM);
+	return -ENOMEM;
 }
 
-static int init_preds(struct ftrace_event_call *call)
+static int init_filter(struct ftrace_event_call *call)
 {
 	if (call->filter)
 		return 0;
 
 	call->flags &= ~TRACE_EVENT_FL_FILTERED;
-	call->filter = __alloc_preds();
+	call->filter = __alloc_filter();
 	if (IS_ERR(call->filter))
 		return PTR_ERR(call->filter);
 
@@ -636,7 +682,7 @@ static int init_subsystem_preds(struct event_subsystem *system)
 		if (strcmp(call->class->system, system->name) != 0)
 			continue;
 
-		err = init_preds(call);
+		err = init_filter(call);
 		if (err)
 			return err;
 	}
@@ -665,7 +711,7 @@ static int filter_add_pred_fn(struct filter_parse_state *ps,
 {
 	int idx, err;
 
-	if (filter->n_preds == MAX_FILTER_PRED) {
+	if (WARN_ON(filter->n_preds == filter->a_preds)) {
 		parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
 		return -ENOSPC;
 	}
@@ -1179,6 +1225,20 @@ static int check_preds(struct filter_parse_state *ps)
 	return 0;
 }
 
+static int count_preds(struct filter_parse_state *ps)
+{
+	struct postfix_elt *elt;
+	int n_preds = 0;
+
+	list_for_each_entry(elt, &ps->postfix, list) {
+		if (elt->op == OP_NONE)
+			continue;
+		n_preds++;
+	}
+
+	return n_preds;
+}
+
 static int replace_preds(struct ftrace_event_call *call,
 			 struct event_filter *filter,
 			 struct filter_parse_state *ps,
@@ -1191,10 +1251,23 @@ static int replace_preds(struct ftrace_event_call *call,
 	int err;
 	int n_preds = 0;
 
+	n_preds = count_preds(ps);
+	if (n_preds >= MAX_FILTER_PRED) {
+		parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
+		return -ENOSPC;
+	}
+
 	err = check_preds(ps);
 	if (err)
 		return err;
 
+	if (!dry_run) {
+		err = __alloc_preds(filter, n_preds);
+		if (err)
+			return err;
+	}
+
+	n_preds = 0;
 	list_for_each_entry(elt, &ps->postfix, list) {
 		if (elt->op == OP_NONE) {
 			if (!operand1)
@@ -1208,7 +1281,7 @@ static int replace_preds(struct ftrace_event_call *call,
 			continue;
 		}
 
-		if (n_preds++ == MAX_FILTER_PRED) {
+		if (WARN_ON(n_preds++ == MAX_FILTER_PRED)) {
 			parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
 			return -ENOSPC;
 		}
@@ -1283,7 +1356,7 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
 
 	mutex_lock(&event_mutex);
 
-	err = init_preds(call);
+	err = init_filter(call);
 	if (err)
 		goto out_unlock;
 
@@ -1376,7 +1449,7 @@ void ftrace_profile_free_filter(struct perf_event *event)
 	struct event_filter *filter = event->filter;
 
 	event->filter = NULL;
-	__free_preds(filter);
+	__free_filter(filter);
 }
 
 int ftrace_profile_set_filter(struct perf_event *event, int event_id,
@@ -1402,7 +1475,7 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
 	if (event->filter)
 		goto out_unlock;
 
-	filter = __alloc_preds();
+	filter = __alloc_filter();
 	if (IS_ERR(filter)) {
 		err = PTR_ERR(filter);
 		goto out_unlock;
@@ -1411,7 +1484,7 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
 	err = -ENOMEM;
 	ps = kzalloc(sizeof(*ps), GFP_KERNEL);
 	if (!ps)
-		goto free_preds;
+		goto free_filter;
 
 	parse_init(ps, filter_ops, filter_str);
 	err = filter_parse(ps);
@@ -1427,9 +1500,9 @@ free_ps:
 	postfix_clear(ps);
 	kfree(ps);
 
-free_preds:
+free_filter:
 	if (err)
-		__free_preds(filter);
+		__free_filter(filter);
 
 out_unlock:
 	mutex_unlock(&event_mutex);
-- 
cgit v0.10.2


From 0fc3ca9a10a61a77f18710fb708b41fd99c79a56 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 27 Jan 2011 22:46:46 -0500
Subject: tracing/filter: Call synchronize_sched() just once for system filters

By separating out the reseting of the filter->n_preds to zero from
the reallocation of preds for the filter, we can reset groups of
filters first, call synchronize_sched() just once, and then reallocate
each of the filters in the system group.

Cc: Tom Zanussi <tzanussi@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index aac6a61..8f00a11 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -570,17 +570,28 @@ static void __free_preds(struct event_filter *filter)
 	filter->n_preds = 0;
 }
 
+static void reset_preds(struct event_filter *filter)
+{
+	struct filter_pred *pred;
+	int n_preds = filter->n_preds;
+	int i;
+
+	filter->n_preds = 0;
+	if (!filter->preds)
+		return;
+
+	for (i = 0; i < n_preds; i++) {
+		pred = filter->preds[i];
+		pred->fn = filter_pred_none;
+	}
+}
+
 static void filter_disable_preds(struct ftrace_event_call *call)
 {
 	struct event_filter *filter = call->filter;
-	int i;
 
 	call->flags &= ~TRACE_EVENT_FL_FILTERED;
-	if (filter->preds) {
-		for (i = 0; i < filter->n_preds; i++)
-			filter->preds[i]->fn = filter_pred_none;
-	}
-	filter->n_preds = 0;
+	reset_preds(filter);
 }
 
 static void __free_filter(struct event_filter *filter)
@@ -620,15 +631,17 @@ static int __alloc_preds(struct event_filter *filter, int n_preds)
 
 	if (filter->preds) {
 		if (filter->a_preds < n_preds) {
-			/* We need to reallocate */
-			filter->n_preds = 0;
 			/*
-			 * It is possible that the filter is currently
-			 * being used. We need to zero out the number
-			 * of preds, wait on preemption and then free
-			 * the preds.
+			 * We need to reallocate.
+			 * We should have already have zeroed out
+			 * the pred count and called synchronized_sched()
+			 * to make sure no one is using the preds.
 			 */
-			synchronize_sched();
+			if (WARN_ON_ONCE(filter->n_preds)) {
+				/* We need to reset it now */
+				filter->n_preds = 0;
+				synchronize_sched();
+			}
 			__free_preds(filter);
 		}
 	}
@@ -1328,6 +1341,30 @@ static int replace_system_preds(struct event_subsystem *system,
 		/* try to see if the filter can be applied */
 		err = replace_preds(call, filter, ps, filter_string, true);
 		if (err)
+			goto fail;
+	}
+
+	/* set all filter pred counts to zero */
+	list_for_each_entry(call, &ftrace_events, list) {
+		struct event_filter *filter = call->filter;
+
+		if (strcmp(call->class->system, system->name) != 0)
+			continue;
+
+		reset_preds(filter);
+	}
+
+	/*
+	 * Since some of the preds may be used under preemption
+	 * we need to wait for them to finish before we may
+	 * reallocate them.
+	 */
+	synchronize_sched();
+
+	list_for_each_entry(call, &ftrace_events, list) {
+		struct event_filter *filter = call->filter;
+
+		if (strcmp(call->class->system, system->name) != 0)
 			continue;
 
 		/* really apply the filter */
@@ -1342,11 +1379,13 @@ static int replace_system_preds(struct event_subsystem *system,
 		fail = false;
 	}
 
-	if (fail) {
-		parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
-		return -EINVAL;
-	}
+	if (fail)
+		goto fail;
+
 	return 0;
+ fail:
+	parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
+	return -EINVAL;
 }
 
 int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
@@ -1381,6 +1420,13 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
 		goto out;
 	}
 
+	/*
+	 * Make sure all the pred counts are zero so that
+	 * no task is using it when we reallocate the preds array.
+	 */
+	reset_preds(call->filter);
+	synchronize_sched();
+
 	err = replace_preds(call, call->filter, ps, filter_string, false);
 	if (err)
 		append_filter_err(ps, call->filter);
-- 
cgit v0.10.2


From 74e9e58c350a24139e268dd6857bbaa55c5aafcf Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 27 Jan 2011 22:49:48 -0500
Subject: tracing/filter: Allocate the preds in an array

Currently we allocate an array of pointers to filter_preds, and then
allocate a separate filter_pred for each item in the array.
This adds slight overhead in the filters as it needs to derefernce
twice to get to the op condition.

Allocating the preds themselves in a single array removes a dereference
as well as helps on the cache footprint.

Cc: Tom Zanussi <tzanussi@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 441fc1b..254d04a 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -663,7 +663,7 @@ struct ftrace_event_field {
 struct event_filter {
 	int			n_preds;	/* Number assigned */
 	int			a_preds;	/* allocated */
-	struct filter_pred	**preds;
+	struct filter_pred	*preds;
 	char			*filter_string;
 };
 
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 8f00a11..b6c9106 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -362,7 +362,7 @@ int filter_match_preds(struct event_filter *filter, void *rec)
 {
 	int match = -1, top = 0, val1 = 0, val2 = 0;
 	int stack[MAX_FILTER_PRED];
-	struct filter_pred **preds;
+	struct filter_pred *preds;
 	struct filter_pred *pred;
 	int n_preds = ACCESS_ONCE(filter->n_preds);
 	int i;
@@ -377,7 +377,7 @@ int filter_match_preds(struct event_filter *filter, void *rec)
 	preds = rcu_dereference_sched(filter->preds);
 
 	for (i = 0; i < n_preds; i++) {
-		pred = preds[i];
+		pred = &preds[i];
 		if (!pred->pop_n) {
 			match = pred->fn(pred, rec);
 			stack[top++] = match;
@@ -559,10 +559,8 @@ static void __free_preds(struct event_filter *filter)
 	int i;
 
 	if (filter->preds) {
-		for (i = 0; i < filter->a_preds; i++) {
-			if (filter->preds[i])
-				filter_free_pred(filter->preds[i]);
-		}
+		for (i = 0; i < filter->a_preds; i++)
+			kfree(filter->preds[i].field_name);
 		kfree(filter->preds);
 		filter->preds = NULL;
 	}
@@ -572,7 +570,6 @@ static void __free_preds(struct event_filter *filter)
 
 static void reset_preds(struct event_filter *filter)
 {
-	struct filter_pred *pred;
 	int n_preds = filter->n_preds;
 	int i;
 
@@ -580,10 +577,8 @@ static void reset_preds(struct event_filter *filter)
 	if (!filter->preds)
 		return;
 
-	for (i = 0; i < n_preds; i++) {
-		pred = filter->preds[i];
-		pred->fn = filter_pred_none;
-	}
+	for (i = 0; i < n_preds; i++)
+		filter->preds[i].fn = filter_pred_none;
 }
 
 static void filter_disable_preds(struct ftrace_event_call *call)
@@ -658,19 +653,11 @@ static int __alloc_preds(struct event_filter *filter, int n_preds)
 		return -EINVAL;
 
 	for (i = 0; i < n_preds; i++) {
-		pred = filter->preds[i];
-		if (!pred)
-			pred = kzalloc(sizeof(*pred), GFP_KERNEL);
-		if (!pred)
-			goto oom;
+		pred = &filter->preds[i];
 		pred->fn = filter_pred_none;
-		filter->preds[i] = pred;
 	}
 
 	return 0;
- oom:
-	__free_preds(filter);
-	return -ENOMEM;
 }
 
 static int init_filter(struct ftrace_event_call *call)
@@ -730,8 +717,8 @@ static int filter_add_pred_fn(struct filter_parse_state *ps,
 	}
 
 	idx = filter->n_preds;
-	filter_clear_pred(filter->preds[idx]);
-	err = filter_set_pred(filter->preds[idx], pred, fn);
+	filter_clear_pred(&filter->preds[idx]);
+	err = filter_set_pred(&filter->preds[idx], pred, fn);
 	if (err)
 		return err;
 
-- 
cgit v0.10.2


From f76690afd05e3e163149310bdcd30234f93b3a7a Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 27 Jan 2011 22:53:06 -0500
Subject: tracing/filter: Free pred array on disabling of filter

When a filter is disabled, free the preds.

Cc: Tom Zanussi <tzanussi@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index b6c9106..2f5458e 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1388,6 +1388,10 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
 
 	if (!strcmp(strstrip(filter_string), "0")) {
 		filter_disable_preds(call);
+		reset_preds(call->filter);
+		/* Make sure the filter is not being used */
+		synchronize_sched();
+		__free_preds(call->filter);
 		remove_filter_string(call->filter);
 		goto out_unlock;
 	}
-- 
cgit v0.10.2


From 61e9dea20e1ada886cc49a9ec6fe3c6ac0de7324 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 27 Jan 2011 22:54:33 -0500
Subject: tracing/filter: Use a tree instead of stack for filter_match_preds()

Currently the filter_match_preds() requires a stack to push
and pop the preds to determine if the filter matches the record or not.
This has two drawbacks:

1) It requires a stack to store state information. As this is done
   in fast paths we can't allocate the storage for this stack, and
   we can't use a global as it must be re-entrant. The stack is stored
   on the kernel stack and this greatly limits how many preds we
   may allow.

2) All conditions are calculated even when a short circuit exists.
   a || b  will always calculate a and b even though a was determined
   to be true.

Using a tree we can walk a constant structure that will save
the state as we go. The algorithm is simply:

  pred = root;
  do {
	switch (move) {
	case MOVE_DOWN:
		if (OR or AND) {
			pred = left;
			continue;
		}
		if (pred == root)
			break;
		match = pred->fn();
		pred = pred->parent;
		move = left child ? MOVE_UP_FROM_LEFT : MOVE_UP_FROM_RIGHT;
		continue;

	case MOVE_UP_FROM_LEFT:
		/* Only OR or AND can be a parent */
		if (match && OR || !match && AND) {
			/* short circuit */
			if (pred == root)
				break;
			pred = pred->parent;
			move = left child ?
				MOVE_UP_FROM_LEFT :
				MOVE_UP_FROM_RIGHT;
			continue;
		}
		pred = pred->right;
		move = MOVE_DOWN;
		continue;

	case MOVE_UP_FROM_RIGHT:
		if (pred == root)
			break;
		pred = pred->parent;
		move = left child ? MOVE_UP_FROM_LEFT : MOVE_UP_FROM_RIGHT;
		continue;
	}
	done = 1;
  } while (!done);

This way there's no strict limit to how many preds we allow
and it also will short circuit the logical operations when possible.

Cc: Tom Zanussi <tzanussi@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 254d04a..bba34a7 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -664,6 +664,7 @@ struct event_filter {
 	int			n_preds;	/* Number assigned */
 	int			a_preds;	/* allocated */
 	struct filter_pred	*preds;
+	struct filter_pred	*root;
 	char			*filter_string;
 };
 
@@ -675,6 +676,9 @@ struct event_subsystem {
 	int			nr_events;
 };
 
+#define FILTER_PRED_INVALID	((unsigned short)-1)
+#define FILTER_PRED_IS_RIGHT	(1 << 15)
+
 struct filter_pred;
 struct regex;
 
@@ -704,7 +708,10 @@ struct filter_pred {
 	int 			offset;
 	int 			not;
 	int 			op;
-	int 			pop_n;
+	unsigned short		index;
+	unsigned short		parent;
+	unsigned short		left;
+	unsigned short		right;
 };
 
 extern struct list_head ftrace_common_fields;
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 2f5458e..1039049 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -123,6 +123,11 @@ struct filter_parse_state {
 	} operand;
 };
 
+struct pred_stack {
+	struct filter_pred	**preds;
+	int			index;
+};
+
 #define DEFINE_COMPARISON_PRED(type)					\
 static int filter_pred_##type(struct filter_pred *pred, void *event)	\
 {									\
@@ -357,52 +362,95 @@ static void filter_build_regex(struct filter_pred *pred)
 	pred->not ^= not;
 }
 
+enum move_type {
+	MOVE_DOWN,
+	MOVE_UP_FROM_LEFT,
+	MOVE_UP_FROM_RIGHT
+};
+
+static struct filter_pred *
+get_pred_parent(struct filter_pred *pred, struct filter_pred *preds,
+		int index, enum move_type *move)
+{
+	if (pred->parent & FILTER_PRED_IS_RIGHT)
+		*move = MOVE_UP_FROM_RIGHT;
+	else
+		*move = MOVE_UP_FROM_LEFT;
+	pred = &preds[pred->parent & ~FILTER_PRED_IS_RIGHT];
+
+	return pred;
+}
+
 /* return 1 if event matches, 0 otherwise (discard) */
 int filter_match_preds(struct event_filter *filter, void *rec)
 {
-	int match = -1, top = 0, val1 = 0, val2 = 0;
-	int stack[MAX_FILTER_PRED];
+	int match = -1;
+	enum move_type move = MOVE_DOWN;
 	struct filter_pred *preds;
 	struct filter_pred *pred;
+	struct filter_pred *root;
 	int n_preds = ACCESS_ONCE(filter->n_preds);
-	int i;
+	int done = 0;
 
 	/* no filter is considered a match */
 	if (!n_preds)
 		return 1;
 
 	/*
-	 * n_preds and filter->preds is protect with preemption disabled.
+	 * n_preds, root and filter->preds are protect with preemption disabled.
 	 */
 	preds = rcu_dereference_sched(filter->preds);
+	root = rcu_dereference_sched(filter->root);
+	if (!root)
+		return 1;
 
-	for (i = 0; i < n_preds; i++) {
-		pred = &preds[i];
-		if (!pred->pop_n) {
+	pred = root;
+
+	/* match is currently meaningless */
+	match = -1;
+
+	do {
+		switch (move) {
+		case MOVE_DOWN:
+			/* only AND and OR have children */
+			if (pred->left != FILTER_PRED_INVALID) {
+				/* keep going to leaf node */
+				pred = &preds[pred->left];
+				continue;
+			}
 			match = pred->fn(pred, rec);
-			stack[top++] = match;
+			/* If this pred is the only pred */
+			if (pred == root)
+				break;
+			pred = get_pred_parent(pred, preds,
+					       pred->parent, &move);
+			continue;
+		case MOVE_UP_FROM_LEFT:
+			/* Check for short circuits */
+			if ((match && pred->op == OP_OR) ||
+			    (!match && pred->op == OP_AND)) {
+				if (pred == root)
+					break;
+				pred = get_pred_parent(pred, preds,
+						       pred->parent, &move);
+				continue;
+			}
+			/* now go down the right side of the tree. */
+			pred = &preds[pred->right];
+			move = MOVE_DOWN;
+			continue;
+		case MOVE_UP_FROM_RIGHT:
+			/* We finished this equation. */
+			if (pred == root)
+				break;
+			pred = get_pred_parent(pred, preds,
+					       pred->parent, &move);
 			continue;
 		}
-		if (pred->pop_n > top) {
-			WARN_ON_ONCE(1);
-			return 0;
-		}
-		val1 = stack[--top];
-		val2 = stack[--top];
-		switch (pred->op) {
-		case OP_AND:
-			match = val1 && val2;
-			break;
-		case OP_OR:
-			match = val1 || val2;
-			break;
-		default:
-			WARN_ONCE(1, "filter op is not AND or OR");
-		}
-		stack[top++] = match;
-	}
+		done = 1;
+	} while (!done);
 
-	return stack[--top];
+	return match;
 }
 EXPORT_SYMBOL_GPL(filter_match_preds);
 
@@ -539,10 +587,58 @@ static void filter_clear_pred(struct filter_pred *pred)
 	pred->regex.len = 0;
 }
 
-static int filter_set_pred(struct filter_pred *dest,
+static int __alloc_pred_stack(struct pred_stack *stack, int n_preds)
+{
+	stack->preds = kzalloc(sizeof(*stack->preds)*(n_preds + 1), GFP_KERNEL);
+	if (!stack->preds)
+		return -ENOMEM;
+	stack->index = n_preds;
+	return 0;
+}
+
+static void __free_pred_stack(struct pred_stack *stack)
+{
+	kfree(stack->preds);
+	stack->index = 0;
+}
+
+static int __push_pred_stack(struct pred_stack *stack,
+			     struct filter_pred *pred)
+{
+	int index = stack->index;
+
+	if (WARN_ON(index == 0))
+		return -ENOSPC;
+
+	stack->preds[--index] = pred;
+	stack->index = index;
+	return 0;
+}
+
+static struct filter_pred *
+__pop_pred_stack(struct pred_stack *stack)
+{
+	struct filter_pred *pred;
+	int index = stack->index;
+
+	pred = stack->preds[index++];
+	if (!pred)
+		return NULL;
+
+	stack->index = index;
+	return pred;
+}
+
+static int filter_set_pred(struct event_filter *filter,
+			   int idx,
+			   struct pred_stack *stack,
 			   struct filter_pred *src,
 			   filter_pred_fn_t fn)
 {
+	struct filter_pred *dest = &filter->preds[idx];
+	struct filter_pred *left;
+	struct filter_pred *right;
+
 	*dest = *src;
 	if (src->field_name) {
 		dest->field_name = kstrdup(src->field_name, GFP_KERNEL);
@@ -550,8 +646,25 @@ static int filter_set_pred(struct filter_pred *dest,
 			return -ENOMEM;
 	}
 	dest->fn = fn;
+	dest->index = idx;
 
-	return 0;
+	if (dest->op == OP_OR || dest->op == OP_AND) {
+		right = __pop_pred_stack(stack);
+		left = __pop_pred_stack(stack);
+		if (!left || !right)
+			return -EINVAL;
+		dest->left = left->index;
+		dest->right = right->index;
+		left->parent = dest->index;
+		right->parent = dest->index | FILTER_PRED_IS_RIGHT;
+	} else
+		/*
+		 * Make dest->left invalid to be used as a quick
+		 * way to know this is a leaf node.
+		 */
+		dest->left = FILTER_PRED_INVALID;
+
+	return __push_pred_stack(stack, dest);
 }
 
 static void __free_preds(struct event_filter *filter)
@@ -574,6 +687,7 @@ static void reset_preds(struct event_filter *filter)
 	int i;
 
 	filter->n_preds = 0;
+	filter->root = NULL;
 	if (!filter->preds)
 		return;
 
@@ -707,6 +821,7 @@ static int filter_add_pred_fn(struct filter_parse_state *ps,
 			      struct ftrace_event_call *call,
 			      struct event_filter *filter,
 			      struct filter_pred *pred,
+			      struct pred_stack *stack,
 			      filter_pred_fn_t fn)
 {
 	int idx, err;
@@ -718,7 +833,7 @@ static int filter_add_pred_fn(struct filter_parse_state *ps,
 
 	idx = filter->n_preds;
 	filter_clear_pred(&filter->preds[idx]);
-	err = filter_set_pred(&filter->preds[idx], pred, fn);
+	err = filter_set_pred(filter, idx, stack, pred, fn);
 	if (err)
 		return err;
 
@@ -803,6 +918,7 @@ static int filter_add_pred(struct filter_parse_state *ps,
 			   struct ftrace_event_call *call,
 			   struct event_filter *filter,
 			   struct filter_pred *pred,
+			   struct pred_stack *stack,
 			   bool dry_run)
 {
 	struct ftrace_event_field *field;
@@ -812,13 +928,10 @@ static int filter_add_pred(struct filter_parse_state *ps,
 
 	fn = pred->fn = filter_pred_none;
 
-	if (pred->op == OP_AND) {
-		pred->pop_n = 2;
+	if (pred->op == OP_AND)
 		goto add_pred_fn;
-	} else if (pred->op == OP_OR) {
-		pred->pop_n = 2;
+	else if (pred->op == OP_OR)
 		goto add_pred_fn;
-	}
 
 	field = find_event_field(call, pred->field_name);
 	if (!field) {
@@ -867,7 +980,7 @@ static int filter_add_pred(struct filter_parse_state *ps,
 
 add_pred_fn:
 	if (!dry_run)
-		return filter_add_pred_fn(ps, call, filter, pred, fn);
+		return filter_add_pred_fn(ps, call, filter, pred, stack, fn);
 	return 0;
 }
 
@@ -1248,6 +1361,7 @@ static int replace_preds(struct ftrace_event_call *call,
 	char *operand1 = NULL, *operand2 = NULL;
 	struct filter_pred *pred;
 	struct postfix_elt *elt;
+	struct pred_stack stack = { }; /* init to NULL */
 	int err;
 	int n_preds = 0;
 
@@ -1262,9 +1376,12 @@ static int replace_preds(struct ftrace_event_call *call,
 		return err;
 
 	if (!dry_run) {
-		err = __alloc_preds(filter, n_preds);
+		err = __alloc_pred_stack(&stack, n_preds);
 		if (err)
 			return err;
+		err = __alloc_preds(filter, n_preds);
+		if (err)
+			goto fail;
 	}
 
 	n_preds = 0;
@@ -1276,14 +1393,16 @@ static int replace_preds(struct ftrace_event_call *call,
 				operand2 = elt->operand;
 			else {
 				parse_error(ps, FILT_ERR_TOO_MANY_OPERANDS, 0);
-				return -EINVAL;
+				err = -EINVAL;
+				goto fail;
 			}
 			continue;
 		}
 
 		if (WARN_ON(n_preds++ == MAX_FILTER_PRED)) {
 			parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
-			return -ENOSPC;
+			err = -ENOSPC;
+			goto fail;
 		}
 
 		if (elt->op == OP_AND || elt->op == OP_OR) {
@@ -1293,22 +1412,44 @@ static int replace_preds(struct ftrace_event_call *call,
 
 		if (!operand1 || !operand2) {
 			parse_error(ps, FILT_ERR_MISSING_FIELD, 0);
-			return -EINVAL;
+			err = -EINVAL;
+			goto fail;
 		}
 
 		pred = create_pred(elt->op, operand1, operand2);
 add_pred:
-		if (!pred)
-			return -ENOMEM;
-		err = filter_add_pred(ps, call, filter, pred, dry_run);
+		if (!pred) {
+			err = -ENOMEM;
+			goto fail;
+		}
+		err = filter_add_pred(ps, call, filter, pred, &stack, dry_run);
 		filter_free_pred(pred);
 		if (err)
-			return err;
+			goto fail;
 
 		operand1 = operand2 = NULL;
 	}
 
-	return 0;
+	if (!dry_run) {
+		/* We should have one item left on the stack */
+		pred = __pop_pred_stack(&stack);
+		if (!pred)
+			return -EINVAL;
+		/* This item is where we start from in matching */
+		filter->root = pred;
+		/* Make sure the stack is empty */
+		pred = __pop_pred_stack(&stack);
+		if (WARN_ON(pred)) {
+			err = -EINVAL;
+			filter->root = NULL;
+			goto fail;
+		}
+	}
+
+	err = 0;
+fail:
+	__free_pred_stack(&stack);
+	return err;
 }
 
 static int replace_system_preds(struct event_subsystem *system,
-- 
cgit v0.10.2


From 55719274188f13cff9e3bd11fdd4c0e7617cd03d Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 27 Jan 2011 23:12:05 -0500
Subject: tracing/filter: Optimize short ciruit check

The test if we should break out early for OR and AND operations
can be optimized by comparing the current result with
  (pred->op == OP_OR)

That is if the result is true and the op is an OP_OR, or
if the result is false and the op is not an OP_OR (thus an OP_AND)
we can break out early in either case. Otherwise we continue
processing.

Cc: Tom Zanussi <tzanussi@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 1039049..0a3e050 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -426,9 +426,15 @@ int filter_match_preds(struct event_filter *filter, void *rec)
 					       pred->parent, &move);
 			continue;
 		case MOVE_UP_FROM_LEFT:
-			/* Check for short circuits */
-			if ((match && pred->op == OP_OR) ||
-			    (!match && pred->op == OP_AND)) {
+			/*
+			 * Check for short circuits.
+			 *
+			 * Optimization: !!match == (pred->op == OP_OR)
+			 *   is the same as:
+			 * if ((match && pred->op == OP_OR) ||
+			 *     (!match && pred->op == OP_AND))
+			 */
+			if (!!match == (pred->op == OP_OR)) {
 				if (pred == root)
 					break;
 				pred = get_pred_parent(pred, preds,
-- 
cgit v0.10.2


From ec126cac23945de12eb2d103374e1f7ee97c5595 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 27 Jan 2011 23:14:25 -0500
Subject: tracing/filter: Check the created pred tree

Since the filter walks a tree to determine if a match is made or not,
if the tree was incorrectly created, it could cause an infinite loop.

Add a check to walk the entire tree before assigning it as a filter
to make sure the tree is correct.

Cc: Tom Zanussi <tzanussi@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 0a3e050..91c9cdc 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1358,6 +1358,68 @@ static int count_preds(struct filter_parse_state *ps)
 	return n_preds;
 }
 
+/*
+ * The tree is walked at filtering of an event. If the tree is not correctly
+ * built, it may cause an infinite loop. Check here that the tree does
+ * indeed terminate.
+ */
+static int check_pred_tree(struct event_filter *filter,
+			   struct filter_pred *root)
+{
+	struct filter_pred *preds;
+	struct filter_pred *pred;
+	enum move_type move = MOVE_DOWN;
+	int count = 0;
+	int done = 0;
+	int max;
+
+	/*
+	 * The max that we can hit a node is three times.
+	 * Once going down, once coming up from left, and
+	 * once coming up from right. This is more than enough
+	 * since leafs are only hit a single time.
+	 */
+	max = 3 * filter->n_preds;
+
+	preds = filter->preds;
+	if  (!preds)
+		return -EINVAL;
+	pred = root;
+
+	do {
+		if (WARN_ON(count++ > max))
+			return -EINVAL;
+
+		switch (move) {
+		case MOVE_DOWN:
+			if (pred->left != FILTER_PRED_INVALID) {
+				pred = &preds[pred->left];
+				continue;
+			}
+			/* A leaf at the root is just a leaf in the tree */
+			if (pred == root)
+				break;
+			pred = get_pred_parent(pred, preds,
+					       pred->parent, &move);
+			continue;
+		case MOVE_UP_FROM_LEFT:
+			pred = &preds[pred->right];
+			move = MOVE_DOWN;
+			continue;
+		case MOVE_UP_FROM_RIGHT:
+			if (pred == root)
+				break;
+			pred = get_pred_parent(pred, preds,
+					       pred->parent, &move);
+			continue;
+		}
+		done = 1;
+	} while (!done);
+
+	/* We are fine. */
+	return 0;
+}
+
 static int replace_preds(struct ftrace_event_call *call,
 			 struct event_filter *filter,
 			 struct filter_parse_state *ps,
@@ -1366,6 +1428,7 @@ static int replace_preds(struct ftrace_event_call *call,
 {
 	char *operand1 = NULL, *operand2 = NULL;
 	struct filter_pred *pred;
+	struct filter_pred *root;
 	struct postfix_elt *elt;
 	struct pred_stack stack = { }; /* init to NULL */
 	int err;
@@ -1442,7 +1505,7 @@ add_pred:
 		if (!pred)
 			return -EINVAL;
 		/* This item is where we start from in matching */
-		filter->root = pred;
+		root = pred;
 		/* Make sure the stack is empty */
 		pred = __pop_pred_stack(&stack);
 		if (WARN_ON(pred)) {
@@ -1450,6 +1513,13 @@ add_pred:
 			filter->root = NULL;
 			goto fail;
 		}
+		err = check_pred_tree(filter, root);
+		if (err)
+			goto fail;
+
+		/* We don't set root until we know it works */
+		barrier();
+		filter->root = root;
 	}
 
 	err = 0;
-- 
cgit v0.10.2


From 43cd414552d8137157e926e46361678ea867e476 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 27 Jan 2011 23:16:51 -0500
Subject: tracing/filter: Optimize filter by folding the tree

There are many cases that a filter will contain multiple ORs or
ANDs together near the leafs. Walking up and down the tree to get
to the next compare can be a waste.

If there are several ORs or ANDs together, fold them into a single
pred and allocate an array of the conditions that they check.
This will speed up the filter by linearly walking an array
and can still break out if a short circuit condition is met.

Cc: Tom Zanussi <tzanussi@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index bba34a7..d754330 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -678,6 +678,7 @@ struct event_subsystem {
 
 #define FILTER_PRED_INVALID	((unsigned short)-1)
 #define FILTER_PRED_IS_RIGHT	(1 << 15)
+#define FILTER_PRED_FOLD	(1 << 15)
 
 struct filter_pred;
 struct regex;
@@ -704,7 +705,16 @@ struct filter_pred {
 	filter_pred_fn_t 	fn;
 	u64 			val;
 	struct regex		regex;
-	char 			*field_name;
+	/*
+	 * Leaf nodes use field_name, ops is used by AND and OR
+	 * nodes. The field_name is always freed when freeing a pred.
+	 * We can overload field_name for ops and have it freed
+	 * as well.
+	 */
+	union {
+		char		*field_name;
+		unsigned short	*ops;
+	};
 	int 			offset;
 	int 			not;
 	int 			op;
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 91c9cdc..2403ce5 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -381,6 +381,42 @@ get_pred_parent(struct filter_pred *pred, struct filter_pred *preds,
 	return pred;
 }
 
+/*
+ * A series of AND or ORs where found together. Instead of
+ * climbing up and down the tree branches, an array of the
+ * ops were made in order of checks. We can just move across
+ * the array and short circuit if needed.
+ */
+static int process_ops(struct filter_pred *preds,
+		       struct filter_pred *op, void *rec)
+{
+	struct filter_pred *pred;
+	int type;
+	int match;
+	int i;
+
+	/*
+	 * Micro-optimization: We set type to true if op
+	 * is an OR and false otherwise (AND). Then we
+	 * just need to test if the match is equal to
+	 * the type, and if it is, we can short circuit the
+	 * rest of the checks:
+	 *
+	 * if ((match && op->op == OP_OR) ||
+	 *     (!match && op->op == OP_AND))
+	 *	  return match;
+	 */
+	type = op->op == OP_OR;
+
+	for (i = 0; i < op->val; i++) {
+		pred = &preds[op->ops[i]];
+		match = pred->fn(pred, rec);
+		if (!!match == type)
+			return match;
+	}
+	return match;
+}
+
 /* return 1 if event matches, 0 otherwise (discard) */
 int filter_match_preds(struct event_filter *filter, void *rec)
 {
@@ -414,11 +450,16 @@ int filter_match_preds(struct event_filter *filter, void *rec)
 		case MOVE_DOWN:
 			/* only AND and OR have children */
 			if (pred->left != FILTER_PRED_INVALID) {
-				/* keep going to leaf node */
-				pred = &preds[pred->left];
-				continue;
-			}
-			match = pred->fn(pred, rec);
+				/* If ops is set, then it was folded. */
+				if (!pred->ops) {
+					/* keep going to down the left side */
+					pred = &preds[pred->left];
+					continue;
+				}
+				/* We can treat folded ops as a leaf node */
+				match = process_ops(preds, pred, rec);
+			} else
+				match = pred->fn(pred, rec);
 			/* If this pred is the only pred */
 			if (pred == root)
 				break;
@@ -659,17 +700,34 @@ static int filter_set_pred(struct event_filter *filter,
 		left = __pop_pred_stack(stack);
 		if (!left || !right)
 			return -EINVAL;
-		dest->left = left->index;
-		dest->right = right->index;
-		left->parent = dest->index;
+		/*
+		 * If both children can be folded
+		 * and they are the same op as this op or a leaf,
+		 * then this op can be folded.
+		 */
+		if (left->index & FILTER_PRED_FOLD &&
+		    (left->op == dest->op ||
+		     left->left == FILTER_PRED_INVALID) &&
+		    right->index & FILTER_PRED_FOLD &&
+		    (right->op == dest->op ||
+		     right->left == FILTER_PRED_INVALID))
+			dest->index |= FILTER_PRED_FOLD;
+
+		dest->left = left->index & ~FILTER_PRED_FOLD;
+		dest->right = right->index & ~FILTER_PRED_FOLD;
+		left->parent = dest->index & ~FILTER_PRED_FOLD;
 		right->parent = dest->index | FILTER_PRED_IS_RIGHT;
-	} else
+	} else {
 		/*
 		 * Make dest->left invalid to be used as a quick
 		 * way to know this is a leaf node.
 		 */
 		dest->left = FILTER_PRED_INVALID;
 
+		/* All leafs allow folding the parent ops. */
+		dest->index |= FILTER_PRED_FOLD;
+	}
+
 	return __push_pred_stack(stack, dest);
 }
 
@@ -1420,6 +1478,158 @@ static int check_pred_tree(struct event_filter *filter,
 	return 0;
 }
 
+static int count_leafs(struct filter_pred *preds, struct filter_pred *root)
+{
+	struct filter_pred *pred;
+	enum move_type move = MOVE_DOWN;
+	int count = 0;
+	int done = 0;
+
+	pred = root;
+
+	do {
+		switch (move) {
+		case MOVE_DOWN:
+			if (pred->left != FILTER_PRED_INVALID) {
+				pred = &preds[pred->left];
+				continue;
+			}
+			/* A leaf at the root is just a leaf in the tree */
+			if (pred == root)
+				return 1;
+			count++;
+			pred = get_pred_parent(pred, preds,
+					       pred->parent, &move);
+			continue;
+		case MOVE_UP_FROM_LEFT:
+			pred = &preds[pred->right];
+			move = MOVE_DOWN;
+			continue;
+		case MOVE_UP_FROM_RIGHT:
+			if (pred == root)
+				break;
+			pred = get_pred_parent(pred, preds,
+					       pred->parent, &move);
+			continue;
+		}
+		done = 1;
+	} while (!done);
+
+	return count;
+}
+
+static int fold_pred(struct filter_pred *preds, struct filter_pred *root)
+{
+	struct filter_pred *pred;
+	enum move_type move = MOVE_DOWN;
+	int count = 0;
+	int children;
+	int done = 0;
+
+	/* No need to keep the fold flag */
+	root->index &= ~FILTER_PRED_FOLD;
+
+	/* If the root is a leaf then do nothing */
+	if (root->left == FILTER_PRED_INVALID)
+		return 0;
+
+	/* count the children */
+	children = count_leafs(preds, &preds[root->left]);
+	children += count_leafs(preds, &preds[root->right]);
+
+	root->ops = kzalloc(sizeof(*root->ops) * children, GFP_KERNEL);
+	if (!root->ops)
+		return -ENOMEM;
+
+	root->val = children;
+
+	pred = root;
+	do {
+		switch (move) {
+		case MOVE_DOWN:
+			if (pred->left != FILTER_PRED_INVALID) {
+				pred = &preds[pred->left];
+				continue;
+			}
+			if (WARN_ON(count == children))
+				return -EINVAL;
+			pred->index &= ~FILTER_PRED_FOLD;
+			root->ops[count++] = pred->index;
+			pred = get_pred_parent(pred, preds,
+					       pred->parent, &move);
+			continue;
+		case MOVE_UP_FROM_LEFT:
+			pred = &preds[pred->right];
+			move = MOVE_DOWN;
+			continue;
+		case MOVE_UP_FROM_RIGHT:
+			if (pred == root)
+				break;
+			pred = get_pred_parent(pred, preds,
+					       pred->parent, &move);
+			continue;
+		}
+		done = 1;
+	} while (!done);
+
+	return 0;
+}
+
+/*
+ * To optimize the processing of the ops, if we have several "ors" or
+ * "ands" together, we can put them in an array and process them all
+ * together speeding up the filter logic.
+ */
+static int fold_pred_tree(struct event_filter *filter,
+			   struct filter_pred *root)
+{
+	struct filter_pred *preds;
+	struct filter_pred *pred;
+	enum move_type move = MOVE_DOWN;
+	int done = 0;
+	int err;
+
+	preds = filter->preds;
+	if  (!preds)
+		return -EINVAL;
+	pred = root;
+
+	do {
+		switch (move) {
+		case MOVE_DOWN:
+			if (pred->index & FILTER_PRED_FOLD) {
+				err = fold_pred(preds, pred);
+				if (err)
+					return err;
+				/* Folded nodes are like leafs */
+			} else if (pred->left != FILTER_PRED_INVALID) {
+				pred = &preds[pred->left];
+				continue;
+			}
+
+			/* A leaf at the root is just a leaf in the tree */
+			if (pred == root)
+				break;
+			pred = get_pred_parent(pred, preds,
+					       pred->parent, &move);
+			continue;
+		case MOVE_UP_FROM_LEFT:
+			pred = &preds[pred->right];
+			move = MOVE_DOWN;
+			continue;
+		case MOVE_UP_FROM_RIGHT:
+			if (pred == root)
+				break;
+			pred = get_pred_parent(pred, preds,
+					       pred->parent, &move);
+			continue;
+		}
+		done = 1;
+	} while (!done);
+
+	return 0;
+}
+
 static int replace_preds(struct ftrace_event_call *call,
 			 struct event_filter *filter,
 			 struct filter_parse_state *ps,
@@ -1517,6 +1727,11 @@ add_pred:
 		if (err)
 			goto fail;
 
+		/* Optimize the tree */
+		err = fold_pred_tree(filter, root);
+		if (err)
+			goto fail;
+
 		/* We don't set root until we know it works */
 		barrier();
 		filter->root = root;
-- 
cgit v0.10.2


From 4a3d27e98a7f2682e96d6f863752e0424b00d691 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 27 Jan 2011 23:19:49 -0500
Subject: tracing/filter: Move MAX_FILTER_PRED to local tracing directory

The MAX_FILTER_PRED is only needed by the kernel/trace/*.c files.
Move it to kernel/trace/trace.h.

Cc: Tom Zanussi <tzanussi@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 47e3997..1a99e79 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -208,7 +208,6 @@ struct ftrace_event_call {
 
 #define PERF_MAX_TRACE_SIZE	2048
 
-#define MAX_FILTER_PRED		32
 #define MAX_FILTER_STR_VAL	256	/* Should handle KSYM_SYMBOL_LEN */
 
 extern void destroy_preds(struct ftrace_event_call *call);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index d754330..fbff872 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -680,6 +680,8 @@ struct event_subsystem {
 #define FILTER_PRED_IS_RIGHT	(1 << 15)
 #define FILTER_PRED_FOLD	(1 << 15)
 
+#define MAX_FILTER_PRED		32
+
 struct filter_pred;
 struct regex;
 
-- 
cgit v0.10.2


From bf93f9ed3a2cb89eb7e58851139d3be375b98027 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 27 Jan 2011 23:21:34 -0500
Subject: tracing/filter: Increase the max preds to 2^14

Now that the filter logic does not require to save the pred results
on the stack, we can increase the max number of preds we allow.
As the preds are index by a short value, and we use the MSBs as flags
we can increase the max preds to 2^14 (16384) which should be way
more than enough.

Cc: Tom Zanussi <tzanussi@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index fbff872..856e73c 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -680,7 +680,14 @@ struct event_subsystem {
 #define FILTER_PRED_IS_RIGHT	(1 << 15)
 #define FILTER_PRED_FOLD	(1 << 15)
 
-#define MAX_FILTER_PRED		32
+/*
+ * The max preds is the size of unsigned short with
+ * two flags at the MSBs. One bit is used for both the IS_RIGHT
+ * and FOLD flags. The other is reserved.
+ *
+ * 2^14 preds is way more than enough.
+ */
+#define MAX_FILTER_PRED		16384
 
 struct filter_pred;
 struct regex;
-- 
cgit v0.10.2


From 75b8e98263fdb0bfbdeba60d4db463259f1fe8a2 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 3 Feb 2011 23:25:46 -0500
Subject: tracing/filter: Swap entire filter of events

When creating a new filter, instead of allocating the filter to the
event call first and then processing the filter, it is easier to
process a temporary filter and then just swap it with the call filter.
By doing this, it simplifies the code.

A filter is allocated and processed, when it is done, it is
swapped with the call filter, synchronize_sched() is called to make
sure all callers are done with the old filter (filters are called
with premption disabled), and then the old filter is freed.

Cc: Tom Zanussi <tzanussi@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 2403ce5..f5d335d 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -425,10 +425,15 @@ int filter_match_preds(struct event_filter *filter, void *rec)
 	struct filter_pred *preds;
 	struct filter_pred *pred;
 	struct filter_pred *root;
-	int n_preds = ACCESS_ONCE(filter->n_preds);
+	int n_preds;
 	int done = 0;
 
 	/* no filter is considered a match */
+	if (!filter)
+		return 1;
+
+	n_preds = filter->n_preds;
+
 	if (!n_preds)
 		return 1;
 
@@ -509,6 +514,9 @@ static void parse_error(struct filter_parse_state *ps, int err, int pos)
 
 static void remove_filter_string(struct event_filter *filter)
 {
+	if (!filter)
+		return;
+
 	kfree(filter->filter_string);
 	filter->filter_string = NULL;
 }
@@ -568,9 +576,10 @@ static void append_filter_err(struct filter_parse_state *ps,
 
 void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s)
 {
-	struct event_filter *filter = call->filter;
+	struct event_filter *filter;
 
 	mutex_lock(&event_mutex);
+	filter = call->filter;
 	if (filter && filter->filter_string)
 		trace_seq_printf(s, "%s\n", filter->filter_string);
 	else
@@ -581,9 +590,10 @@ void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s)
 void print_subsystem_event_filter(struct event_subsystem *system,
 				  struct trace_seq *s)
 {
-	struct event_filter *filter = system->filter;
+	struct event_filter *filter;
 
 	mutex_lock(&event_mutex);
+	filter = system->filter;
 	if (filter && filter->filter_string)
 		trace_seq_printf(s, "%s\n", filter->filter_string);
 	else
@@ -745,26 +755,9 @@ static void __free_preds(struct event_filter *filter)
 	filter->n_preds = 0;
 }
 
-static void reset_preds(struct event_filter *filter)
-{
-	int n_preds = filter->n_preds;
-	int i;
-
-	filter->n_preds = 0;
-	filter->root = NULL;
-	if (!filter->preds)
-		return;
-
-	for (i = 0; i < n_preds; i++)
-		filter->preds[i].fn = filter_pred_none;
-}
-
-static void filter_disable_preds(struct ftrace_event_call *call)
+static void filter_disable(struct ftrace_event_call *call)
 {
-	struct event_filter *filter = call->filter;
-
 	call->flags &= ~TRACE_EVENT_FL_FILTERED;
-	reset_preds(filter);
 }
 
 static void __free_filter(struct event_filter *filter)
@@ -777,11 +770,16 @@ static void __free_filter(struct event_filter *filter)
 	kfree(filter);
 }
 
+/*
+ * Called when destroying the ftrace_event_call.
+ * The call is being freed, so we do not need to worry about
+ * the call being currently used. This is for module code removing
+ * the tracepoints from within it.
+ */
 void destroy_preds(struct ftrace_event_call *call)
 {
 	__free_filter(call->filter);
 	call->filter = NULL;
-	call->flags &= ~TRACE_EVENT_FL_FILTERED;
 }
 
 static struct event_filter *__alloc_filter(void)
@@ -789,11 +787,6 @@ static struct event_filter *__alloc_filter(void)
 	struct event_filter *filter;
 
 	filter = kzalloc(sizeof(*filter), GFP_KERNEL);
-	if (!filter)
-		return ERR_PTR(-ENOMEM);
-
-	filter->n_preds = 0;
-
 	return filter;
 }
 
@@ -838,46 +831,28 @@ static int __alloc_preds(struct event_filter *filter, int n_preds)
 	return 0;
 }
 
-static int init_filter(struct ftrace_event_call *call)
-{
-	if (call->filter)
-		return 0;
-
-	call->flags &= ~TRACE_EVENT_FL_FILTERED;
-	call->filter = __alloc_filter();
-	if (IS_ERR(call->filter))
-		return PTR_ERR(call->filter);
-
-	return 0;
-}
-
-static int init_subsystem_preds(struct event_subsystem *system)
+static void filter_free_subsystem_preds(struct event_subsystem *system)
 {
 	struct ftrace_event_call *call;
-	int err;
 
 	list_for_each_entry(call, &ftrace_events, list) {
 		if (strcmp(call->class->system, system->name) != 0)
 			continue;
 
-		err = init_filter(call);
-		if (err)
-			return err;
+		filter_disable(call);
+		remove_filter_string(call->filter);
 	}
-
-	return 0;
 }
 
-static void filter_free_subsystem_preds(struct event_subsystem *system)
+static void filter_free_subsystem_filters(struct event_subsystem *system)
 {
 	struct ftrace_event_call *call;
 
 	list_for_each_entry(call, &ftrace_events, list) {
 		if (strcmp(call->class->system, system->name) != 0)
 			continue;
-
-		filter_disable_preds(call);
-		remove_filter_string(call->filter);
+		__free_filter(call->filter);
+		call->filter = NULL;
 	}
 }
 
@@ -1743,88 +1718,129 @@ fail:
 	return err;
 }
 
+struct filter_list {
+	struct list_head	list;
+	struct event_filter	*filter;
+};
+
 static int replace_system_preds(struct event_subsystem *system,
 				struct filter_parse_state *ps,
 				char *filter_string)
 {
 	struct ftrace_event_call *call;
+	struct filter_list *filter_item;
+	struct filter_list *tmp;
+	LIST_HEAD(filter_list);
 	bool fail = true;
 	int err;
 
 	list_for_each_entry(call, &ftrace_events, list) {
-		struct event_filter *filter = call->filter;
 
 		if (strcmp(call->class->system, system->name) != 0)
 			continue;
 
-		/* try to see if the filter can be applied */
-		err = replace_preds(call, filter, ps, filter_string, true);
+		/*
+		 * Try to see if the filter can be applied
+		 *  (filter arg is ignored on dry_run)
+		 */
+		err = replace_preds(call, NULL, ps, filter_string, true);
 		if (err)
 			goto fail;
 	}
 
-	/* set all filter pred counts to zero */
 	list_for_each_entry(call, &ftrace_events, list) {
-		struct event_filter *filter = call->filter;
+		struct event_filter *filter;
 
 		if (strcmp(call->class->system, system->name) != 0)
 			continue;
 
-		reset_preds(filter);
-	}
+		filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL);
+		if (!filter_item)
+			goto fail_mem;
 
-	/*
-	 * Since some of the preds may be used under preemption
-	 * we need to wait for them to finish before we may
-	 * reallocate them.
-	 */
-	synchronize_sched();
+		list_add_tail(&filter_item->list, &filter_list);
 
-	list_for_each_entry(call, &ftrace_events, list) {
-		struct event_filter *filter = call->filter;
+		filter_item->filter = __alloc_filter();
+		if (!filter_item->filter)
+			goto fail_mem;
+		filter = filter_item->filter;
 
-		if (strcmp(call->class->system, system->name) != 0)
-			continue;
+		/* Can only fail on no memory */
+		err = replace_filter_string(filter, filter_string);
+		if (err)
+			goto fail_mem;
 
-		/* really apply the filter */
-		filter_disable_preds(call);
 		err = replace_preds(call, filter, ps, filter_string, false);
-		if (err)
-			filter_disable_preds(call);
-		else {
+		if (err) {
+			filter_disable(call);
+			parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
+			append_filter_err(ps, filter);
+		} else
 			call->flags |= TRACE_EVENT_FL_FILTERED;
-			replace_filter_string(filter, filter_string);
-		}
+		/*
+		 * Regardless of if this returned an error, we still
+		 * replace the filter for the call.
+		 */
+		filter = call->filter;
+		call->filter = filter_item->filter;
+		filter_item->filter = filter;
+
 		fail = false;
 	}
 
 	if (fail)
 		goto fail;
 
+	/*
+	 * The calls can still be using the old filters.
+	 * Do a synchronize_sched() to ensure all calls are
+	 * done with them before we free them.
+	 */
+	synchronize_sched();
+	list_for_each_entry_safe(filter_item, tmp, &filter_list, list) {
+		__free_filter(filter_item->filter);
+		list_del(&filter_item->list);
+		kfree(filter_item);
+	}
 	return 0;
  fail:
+	/* No call succeeded */
+	list_for_each_entry_safe(filter_item, tmp, &filter_list, list) {
+		list_del(&filter_item->list);
+		kfree(filter_item);
+	}
 	parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
 	return -EINVAL;
+ fail_mem:
+	/* If any call succeeded, we still need to sync */
+	if (!fail)
+		synchronize_sched();
+	list_for_each_entry_safe(filter_item, tmp, &filter_list, list) {
+		__free_filter(filter_item->filter);
+		list_del(&filter_item->list);
+		kfree(filter_item);
+	}
+	return -ENOMEM;
 }
 
 int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
 {
-	int err;
 	struct filter_parse_state *ps;
+	struct event_filter *filter;
+	struct event_filter *tmp;
+	int err = 0;
 
 	mutex_lock(&event_mutex);
 
-	err = init_filter(call);
-	if (err)
-		goto out_unlock;
-
 	if (!strcmp(strstrip(filter_string), "0")) {
-		filter_disable_preds(call);
-		reset_preds(call->filter);
+		filter_disable(call);
+		filter = call->filter;
+		if (!filter)
+			goto out_unlock;
+		call->filter = NULL;
 		/* Make sure the filter is not being used */
 		synchronize_sched();
-		__free_preds(call->filter);
-		remove_filter_string(call->filter);
+		__free_filter(filter);
 		goto out_unlock;
 	}
 
@@ -1833,29 +1849,41 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
 	if (!ps)
 		goto out_unlock;
 
-	filter_disable_preds(call);
-	replace_filter_string(call->filter, filter_string);
+	filter = __alloc_filter();
+	if (!filter) {
+		kfree(ps);
+		goto out_unlock;
+	}
+
+	replace_filter_string(filter, filter_string);
 
 	parse_init(ps, filter_ops, filter_string);
 	err = filter_parse(ps);
 	if (err) {
-		append_filter_err(ps, call->filter);
+		append_filter_err(ps, filter);
 		goto out;
 	}
 
-	/*
-	 * Make sure all the pred counts are zero so that
-	 * no task is using it when we reallocate the preds array.
-	 */
-	reset_preds(call->filter);
-	synchronize_sched();
-
-	err = replace_preds(call, call->filter, ps, filter_string, false);
-	if (err)
-		append_filter_err(ps, call->filter);
-	else
+	err = replace_preds(call, filter, ps, filter_string, false);
+	if (err) {
+		filter_disable(call);
+		append_filter_err(ps, filter);
+	} else
 		call->flags |= TRACE_EVENT_FL_FILTERED;
 out:
+	/*
+	 * Always swap the call filter with the new filter
+	 * even if there was an error. If there was an error
+	 * in the filter, we disable the filter and show the error
+	 * string
+	 */
+	tmp = call->filter;
+	call->filter = filter;
+	if (tmp) {
+		/* Make sure the call is done with the filter */
+		synchronize_sched();
+		__free_filter(tmp);
+	}
 	filter_opstack_clear(ps);
 	postfix_clear(ps);
 	kfree(ps);
@@ -1868,18 +1896,21 @@ out_unlock:
 int apply_subsystem_event_filter(struct event_subsystem *system,
 				 char *filter_string)
 {
-	int err;
 	struct filter_parse_state *ps;
+	struct event_filter *filter;
+	int err = 0;
 
 	mutex_lock(&event_mutex);
 
-	err = init_subsystem_preds(system);
-	if (err)
-		goto out_unlock;
-
 	if (!strcmp(strstrip(filter_string), "0")) {
 		filter_free_subsystem_preds(system);
 		remove_filter_string(system->filter);
+		filter = system->filter;
+		system->filter = NULL;
+		/* Ensure all filters are no longer used */
+		synchronize_sched();
+		filter_free_subsystem_filters(system);
+		__free_filter(filter);
 		goto out_unlock;
 	}
 
@@ -1888,7 +1919,17 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
 	if (!ps)
 		goto out_unlock;
 
-	replace_filter_string(system->filter, filter_string);
+	filter = __alloc_filter();
+	if (!filter)
+		goto out;
+
+	replace_filter_string(filter, filter_string);
+	/*
+	 * No event actually uses the system filter
+	 * we can free it without synchronize_sched().
+	 */
+	__free_filter(system->filter);
+	system->filter = filter;
 
 	parse_init(ps, filter_ops, filter_string);
 	err = filter_parse(ps);
@@ -1945,7 +1986,7 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
 		goto out_unlock;
 
 	filter = __alloc_filter();
-	if (IS_ERR(filter)) {
+	if (!filter) {
 		err = PTR_ERR(filter);
 		goto out_unlock;
 	}
-- 
cgit v0.10.2


From 4defe682d81a4960b6840ee4ed1a36f9db77c7bd Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 3 Feb 2011 23:29:06 -0500
Subject: tracing/filter: Remove synchronize_sched() from __alloc_preds()

Because the filters are processed first and then activated
(added to the call), we no longer need to worry about the preds
of the filter in __alloc_preds() being used. As the filter that
is allocating preds is not activated yet.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index f5d335d..3249b4f 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -795,33 +795,17 @@ static int __alloc_preds(struct event_filter *filter, int n_preds)
 	struct filter_pred *pred;
 	int i;
 
-	if (filter->preds) {
-		if (filter->a_preds < n_preds) {
-			/*
-			 * We need to reallocate.
-			 * We should have already have zeroed out
-			 * the pred count and called synchronized_sched()
-			 * to make sure no one is using the preds.
-			 */
-			if (WARN_ON_ONCE(filter->n_preds)) {
-				/* We need to reset it now */
-				filter->n_preds = 0;
-				synchronize_sched();
-			}
-			__free_preds(filter);
-		}
-	}
+	if (filter->preds)
+		__free_preds(filter);
+
+	filter->preds =
+		kzalloc(sizeof(*filter->preds) * n_preds, GFP_KERNEL);
 
-	if (!filter->preds) {
-		filter->preds =
-			kzalloc(sizeof(*filter->preds) * n_preds, GFP_KERNEL);
-		filter->a_preds = n_preds;
-	}
 	if (!filter->preds)
 		return -ENOMEM;
 
-	if (WARN_ON(filter->a_preds < n_preds))
-		return -EINVAL;
+	filter->a_preds = n_preds;
+	filter->n_preds = 0;
 
 	for (i = 0; i < n_preds; i++) {
 		pred = &filter->preds[i];
-- 
cgit v0.10.2


From ba976970c79fd2fbfe1a4b3b6766a318f4eb9d4c Mon Sep 17 00:00:00 2001
From: Ian Munsie <imunsie@au1.ibm.com>
Date: Thu, 3 Feb 2011 14:27:20 +1100
Subject: tracing/syscalls: Don't add events for unmapped syscalls

FTRACE_SYSCALLS would create events for each and every system call, even
if it had failed to map the system call's name with it's number. This
resulted in a number of events being created that would not behave as
expected.

This could happen, for example, on architectures who's symbol names are
unusual and will not match the system call name. It could also happen
with system calls which were mapped to sys_ni_syscall.

This patch changes the default system call number in the metadata to -1.
If the system call name from the metadata is not successfully mapped to
a system call number during boot, than the event initialisation routine
will now return an error, preventing the event from being created.

Signed-off-by: Ian Munsie <imunsie@au1.ibm.com>
LKML-Reference: <1296703645-18718-2-git-send-email-imunsie@au1.ibm.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 98664db..8e8968e 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -158,6 +158,7 @@ extern struct trace_event_functions exit_syscall_print_funcs;
 	static struct syscall_metadata __used			\
 	  __syscall_meta_##sname = {				\
 		.name 		= "sys"#sname,			\
+		.syscall_nr	= -1,	/* Filled in at boot */	\
 		.nb_args 	= nb,				\
 		.types		= types_##sname,		\
 		.args		= args_##sname,			\
@@ -175,6 +176,7 @@ extern struct trace_event_functions exit_syscall_print_funcs;
 	static struct syscall_metadata __used			\
 	  __syscall_meta__##sname = {				\
 		.name 		= "sys_"#sname,			\
+		.syscall_nr	= -1,	/* Filled in at boot */	\
 		.nb_args 	= 0,				\
 		.enter_event	= &event_enter__##sname,	\
 		.exit_event	= &event_exit__##sname,		\
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 5c9fe08..a9ceabd 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -424,6 +424,14 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call)
 int init_syscall_trace(struct ftrace_event_call *call)
 {
 	int id;
+	int num;
+
+	num = ((struct syscall_metadata *)call->data)->syscall_nr;
+	if (num < 0 || num >= NR_syscalls) {
+		pr_debug("syscall %s metadata not mapped, disabling ftrace event\n",
+				((struct syscall_metadata *)call->data)->name);
+		return -ENOSYS;
+	}
 
 	if (set_syscall_print_fmt(call) < 0)
 		return -ENOMEM;
-- 
cgit v0.10.2


From 3773b389b6927595512558594d040c1edba46f36 Mon Sep 17 00:00:00 2001
From: Ian Munsie <imunsie@au1.ibm.com>
Date: Thu, 3 Feb 2011 14:27:21 +1100
Subject: tracing/syscalls: Convert redundant syscall_nr checks into WARN_ON

With the ftrace events now checking if the syscall_nr is valid upon
initialisation it should no longer be possible to register or unregister
a syscall event without a valid syscall_nr since they should not be
created. This adds a WARN_ON_ONCE in the register and unregister
functions to locate potential regressions in the future.

Signed-off-by: Ian Munsie <imunsie@au1.ibm.com>
LKML-Reference: <1296703645-18718-3-git-send-email-imunsie@au1.ibm.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index a9ceabd..4230942 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -359,7 +359,7 @@ int reg_event_syscall_enter(struct ftrace_event_call *call)
 	int num;
 
 	num = ((struct syscall_metadata *)call->data)->syscall_nr;
-	if (num < 0 || num >= NR_syscalls)
+	if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
 		return -ENOSYS;
 	mutex_lock(&syscall_trace_lock);
 	if (!sys_refcount_enter)
@@ -377,7 +377,7 @@ void unreg_event_syscall_enter(struct ftrace_event_call *call)
 	int num;
 
 	num = ((struct syscall_metadata *)call->data)->syscall_nr;
-	if (num < 0 || num >= NR_syscalls)
+	if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
 		return;
 	mutex_lock(&syscall_trace_lock);
 	sys_refcount_enter--;
@@ -393,7 +393,7 @@ int reg_event_syscall_exit(struct ftrace_event_call *call)
 	int num;
 
 	num = ((struct syscall_metadata *)call->data)->syscall_nr;
-	if (num < 0 || num >= NR_syscalls)
+	if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
 		return -ENOSYS;
 	mutex_lock(&syscall_trace_lock);
 	if (!sys_refcount_exit)
@@ -411,7 +411,7 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call)
 	int num;
 
 	num = ((struct syscall_metadata *)call->data)->syscall_nr;
-	if (num < 0 || num >= NR_syscalls)
+	if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
 		return;
 	mutex_lock(&syscall_trace_lock);
 	sys_refcount_exit--;
-- 
cgit v0.10.2


From c763ba06bd9b5db2c46c36276c89103d92d2c604 Mon Sep 17 00:00:00 2001
From: Ian Munsie <imunsie@au1.ibm.com>
Date: Thu, 3 Feb 2011 14:27:22 +1100
Subject: tracing/syscalls: Make arch_syscall_addr weak

Some architectures use non-trivial system call tables and will not work
with the generic arch_syscall_addr code. For example, PowerPC64 uses a
table of twin long longs.

This patch makes the generic arch_syscall_addr weak to allow
architectures with non-trivial system call tables to override it.

Signed-off-by: Ian Munsie <imunsie@au1.ibm.com>
LKML-Reference: <1296703645-18718-4-git-send-email-imunsie@au1.ibm.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/Documentation/trace/ftrace-design.txt b/Documentation/trace/ftrace-design.txt
index dc52bd4..6fca17b 100644
--- a/Documentation/trace/ftrace-design.txt
+++ b/Documentation/trace/ftrace-design.txt
@@ -247,6 +247,9 @@ You need very few things to get the syscalls tracing in an arch.
 - Support the TIF_SYSCALL_TRACEPOINT thread flags.
 - Put the trace_sys_enter() and trace_sys_exit() tracepoints calls from ptrace
   in the ptrace syscalls tracing path.
+- If the system call table on this arch is more complicated than a simple array
+  of addresses of the system calls, implement an arch_syscall_addr to return
+  the address of a given system call.
 - Tag this arch as HAVE_SYSCALL_TRACEPOINTS.
 
 
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 4230942..af83154 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -446,7 +446,7 @@ int init_syscall_trace(struct ftrace_event_call *call)
 	return id;
 }
 
-unsigned long __init arch_syscall_addr(int nr)
+unsigned long __init __weak arch_syscall_addr(int nr)
 {
 	return (unsigned long)sys_call_table[nr];
 }
-- 
cgit v0.10.2


From b2d55496818d64310b9f5486d4eea76ea614d7f8 Mon Sep 17 00:00:00 2001
From: Ian Munsie <imunsie@au1.ibm.com>
Date: Thu, 3 Feb 2011 14:27:23 +1100
Subject: tracing/syscalls: Allow arch specific syscall symbol matching

Some architectures have unusual symbol names and the generic code to
match the symbol name with the function name for the syscall metadata
will fail. For example, symbols on PPC64 start with a period and the
generic code will fail to match them.

This patch moves the match logic out into a separate function which an
arch can override by defining ARCH_HAS_SYSCALL_MATCH_SYM_NAME in
asm/ftrace.h and implementing arch_syscall_match_sym_name.

Signed-off-by: Ian Munsie <imunsie@au1.ibm.com>
LKML-Reference: <1296703645-18718-5-git-send-email-imunsie@au1.ibm.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/Documentation/trace/ftrace-design.txt b/Documentation/trace/ftrace-design.txt
index 6fca17b..79fcafc 100644
--- a/Documentation/trace/ftrace-design.txt
+++ b/Documentation/trace/ftrace-design.txt
@@ -250,6 +250,10 @@ You need very few things to get the syscalls tracing in an arch.
 - If the system call table on this arch is more complicated than a simple array
   of addresses of the system calls, implement an arch_syscall_addr to return
   the address of a given system call.
+- If the symbol names of the system calls do not match the function names on
+  this arch, define ARCH_HAS_SYSCALL_MATCH_SYM_NAME in asm/ftrace.h and
+  implement arch_syscall_match_sym_name with the appropriate logic to return
+  true if the function name corresponds with the symbol name.
 - Tag this arch as HAVE_SYSCALL_TRACEPOINTS.
 
 
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index af83154..86a23e7 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -60,6 +60,19 @@ extern struct syscall_metadata *__stop_syscalls_metadata[];
 
 static struct syscall_metadata **syscalls_metadata;
 
+#ifndef ARCH_HAS_SYSCALL_MATCH_SYM_NAME
+static inline bool arch_syscall_match_sym_name(const char *sym, const char *name)
+{
+	/*
+	 * Only compare after the "sys" prefix. Archs that use
+	 * syscall wrappers may have syscalls symbols aliases prefixed
+	 * with "SyS" instead of "sys", leading to an unwanted
+	 * mismatch.
+	 */
+	return !strcmp(sym + 3, name + 3);
+}
+#endif
+
 static __init struct syscall_metadata *
 find_syscall_meta(unsigned long syscall)
 {
@@ -73,13 +86,7 @@ find_syscall_meta(unsigned long syscall)
 	kallsyms_lookup(syscall, NULL, NULL, NULL, str);
 
 	for ( ; start < stop; start++) {
-		/*
-		 * Only compare after the "sys" prefix. Archs that use
-		 * syscall wrappers may have syscalls symbols aliases prefixed
-		 * with "SyS" instead of "sys", leading to an unwanted
-		 * mismatch.
-		 */
-		if ((*start)->name && !strcmp((*start)->name + 3, str + 3))
+		if ((*start)->name && arch_syscall_match_sym_name(str, (*start)->name))
 			return *start;
 	}
 	return NULL;
-- 
cgit v0.10.2


From ae07f551c42d6e4162436ca452a199deac9dab4d Mon Sep 17 00:00:00 2001
From: Ian Munsie <imunsie@au1.ibm.com>
Date: Thu, 3 Feb 2011 14:27:25 +1100
Subject: tracing/syscalls: Early terminate search for sys_ni_syscall

Many system calls are unimplemented and mapped to sys_ni_syscall, but at
boot ftrace would still search through every syscall metadata entry for
a match which wouldn't be there.

This patch adds causes the search to terminate early if the system call
is not mapped.

Signed-off-by: Ian Munsie <imunsie@au1.ibm.com>
LKML-Reference: <1296703645-18718-7-git-send-email-imunsie@au1.ibm.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 86a23e7..ee7b5a0 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -85,6 +85,9 @@ find_syscall_meta(unsigned long syscall)
 	stop = __stop_syscalls_metadata;
 	kallsyms_lookup(syscall, NULL, NULL, NULL, str);
 
+	if (arch_syscall_match_sym_name(str, "sys_ni_syscall"))
+		return NULL;
+
 	for ( ; start < stop; start++) {
 		if ((*start)->name && arch_syscall_match_sym_name(str, (*start)->name))
 			return *start;
-- 
cgit v0.10.2


From dc5f219e88294b93009eef946251251ffffb6d60 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 4 Feb 2011 13:19:20 +0100
Subject: genirq: Add IRQF_FORCE_RESUME

Xen needs to reenable interrupts which are marked IRQF_NO_SUSPEND in the
resume path. Add a flag to force the reenabling in the resume code.

Tested-and-acked-by: Ian Campbell <Ian.Campbell@eu.citrix.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 55e0d42..d746da1 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -55,7 +55,7 @@
  *                Used by threaded interrupts which need to keep the
  *                irq line disabled until the threaded handler has been run.
  * IRQF_NO_SUSPEND - Do not disable this IRQ during suspend
- *
+ * IRQF_FORCE_RESUME - Force enable it on resume even if IRQF_NO_SUSPEND is set
  */
 #define IRQF_DISABLED		0x00000020
 #define IRQF_SAMPLE_RANDOM	0x00000040
@@ -67,6 +67,7 @@
 #define IRQF_IRQPOLL		0x00001000
 #define IRQF_ONESHOT		0x00002000
 #define IRQF_NO_SUSPEND		0x00004000
+#define IRQF_FORCE_RESUME	0x00008000
 
 #define IRQF_TIMER		(__IRQF_TIMER | IRQF_NO_SUSPEND)
 
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 0caa59f..b4198ee 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -282,8 +282,17 @@ EXPORT_SYMBOL(disable_irq);
 
 void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
 {
-	if (resume)
+	if (resume) {
+		if (!(desc->status & IRQ_SUSPENDED)) {
+			if (!desc->action)
+				return;
+			if (!(desc->action->flags & IRQF_FORCE_RESUME))
+				return;
+			/* Pretend that it got disabled ! */
+			desc->depth++;
+		}
 		desc->status &= ~IRQ_SUSPENDED;
+	}
 
 	switch (desc->depth) {
 	case 0:
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index 0d4005d8..d6bfb89 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -53,9 +53,6 @@ void resume_device_irqs(void)
 	for_each_irq_desc(irq, desc) {
 		unsigned long flags;
 
-		if (!(desc->status & IRQ_SUSPENDED))
-			continue;
-
 		raw_spin_lock_irqsave(&desc->lock, flags);
 		__enable_irq(desc, irq, true);
 		raw_spin_unlock_irqrestore(&desc->lock, flags);
-- 
cgit v0.10.2


From 5e38ca8f3ea423442eaafe1b7e206084aa38120a Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Wed, 2 Feb 2011 13:28:18 +0100
Subject: tracing: Add unstable sched clock note to the warning

The warning "Delta way too big" warning might appear on a system with
unstable shed clock right after the system is resumed and tracing
was enabled during the suspend.

Since it's not realy bug, and the unstable sched clock is working
fast and reliable otherwise, Steven suggested to keep using the
sched clock in any case and just to make note in the warning itself.

Signed-off-by: Jiri Olsa <jolsa@redhat.com>
LKML-Reference: <1296649698-6003-1-git-send-email-jolsa@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index bd1c35a..7739893 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2163,10 +2163,14 @@ rb_reserve_next_event(struct ring_buffer *buffer,
 		delta = diff;
 		if (unlikely(test_time_stamp(delta))) {
 			WARN_ONCE(delta > (1ULL << 59),
-				  KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n",
+				  KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s",
 				  (unsigned long long)delta,
 				  (unsigned long long)ts,
-				  (unsigned long long)cpu_buffer->write_stamp);
+				  (unsigned long long)cpu_buffer->write_stamp,
+				  sched_clock_stable ? "" :
+				  "If you just came from a suspend/resume,\n"
+				  "please switch to the trace global clock:\n"
+				  "  echo global > /sys/kernel/debug/tracing/trace_clock\n");
 			add_timestamp = 1;
 		}
 	}
-- 
cgit v0.10.2


From e3087b80aa0bceda9863f33307460f3ba79f2b15 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 8 Feb 2011 15:01:39 -0200
Subject: perf annotate: Fix --stdio rendering

The checks for not using a max_lines parameter were b0rked, problem
introduced in 3653246.

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index 1012841..6db4351 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -132,7 +132,7 @@ static int objdump_line__print(struct objdump_line *oline,
 		if (percent < min_pcnt)
 			return -1;
 
-		if (printed >= max_lines)
+		if (max_lines && printed >= max_lines)
 			return 1;
 
 		color = get_percent_color(percent);
@@ -154,7 +154,7 @@ static int objdump_line__print(struct objdump_line *oline,
 		color_fprintf(stdout, color, " %7.2f", percent);
 		printf(" :	");
 		color_fprintf(stdout, PERF_COLOR_BLUE, "%s\n", oline->line);
-	} else if (printed >= max_lines)
+	} else if (max_lines && printed >= max_lines)
 		return 1;
 	else {
 		if (!*oline->line)
-- 
cgit v0.10.2


From ce6f4fab4059cd72638a0cfa596a8ee2c79c1c8e Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 8 Feb 2011 13:27:39 -0200
Subject: perf annotate: Move locking to struct annotation

Since we'll need it when implementing the live annotate TUI browser.

This also simplifies things a bit by having the list head for the source
code to be in the dynamicly allocated part of struct annotation, that
way we don't have to pass it around, it can be found from the struct
symbol that is passed everywhere.

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
index ea6a116..4271829 100644
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -62,7 +62,8 @@ static int hists__add_entry(struct hists *self, struct addr_location *al)
 		 * All aggregated on the first sym_hist.
 		 */
 		struct annotation *notes = symbol__annotation(he->ms.sym);
-		if (notes->histograms == NULL && symbol__alloc_hist(he->ms.sym, 1) < 0)
+		if (notes->src == NULL &&
+		    symbol__alloc_hist(he->ms.sym, 1) < 0)
 			return -ENOMEM;
 
 		return hist_entry__inc_addr_samples(he, 0, al->addr);
@@ -77,7 +78,8 @@ static int process_sample_event(union perf_event *event,
 {
 	struct addr_location al;
 
-	if (perf_event__preprocess_sample(event, session, &al, sample, NULL) < 0) {
+	if (perf_event__preprocess_sample(event, session, &al, sample,
+					  symbol__annotate_init) < 0) {
 		pr_warning("problem processing %d event, skipping it.\n",
 			   event->header.type);
 		return -1;
@@ -111,7 +113,7 @@ static void hists__find_annotations(struct hists *self)
 			goto find_next;
 
 		notes = symbol__annotation(he->ms.sym);
-		if (notes->histograms == NULL) {
+		if (notes->src == NULL) {
 find_next:
 			if (key == KEY_LEFT)
 				nd = rb_prev(nd);
@@ -142,11 +144,11 @@ find_next:
 			nd = rb_next(nd);
 			/*
 			 * Since we have a hist_entry per IP for the same
-			 * symbol, free he->ms.sym->histogram to signal we already
+			 * symbol, free he->ms.sym->src to signal we already
 			 * processed this symbol.
 			 */
-			free(notes->histograms);
-			notes->histograms = NULL;
+			free(notes->src);
+			notes->src = NULL;
 		}
 	}
 }
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index de06bf5..f403ace 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -123,7 +123,7 @@ static int perf_session__add_hist_entry(struct perf_session *session,
 		 * All aggregated on the first sym_hist.
 		 */
 		struct annotation *notes = symbol__annotation(he->ms.sym);
-		if (notes->histograms == NULL &&
+		if (notes->src == NULL &&
 		    symbol__alloc_hist(he->ms.sym, 1) < 0)
 			err = -ENOMEM;
 		else
@@ -166,7 +166,8 @@ static int process_sample_event(union perf_event *event,
 	struct addr_location al;
 	struct perf_event_attr *attr;
 
-	if (perf_event__preprocess_sample(event, session, &al, sample, NULL) < 0) {
+	if (perf_event__preprocess_sample(event, session, &al, sample,
+					  symbol__annotate_init) < 0) {
 		fprintf(stderr, "problem processing %d event, skipping it.\n",
 			event->header.type);
 		return -1;
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index b790673..7dbf22d 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -139,7 +139,7 @@ static void sig_winch_handler(int sig __used)
 static int parse_source(struct sym_entry *syme)
 {
 	struct symbol *sym;
-	struct sym_entry_source *source;
+	struct annotation *notes;
 	struct map *map;
 	int err = -1;
 
@@ -152,39 +152,35 @@ static int parse_source(struct sym_entry *syme)
 	/*
 	 * We can't annotate with just /proc/kallsyms
 	 */
-	if (map->dso->origin == DSO__ORIG_KERNEL)
+	if (map->dso->origin == DSO__ORIG_KERNEL) {
+		pr_err("Can't annotate %s: No vmlinux file was found in the "
+		       "path\n", sym->name);
+		sleep(1);
 		return -1;
-
-	if (syme->src == NULL) {
-		syme->src = zalloc(sizeof(*source));
-		if (syme->src == NULL)
-			return -1;
-		pthread_mutex_init(&syme->src->lock, NULL);
-		INIT_LIST_HEAD(&syme->src->head);
 	}
 
-	source = syme->src;
-
-	if (symbol__annotation(sym)->histograms != NULL) {
-		pthread_mutex_lock(&source->lock);
+	notes = symbol__annotation(sym);
+	if (notes->src != NULL) {
+		pthread_mutex_lock(&notes->lock);
 		goto out_assign;
 	}
 
-	pthread_mutex_lock(&source->lock);
+	pthread_mutex_lock(&notes->lock);
 
 	if (symbol__alloc_hist(sym, top.evlist->nr_entries) < 0) {
 		pr_err("Not enough memory for annotating '%s' symbol!\n",
 		       sym->name);
+		sleep(1);
 		goto out_unlock;
 	}
 
-	err = symbol__annotate(sym, syme->map, &source->head, 0);
+	err = symbol__annotate(sym, syme->map, 0);
 	if (err == 0) {
 out_assign:
 	sym_filter_entry = syme;
 	}
 out_unlock:
-	pthread_mutex_unlock(&source->lock);
+	pthread_mutex_unlock(&notes->lock);
 	return err;
 }
 
@@ -196,20 +192,27 @@ static void __zero_source_counters(struct sym_entry *syme)
 
 static void record_precise_ip(struct sym_entry *syme, int counter, u64 ip)
 {
+	struct annotation *notes;
+	struct symbol *sym;
+
 	if (syme != sym_filter_entry)
 		return;
 
-	if (pthread_mutex_trylock(&syme->src->lock))
+	sym = sym_entry__symbol(syme);
+	notes = symbol__annotation(sym);
+
+	if (pthread_mutex_trylock(&notes->lock))
 		return;
 
 	ip = syme->map->map_ip(syme->map, ip);
-	symbol__inc_addr_samples(sym_entry__symbol(syme), syme->map, counter, ip);
+	symbol__inc_addr_samples(sym, syme->map, counter, ip);
 
-	pthread_mutex_unlock(&syme->src->lock);
+	pthread_mutex_unlock(&notes->lock);
 }
 
 static void show_details(struct sym_entry *syme)
 {
+	struct annotation *notes;
 	struct symbol *symbol;
 	int more;
 
@@ -217,24 +220,26 @@ static void show_details(struct sym_entry *syme)
 		return;
 
 	symbol = sym_entry__symbol(syme);
-	if (!syme->src || symbol__annotation(symbol)->histograms == NULL)
-		return;
+	notes = symbol__annotation(symbol);
+
+	pthread_mutex_lock(&notes->lock);
+
+	if (notes->src == NULL)
+		goto out_unlock;
 
 	printf("Showing %s for %s\n", event_name(top.sym_evsel), symbol->name);
 	printf("  Events  Pcnt (>=%d%%)\n", sym_pcnt_filter);
 
-	pthread_mutex_lock(&syme->src->lock);
-	more = symbol__annotate_printf(symbol, syme->map, &syme->src->head,
-				       top.sym_evsel->idx, 0, sym_pcnt_filter,
-				       top.print_entries);
+	more = symbol__annotate_printf(symbol, syme->map, top.sym_evsel->idx,
+				       0, sym_pcnt_filter, top.print_entries);
 	if (top.zero)
 		symbol__annotate_zero_histogram(symbol, top.sym_evsel->idx);
 	else
-		symbol__annotate_decay_histogram(symbol, &syme->src->head,
-						 top.sym_evsel->idx);
-	pthread_mutex_unlock(&syme->src->lock);
+		symbol__annotate_decay_histogram(symbol, top.sym_evsel->idx);
 	if (more != 0)
 		printf("%d lines not displayed, maybe increase display entries [e]\n", more);
+out_unlock:
+	pthread_mutex_unlock(&notes->lock);
 }
 
 static const char		CONSOLE_CLEAR[] = "[H[2J";
@@ -372,10 +377,8 @@ static void prompt_symbol(struct sym_entry **target, const char *msg)
 
 	/* zero counters of active symbol */
 	if (syme) {
-		pthread_mutex_lock(&syme->src->lock);
 		__zero_source_counters(syme);
 		*target = NULL;
-		pthread_mutex_unlock(&syme->src->lock);
 	}
 
 	fprintf(stdout, "\n%s: ", msg);
@@ -554,10 +557,8 @@ static void handle_keypress(struct perf_session *session, int c)
 			else {
 				struct sym_entry *syme = sym_filter_entry;
 
-				pthread_mutex_lock(&syme->src->lock);
 				sym_filter_entry = NULL;
 				__zero_source_counters(syme);
-				pthread_mutex_unlock(&syme->src->lock);
 			}
 			break;
 		case 'U':
@@ -653,7 +654,7 @@ static int symbol_filter(struct map *map, struct symbol *sym)
 
 	syme = symbol__priv(sym);
 	syme->map = map;
-	syme->src = NULL;
+	symbol__annotate_init(map, sym);
 
 	if (!sym_filter_entry && sym_filter && !strcmp(name, sym_filter)) {
 		/* schedule initial sym_filter_entry setup */
diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index 6db4351..c777bda 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -14,25 +14,39 @@
 #include "symbol.h"
 #include "debug.h"
 #include "annotate.h"
+#include <pthread.h>
 
-int symbol__alloc_hist(struct symbol *sym, int nevents)
+int symbol__annotate_init(struct map *map __used, struct symbol *sym)
 {
 	struct annotation *notes = symbol__annotation(sym);
+	pthread_mutex_init(&notes->lock, NULL);
+	return 0;
+}
 
-	notes->sizeof_sym_hist = (sizeof(*notes->histograms) +
+int symbol__alloc_hist(struct symbol *sym, int nevents)
+{
+	struct annotation *notes = symbol__annotation(sym);
+	size_t sizeof_sym_hist = (sizeof(struct sym_hist) +
 				  (sym->end - sym->start) * sizeof(u64));
-	notes->histograms = calloc(nevents, notes->sizeof_sym_hist);
-	notes->nr_histograms = nevents;
-	return notes->histograms == NULL ? -1 : 0;
+
+	notes->src = zalloc(sizeof(*notes->src) + nevents * sizeof_sym_hist);
+	if (notes->src == NULL)
+		return -1;
+	notes->src->sizeof_sym_hist = sizeof_sym_hist;
+	notes->src->nr_histograms   = nevents;
+	INIT_LIST_HEAD(&notes->src->source);
+	return 0;
 }
 
 void symbol__annotate_zero_histograms(struct symbol *sym)
 {
 	struct annotation *notes = symbol__annotation(sym);
 
-	if (notes->histograms != NULL)
-		memset(notes->histograms, 0,
-		       notes->nr_histograms * notes->sizeof_sym_hist);
+	pthread_mutex_lock(&notes->lock);
+	if (notes->src != NULL)
+		memset(notes->src->histograms, 0,
+		       notes->src->nr_histograms * notes->src->sizeof_sym_hist);
+	pthread_mutex_unlock(&notes->lock);
 }
 
 int symbol__inc_addr_samples(struct symbol *sym, struct map *map,
@@ -43,7 +57,7 @@ int symbol__inc_addr_samples(struct symbol *sym, struct map *map,
 	struct sym_hist *h;
 
 	notes = symbol__annotation(sym);
-	if (notes->histograms == NULL)
+	if (notes->src == NULL)
 		return -ENOMEM;
 
 	pr_debug3("%s: addr=%#" PRIx64 "\n", __func__, map->unmap_ip(map, addr));
@@ -95,8 +109,7 @@ struct objdump_line *objdump__get_next_ip_line(struct list_head *head,
 	return NULL;
 }
 
-static int objdump_line__print(struct objdump_line *oline,
-			       struct list_head *head, struct symbol *sym,
+static int objdump_line__print(struct objdump_line *oline, struct symbol *sym,
 			       int evidx, u64 len, int min_pcnt,
 			       int printed, int max_lines)
 {
@@ -109,10 +122,12 @@ static int objdump_line__print(struct objdump_line *oline,
 		double percent = 0.0;
 		const char *color;
 		struct annotation *notes = symbol__annotation(sym);
-		struct source_line *src_line = notes->src_line;
+		struct source_line *src_line = notes->src->lines;
 		struct sym_hist *h = annotation__histogram(notes, evidx);
 		s64 offset = oline->offset;
-		struct objdump_line *next = objdump__get_next_ip_line(head, oline);
+		struct objdump_line *next;
+
+		next = objdump__get_next_ip_line(&notes->src->source, oline);
 
 		while (offset < (s64)len &&
 		       (next == NULL || offset < next->offset)) {
@@ -166,9 +181,10 @@ static int objdump_line__print(struct objdump_line *oline,
 	return 0;
 }
 
-static int symbol__parse_objdump_line(struct symbol *sym, struct map *map, FILE *file,
-				      struct list_head *head, size_t privsize)
+static int symbol__parse_objdump_line(struct symbol *sym, struct map *map,
+				      FILE *file, size_t privsize)
 {
+	struct annotation *notes = symbol__annotation(sym);
 	struct objdump_line *objdump_line;
 	char *line = NULL, *tmp, *tmp2, *c;
 	size_t line_len;
@@ -222,13 +238,12 @@ static int symbol__parse_objdump_line(struct symbol *sym, struct map *map, FILE
 		free(line);
 		return -1;
 	}
-	objdump__add_line(head, objdump_line);
+	objdump__add_line(&notes->src->source, objdump_line);
 
 	return 0;
 }
 
-int symbol__annotate(struct symbol *sym, struct map *map,
-		     struct list_head *head, size_t privsize)
+int symbol__annotate(struct symbol *sym, struct map *map, size_t privsize)
 {
 	struct dso *dso = map->dso;
 	char *filename = dso__build_id_filename(dso, NULL, 0);
@@ -297,7 +312,7 @@ fallback:
 		goto out_free_filename;
 
 	while (!feof(file))
-		if (symbol__parse_objdump_line(sym, map, file, head, privsize) < 0)
+		if (symbol__parse_objdump_line(sym, map, file, privsize) < 0)
 			break;
 
 	pclose(file);
@@ -330,14 +345,14 @@ static void insert_source_line(struct rb_root *root, struct source_line *src_lin
 static void symbol__free_source_line(struct symbol *sym, int len)
 {
 	struct annotation *notes = symbol__annotation(sym);
-	struct source_line *src_line = notes->src_line;
+	struct source_line *src_line = notes->src->lines;
 	int i;
 
 	for (i = 0; i < len; i++)
 		free(src_line[i].path);
 
 	free(src_line);
-	notes->src_line = NULL;
+	notes->src->lines = NULL;
 }
 
 /* Get the filename:line for the colored entries */
@@ -355,8 +370,8 @@ static int symbol__get_source_line(struct symbol *sym, struct map *map,
 	if (!h->sum)
 		return 0;
 
-	src_line = notes->src_line = calloc(len, sizeof(struct source_line));
-	if (!notes->src_line)
+	src_line = notes->src->lines = calloc(len, sizeof(struct source_line));
+	if (!notes->src->lines)
 		return -1;
 
 	start = map->unmap_ip(map, sym->start);
@@ -436,12 +451,12 @@ static void symbol__annotate_hits(struct symbol *sym, int evidx)
 	printf("%*s: %" PRIu64 "\n", BITS_PER_LONG / 2, "h->sum", h->sum);
 }
 
-int symbol__annotate_printf(struct symbol *sym, struct map *map,
-			    struct list_head *head, int evidx, bool full_paths,
-			    int min_pcnt, int max_lines)
+int symbol__annotate_printf(struct symbol *sym, struct map *map, int evidx,
+			    bool full_paths, int min_pcnt, int max_lines)
 {
 	struct dso *dso = map->dso;
 	const char *filename = dso->long_name, *d_filename;
+	struct annotation *notes = symbol__annotation(sym);
 	struct objdump_line *pos;
 	int printed = 2;
 	int more = 0;
@@ -460,8 +475,8 @@ int symbol__annotate_printf(struct symbol *sym, struct map *map,
 	if (verbose)
 		symbol__annotate_hits(sym, evidx);
 
-	list_for_each_entry(pos, head, node) {
-		switch (objdump_line__print(pos, head, sym, evidx, len, min_pcnt,
+	list_for_each_entry(pos, &notes->src->source, node) {
+		switch (objdump_line__print(pos, sym, evidx, len, min_pcnt,
 					    printed, max_lines)) {
 		case 0:
 			++printed;
@@ -485,11 +500,10 @@ void symbol__annotate_zero_histogram(struct symbol *sym, int evidx)
 	struct annotation *notes = symbol__annotation(sym);
 	struct sym_hist *h = annotation__histogram(notes, evidx);
 
-	memset(h, 0, notes->sizeof_sym_hist);
+	memset(h, 0, notes->src->sizeof_sym_hist);
 }
 
-void symbol__annotate_decay_histogram(struct symbol *sym,
-				      struct list_head *head, int evidx)
+void symbol__annotate_decay_histogram(struct symbol *sym, int evidx)
 {
 	struct annotation *notes = symbol__annotation(sym);
 	struct sym_hist *h = annotation__histogram(notes, evidx);
@@ -497,7 +511,7 @@ void symbol__annotate_decay_histogram(struct symbol *sym,
 
 	h->sum = 0;
 
-	list_for_each_entry(pos, head, node) {
+	list_for_each_entry(pos, &notes->src->source, node) {
 		if (pos->offset != -1) {
 			h->addr[pos->offset] = h->addr[pos->offset] * 7 / 8;
 			h->sum += h->addr[pos->offset];
@@ -522,10 +536,9 @@ int symbol__tty_annotate(struct symbol *sym, struct map *map, int evidx,
 	struct dso *dso = map->dso;
 	const char *filename = dso->long_name;
 	struct rb_root source_line = RB_ROOT;
-	LIST_HEAD(head);
 	u64 len;
 
-	if (symbol__annotate(sym, map, &head, 0) < 0)
+	if (symbol__annotate(sym, map, 0) < 0)
 		return -1;
 
 	len = sym->end - sym->start;
@@ -536,12 +549,12 @@ int symbol__tty_annotate(struct symbol *sym, struct map *map, int evidx,
 		print_summary(&source_line, filename);
 	}
 
-	symbol__annotate_printf(sym, map, &head, evidx, full_paths,
+	symbol__annotate_printf(sym, map, evidx, full_paths,
 				min_pcnt, max_lines);
 	if (print_lines)
 		symbol__free_source_line(sym, len);
 
-	objdump_line_list__purge(&head);
+	objdump_line_list__purge(&symbol__annotation(sym)->src->source);
 
 	return 0;
 }
diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h
index bc08b36..b237c86 100644
--- a/tools/perf/util/annotate.h
+++ b/tools/perf/util/annotate.h
@@ -28,22 +28,29 @@ struct source_line {
 	char		*path;
 };
 
-/** struct annotation - symbols with hits have this attached as in sannotation
+/** struct annotated_source - symbols with hits have this attached as in sannotation
  *
  * @histogram: Array of addr hit histograms per event being monitored
- * @src_line: If 'print_lines' is specified, per source code line percentages
+ * @lines: If 'print_lines' is specified, per source code line percentages
+ * @source: source parsed from objdump -dS
  *
- * src_line is allocated, percentages calculated and all sorted by percentage
+ * lines is allocated, percentages calculated and all sorted by percentage
  * when the annotation is about to be presented, so the percentages are for
  * one of the entries in the histogram array, i.e. for the event/counter being
  * presented. It is deallocated right after symbol__{tui,tty,etc}_annotate
  * returns.
  */
-struct annotation {
-	struct source_line *src_line;
-	struct sym_hist	   *histograms;
+struct annotated_source {
+	struct list_head   source;
+	struct source_line *lines;
 	int    		   nr_histograms;
 	int    		   sizeof_sym_hist;
+	struct sym_hist	   histograms[0];
+};
+
+struct annotation {
+	pthread_mutex_t		lock;
+	struct annotated_source *src;
 };
 
 struct sannotation {
@@ -53,7 +60,8 @@ struct sannotation {
 
 static inline struct sym_hist *annotation__histogram(struct annotation *notes, int idx)
 {
-	return ((void *)notes->histograms) + (notes->sizeof_sym_hist * idx);
+	return (((void *)&notes->src->histograms) +
+	 	(notes->src->sizeof_sym_hist * idx));
 }
 
 static inline struct annotation *symbol__annotation(struct symbol *sym)
@@ -67,14 +75,12 @@ int symbol__inc_addr_samples(struct symbol *sym, struct map *map,
 int symbol__alloc_hist(struct symbol *sym, int nevents);
 void symbol__annotate_zero_histograms(struct symbol *sym);
 
-int symbol__annotate(struct symbol *sym, struct map *map,
-		     struct list_head *head, size_t privsize);
-int symbol__annotate_printf(struct symbol *sym, struct map *map,
-			    struct list_head *head, int evidx, bool full_paths,
-			    int min_pcnt, int max_lines);
+int symbol__annotate(struct symbol *sym, struct map *map, size_t privsize);
+int symbol__annotate_init(struct map *map __used, struct symbol *sym);
+int symbol__annotate_printf(struct symbol *sym, struct map *map, int evidx,
+			    bool full_paths, int min_pcnt, int max_lines);
 void symbol__annotate_zero_histogram(struct symbol *sym, int evidx);
-void symbol__annotate_decay_histogram(struct symbol *sym,
-				      struct list_head *head, int evidx);
+void symbol__annotate_decay_histogram(struct symbol *sym, int evidx);
 void objdump_line_list__purge(struct list_head *head);
 
 int symbol__tty_annotate(struct symbol *sym, struct map *map, int evidx,
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index bac5ab6..3f43723 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -955,10 +955,9 @@ int hist_entry__inc_addr_samples(struct hist_entry *he, int evidx, u64 ip)
 	return symbol__inc_addr_samples(he->ms.sym, he->ms.map, evidx, ip);
 }
 
-int hist_entry__annotate(struct hist_entry *he, struct list_head *head,
-			 size_t privsize)
+int hist_entry__annotate(struct hist_entry *he, size_t privsize)
 {
-	return symbol__annotate(he->ms.sym, he->ms.map, head, privsize);
+	return symbol__annotate(he->ms.sym, he->ms.map, privsize);
 }
 
 void hists__inc_nr_events(struct hists *self, u32 type)
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index 2c6cdae..37c7908 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -78,8 +78,7 @@ size_t hists__fprintf(struct hists *self, struct hists *pair,
 		      bool show_displacement, FILE *fp);
 
 int hist_entry__inc_addr_samples(struct hist_entry *self, int evidx, u64 addr);
-int hist_entry__annotate(struct hist_entry *self, struct list_head *head,
-			 size_t privsize);
+int hist_entry__annotate(struct hist_entry *self, size_t privsize);
 
 void hists__filter_by_dso(struct hists *self, const struct dso *dso);
 void hists__filter_by_thread(struct hists *self, const struct thread *thread);
diff --git a/tools/perf/util/top.h b/tools/perf/util/top.h
index 62e3293..4f769f4 100644
--- a/tools/perf/util/top.h
+++ b/tools/perf/util/top.h
@@ -11,11 +11,6 @@
 struct perf_evlist;
 struct perf_evsel;
 
-struct sym_entry_source {
-	struct list_head	head;
-	pthread_mutex_t		lock;
-};
-
 struct sym_entry {
 	struct rb_node		rb_node;
 	struct list_head	node;
@@ -24,7 +19,6 @@ struct sym_entry {
 	int			skip;
 	u8			origin;
 	struct map		*map;
-	struct sym_entry_source	*src;
 	unsigned long		count[0];
 };
 
diff --git a/tools/perf/util/ui/browsers/annotate.c b/tools/perf/util/ui/browsers/annotate.c
index 8d8a168..1aa3965 100644
--- a/tools/perf/util/ui/browsers/annotate.c
+++ b/tools/perf/util/ui/browsers/annotate.c
@@ -60,7 +60,6 @@ static void annotate_browser__write(struct ui_browser *self, void *entry, int ro
 }
 
 static double objdump_line__calc_percent(struct objdump_line *self,
-					 struct list_head *head,
 					 struct symbol *sym, int evidx)
 {
 	double percent = 0.0;
@@ -69,11 +68,12 @@ static double objdump_line__calc_percent(struct objdump_line *self,
 		int len = sym->end - sym->start;
 		unsigned int hits = 0;
 		struct annotation *notes = symbol__annotation(sym);
-		struct source_line *src_line = notes->src_line;
+		struct source_line *src_line = notes->src->lines;
 		struct sym_hist *h = annotation__histogram(notes, evidx);
 		s64 offset = self->offset;
-		struct objdump_line *next = objdump__get_next_ip_line(head, self);
+		struct objdump_line *next;
 
+		next = objdump__get_next_ip_line(&notes->src->source, self);
 		while (offset < (s64)len &&
 		       (next == NULL || offset < next->offset)) {
 			if (src_line) {
@@ -192,10 +192,10 @@ int symbol__tui_annotate(struct symbol *sym, struct map *map, int evidx)
 {
 	struct objdump_line *pos, *n;
 	struct objdump_line_rb_node *rbpos;
-	LIST_HEAD(head);
+	struct annotation *notes = symbol__annotation(sym);
 	struct annotate_browser browser = {
 		.b = {
-			.entries = &head,
+			.entries = &notes->src->source,
 			.refresh = ui_browser__list_head_refresh,
 			.seek	 = ui_browser__list_head_seek,
 			.write	 = annotate_browser__write,
@@ -210,20 +210,20 @@ int symbol__tui_annotate(struct symbol *sym, struct map *map, int evidx)
 	if (map->dso->annotate_warned)
 		return -1;
 
-	if (symbol__annotate(sym, map, &head, sizeof(*rbpos)) < 0) {
+	if (symbol__annotate(sym, map, sizeof(*rbpos)) < 0) {
 		ui__error_window(ui_helpline__last_msg);
 		return -1;
 	}
 
 	ui_helpline__push("Press <- or ESC to exit");
 
-	list_for_each_entry(pos, &head, node) {
+	list_for_each_entry(pos, &notes->src->source, node) {
 		size_t line_len = strlen(pos->line);
 		if (browser.b.width < line_len)
 			browser.b.width = line_len;
 		rbpos = objdump_line__rb(pos);
 		rbpos->idx = browser.b.nr_entries++;
-		rbpos->percent = objdump_line__calc_percent(pos, &head, sym, evidx);
+		rbpos->percent = objdump_line__calc_percent(pos, sym, evidx);
 		if (rbpos->percent < 0.01)
 			continue;
 		objdump__insert_line(&browser.entries, rbpos);
@@ -238,7 +238,7 @@ int symbol__tui_annotate(struct symbol *sym, struct map *map, int evidx)
 
 	browser.b.width += 18; /* Percentage */
 	ret = annotate_browser__run(&browser);
-	list_for_each_entry_safe(pos, n, &head, node) {
+	list_for_each_entry_safe(pos, n, &notes->src->source, node) {
 		list_del(&pos->node);
 		objdump_line__free(pos);
 	}
-- 
cgit v0.10.2


From d5e3d747007fdb541e57ed72e020ff0b94db3470 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 8 Feb 2011 15:29:25 -0200
Subject: perf annotate: Fix annotate context lines regression

The live annotation done in 'perf top' needs to limit the context before
lines that aren't filtered out by the min percent filter, if we don't do
that, the screen in a tty often is not enough for showing what is
interesting: lines with hits and a few source code lines before it.

Reported-by: Mike Galbraith <efault@gmx.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 7dbf22d..210c736 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -231,7 +231,7 @@ static void show_details(struct sym_entry *syme)
 	printf("  Events  Pcnt (>=%d%%)\n", sym_pcnt_filter);
 
 	more = symbol__annotate_printf(symbol, syme->map, top.sym_evsel->idx,
-				       0, sym_pcnt_filter, top.print_entries);
+				       0, sym_pcnt_filter, top.print_entries, 4);
 	if (top.zero)
 		symbol__annotate_zero_histogram(symbol, top.sym_evsel->idx);
 	else
diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index c777bda..02976b8 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -111,7 +111,8 @@ struct objdump_line *objdump__get_next_ip_line(struct list_head *head,
 
 static int objdump_line__print(struct objdump_line *oline, struct symbol *sym,
 			       int evidx, u64 len, int min_pcnt,
-			       int printed, int max_lines)
+			       int printed, int max_lines,
+			       struct objdump_line *queue)
 {
 	static const char *prev_line;
 	static const char *prev_color;
@@ -150,6 +151,15 @@ static int objdump_line__print(struct objdump_line *oline, struct symbol *sym,
 		if (max_lines && printed >= max_lines)
 			return 1;
 
+		if (queue != NULL) {
+			list_for_each_entry_from(queue, &notes->src->source, node) {
+				if (queue == oline)
+					break;
+				objdump_line__print(queue, sym, evidx, len,
+						    0, 0, 1, NULL);
+			}
+		}
+
 		color = get_percent_color(percent);
 
 		/*
@@ -172,6 +182,9 @@ static int objdump_line__print(struct objdump_line *oline, struct symbol *sym,
 	} else if (max_lines && printed >= max_lines)
 		return 1;
 	else {
+		if (queue)
+			return -1;
+
 		if (!*oline->line)
 			printf("         :\n");
 		else
@@ -452,13 +465,14 @@ static void symbol__annotate_hits(struct symbol *sym, int evidx)
 }
 
 int symbol__annotate_printf(struct symbol *sym, struct map *map, int evidx,
-			    bool full_paths, int min_pcnt, int max_lines)
+			    bool full_paths, int min_pcnt, int max_lines,
+			    int context)
 {
 	struct dso *dso = map->dso;
 	const char *filename = dso->long_name, *d_filename;
 	struct annotation *notes = symbol__annotation(sym);
-	struct objdump_line *pos;
-	int printed = 2;
+	struct objdump_line *pos, *queue = NULL;
+	int printed = 2, queue_len = 0;
 	int more = 0;
 	u64 len;
 
@@ -476,10 +490,20 @@ int symbol__annotate_printf(struct symbol *sym, struct map *map, int evidx,
 		symbol__annotate_hits(sym, evidx);
 
 	list_for_each_entry(pos, &notes->src->source, node) {
+		if (context && queue == NULL) {
+			queue = pos;
+			queue_len = 0;
+		}
+
 		switch (objdump_line__print(pos, sym, evidx, len, min_pcnt,
-					    printed, max_lines)) {
+					    printed, max_lines, queue)) {
 		case 0:
 			++printed;
+			if (context) {
+				printed += queue_len;
+				queue = NULL;
+				queue_len = 0;
+			}
 			break;
 		case 1:
 			/* filtered by max_lines */
@@ -487,7 +511,16 @@ int symbol__annotate_printf(struct symbol *sym, struct map *map, int evidx,
 			break;
 		case -1:
 		default:
-			/* filtered by min_pcnt */
+			/*
+			 * Filtered by min_pcnt or non IP lines when
+			 * context != 0
+			 */
+			if (!context)
+				break;
+			if (queue_len == context)
+				queue = list_entry(queue->node.next, typeof(*queue), node);
+			else
+				++queue_len;
 			break;
 		}
 	}
@@ -550,7 +583,7 @@ int symbol__tty_annotate(struct symbol *sym, struct map *map, int evidx,
 	}
 
 	symbol__annotate_printf(sym, map, evidx, full_paths,
-				min_pcnt, max_lines);
+				min_pcnt, max_lines, 0);
 	if (print_lines)
 		symbol__free_source_line(sym, len);
 
diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h
index b237c86..e848803 100644
--- a/tools/perf/util/annotate.h
+++ b/tools/perf/util/annotate.h
@@ -78,7 +78,8 @@ void symbol__annotate_zero_histograms(struct symbol *sym);
 int symbol__annotate(struct symbol *sym, struct map *map, size_t privsize);
 int symbol__annotate_init(struct map *map __used, struct symbol *sym);
 int symbol__annotate_printf(struct symbol *sym, struct map *map, int evidx,
-			    bool full_paths, int min_pcnt, int max_lines);
+			    bool full_paths, int min_pcnt, int max_lines,
+			    int context);
 void symbol__annotate_zero_histogram(struct symbol *sym, int evidx);
 void symbol__annotate_decay_histogram(struct symbol *sym, int evidx);
 void objdump_line_list__purge(struct list_head *head);
-- 
cgit v0.10.2


From c305d524e5dd3c3c7a6035083e30950bea1b52dc Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 2 Feb 2011 17:10:48 +0100
Subject: softirq: Avoid stack switch from ksoftirqd

ksoftirqd() calls do_softirq() which switches stacks on several
architectures. That makes no sense at all. ksoftirqd's stack is
sufficient.

Call __do_softirq() directly.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Acked-by: David Miller <davem@davemloft.net>
Cc: Paul Mundt <lethal@linux-sh.org>
Reviewed-by: Frank Rowand <frank.rowand@am.sony.com>
LKML-Reference: <alpine.LFD.2.00.1102021704530.31804@localhost6.localdomain6>

diff --git a/kernel/softirq.c b/kernel/softirq.c
index 68eb5ef..c049046 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -738,7 +738,10 @@ static int run_ksoftirqd(void * __bind_cpu)
 			   don't process */
 			if (cpu_is_offline((long)__bind_cpu))
 				goto wait_to_die;
-			do_softirq();
+			local_irq_disable();
+			if (local_softirq_pending())
+				__do_softirq();
+			local_irq_enable();
 			preempt_enable_no_resched();
 			cond_resched();
 			preempt_disable();
-- 
cgit v0.10.2


From 44951a60ff888add9e84f509ffce20052e45af94 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 4 Feb 2011 17:33:49 +0100
Subject: genirq: Remove dead code

CONFIG_KSTAT_IRQS_ONDEMAND does not exist. It's not worth to implement
it. Use sparse irqs if you care about memory consumption of the
interrupt layer.

Found by undertaker: http://vamos.informatik.uni-erlangen.de/trac/undertaker

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 282f202..a7ac6e1 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -251,7 +251,6 @@ int __init early_irq_init(void)
 	for (i = 0; i < count; i++) {
 		desc[i].irq_data.irq = i;
 		desc[i].irq_data.chip = &no_irq_chip;
-		/* TODO : do this allocation on-demand ... */
 		desc[i].kstat_irqs = alloc_percpu(unsigned int);
 		alloc_masks(desc + i, GFP_KERNEL, node);
 		desc_smp_init(desc + i, node);
@@ -277,22 +276,6 @@ static void free_desc(unsigned int irq)
 
 static inline int alloc_descs(unsigned int start, unsigned int cnt, int node)
 {
-#if defined(CONFIG_KSTAT_IRQS_ONDEMAND)
-	struct irq_desc *desc;
-	unsigned int i;
-
-	for (i = 0; i < cnt; i++) {
-		desc = irq_to_desc(start + i);
-		if (desc && !desc->kstat_irqs) {
-			unsigned int __percpu *stats = alloc_percpu(unsigned int);
-
-			if (!stats)
-				return -1;
-			if (cmpxchg(&desc->kstat_irqs, NULL, stats) != NULL)
-				free_percpu(stats);
-		}
-	}
-#endif
 	return start;
 }
 #endif /* !CONFIG_SPARSE_IRQ */
-- 
cgit v0.10.2


From cce1dac871f387d0f3da81440d85bd387d8fd5a6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Mon, 24 Jan 2011 21:12:01 +0100
Subject: trivial: Fix Steven's Copyright typos
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

OK, the copyright allows you to write a copy, still I think the lawyers
prefer the correct spelling.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
LKML-Reference: <1295899921-11333-1-git-send-email-u.kleine-koenig@pengutronix.de>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/scripts/kconfig/streamline_config.pl b/scripts/kconfig/streamline_config.pl
index fd81fc3..a4fe923 100644
--- a/scripts/kconfig/streamline_config.pl
+++ b/scripts/kconfig/streamline_config.pl
@@ -1,6 +1,6 @@
 #!/usr/bin/perl -w
 #
-# Copywrite 2005-2009 - Steven Rostedt
+# Copyright 2005-2009 - Steven Rostedt
 # Licensed under the terms of the GNU GPL License version 2
 #
 #  It's simple enough to figure out how this works.
diff --git a/tools/testing/ktest/ktest.pl b/tools/testing/ktest/ktest.pl
index e1c62ee..ba7c63a 100755
--- a/tools/testing/ktest/ktest.pl
+++ b/tools/testing/ktest/ktest.pl
@@ -1,6 +1,6 @@
 #!/usr/bin/perl -w
 #
-# Copywrite 2010 - Steven Rostedt <srostedt@redhat.com>, Red Hat Inc.
+# Copyright 2010 - Steven Rostedt <srostedt@redhat.com>, Red Hat Inc.
 # Licensed under the terms of the GNU GPL License version 2
 #
 
-- 
cgit v0.10.2


From f4d5c029bd6731baac0937324cef0f746e7d5ea7 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Wed, 26 Jan 2011 16:49:00 +0800
Subject: tracing: Compile time initialization for event flags value

Compile time initialization is better than runtime initialization.

Remove many early_initcall()s and many trace_init_flags_##name()s.

Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
LKML-Reference: <4D3FDFFC.6030304@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 8e8968e..a17fcea 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -132,11 +132,11 @@ extern struct trace_event_functions exit_syscall_print_funcs;
 		.class			= &event_class_syscall_enter,	\
 		.event.funcs            = &enter_syscall_print_funcs,	\
 		.data			= (void *)&__syscall_meta_##sname,\
+		.flags			= TRACE_EVENT_FL_CAP_ANY,	\
 	};								\
 	static struct ftrace_event_call __used				\
 	  __attribute__((section("_ftrace_events")))			\
-	 *__event_enter_##sname = &event_enter_##sname;			\
-	__TRACE_EVENT_FLAGS(enter_##sname, TRACE_EVENT_FL_CAP_ANY)
+	 *__event_enter_##sname = &event_enter_##sname;
 
 #define SYSCALL_TRACE_EXIT_EVENT(sname)					\
 	static struct syscall_metadata __syscall_meta_##sname;		\
@@ -146,11 +146,11 @@ extern struct trace_event_functions exit_syscall_print_funcs;
 		.class			= &event_class_syscall_exit,	\
 		.event.funcs		= &exit_syscall_print_funcs,	\
 		.data			= (void *)&__syscall_meta_##sname,\
+		.flags			= TRACE_EVENT_FL_CAP_ANY,	\
 	};								\
 	static struct ftrace_event_call __used				\
 	  __attribute__((section("_ftrace_events")))			\
-	*__event_exit_##sname = &event_exit_##sname;			\
-	__TRACE_EVENT_FLAGS(exit_##sname, TRACE_EVENT_FL_CAP_ANY)
+	*__event_exit_##sname = &event_exit_##sname;
 
 #define SYSCALL_METADATA(sname, nb)				\
 	SYSCALL_TRACE_ENTER_EVENT(sname);			\
-- 
cgit v0.10.2


From 87d80de2800d087ea833cb79bc13f85ff34ed49f Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 8 Feb 2011 13:19:49 -0500
Subject: tracing: Remove obsolete sched_switch tracer

The trace events sched_switch and sched_wakeup do the same thing
as the stand alone sched_switch tracer does. It is no longer needed.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt
index 557c1ed..65eddb7 100644
--- a/Documentation/trace/ftrace.txt
+++ b/Documentation/trace/ftrace.txt
@@ -202,10 +202,6 @@ Here is the list of current tracers that may be configured.
 	to draw a graph of function calls similar to C code
 	source.
 
-  "sched_switch"
-
-	Traces the context switches and wakeups between tasks.
-
   "irqsoff"
 
 	Traces the areas that disable interrupts and saves
@@ -273,39 +269,6 @@ format, the function name that was traced "path_put" and the
 parent function that called this function "path_walk". The
 timestamp is the time at which the function was entered.
 
-The sched_switch tracer also includes tracing of task wakeups
-and context switches.
-
-     ksoftirqd/1-7     [01]  1453.070013:      7:115:R   +  2916:115:S
-     ksoftirqd/1-7     [01]  1453.070013:      7:115:R   +    10:115:S
-     ksoftirqd/1-7     [01]  1453.070013:      7:115:R ==>    10:115:R
-        events/1-10    [01]  1453.070013:     10:115:S ==>  2916:115:R
-     kondemand/1-2916  [01]  1453.070013:   2916:115:S ==>     7:115:R
-     ksoftirqd/1-7     [01]  1453.070013:      7:115:S ==>     0:140:R
-
-Wake ups are represented by a "+" and the context switches are
-shown as "==>".  The format is:
-
- Context switches:
-
-       Previous task              Next Task
-
-  <pid>:<prio>:<state>  ==>  <pid>:<prio>:<state>
-
- Wake ups:
-
-       Current task               Task waking up
-
-  <pid>:<prio>:<state>    +  <pid>:<prio>:<state>
-
-The prio is the internal kernel priority, which is the inverse
-of the priority that is usually displayed by user-space tools.
-Zero represents the highest priority (99). Prio 100 starts the
-"nice" priorities with 100 being equal to nice -20 and 139 being
-nice 19. The prio "140" is reserved for the idle task which is
-the lowest priority thread (pid 0).
-
-
 Latency trace format
 --------------------
 
@@ -491,79 +454,6 @@ x494] <- /root/a.out[+0x4a8] <- /lib/libc-2.7.so[+0x1e1a6]
                    latencies, as described in "Latency
                    trace format".
 
-sched_switch
-------------
-
-This tracer simply records schedule switches. Here is an example
-of how to use it.
-
- # echo sched_switch > current_tracer
- # echo 1 > tracing_enabled
- # sleep 1
- # echo 0 > tracing_enabled
- # cat trace
-
-# tracer: sched_switch
-#
-#           TASK-PID   CPU#    TIMESTAMP  FUNCTION
-#              | |      |          |         |
-            bash-3997  [01]   240.132281:   3997:120:R   +  4055:120:R
-            bash-3997  [01]   240.132284:   3997:120:R ==>  4055:120:R
-           sleep-4055  [01]   240.132371:   4055:120:S ==>  3997:120:R
-            bash-3997  [01]   240.132454:   3997:120:R   +  4055:120:S
-            bash-3997  [01]   240.132457:   3997:120:R ==>  4055:120:R
-           sleep-4055  [01]   240.132460:   4055:120:D ==>  3997:120:R
-            bash-3997  [01]   240.132463:   3997:120:R   +  4055:120:D
-            bash-3997  [01]   240.132465:   3997:120:R ==>  4055:120:R
-          <idle>-0     [00]   240.132589:      0:140:R   +     4:115:S
-          <idle>-0     [00]   240.132591:      0:140:R ==>     4:115:R
-     ksoftirqd/0-4     [00]   240.132595:      4:115:S ==>     0:140:R
-          <idle>-0     [00]   240.132598:      0:140:R   +     4:115:S
-          <idle>-0     [00]   240.132599:      0:140:R ==>     4:115:R
-     ksoftirqd/0-4     [00]   240.132603:      4:115:S ==>     0:140:R
-           sleep-4055  [01]   240.133058:   4055:120:S ==>  3997:120:R
- [...]
-
-
-As we have discussed previously about this format, the header
-shows the name of the trace and points to the options. The
-"FUNCTION" is a misnomer since here it represents the wake ups
-and context switches.
-
-The sched_switch file only lists the wake ups (represented with
-'+') and context switches ('==>') with the previous task or
-current task first followed by the next task or task waking up.
-The format for both of these is PID:KERNEL-PRIO:TASK-STATE.
-Remember that the KERNEL-PRIO is the inverse of the actual
-priority with zero (0) being the highest priority and the nice
-values starting at 100 (nice -20). Below is a quick chart to map
-the kernel priority to user land priorities.
-
-   Kernel Space                     User Space
- ===============================================================
-   0(high) to  98(low)     user RT priority 99(high) to 1(low)
-                           with SCHED_RR or SCHED_FIFO
- ---------------------------------------------------------------
-  99                       sched_priority is not used in scheduling
-                           decisions(it must be specified as 0)
- ---------------------------------------------------------------
- 100(high) to 139(low)     user nice -20(high) to 19(low)
- ---------------------------------------------------------------
- 140                       idle task priority
- ---------------------------------------------------------------
-
-The task states are:
-
- R - running : wants to run, may not actually be running
- S - sleep   : process is waiting to be woken up (handles signals)
- D - disk sleep (uninterruptible sleep) : process must be woken up
-					(ignores signals)
- T - stopped : process suspended
- t - traced  : process is being traced (with something like gdb)
- Z - zombie  : process waiting to be cleaned up
- X - unknown
-
-
 ftrace_enabled
 --------------
 
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 8f758d0..7e62c0a 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -247,51 +247,3 @@ void tracing_sched_switch_assign_trace(struct trace_array *tr)
 	ctx_trace = tr;
 }
 
-static void stop_sched_trace(struct trace_array *tr)
-{
-	tracing_stop_sched_switch_record();
-}
-
-static int sched_switch_trace_init(struct trace_array *tr)
-{
-	ctx_trace = tr;
-	tracing_reset_online_cpus(tr);
-	tracing_start_sched_switch_record();
-	return 0;
-}
-
-static void sched_switch_trace_reset(struct trace_array *tr)
-{
-	if (sched_ref)
-		stop_sched_trace(tr);
-}
-
-static void sched_switch_trace_start(struct trace_array *tr)
-{
-	sched_stopped = 0;
-}
-
-static void sched_switch_trace_stop(struct trace_array *tr)
-{
-	sched_stopped = 1;
-}
-
-static struct tracer sched_switch_trace __read_mostly =
-{
-	.name		= "sched_switch",
-	.init		= sched_switch_trace_init,
-	.reset		= sched_switch_trace_reset,
-	.start		= sched_switch_trace_start,
-	.stop		= sched_switch_trace_stop,
-	.wait_pipe	= poll_wait_pipe,
-#ifdef CONFIG_FTRACE_SELFTEST
-	.selftest    = trace_selftest_startup_sched_switch,
-#endif
-};
-
-__init static int init_sched_switch_trace(void)
-{
-	return register_tracer(&sched_switch_trace);
-}
-device_initcall(init_sched_switch_trace);
-
-- 
cgit v0.10.2


From 6752ab4a9c30d5411b2dfdb251a3f1cb18aae487 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 8 Feb 2011 13:54:06 -0500
Subject: tracing: Deprecate tracing_enabled for tracing_on

tracing_enabled should not be used, it is heavy weight and does not
do much in helping lower the overhead.

tracing_on should be used instead. Warn users to use tracing_on
when tracing_enabled is used as it will soon be removed from the
tracing directory.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt
index 65eddb7..67f1cc4 100644
--- a/Documentation/trace/ftrace.txt
+++ b/Documentation/trace/ftrace.txt
@@ -80,11 +80,11 @@ of ftrace. Here is a list of some of the key files:
 	tracers listed here can be configured by
 	echoing their name into current_tracer.
 
-  tracing_enabled:
+  tracing_on:
 
-	This sets or displays whether the current_tracer
-	is activated and tracing or not. Echo 0 into this
-	file to disable the tracer or 1 to enable it.
+	This sets or displays whether writing to the trace
+	ring buffer is enabled. Echo 0 into this file to disable
+	the tracer or 1 to enable it.
 
   trace:
 
@@ -497,10 +497,10 @@ an example:
  # echo irqsoff > current_tracer
  # echo latency-format > trace_options
  # echo 0 > tracing_max_latency
- # echo 1 > tracing_enabled
+ # echo 1 > tracing_on
  # ls -ltr
  [...]
- # echo 0 > tracing_enabled
+ # echo 0 > tracing_on
  # cat trace
 # tracer: irqsoff
 #
@@ -605,10 +605,10 @@ is much like the irqsoff tracer.
  # echo preemptoff > current_tracer
  # echo latency-format > trace_options
  # echo 0 > tracing_max_latency
- # echo 1 > tracing_enabled
+ # echo 1 > tracing_on
  # ls -ltr
  [...]
- # echo 0 > tracing_enabled
+ # echo 0 > tracing_on
  # cat trace
 # tracer: preemptoff
 #
@@ -753,10 +753,10 @@ tracers.
  # echo preemptirqsoff > current_tracer
  # echo latency-format > trace_options
  # echo 0 > tracing_max_latency
- # echo 1 > tracing_enabled
+ # echo 1 > tracing_on
  # ls -ltr
  [...]
- # echo 0 > tracing_enabled
+ # echo 0 > tracing_on
  # cat trace
 # tracer: preemptirqsoff
 #
@@ -916,9 +916,9 @@ Instead of performing an 'ls', we will run 'sleep 1' under
  # echo wakeup > current_tracer
  # echo latency-format > trace_options
  # echo 0 > tracing_max_latency
- # echo 1 > tracing_enabled
+ # echo 1 > tracing_on
  # chrt -f 5 sleep 1
- # echo 0 > tracing_enabled
+ # echo 0 > tracing_on
  # cat trace
 # tracer: wakeup
 #
@@ -1030,9 +1030,9 @@ ftrace_enabled is set; otherwise this tracer is a nop.
 
  # sysctl kernel.ftrace_enabled=1
  # echo function > current_tracer
- # echo 1 > tracing_enabled
+ # echo 1 > tracing_on
  # usleep 1
- # echo 0 > tracing_enabled
+ # echo 0 > tracing_on
  # cat trace
 # tracer: function
 #
@@ -1070,7 +1070,7 @@ int trace_fd;
 [...]
 int main(int argc, char *argv[]) {
 	[...]
-	trace_fd = open(tracing_file("tracing_enabled"), O_WRONLY);
+	trace_fd = open(tracing_file("tracing_on"), O_WRONLY);
 	[...]
 	if (condition_hit()) {
 		write(trace_fd, "0", 1);
@@ -1521,9 +1521,9 @@ If I am only interested in sys_nanosleep and hrtimer_interrupt:
  # echo sys_nanosleep hrtimer_interrupt \
 		> set_ftrace_filter
  # echo function > current_tracer
- # echo 1 > tracing_enabled
+ # echo 1 > tracing_on
  # usleep 1
- # echo 0 > tracing_enabled
+ # echo 0 > tracing_on
  # cat trace
 # tracer: ftrace
 #
@@ -1769,9 +1769,9 @@ different. The trace is live.
  # echo function > current_tracer
  # cat trace_pipe > /tmp/trace.out &
 [1] 4153
- # echo 1 > tracing_enabled
+ # echo 1 > tracing_on
  # usleep 1
- # echo 0 > tracing_enabled
+ # echo 0 > tracing_on
  # cat trace
 # tracer: function
 #
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index dc53ecb..8dc8da6 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2710,6 +2710,10 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf,
 
 	mutex_lock(&trace_types_lock);
 	if (tracer_enabled ^ val) {
+
+		/* Only need to warn if this is used to change the state */
+		WARN_ONCE(1, "tracing_enabled is deprecated. Use tracing_on");
+
 		if (val) {
 			tracer_enabled = 1;
 			if (current_trace->start)
-- 
cgit v0.10.2


From 6c53cbfced048c421e4f72cb2183465f68fbc5e7 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <borislav.petkov@amd.com>
Date: Thu, 6 Jan 2011 16:56:51 +0100
Subject: x86, microcode: Correct sysdev_add error path

When we encounter an error while initting the microcode driver on a CPU,
we must undo the previously added sysfs group.

Cc: Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
Acked-by: Andreas Herrmann <Andreas.Herrmann3@amd.com>

diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index 1cca374..87af68e 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -417,8 +417,10 @@ static int mc_sysdev_add(struct sys_device *sys_dev)
 	if (err)
 		return err;
 
-	if (microcode_init_cpu(cpu) == UCODE_ERROR)
-		err = -EINVAL;
+	if (microcode_init_cpu(cpu) == UCODE_ERROR) {
+		sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
+		return -EINVAL;
+	}
 
 	return err;
 }
-- 
cgit v0.10.2


From ffc7e8ac820bf9dd6106b01d3e64fecb5177cf43 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <borislav.petkov@amd.com>
Date: Thu, 30 Dec 2010 21:06:01 +0100
Subject: x86, microcode, AMD: Release firmware on error

When the ucode magic is wrong, for whatever reason, we don't release the
loaded firmware binary and its related resources. Make sure we do. Also,
fix function naming to fit this driver's convention and shorten variable
names.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
Acked-by: Andreas Herrmann <Andreas.Herrmann3@amd.com>

diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index 0fe6d1a..ef91df0 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -278,27 +278,29 @@ generic_load_microcode(int cpu, const u8 *data, size_t size)
 	return state;
 }
 
-static enum ucode_state request_microcode_fw(int cpu, struct device *device)
+static enum ucode_state request_microcode_amd(int cpu, struct device *device)
 {
 	const char *fw_name = "amd-ucode/microcode_amd.bin";
-	const struct firmware *firmware;
-	enum ucode_state ret;
+	const struct firmware *fw;
+	enum ucode_state ret = UCODE_NFOUND;
 
-	if (request_firmware(&firmware, fw_name, device)) {
+	if (request_firmware(&fw, fw_name, device)) {
 		printk(KERN_ERR "microcode: failed to load file %s\n", fw_name);
-		return UCODE_NFOUND;
+		goto out;
 	}
 
-	if (*(u32 *)firmware->data != UCODE_MAGIC) {
-		pr_err("invalid UCODE_MAGIC (0x%08x)\n",
-		       *(u32 *)firmware->data);
-		return UCODE_ERROR;
+	ret = UCODE_ERROR;
+	if (*(u32 *)fw->data != UCODE_MAGIC) {
+		pr_err("Invalid UCODE_MAGIC (0x%08x)\n", *(u32 *)fw->data);
+		goto fw_release;
 	}
 
-	ret = generic_load_microcode(cpu, firmware->data, firmware->size);
+	ret = generic_load_microcode(cpu, fw->data, fw->size);
 
-	release_firmware(firmware);
+fw_release:
+	release_firmware(fw);
 
+out:
 	return ret;
 }
 
@@ -319,7 +321,7 @@ static void microcode_fini_cpu_amd(int cpu)
 
 static struct microcode_ops microcode_amd_ops = {
 	.request_microcode_user           = request_microcode_user,
-	.request_microcode_fw             = request_microcode_fw,
+	.request_microcode_fw             = request_microcode_amd,
 	.collect_cpu_info                 = collect_cpu_info_amd,
 	.apply_microcode                  = apply_microcode_amd,
 	.microcode_fini_cpu               = microcode_fini_cpu_amd,
-- 
cgit v0.10.2


From 10de52d6655ef0d4a1b8d2804db30208c26601ed Mon Sep 17 00:00:00 2001
From: Borislav Petkov <borislav.petkov@amd.com>
Date: Thu, 30 Dec 2010 22:10:12 +0100
Subject: x86, microcode, AMD: Simplify install_equiv_cpu_table

There's no need to memcpy the ucode header in order to look at it only
in this function - use the original buffer instead. Also, fix return
type semantics by returning a negative value on error and a positive
otherwise.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
Acked-by: Andreas Herrmann <Andreas.Herrmann3@amd.com>

diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index ef91df0..9a451d7 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -188,27 +188,22 @@ get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size)
 
 static int install_equiv_cpu_table(const u8 *buf)
 {
-	u8 *container_hdr[UCODE_CONTAINER_HEADER_SIZE];
-	unsigned int *buf_pos = (unsigned int *)container_hdr;
-	unsigned long size;
+	unsigned int *ibuf = (unsigned int *)buf;
+	unsigned int type = ibuf[1];
+	unsigned int size = ibuf[2];
 
-	get_ucode_data(&container_hdr, buf, UCODE_CONTAINER_HEADER_SIZE);
-
-	size = buf_pos[2];
-
-	if (buf_pos[1] != UCODE_EQUIV_CPU_TABLE_TYPE || !size) {
+	if (type != UCODE_EQUIV_CPU_TABLE_TYPE || !size) {
 		pr_err("error: invalid type field in container file section header\n");
-		return 0;
+		return -EINVAL;
 	}
 
 	equiv_cpu_table = vmalloc(size);
 	if (!equiv_cpu_table) {
 		pr_err("failed to allocate equivalent CPU table\n");
-		return 0;
+		return -ENOMEM;
 	}
 
-	buf += UCODE_CONTAINER_HEADER_SIZE;
-	get_ucode_data(equiv_cpu_table, buf, size);
+	get_ucode_data(equiv_cpu_table, buf + UCODE_CONTAINER_HEADER_SIZE, size);
 
 	return size + UCODE_CONTAINER_HEADER_SIZE; /* add header length */
 }
@@ -232,7 +227,7 @@ generic_load_microcode(int cpu, const u8 *data, size_t size)
 	enum ucode_state state = UCODE_OK;
 
 	offset = install_equiv_cpu_table(ucode_ptr);
-	if (!offset) {
+	if (offset < 0) {
 		pr_err("failed to create equivalent cpu table\n");
 		return UCODE_ERROR;
 	}
-- 
cgit v0.10.2


From 7cc27349cbfec271eecec9488b4bf3f3fadb2ce4 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <borislav.petkov@amd.com>
Date: Fri, 31 Dec 2010 16:57:48 +0100
Subject: x86, microcode, AMD: Simplify get_next_ucode

Do not copy the section header but look at it directly through the
pointer. Also, make it return a ptr to a ucode header directly
thus dropping a bunch of unneeded casts. Finally, simplify
generic_load_microcode(), while at it.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
Acked-by: Andreas Herrmann <Andreas.Herrmann3@amd.com>

diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index 9a451d7..8b58fc1 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -88,9 +88,9 @@ static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
 	return 0;
 }
 
-static int get_matching_microcode(int cpu, void *mc, int rev)
+static int get_matching_microcode(int cpu, struct microcode_header_amd *mc_hdr,
+				  int rev)
 {
-	struct microcode_header_amd *mc_header = mc;
 	unsigned int current_cpu_id;
 	u16 equiv_cpu_id = 0;
 	unsigned int i = 0;
@@ -109,17 +109,17 @@ static int get_matching_microcode(int cpu, void *mc, int rev)
 	if (!equiv_cpu_id)
 		return 0;
 
-	if (mc_header->processor_rev_id != equiv_cpu_id)
+	if (mc_hdr->processor_rev_id != equiv_cpu_id)
 		return 0;
 
 	/* ucode might be chipset specific -- currently we don't support this */
-	if (mc_header->nb_dev_id || mc_header->sb_dev_id) {
+	if (mc_hdr->nb_dev_id || mc_hdr->sb_dev_id) {
 		pr_err("CPU%d: loading of chipset specific code not yet supported\n",
 		       cpu);
 		return 0;
 	}
 
-	if (mc_header->patch_id <= rev)
+	if (mc_hdr->patch_id <= rev)
 		return 0;
 
 	return 1;
@@ -155,21 +155,18 @@ static int apply_microcode_amd(int cpu)
 	return 0;
 }
 
-static void *
+static struct microcode_header_amd *
 get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size)
 {
+	struct microcode_header_amd *mc;
 	unsigned int total_size;
-	u8 section_hdr[UCODE_CONTAINER_SECTION_HDR];
-	void *mc;
 
-	get_ucode_data(section_hdr, buf, UCODE_CONTAINER_SECTION_HDR);
-
-	if (section_hdr[0] != UCODE_UCODE_TYPE) {
+	if (buf[0] != UCODE_UCODE_TYPE) {
 		pr_err("error: invalid type field in container file section header\n");
 		return NULL;
 	}
 
-	total_size = (unsigned long) (section_hdr[4] + (section_hdr[5] << 8));
+	total_size = buf[4] + (buf[5] << 8);
 
 	if (total_size > size || total_size > UCODE_MAX_SIZE) {
 		pr_err("error: size mismatch\n");
@@ -218,12 +215,12 @@ static enum ucode_state
 generic_load_microcode(int cpu, const u8 *data, size_t size)
 {
 	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+	struct microcode_header_amd *mc_hdr = NULL;
+	unsigned int mc_size, leftover;
+	unsigned long offset;
 	const u8 *ucode_ptr = data;
 	void *new_mc = NULL;
-	void *mc;
 	int new_rev = uci->cpu_sig.rev;
-	unsigned int leftover;
-	unsigned long offset;
 	enum ucode_state state = UCODE_OK;
 
 	offset = install_equiv_cpu_table(ucode_ptr);
@@ -236,38 +233,37 @@ generic_load_microcode(int cpu, const u8 *data, size_t size)
 	leftover = size - offset;
 
 	while (leftover) {
-		unsigned int uninitialized_var(mc_size);
-		struct microcode_header_amd *mc_header;
-
-		mc = get_next_ucode(ucode_ptr, leftover, &mc_size);
-		if (!mc)
+		mc_hdr = get_next_ucode(ucode_ptr, leftover, &mc_size);
+		if (!mc_hdr)
 			break;
 
-		mc_header = (struct microcode_header_amd *)mc;
-		if (get_matching_microcode(cpu, mc, new_rev)) {
+		if (get_matching_microcode(cpu, mc_hdr, new_rev)) {
 			vfree(new_mc);
-			new_rev = mc_header->patch_id;
-			new_mc  = mc;
+			new_rev = mc_hdr->patch_id;
+			new_mc  = mc_hdr;
 		} else
-			vfree(mc);
+			vfree(mc_hdr);
 
 		ucode_ptr += mc_size;
 		leftover  -= mc_size;
 	}
 
-	if (new_mc) {
-		if (!leftover) {
-			vfree(uci->mc);
-			uci->mc = new_mc;
-			pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n",
-				 cpu, new_rev, uci->cpu_sig.rev);
-		} else {
-			vfree(new_mc);
-			state = UCODE_ERROR;
-		}
-	} else
+	if (!new_mc) {
 		state = UCODE_NFOUND;
+		goto free_table;
+	}
+
+	if (!leftover) {
+		vfree(uci->mc);
+		uci->mc = new_mc;
+		pr_debug("CPU%d update ucode to version 0x%x (from 0x%x)\n",
+			 cpu, new_rev, uci->cpu_sig.rev);
+	} else {
+		vfree(new_mc);
+		state = UCODE_ERROR;
+	}
 
+free_table:
 	free_equiv_cpu_table();
 
 	return state;
-- 
cgit v0.10.2


From 05ff02e4c0686051fcb074aec92df03f2c184fd1 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <borislav.petkov@amd.com>
Date: Wed, 5 Jan 2011 18:04:11 +0100
Subject: x86, microcode, AMD: Remove unneeded memset call

collect_cpu_info_amd() clears its csig arg but this is done in the
microcode_core's collect_cpu_info() by clearing the embedding struct
ucode_cpu_info. Drop it.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
Acked-by: Andreas Herrmann <Andreas.Herrmann3@amd.com>

diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index 8b58fc1..274fa40 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -77,7 +77,6 @@ static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
 	struct cpuinfo_x86 *c = &cpu_data(cpu);
 	u32 dummy;
 
-	memset(csig, 0, sizeof(*csig));
 	if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) {
 		pr_warning("microcode: CPU%d: AMD CPU family 0x%x not "
 			   "supported\n", cpu, c->x86);
-- 
cgit v0.10.2


From 258721ef34fce97a7a6ca9cebebb303827645868 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <borislav.petkov@amd.com>
Date: Wed, 5 Jan 2011 18:13:19 +0100
Subject: x86, microcode, AMD: Cleanup dmesg output

Unify pr_* to use pr_fmt, shorten messages, correct type formatting.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
Acked-by: Andreas Herrmann <Andreas.Herrmann3@amd.com>

diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index 274fa40..adbcef4 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -78,12 +78,13 @@ static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
 	u32 dummy;
 
 	if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) {
-		pr_warning("microcode: CPU%d: AMD CPU family 0x%x not "
-			   "supported\n", cpu, c->x86);
+		pr_warning("CPU%d: family %d not supported\n", cpu, c->x86);
 		return -1;
 	}
+
 	rdmsr(MSR_AMD64_PATCH_LEVEL, csig->rev, dummy);
-	pr_info("CPU%d: patch_level=0x%x\n", cpu, csig->rev);
+	pr_info("CPU%d: patch_level=0x%08x\n", cpu, csig->rev);
+
 	return 0;
 }
 
@@ -113,7 +114,7 @@ static int get_matching_microcode(int cpu, struct microcode_header_amd *mc_hdr,
 
 	/* ucode might be chipset specific -- currently we don't support this */
 	if (mc_hdr->nb_dev_id || mc_hdr->sb_dev_id) {
-		pr_err("CPU%d: loading of chipset specific code not yet supported\n",
+		pr_err("CPU%d: chipset specific code not yet supported\n",
 		       cpu);
 		return 0;
 	}
@@ -143,12 +144,12 @@ static int apply_microcode_amd(int cpu)
 
 	/* check current patch id and patch's id for match */
 	if (rev != mc_amd->hdr.patch_id) {
-		pr_err("CPU%d: update failed (for patch_level=0x%x)\n",
+		pr_err("CPU%d: update failed for patch_level=0x%08x\n",
 		       cpu, mc_amd->hdr.patch_id);
 		return -1;
 	}
 
-	pr_info("CPU%d: updated (new patch_level=0x%x)\n", cpu, rev);
+	pr_info("CPU%d: new patch_level=0x%08x\n", cpu, rev);
 	uci->cpu_sig.rev = rev;
 
 	return 0;
@@ -161,14 +162,14 @@ get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size)
 	unsigned int total_size;
 
 	if (buf[0] != UCODE_UCODE_TYPE) {
-		pr_err("error: invalid type field in container file section header\n");
+		pr_err("invalid type field in container file section header\n");
 		return NULL;
 	}
 
 	total_size = buf[4] + (buf[5] << 8);
 
 	if (total_size > size || total_size > UCODE_MAX_SIZE) {
-		pr_err("error: size mismatch\n");
+		pr_err("section size mismatch\n");
 		return NULL;
 	}
 
@@ -189,7 +190,8 @@ static int install_equiv_cpu_table(const u8 *buf)
 	unsigned int size = ibuf[2];
 
 	if (type != UCODE_EQUIV_CPU_TABLE_TYPE || !size) {
-		pr_err("error: invalid type field in container file section header\n");
+		pr_err("empty section/"
+		       "invalid type field in container file section header\n");
 		return -EINVAL;
 	}
 
@@ -219,7 +221,7 @@ generic_load_microcode(int cpu, const u8 *data, size_t size)
 	unsigned long offset;
 	const u8 *ucode_ptr = data;
 	void *new_mc = NULL;
-	int new_rev = uci->cpu_sig.rev;
+	unsigned int new_rev = uci->cpu_sig.rev;
 	enum ucode_state state = UCODE_OK;
 
 	offset = install_equiv_cpu_table(ucode_ptr);
@@ -255,8 +257,8 @@ generic_load_microcode(int cpu, const u8 *data, size_t size)
 	if (!leftover) {
 		vfree(uci->mc);
 		uci->mc = new_mc;
-		pr_debug("CPU%d update ucode to version 0x%x (from 0x%x)\n",
-			 cpu, new_rev, uci->cpu_sig.rev);
+		pr_debug("CPU%d update ucode (0x%08x -> 0x%08x)\n",
+			 cpu, uci->cpu_sig.rev, new_rev);
 	} else {
 		vfree(new_mc);
 		state = UCODE_ERROR;
@@ -275,13 +277,13 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device)
 	enum ucode_state ret = UCODE_NFOUND;
 
 	if (request_firmware(&fw, fw_name, device)) {
-		printk(KERN_ERR "microcode: failed to load file %s\n", fw_name);
+		pr_err("failed to load file %s\n", fw_name);
 		goto out;
 	}
 
 	ret = UCODE_ERROR;
 	if (*(u32 *)fw->data != UCODE_MAGIC) {
-		pr_err("Invalid UCODE_MAGIC (0x%08x)\n", *(u32 *)fw->data);
+		pr_err("invalid magic value (0x%08x)\n", *(u32 *)fw->data);
 		goto fw_release;
 	}
 
-- 
cgit v0.10.2


From 22b7fcdae562b6792b3f5517e89fd7e0337180ae Mon Sep 17 00:00:00 2001
From: Torben Hohn <torbenh@gmx.de>
Date: Thu, 27 Jan 2011 16:00:12 +0100
Subject: mn10300: Switch do_timer() to xtimer_update()

Only one CPU gets the timer interrupt so mn10300_last_tsc does not
need to be protected by xtime lock. Remove xtime lovking and use
xtime_update() which does the locking itself.

Signed-off-by: Torben Hohn <torbenh@gmx.de>
Cc: David Howells <dhowells@redhat.com>
Cc: Koichi Yasutake <yasutake.koichi@jp.panasonic.com>
LKML-Reference: <20110127150011.23248.62040.stgit@localhost>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/mn10300/kernel/time.c b/arch/mn10300/kernel/time.c
index 75da468..5b95500 100644
--- a/arch/mn10300/kernel/time.c
+++ b/arch/mn10300/kernel/time.c
@@ -104,8 +104,6 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id)
 	unsigned tsc, elapse;
 	irqreturn_t ret;
 
-	write_seqlock(&xtime_lock);
-
 	while (tsc = get_cycles(),
 	       elapse = tsc - mn10300_last_tsc, /* time elapsed since last
 						 * tick */
@@ -114,11 +112,9 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id)
 		mn10300_last_tsc += MN10300_TSC_PER_HZ;
 
 		/* advance the kernel's time tracking system */
-		do_timer(1);
+		xtime_update(1);
 	}
 
-	write_sequnlock(&xtime_lock);
-
 	ret = local_timer_interrupt();
 #ifdef CONFIG_SMP
 	send_IPI_allbutself(LOCAL_TIMER_IPI);
-- 
cgit v0.10.2


From 44d60c0f5c58c2168f31df9a481761451840eb54 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <borislav.petkov@amd.com>
Date: Thu, 10 Feb 2011 12:19:47 +0100
Subject: x86, microcode, AMD: Extend ucode size verification

The different families have a different max size for the ucode patch,
adjust size checking to the family we're running on. Also, do not
vzalloc the max size of the ucode but only the actual size that is
passed on from the firmware loader.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>

diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index adbcef4..9fb8405 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -66,7 +66,6 @@ struct microcode_amd {
 	unsigned int			mpb[0];
 };
 
-#define UCODE_MAX_SIZE			2048
 #define UCODE_CONTAINER_SECTION_HDR	8
 #define UCODE_CONTAINER_HEADER_SIZE	12
 
@@ -155,31 +154,60 @@ static int apply_microcode_amd(int cpu)
 	return 0;
 }
 
+static unsigned int verify_ucode_size(int cpu, const u8 *buf, unsigned int size)
+{
+	struct cpuinfo_x86 *c = &cpu_data(cpu);
+	unsigned int max_size, actual_size;
+
+#define F1XH_MPB_MAX_SIZE 2048
+#define F14H_MPB_MAX_SIZE 1824
+#define F15H_MPB_MAX_SIZE 4096
+
+	switch (c->x86) {
+	case 0x14:
+		max_size = F14H_MPB_MAX_SIZE;
+		break;
+	case 0x15:
+		max_size = F15H_MPB_MAX_SIZE;
+		break;
+	default:
+		max_size = F1XH_MPB_MAX_SIZE;
+		break;
+	}
+
+	actual_size = buf[4] + (buf[5] << 8);
+
+	if (actual_size > size || actual_size > max_size) {
+		pr_err("section size mismatch\n");
+		return 0;
+	}
+
+	return actual_size;
+}
+
 static struct microcode_header_amd *
-get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size)
+get_next_ucode(int cpu, const u8 *buf, unsigned int size, unsigned int *mc_size)
 {
-	struct microcode_header_amd *mc;
-	unsigned int total_size;
+	struct microcode_header_amd *mc = NULL;
+	unsigned int actual_size = 0;
 
 	if (buf[0] != UCODE_UCODE_TYPE) {
 		pr_err("invalid type field in container file section header\n");
-		return NULL;
+		goto out;
 	}
 
-	total_size = buf[4] + (buf[5] << 8);
-
-	if (total_size > size || total_size > UCODE_MAX_SIZE) {
-		pr_err("section size mismatch\n");
-		return NULL;
-	}
+	actual_size = verify_ucode_size(cpu, buf, size);
+	if (!actual_size)
+		goto out;
 
-	mc = vzalloc(UCODE_MAX_SIZE);
+	mc = vzalloc(actual_size);
 	if (!mc)
-		return NULL;
+		goto out;
 
-	get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, total_size);
-	*mc_size = total_size + UCODE_CONTAINER_SECTION_HDR;
+	get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, actual_size);
+	*mc_size = actual_size + UCODE_CONTAINER_SECTION_HDR;
 
+out:
 	return mc;
 }
 
@@ -234,7 +262,7 @@ generic_load_microcode(int cpu, const u8 *data, size_t size)
 	leftover = size - offset;
 
 	while (leftover) {
-		mc_hdr = get_next_ucode(ucode_ptr, leftover, &mc_size);
+		mc_hdr = get_next_ucode(cpu, ucode_ptr, leftover, &mc_size);
 		if (!mc_hdr)
 			break;
 
-- 
cgit v0.10.2


From 94d1ac8b55799be10487fff9766cce6d6628462a Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@novell.com>
Date: Wed, 9 Feb 2011 08:22:46 +0000
Subject: x86: Reduce back the alignment of the per-CPU data section

This complements commit:

  47f19a0814e8: percpu: Remove the multi-page alignment facility

reverting one leftover of:

  fe8e0c25cad2: x86, 32-bit: Align percpu area and irq stacks to THREAD_SIZE

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Acked-by: Alexander van Heukelum <heukelum@fastmail.fm>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <4D525CE60200007800030EE5@vpn.id2.novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: Alexander van Heukelum <heukelum@fastmail.fm>

diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index bf47007..e9f7a3c 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -305,7 +305,7 @@ SECTIONS
 	}
 
 #if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP)
-	PERCPU(THREAD_SIZE)
+	PERCPU(PAGE_SIZE)
 #endif
 
 	. = ALIGN(PAGE_SIZE);
-- 
cgit v0.10.2


From b82fef82d56789439e6be638a87a1a5bba1e6e75 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@novell.com>
Date: Wed, 9 Feb 2011 08:24:34 +0000
Subject: x86: Partly unify asm-offsets_{32,64}.c

Just consolidating the common parts. Full unification would seem
straight forward, but it's not clear the necessary #ifdef-s would
be acceptable.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
LKML-Reference: <4D525D520200007800030EE9@vpn.id2.novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index cfa82c8..2b141c1 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -1,5 +1,75 @@
+/*
+ * Generate definitions needed by assembly language modules.
+ * This code generates raw asm output which is post-processed to extract
+ * and format the required data.
+ */
+#define COMPILE_OFFSETS
+
+#include <linux/crypto.h>
+#include <linux/sched.h>
+#include <linux/stddef.h>
+#include <linux/hardirq.h>
+#include <linux/suspend.h>
+#include <linux/kbuild.h>
+#include <asm/processor.h>
+#include <asm/thread_info.h>
+#include <asm/sigframe.h>
+#include <asm/bootparam.h>
+#include <asm/suspend.h>
+
+#ifdef CONFIG_XEN
+#include <xen/interface/xen.h>
+#endif
+
 #ifdef CONFIG_X86_32
 # include "asm-offsets_32.c"
 #else
 # include "asm-offsets_64.c"
 #endif
+
+void common(void) {
+	BLANK();
+	DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
+	DEFINE(PAGE_SHIFT_asm, PAGE_SHIFT);
+	DEFINE(THREAD_SIZE_asm, THREAD_SIZE);
+
+	BLANK();
+	OFFSET(TI_flags, thread_info, flags);
+	OFFSET(TI_status, thread_info, status);
+	OFFSET(TI_addr_limit, thread_info, addr_limit);
+	OFFSET(TI_preempt_count, thread_info, preempt_count);
+
+	BLANK();
+	OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
+
+	BLANK();
+	OFFSET(pbe_address, pbe, address);
+	OFFSET(pbe_orig_address, pbe, orig_address);
+	OFFSET(pbe_next, pbe, next);
+
+#ifdef CONFIG_PARAVIRT
+	BLANK();
+	OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled);
+	OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops);
+	OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops);
+	OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
+	OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
+	OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
+	OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit);
+	OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0);
+	OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2);
+#endif
+
+#ifdef CONFIG_XEN
+	BLANK();
+	OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
+	OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
+#endif
+
+	BLANK();
+	OFFSET(BP_scratch, boot_params, scratch);
+	OFFSET(BP_loadflags, boot_params, hdr.loadflags);
+	OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
+	OFFSET(BP_version, boot_params, hdr.version);
+	OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment);
+}
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 1a4088d..c29d631 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -1,26 +1,4 @@
-/*
- * Generate definitions needed by assembly language modules.
- * This code generates raw asm output which is post-processed
- * to extract and format the required data.
- */
-
-#include <linux/crypto.h>
-#include <linux/sched.h>
-#include <linux/signal.h>
-#include <linux/personality.h>
-#include <linux/suspend.h>
-#include <linux/kbuild.h>
 #include <asm/ucontext.h>
-#include <asm/sigframe.h>
-#include <asm/pgtable.h>
-#include <asm/fixmap.h>
-#include <asm/processor.h>
-#include <asm/thread_info.h>
-#include <asm/bootparam.h>
-#include <asm/elf.h>
-#include <asm/suspend.h>
-
-#include <xen/interface/xen.h>
 
 #include <linux/lguest.h>
 #include "../../../drivers/lguest/lg.h"
@@ -51,21 +29,10 @@ void foo(void)
 	OFFSET(CPUINFO_x86_vendor_id, cpuinfo_x86, x86_vendor_id);
 	BLANK();
 
-	OFFSET(TI_task, thread_info, task);
-	OFFSET(TI_exec_domain, thread_info, exec_domain);
-	OFFSET(TI_flags, thread_info, flags);
-	OFFSET(TI_status, thread_info, status);
-	OFFSET(TI_preempt_count, thread_info, preempt_count);
-	OFFSET(TI_addr_limit, thread_info, addr_limit);
-	OFFSET(TI_restart_block, thread_info, restart_block);
 	OFFSET(TI_sysenter_return, thread_info, sysenter_return);
 	OFFSET(TI_cpu, thread_info, cpu);
 	BLANK();
 
-	OFFSET(GDS_size, desc_ptr, size);
-	OFFSET(GDS_address, desc_ptr, address);
-	BLANK();
-
 	OFFSET(PT_EBX, pt_regs, bx);
 	OFFSET(PT_ECX, pt_regs, cx);
 	OFFSET(PT_EDX, pt_regs, dx);
@@ -85,42 +52,13 @@ void foo(void)
 	OFFSET(PT_OLDSS,  pt_regs, ss);
 	BLANK();
 
-	OFFSET(EXEC_DOMAIN_handler, exec_domain, handler);
 	OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext);
 	BLANK();
 
-	OFFSET(pbe_address, pbe, address);
-	OFFSET(pbe_orig_address, pbe, orig_address);
-	OFFSET(pbe_next, pbe, next);
-
 	/* Offset from the sysenter stack to tss.sp0 */
 	DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) -
 		 sizeof(struct tss_struct));
 
-	DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
-	DEFINE(PAGE_SHIFT_asm, PAGE_SHIFT);
-	DEFINE(THREAD_SIZE_asm, THREAD_SIZE);
-
-	OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
-
-#ifdef CONFIG_PARAVIRT
-	BLANK();
-	OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled);
-	OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops);
-	OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops);
-	OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
-	OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
-	OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
-	OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit);
-	OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0);
-#endif
-
-#ifdef CONFIG_XEN
-	BLANK();
-	OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
-	OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
-#endif
-
 #if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE)
 	BLANK();
 	OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
@@ -139,11 +77,4 @@ void foo(void)
 	OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode);
 	OFFSET(LGUEST_PAGES_regs, lguest_pages, regs);
 #endif
-
-	BLANK();
-	OFFSET(BP_scratch, boot_params, scratch);
-	OFFSET(BP_loadflags, boot_params, hdr.loadflags);
-	OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
-	OFFSET(BP_version, boot_params, hdr.version);
-	OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment);
 }
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 4a6aeed..e72a119 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -1,27 +1,4 @@
-/*
- * Generate definitions needed by assembly language modules.
- * This code generates raw asm output which is post-processed to extract
- * and format the required data.
- */
-#define COMPILE_OFFSETS
-
-#include <linux/crypto.h>
-#include <linux/sched.h> 
-#include <linux/stddef.h>
-#include <linux/errno.h> 
-#include <linux/hardirq.h>
-#include <linux/suspend.h>
-#include <linux/kbuild.h>
-#include <asm/processor.h>
-#include <asm/segment.h>
-#include <asm/thread_info.h>
 #include <asm/ia32.h>
-#include <asm/bootparam.h>
-#include <asm/suspend.h>
-
-#include <xen/interface/xen.h>
-
-#include <asm/sigframe.h>
 
 #define __NO_STUBS 1
 #undef __SYSCALL
@@ -33,41 +10,19 @@ static char syscalls[] = {
 
 int main(void)
 {
-#define ENTRY(entry) DEFINE(tsk_ ## entry, offsetof(struct task_struct, entry))
-	ENTRY(state);
-	ENTRY(flags); 
-	ENTRY(pid);
-	BLANK();
-#undef ENTRY
-#define ENTRY(entry) DEFINE(TI_ ## entry, offsetof(struct thread_info, entry))
-	ENTRY(flags);
-	ENTRY(addr_limit);
-	ENTRY(preempt_count);
-	ENTRY(status);
-#ifdef CONFIG_IA32_EMULATION
-	ENTRY(sysenter_return);
-#endif
-	BLANK();
-#undef ENTRY
 #ifdef CONFIG_PARAVIRT
-	BLANK();
-	OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled);
-	OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops);
-	OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops);
-	OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
-	OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
 	OFFSET(PV_IRQ_adjust_exception_frame, pv_irq_ops, adjust_exception_frame);
-	OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
 	OFFSET(PV_CPU_usergs_sysret32, pv_cpu_ops, usergs_sysret32);
 	OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64);
-	OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit);
 	OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs);
-	OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2);
+	BLANK();
 #endif
 
-
 #ifdef CONFIG_IA32_EMULATION
-#define ENTRY(entry) DEFINE(IA32_SIGCONTEXT_ ## entry, offsetof(struct sigcontext_ia32, entry))
+	OFFSET(TI_sysenter_return, thread_info, sysenter_return);
+	BLANK();
+
+#define ENTRY(entry) OFFSET(IA32_SIGCONTEXT_ ## entry, sigcontext_ia32, entry)
 	ENTRY(ax);
 	ENTRY(bx);
 	ENTRY(cx);
@@ -79,15 +34,12 @@ int main(void)
 	ENTRY(ip);
 	BLANK();
 #undef ENTRY
-	DEFINE(IA32_RT_SIGFRAME_sigcontext,
-	       offsetof (struct rt_sigframe_ia32, uc.uc_mcontext));
+
+	OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe_ia32, uc.uc_mcontext);
 	BLANK();
 #endif
-	DEFINE(pbe_address, offsetof(struct pbe, address));
-	DEFINE(pbe_orig_address, offsetof(struct pbe, orig_address));
-	DEFINE(pbe_next, offsetof(struct pbe, next));
-	BLANK();
-#define ENTRY(entry) DEFINE(pt_regs_ ## entry, offsetof(struct pt_regs, entry))
+
+#define ENTRY(entry) OFFSET(pt_regs_ ## entry, pt_regs, entry)
 	ENTRY(bx);
 	ENTRY(bx);
 	ENTRY(cx);
@@ -107,7 +59,8 @@ int main(void)
 	ENTRY(flags);
 	BLANK();
 #undef ENTRY
-#define ENTRY(entry) DEFINE(saved_context_ ## entry, offsetof(struct saved_context, entry))
+
+#define ENTRY(entry) OFFSET(saved_context_ ## entry, saved_context, entry)
 	ENTRY(cr0);
 	ENTRY(cr2);
 	ENTRY(cr3);
@@ -115,26 +68,11 @@ int main(void)
 	ENTRY(cr8);
 	BLANK();
 #undef ENTRY
-	DEFINE(TSS_ist, offsetof(struct tss_struct, x86_tss.ist));
-	BLANK();
-	DEFINE(crypto_tfm_ctx_offset, offsetof(struct crypto_tfm, __crt_ctx));
-	BLANK();
-	DEFINE(__NR_syscall_max, sizeof(syscalls) - 1);
 
+	OFFSET(TSS_ist, tss_struct, x86_tss.ist);
 	BLANK();
-	OFFSET(BP_scratch, boot_params, scratch);
-	OFFSET(BP_loadflags, boot_params, hdr.loadflags);
-	OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
-	OFFSET(BP_version, boot_params, hdr.version);
-	OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment);
 
-	BLANK();
-	DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
-#ifdef CONFIG_XEN
-	BLANK();
-	OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
-	OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
-#undef ENTRY
-#endif
+	DEFINE(__NR_syscall_max, sizeof(syscalls) - 1);
+
 	return 0;
 }
-- 
cgit v0.10.2


From 986c011ddbb3ed44b35e1bfd67f6aa60b293b495 Mon Sep 17 00:00:00 2001
From: David Daney <ddaney@caviumnetworks.com>
Date: Wed, 9 Feb 2011 16:04:25 -0800
Subject: genirq: Call bus_lock/unlock functions in setup_irq()

irq_chips that supply .irq_bus_lock/.irq_bus_sync_unlock functions,
expect that the other chip methods will be called inside of calls to
the pair.  If this expectation is not met, things tend to not work.

Make setup_irq() call chip_bus_lock()/chip_bus_sync_unlock() too.

For the vast majority of irq_chips, this will be a NOP as most don't
have these bus lock functions.

[ tglx: No we don't want to call that in __setup_irq(). Way too many
  	error exit pathes. ]

Signed-off-by: David Daney <ddaney@caviumnetworks.com>
LKML-Reference: <1297296265-18680-1-git-send-email-ddaney@caviumnetworks.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 0caa59f..a00bf2c 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -871,9 +871,14 @@ out_thread:
  */
 int setup_irq(unsigned int irq, struct irqaction *act)
 {
+	int retval;
 	struct irq_desc *desc = irq_to_desc(irq);
 
-	return __setup_irq(irq, desc, act);
+	chip_bus_lock(desc);
+	retval = __setup_irq(irq, desc, act);
+	chip_bus_sync_unlock(desc);
+
+	return retval;
 }
 EXPORT_SYMBOL_GPL(setup_irq);
 
-- 
cgit v0.10.2


From 0849327d13a0bd7f6512b7c21f4b3e79efb2076d Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 11 Feb 2011 12:09:54 -0200
Subject: perf report: Fix initializion of annotate symbol priv area

We only allocate it when in TUI mode. In --stdio mode unconditionally
initializing this area leads to memory corruption.

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index f403ace..f9a99a1 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -44,6 +44,7 @@ static const char	default_pretty_printing_style[] = "normal";
 static const char	*pretty_printing_style = default_pretty_printing_style;
 
 static char		callchain_default_opt[] = "fractal,0.5";
+static symbol_filter_t	annotate_init;
 
 static struct hists *perf_session__hists_findnew(struct perf_session *self,
 						 u64 event_stream, u32 type,
@@ -167,7 +168,7 @@ static int process_sample_event(union perf_event *event,
 	struct perf_event_attr *attr;
 
 	if (perf_event__preprocess_sample(event, session, &al, sample,
-					  symbol__annotate_init) < 0) {
+					  annotate_init) < 0) {
 		fprintf(stderr, "problem processing %d event, skipping it.\n",
 			event->header.type);
 		return -1;
@@ -520,6 +521,7 @@ int cmd_report(int argc, const char **argv, const char *prefix __used)
 	 */
 	if (use_browser > 0) {
 		symbol_conf.priv_size = sizeof(struct annotation);
+		annotate_init	      = symbol__annotate_init;
 		/*
  		 * For searching by name on the "Browse map details".
  		 * providing it only in verbose mode not to bloat too
-- 
cgit v0.10.2


From b052181a985592f81767f631f9f42accb4b436cd Mon Sep 17 00:00:00 2001
From: Ian Campbell <ian.campbell@citrix.com>
Date: Fri, 11 Feb 2011 15:23:56 +0000
Subject: xen: events: mark cpu_evtchn_mask_p as __refdata

This variable starts out pointing at init_evtchn_mask which is marked
__initdata but is set to point to a non-init data region in xen_init_IRQ
which is itself an __init function so this is safe.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Tested-and-acked-by: Andrew Jones <drjones@redhat.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index 7468147..a313890 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -114,7 +114,7 @@ struct cpu_evtchn_s {
 static __initdata struct cpu_evtchn_s init_evtchn_mask = {
 	.bits[0 ... (NR_EVENT_CHANNELS/BITS_PER_LONG)-1] = ~0ul,
 };
-static struct cpu_evtchn_s *cpu_evtchn_mask_p = &init_evtchn_mask;
+static struct cpu_evtchn_s __refdata *cpu_evtchn_mask_p = &init_evtchn_mask;
 
 static inline unsigned long *cpu_evtchn_mask(int cpu)
 {
-- 
cgit v0.10.2


From 6b08cfebd3bd346d8a2fd68a2265fc7736849802 Mon Sep 17 00:00:00 2001
From: Ian Campbell <ian.campbell@citrix.com>
Date: Fri, 11 Feb 2011 15:23:58 +0000
Subject: xen p2m: annotate variable which appears unused

 CC      arch/x86/xen/p2m.o
arch/x86/xen/p2m.c: In function 'm2p_remove_override':
arch/x86/xen/p2m.c:460: warning: 'address' may be used uninitialized in this function
arch/x86/xen/p2m.c: In function 'm2p_add_override':
arch/x86/xen/p2m.c:426: warning: 'address' may be used uninitialized in this function

In actual fact address is inialised in one "if (!PageHighMem(page))"
statement and used in a second and so is always initialised before
use.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index fd12d7c..89342e5 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -421,7 +421,7 @@ int m2p_add_override(unsigned long mfn, struct page *page)
 {
 	unsigned long flags;
 	unsigned long pfn;
-	unsigned long address;
+	unsigned long uninitialized_var(address);
 	unsigned level;
 	pte_t *ptep = NULL;
 
@@ -455,7 +455,7 @@ int m2p_remove_override(struct page *page)
 	unsigned long flags;
 	unsigned long mfn;
 	unsigned long pfn;
-	unsigned long address;
+	unsigned long uninitialized_var(address);
 	unsigned level;
 	pte_t *ptep = NULL;
 
-- 
cgit v0.10.2


From 44b46c3ef805793ab3a7730dc71c72d0f258ea8e Mon Sep 17 00:00:00 2001
From: Ian Campbell <Ian.Campbell@eu.citrix.com>
Date: Fri, 11 Feb 2011 16:37:41 +0000
Subject: xen: annotate functions which only call into __init at start of day

Both xen_hvm_init_shared_info and xen_build_mfn_list_list can be
called at resume time as well as at start of day but only reference
__init functions (extend_brk) at start of day. Hence annotate with
__ref.

    WARNING: arch/x86/built-in.o(.text+0x4f1): Section mismatch in reference
        from the function xen_hvm_init_shared_info() to the function
        .init.text:extend_brk()
    The function xen_hvm_init_shared_info() references
    the function __init extend_brk().
    This is often because xen_hvm_init_shared_info lacks a __init
    annotation or the annotation of extend_brk is wrong.

xen_hvm_init_shared_info calls extend_brk() iff !shared_info_page and
initialises shared_info_page with the result. This happens at start of
day only.

    WARNING: arch/x86/built-in.o(.text+0x599b): Section mismatch in reference
        from the function xen_build_mfn_list_list() to the function
        .init.text:extend_brk()
    The function xen_build_mfn_list_list() references
    the function __init extend_brk().
    This is often because xen_build_mfn_list_list lacks a __init
    annotation or the annotation of extend_brk is wrong.

(this warning occurs multiple times)

xen_build_mfn_list_list only calls extend_brk() at boot time, while
building the initial mfn list list

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 50542ef..28e6d42 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1292,7 +1292,7 @@ static int init_hvm_pv_info(int *major, int *minor)
 	return 0;
 }
 
-void xen_hvm_init_shared_info(void)
+void __ref xen_hvm_init_shared_info(void)
 {
 	int cpu;
 	struct xen_add_to_physmap xatp;
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 89342e5..05cfc6a 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -136,7 +136,7 @@ static void p2m_init(unsigned long *p2m)
  * - After resume we're called from within stop_machine, but the mfn
  *   tree should alreay be completely allocated.
  */
-void xen_build_mfn_list_list(void)
+void __ref xen_build_mfn_list_list(void)
 {
 	unsigned long pfn;
 
-- 
cgit v0.10.2


From 868baf07b1a259f5f3803c1dc2777b6c358f83cf Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 10 Feb 2011 21:26:13 -0500
Subject: ftrace: Fix memory leak with function graph and cpu hotplug

When the fuction graph tracer starts, it needs to make a special
stack for each task to save the real return values of the tasks.
All running tasks have this stack created, as well as any new
tasks.

On CPU hot plug, the new idle task will allocate a stack as well
when init_idle() is called. The problem is that cpu hotplug does
not create a new idle_task. Instead it uses the idle task that
existed when the cpu went down.

ftrace_graph_init_task() will add a new ret_stack to the task
that is given to it. Because a clone will make the task
have a stack of its parent it does not check if the task's
ret_stack is already NULL or not. When the CPU hotplug code
starts a CPU up again, it will allocate a new stack even
though one already existed for it.

The solution is to treat the idle_task specially. In fact, the
function_graph code already does, just not at init_idle().
Instead of using the ftrace_graph_init_task() for the idle task,
which that function expects the task to be a clone, have a
separate ftrace_graph_init_idle_task(). Also, we will create a
per_cpu ret_stack that is used by the idle task. When we call
ftrace_graph_init_idle_task() it will check if the idle task's
ret_stack is NULL, if it is, then it will assign it the per_cpu
ret_stack.

Reported-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Suggested-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stable Tree <stable@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index dcd6a7c..ca29e03 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -428,6 +428,7 @@ extern void unregister_ftrace_graph(void);
 
 extern void ftrace_graph_init_task(struct task_struct *t);
 extern void ftrace_graph_exit_task(struct task_struct *t);
+extern void ftrace_graph_init_idle_task(struct task_struct *t, int cpu);
 
 static inline int task_curr_ret_stack(struct task_struct *t)
 {
@@ -451,6 +452,7 @@ static inline void unpause_graph_tracing(void)
 
 static inline void ftrace_graph_init_task(struct task_struct *t) { }
 static inline void ftrace_graph_exit_task(struct task_struct *t) { }
+static inline void ftrace_graph_init_idle_task(struct task_struct *t, int cpu) { }
 
 static inline int register_ftrace_graph(trace_func_graph_ret_t retfunc,
 			  trace_func_graph_ent_t entryfunc)
diff --git a/kernel/sched.c b/kernel/sched.c
index 18d38e4..fbe86cb 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5571,7 +5571,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
 	 * The idle tasks have their own, simple scheduling class:
 	 */
 	idle->sched_class = &idle_sched_class;
-	ftrace_graph_init_task(idle);
+	ftrace_graph_init_idle_task(idle, cpu);
 }
 
 /*
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index f3dadae..888b611 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -3328,7 +3328,7 @@ static int start_graph_tracing(void)
 	/* The cpu_boot init_task->ret_stack will never be freed */
 	for_each_online_cpu(cpu) {
 		if (!idle_task(cpu)->ret_stack)
-			ftrace_graph_init_task(idle_task(cpu));
+			ftrace_graph_init_idle_task(idle_task(cpu), cpu);
 	}
 
 	do {
@@ -3418,6 +3418,49 @@ void unregister_ftrace_graph(void)
 	mutex_unlock(&ftrace_lock);
 }
 
+static DEFINE_PER_CPU(struct ftrace_ret_stack *, idle_ret_stack);
+
+static void
+graph_init_task(struct task_struct *t, struct ftrace_ret_stack *ret_stack)
+{
+	atomic_set(&t->tracing_graph_pause, 0);
+	atomic_set(&t->trace_overrun, 0);
+	t->ftrace_timestamp = 0;
+	/* make curr_ret_stack visable before we add the ret_stack */
+	smp_wmb();
+	t->ret_stack = ret_stack;
+}
+
+/*
+ * Allocate a return stack for the idle task. May be the first
+ * time through, or it may be done by CPU hotplug online.
+ */
+void ftrace_graph_init_idle_task(struct task_struct *t, int cpu)
+{
+	t->curr_ret_stack = -1;
+	/*
+	 * The idle task has no parent, it either has its own
+	 * stack or no stack at all.
+	 */
+	if (t->ret_stack)
+		WARN_ON(t->ret_stack != per_cpu(idle_ret_stack, cpu));
+
+	if (ftrace_graph_active) {
+		struct ftrace_ret_stack *ret_stack;
+
+		ret_stack = per_cpu(idle_ret_stack, cpu);
+		if (!ret_stack) {
+			ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH
+					    * sizeof(struct ftrace_ret_stack),
+					    GFP_KERNEL);
+			if (!ret_stack)
+				return;
+			per_cpu(idle_ret_stack, cpu) = ret_stack;
+		}
+		graph_init_task(t, ret_stack);
+	}
+}
+
 /* Allocate a return stack for newly created task */
 void ftrace_graph_init_task(struct task_struct *t)
 {
@@ -3433,12 +3476,7 @@ void ftrace_graph_init_task(struct task_struct *t)
 				GFP_KERNEL);
 		if (!ret_stack)
 			return;
-		atomic_set(&t->tracing_graph_pause, 0);
-		atomic_set(&t->trace_overrun, 0);
-		t->ftrace_timestamp = 0;
-		/* make curr_ret_stack visable before we add the ret_stack */
-		smp_wmb();
-		t->ret_stack = ret_stack;
+		graph_init_task(t, ret_stack);
 	}
 }
 
-- 
cgit v0.10.2


From 0de4b34d466bae571b50f41c7296b85248205e35 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Mon, 14 Feb 2011 14:48:07 +0900
Subject: tracing/kprobe: Fix NULL pointer deref check

Add NULL check for avoiding NULL pointer deref.
This bug has been introduced by:

  1ff511e35ed8: tracing/kprobes: Add bitfield type

which causes a null pointer dereference bug when kprobe-tracer
parses an argument without type.

Reported-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: 2nddept-manager@sdl.hitachi.co.jp
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <20110214054807.8919.69740.stgit@ltc236.sdl.hitachi.co.jp>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Reported-by: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index ccdc542..8435b43 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -935,7 +935,7 @@ static int parse_probe_arg(char *arg, struct trace_probe *tp,
 	parg->offset = tp->size;
 	tp->size += parg->type->size;
 	ret = __parse_probe_arg(arg, parg->type, &parg->fetch, is_return);
-	if (ret >= 0)
+	if (ret >= 0 && t != NULL)
 		ret = __parse_bitfield_probe_arg(t, parg->type, &parg->fetch);
 	if (ret >= 0) {
 		parg->fetch_size.fn = get_fetch_size_function(parg->type,
-- 
cgit v0.10.2


From 60f6e65d7887c257392313755f95540ef5e7ea89 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Mon, 17 Jan 2011 10:52:02 +0800
Subject: x86: Cleanup vector usage

Cleanup the vector usage and make them continuous if possible.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
LKML-Reference: <1295232722.1949.707.camel@sli10-conroe>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 6af0894..42f0d4a 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -1,6 +1,7 @@
 #ifndef _ASM_X86_IRQ_VECTORS_H
 #define _ASM_X86_IRQ_VECTORS_H
 
+#include <linux/threads.h>
 /*
  * Linux IRQ vector layout.
  *
@@ -16,8 +17,8 @@
  *  Vectors   0 ...  31 : system traps and exceptions - hardcoded events
  *  Vectors  32 ... 127 : device interrupts
  *  Vector  128         : legacy int80 syscall interface
- *  Vectors 129 ... 237 : device interrupts
- *  Vectors 238 ... 255 : special interrupts
+ *  Vectors 129 ... 229 : device interrupts
+ *  Vectors 230 ... 255 : special interrupts
  *
  * 64-bit x86 has per CPU IDT tables, 32-bit has one shared IDT table.
  *
@@ -96,37 +97,38 @@
 #define THRESHOLD_APIC_VECTOR		0xf9
 #define REBOOT_VECTOR			0xf8
 
-/* f0-f7 used for spreading out TLB flushes: */
-#define INVALIDATE_TLB_VECTOR_END	0xf7
-#define INVALIDATE_TLB_VECTOR_START	0xf0
-#define NUM_INVALIDATE_TLB_VECTORS	   8
-
-/*
- * Local APIC timer IRQ vector is on a different priority level,
- * to work around the 'lost local interrupt if more than 2 IRQ
- * sources per level' errata.
- */
-#define LOCAL_TIMER_VECTOR		0xef
-
 /*
  * Generic system vector for platform specific use
  */
-#define X86_PLATFORM_IPI_VECTOR		0xed
+#define X86_PLATFORM_IPI_VECTOR		0xf7
 
 /*
  * IRQ work vector:
  */
-#define IRQ_WORK_VECTOR			0xec
+#define IRQ_WORK_VECTOR			0xf6
 
-#define UV_BAU_MESSAGE			0xea
+#define UV_BAU_MESSAGE			0xf5
 
 /*
  * Self IPI vector for machine checks
  */
-#define MCE_SELF_VECTOR			0xeb
+#define MCE_SELF_VECTOR			0xf4
 
 /* Xen vector callback to receive events in a HVM domain */
-#define XEN_HVM_EVTCHN_CALLBACK		0xe9
+#define XEN_HVM_EVTCHN_CALLBACK		0xf3
+
+/*
+ * Local APIC timer IRQ vector is on a different priority level,
+ * to work around the 'lost local interrupt if more than 2 IRQ
+ * sources per level' errata.
+ */
+#define LOCAL_TIMER_VECTOR		0xef
+
+/* f0-f7 used for spreading out TLB flushes: */
+#define NUM_INVALIDATE_TLB_VECTORS	   8
+#define INVALIDATE_TLB_VECTOR_END	0xee
+#define INVALIDATE_TLB_VECTOR_START	\
+	(INVALIDATE_TLB_VECTOR_END - NUM_INVALIDATE_TLB_VECTORS + 1)
 
 #define NR_VECTORS			 256
 
-- 
cgit v0.10.2


From 3a09fb4570a1cce11472b8e5da3f6ee409f529d5 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Mon, 17 Jan 2011 10:52:05 +0800
Subject: x86: Allocate 32 tlb_invalidate_interrupt handler stubs

Add up to 32 invalidate_interrupt handlers. How many handlers are
added depends on NUM_INVALIDATE_TLB_VECTORS. So if
NUM_INVALIDATE_TLB_VECTORS is smaller than 32, we reduce code
size.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
LKML-Reference: <1295232725.1949.708.camel@sli10-conroe>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index 57650ab..1cd6d26 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -16,10 +16,13 @@ BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR)
 BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR)
 BUILD_INTERRUPT(reboot_interrupt,REBOOT_VECTOR)
 
-.irpc idx, "01234567"
+.irp idx,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \
+	16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
+.if NUM_INVALIDATE_TLB_VECTORS > \idx
 BUILD_INTERRUPT3(invalidate_interrupt\idx,
 		 (INVALIDATE_TLB_VECTOR_START)+\idx,
 		 smp_invalidate_interrupt)
+.endif
 .endr
 #endif
 
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 0274ec5..bb9efe8 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -45,6 +45,30 @@ extern void invalidate_interrupt4(void);
 extern void invalidate_interrupt5(void);
 extern void invalidate_interrupt6(void);
 extern void invalidate_interrupt7(void);
+extern void invalidate_interrupt8(void);
+extern void invalidate_interrupt9(void);
+extern void invalidate_interrupt10(void);
+extern void invalidate_interrupt11(void);
+extern void invalidate_interrupt12(void);
+extern void invalidate_interrupt13(void);
+extern void invalidate_interrupt14(void);
+extern void invalidate_interrupt15(void);
+extern void invalidate_interrupt16(void);
+extern void invalidate_interrupt17(void);
+extern void invalidate_interrupt18(void);
+extern void invalidate_interrupt19(void);
+extern void invalidate_interrupt20(void);
+extern void invalidate_interrupt21(void);
+extern void invalidate_interrupt22(void);
+extern void invalidate_interrupt23(void);
+extern void invalidate_interrupt24(void);
+extern void invalidate_interrupt25(void);
+extern void invalidate_interrupt26(void);
+extern void invalidate_interrupt27(void);
+extern void invalidate_interrupt28(void);
+extern void invalidate_interrupt29(void);
+extern void invalidate_interrupt30(void);
+extern void invalidate_interrupt31(void);
 
 extern void irq_move_cleanup_interrupt(void);
 extern void reboot_interrupt(void);
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index aed1ffb..891268c 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -975,9 +975,12 @@ apicinterrupt X86_PLATFORM_IPI_VECTOR \
 	x86_platform_ipi smp_x86_platform_ipi
 
 #ifdef CONFIG_SMP
-.irpc idx, "01234567"
+.irp idx,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \
+	16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
+.if NUM_INVALIDATE_TLB_VECTORS > \idx
 apicinterrupt (INVALIDATE_TLB_VECTOR_START)+\idx \
 	invalidate_interrupt\idx smp_invalidate_interrupt
+.endif
 .endr
 #endif
 
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index c752e97..7aad10a 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -164,14 +164,77 @@ static void __init smp_intr_init(void)
 	alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
 
 	/* IPIs for invalidation */
-	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0);
-	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1);
-	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2);
-	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3);
-	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4);
-	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5);
-	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6);
-	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7);
+#define ALLOC_INVTLB_VEC(NR) \
+	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+NR, \
+		invalidate_interrupt##NR)
+
+	switch (NUM_INVALIDATE_TLB_VECTORS) {
+	default:
+		ALLOC_INVTLB_VEC(31);
+	case 31:
+		ALLOC_INVTLB_VEC(30);
+	case 30:
+		ALLOC_INVTLB_VEC(29);
+	case 29:
+		ALLOC_INVTLB_VEC(28);
+	case 28:
+		ALLOC_INVTLB_VEC(27);
+	case 27:
+		ALLOC_INVTLB_VEC(26);
+	case 26:
+		ALLOC_INVTLB_VEC(25);
+	case 25:
+		ALLOC_INVTLB_VEC(24);
+	case 24:
+		ALLOC_INVTLB_VEC(23);
+	case 23:
+		ALLOC_INVTLB_VEC(22);
+	case 22:
+		ALLOC_INVTLB_VEC(21);
+	case 21:
+		ALLOC_INVTLB_VEC(20);
+	case 20:
+		ALLOC_INVTLB_VEC(19);
+	case 19:
+		ALLOC_INVTLB_VEC(18);
+	case 18:
+		ALLOC_INVTLB_VEC(17);
+	case 17:
+		ALLOC_INVTLB_VEC(16);
+	case 16:
+		ALLOC_INVTLB_VEC(15);
+	case 15:
+		ALLOC_INVTLB_VEC(14);
+	case 14:
+		ALLOC_INVTLB_VEC(13);
+	case 13:
+		ALLOC_INVTLB_VEC(12);
+	case 12:
+		ALLOC_INVTLB_VEC(11);
+	case 11:
+		ALLOC_INVTLB_VEC(10);
+	case 10:
+		ALLOC_INVTLB_VEC(9);
+	case 9:
+		ALLOC_INVTLB_VEC(8);
+	case 8:
+		ALLOC_INVTLB_VEC(7);
+	case 7:
+		ALLOC_INVTLB_VEC(6);
+	case 6:
+		ALLOC_INVTLB_VEC(5);
+	case 5:
+		ALLOC_INVTLB_VEC(4);
+	case 4:
+		ALLOC_INVTLB_VEC(3);
+	case 3:
+		ALLOC_INVTLB_VEC(2);
+	case 2:
+		ALLOC_INVTLB_VEC(1);
+	case 1:
+		ALLOC_INVTLB_VEC(0);
+		break;
+	}
 
 	/* IPI for generic function call */
 	alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
-- 
cgit v0.10.2


From 70e4a369733a21e3d16b059a6ccdad22a344bf57 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Mon, 17 Jan 2011 10:52:07 +0800
Subject: x86: Scale up the number of TLB invalidate vectors with NR_CPUs, up
 to 32

Make the maxium TLB invalidate vectors depend on NR_CPUS linearly,
with a maximum of 32 vectors.

We currently only have 8 vectors for TLB invalidation and that is clearly
inadequate. If we have a lot of CPUs, the CPUs need share the 8 vectors and
tlbstate_lock is used to protect them. flush_tlb_page() is
heavily used in page reclaim, which will cause a lot of lock
contention for tlbstate_lock.

Andi Kleen suggested increasing the vectors number to 32, which should be
good for current typical systems to reduce the tlbstate_lock contention.

My test system has 4 sockets and 64G memory, and 64 CPUs. My
workload creates 64 processes. Each process mmap reads a big
empty sparse file. The total size of the files are 2*total_mem,
so this will cause a lot of page reclaim.

Below is the result I get from perf call-graph profiling:

 without the patch:
 ------------------

    24.25%           usemem  [kernel]                                   [k] _raw_spin_lock
                     |
                     --- _raw_spin_lock
                        |
                        |--42.15%-- native_flush_tlb_others

 with the patch:
 ------------------

    14.96%           usemem  [kernel]                                   [k] _raw_spin_lock
                     |
                     --- _raw_spin_lock
                        |--13.89%-- native_flush_tlb_others

So this heavily reduces the tlbstate_lock contention.

Suggested-by: Andi Kleen <andi@firstfloor.org>
Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1295232727.1949.709.camel@sli10-conroe>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 42f0d4a..4980f48 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -17,8 +17,8 @@
  *  Vectors   0 ...  31 : system traps and exceptions - hardcoded events
  *  Vectors  32 ... 127 : device interrupts
  *  Vector  128         : legacy int80 syscall interface
- *  Vectors 129 ... 229 : device interrupts
- *  Vectors 230 ... 255 : special interrupts
+ *  Vectors 129 ... INVALIDATE_TLB_VECTOR_START-1 : device interrupts
+ *  Vectors INVALIDATE_TLB_VECTOR_START ... 255 : special interrupts
  *
  * 64-bit x86 has per CPU IDT tables, 32-bit has one shared IDT table.
  *
@@ -124,8 +124,13 @@
  */
 #define LOCAL_TIMER_VECTOR		0xef
 
-/* f0-f7 used for spreading out TLB flushes: */
-#define NUM_INVALIDATE_TLB_VECTORS	   8
+/* up to 32 vectors used for spreading out TLB flushes: */
+#if NR_CPUS <= 32
+# define NUM_INVALIDATE_TLB_VECTORS NR_CPUS
+#else
+# define NUM_INVALIDATE_TLB_VECTORS 32
+#endif
+
 #define INVALIDATE_TLB_VECTOR_END	0xee
 #define INVALIDATE_TLB_VECTOR_START	\
 	(INVALIDATE_TLB_VECTOR_END - NUM_INVALIDATE_TLB_VECTORS + 1)
-- 
cgit v0.10.2


From 7064d865af804b9b841e7b9a3e9b653e40c3e5ca Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Mon, 17 Jan 2011 10:52:10 +0800
Subject: x86: Avoid tlbstate lock if not enough cpus

This one isn't related to previous patch. If online cpus are
below NUM_INVALIDATE_TLB_VECTORS, we don't need the lock. The
comments in the code declares we don't need the check, but a hot
lock still needs an atomic operation and expensive, so add the
check here.

Uses nr_cpu_ids here as suggested by Eric Dumazet.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Acked-by: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Andi Kleen <andi@firstfloor.org>
LKML-Reference: <1295232730.1949.710.camel@sli10-conroe>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 6acc724..55272d7 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -179,12 +179,8 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask,
 	sender = this_cpu_read(tlb_vector_offset);
 	f = &flush_state[sender];
 
-	/*
-	 * Could avoid this lock when
-	 * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
-	 * probably not worth checking this for a cache-hot lock.
-	 */
-	raw_spin_lock(&f->tlbstate_lock);
+	if (nr_cpu_ids > NUM_INVALIDATE_TLB_VECTORS)
+		raw_spin_lock(&f->tlbstate_lock);
 
 	f->flush_mm = mm;
 	f->flush_va = va;
@@ -202,7 +198,8 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask,
 
 	f->flush_mm = NULL;
 	f->flush_va = 0;
-	raw_spin_unlock(&f->tlbstate_lock);
+	if (nr_cpu_ids > NUM_INVALIDATE_TLB_VECTORS)
+		raw_spin_unlock(&f->tlbstate_lock);
 }
 
 void native_flush_tlb_others(const struct cpumask *cpumask,
-- 
cgit v0.10.2


From 77eed821accf5dd962b1f13bed0680e217e49112 Mon Sep 17 00:00:00 2001
From: Kamal Mostafa <kamal@canonical.com>
Date: Thu, 3 Feb 2011 17:38:04 -0800
Subject: x86: Fix panic when handling "mem={invalid}" param

Avoid removing all of memory and panicing when "mem={invalid}"
is specified, e.g. mem=blahblah, mem=0, or mem=nopentium (on
platforms other than x86_32).

Signed-off-by: Kamal Mostafa <kamal@canonical.com>
BugLink: http://bugs.launchpad.net/bugs/553464
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Len Brown <len.brown@intel.com>
Cc: Rafael J. Wysocki <rjw@sisk.pl>
Cc: <stable@kernel.org> # .3x: as far back as it applies
LKML-Reference: <1296783486-23033-1-git-send-email-kamal@canonical.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 294f26d..55a59d8 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -856,6 +856,9 @@ static int __init parse_memopt(char *p)
 
 	userdef = 1;
 	mem_size = memparse(p, &p);
+	/* don't remove all of memory when handling "mem={invalid}" param */
+	if (mem_size == 0)
+		return -EINVAL;
 	e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);
 
 	return 0;
-- 
cgit v0.10.2


From 9a6d44b9adb777ca9549e88cd55bd8f2673c52a2 Mon Sep 17 00:00:00 2001
From: Kamal Mostafa <kamal@canonical.com>
Date: Thu, 3 Feb 2011 17:38:05 -0800
Subject: x86: Emit "mem=nopentium ignored" warning when not supported

Emit warning when "mem=nopentium" is specified on any arch other
than x86_32 (the only that arch supports it).

Signed-off-by: Kamal Mostafa <kamal@canonical.com>
BugLink: http://bugs.launchpad.net/bugs/553464
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Len Brown <len.brown@intel.com>
Cc: Rafael J. Wysocki <rjw@sisk.pl>
LKML-Reference: <1296783486-23033-2-git-send-email-kamal@canonical.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: <stable@kernel.org>

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 55a59d8..0b5e2b5 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -847,12 +847,15 @@ static int __init parse_memopt(char *p)
 	if (!p)
 		return -EINVAL;
 
-#ifdef CONFIG_X86_32
 	if (!strcmp(p, "nopentium")) {
+#ifdef CONFIG_X86_32
 		setup_clear_cpu_cap(X86_FEATURE_PSE);
 		return 0;
-	}
+#else
+		printk(KERN_WARNING "mem=nopentium ignored! (only supported on x86_32)\n");
+		return -EINVAL;
 #endif
+	}
 
 	userdef = 1;
 	mem_size = memparse(p, &p);
-- 
cgit v0.10.2


From e5fea868e6c04343e501176a373d568c1c0094aa Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Tue, 8 Feb 2011 23:22:17 -0800
Subject: x86: Fix and clean up generic_processor_info()

One of the error printouts in generic_processor_info() prints out
the APIC version instead of the cpu index the warning text describes.

Move version validation down, after we get the right cpu index.

-v2: add comments about reason why we can have cpu=0 there.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <4D5240A9.4080703@kernel.org>
[ Cleaned up and made the BIOS bug printouts more consistent ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 06c196d..628dcdb 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1925,17 +1925,6 @@ void __cpuinit generic_processor_info(int apicid, int version)
 {
 	int cpu;
 
-	/*
-	 * Validate version
-	 */
-	if (version == 0x0) {
-		pr_warning("BIOS bug, APIC version is 0 for CPU#%d! "
-			   "fixing up to 0x10. (tell your hw vendor)\n",
-				version);
-		version = 0x10;
-	}
-	apic_version[apicid] = version;
-
 	if (num_processors >= nr_cpu_ids) {
 		int max = nr_cpu_ids;
 		int thiscpu = max + disabled_cpus;
@@ -1949,22 +1938,34 @@ void __cpuinit generic_processor_info(int apicid, int version)
 	}
 
 	num_processors++;
-	cpu = cpumask_next_zero(-1, cpu_present_mask);
-
-	if (version != apic_version[boot_cpu_physical_apicid])
-		WARN_ONCE(1,
-			"ACPI: apic version mismatch, bootcpu: %x cpu %d: %x\n",
-			apic_version[boot_cpu_physical_apicid], cpu, version);
-
-	physid_set(apicid, phys_cpu_present_map);
 	if (apicid == boot_cpu_physical_apicid) {
 		/*
 		 * x86_bios_cpu_apicid is required to have processors listed
 		 * in same order as logical cpu numbers. Hence the first
 		 * entry is BSP, and so on.
+		 * boot_cpu_init() already hold bit 0 in cpu_present_mask
+		 * for BSP.
 		 */
 		cpu = 0;
+	} else
+		cpu = cpumask_next_zero(-1, cpu_present_mask);
+
+	/*
+	 * Validate version
+	 */
+	if (version == 0x0) {
+		pr_warning("BIOS bug: APIC version is 0 for CPU %d/0x%x, fixing up to 0x10\n",
+			   cpu, apicid);
+		version = 0x10;
 	}
+	apic_version[apicid] = version;
+
+	if (version != apic_version[boot_cpu_physical_apicid]) {
+		pr_warning("BIOS bug: APIC version mismatch, boot CPU: %x, CPU %d: version %x\n",
+			apic_version[boot_cpu_physical_apicid], cpu, version);
+	}
+
+	physid_set(apicid, phys_cpu_present_map);
 	if (apicid > max_physical_apicid)
 		max_physical_apicid = apicid;
 
-- 
cgit v0.10.2


From 14392fd329eca9b59d51c0aa5d0acfb4965424d1 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Mon, 7 Feb 2011 14:08:53 -0800
Subject: x86, numa: Add error handling for bad cpu-to-node mappings

CONFIG_DEBUG_PER_CPU_MAPS may return NUMA_NO_NODE when an
early_cpu_to_node() mapping hasn't been initialized.  In such a
case, it emits a warning and continues without an issue but
callers may try to use the return value to index into an array.

We can catch those errors and fail silently since a warning has
already been emitted.  No current user of numa_add_cpu()
requires this error checking to avoid a crash, but it's better
to be proactive in case a future user happens to have a bug and
a user tries to diagnose it with CONFIG_DEBUG_PER_CPU_MAPS.

Reported-by: Jesper Juhl <jj@chaosbits.net>
Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Tejun Heo <tj@kernel.org>
LKML-Reference: <alpine.DEB.2.00.1102071407250.7812@chino.kir.corp.google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index bf60715..9559d36 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -219,6 +219,10 @@ struct cpumask __cpuinit *debug_cpumask_set_cpu(int cpu, int enable)
 	struct cpumask *mask;
 	char buf[64];
 
+	if (node == NUMA_NO_NODE) {
+		/* early_cpu_to_node() already emits a warning and trace */
+		return NULL;
+	}
 	mask = node_to_cpumask_map[node];
 	if (!mask) {
 		pr_err("node_to_cpumask_map[%i] NULL\n", node);
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index f548fbf..3f9411e 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -709,6 +709,10 @@ static void __cpuinit numa_set_cpumask(int cpu, int enable)
 	struct cpumask *mask;
 	int i;
 
+	if (node == NUMA_NO_NODE) {
+		/* early_cpu_to_node() already emits a warning and trace */
+		return;
+	}
 	for_each_online_node(i) {
 		unsigned long addr;
 
-- 
cgit v0.10.2


From 6b617e224dfac0b64ed70dacdac50be6eb78a6a1 Mon Sep 17 00:00:00 2001
From: Feng Tang <feng.tang@intel.com>
Date: Tue, 15 Feb 2011 00:13:31 +0800
Subject: x86/platform: Add a wallclock_init func to x86_init.timers ops

Some wall clock devices use MMIO based HW register, this new
function will give them a chance to do some initialization work
before their get/set_time service get called, which is usually
in early kernel boot phase.

Signed-off-by: Feng Tang <feng.tang@intel.com>
Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 64642ad..643ebf2 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -83,11 +83,13 @@ struct x86_init_paging {
  *				boot cpu
  * @tsc_pre_init:		platform function called before TSC init
  * @timer_init:			initialize the platform timer (default PIT/HPET)
+ * @wallclock_init:		init the wallclock device
  */
 struct x86_init_timers {
 	void (*setup_percpu_clockev)(void);
 	void (*tsc_pre_init)(void);
 	void (*timer_init)(void);
+	void (*wallclock_init)(void);
 };
 
 /**
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 21c6746..68d535a 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1059,6 +1059,8 @@ void __init setup_arch(char **cmdline_p)
 #endif
 	x86_init.oem.banner();
 
+	x86_init.timers.wallclock_init();
+
 	mcheck_init();
 
 	local_irq_save(flags);
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index ceb2911..c11514e9 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -70,6 +70,7 @@ struct x86_init_ops x86_init __initdata = {
 		.setup_percpu_clockev	= setup_boot_APIC_clock,
 		.tsc_pre_init		= x86_init_noop,
 		.timer_init		= hpet_time_init,
+		.wallclock_init		= x86_init_noop,
 	},
 
 	.iommu = {
-- 
cgit v0.10.2


From 168202c7bf89d7a2abaf8deaf4bbed18a1f7b3a3 Mon Sep 17 00:00:00 2001
From: Feng Tang <feng.tang@intel.com>
Date: Tue, 15 Feb 2011 00:13:32 +0800
Subject: mrst/vrtc: Avoid using cmos rtc ops

If we don't assign Moorestown specific wallclock init and ops function
the rtc/persisent clock code will use cmos rtc for access, this will
crash Moorestown in that the ioports are not present.

Also in vrtc driver, should avoid using cmos access to check UIP status.

[feng.tang@intel.com: use set_fixmap_offset_nocache() to simplify code]
Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Feng Tang <feng.tang@intel.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c
index fee0b49..4c542c7 100644
--- a/arch/x86/platform/mrst/mrst.c
+++ b/arch/x86/platform/mrst/mrst.c
@@ -31,6 +31,7 @@
 #include <asm/apic.h>
 #include <asm/io_apic.h>
 #include <asm/mrst.h>
+#include <asm/mrst-vrtc.h>
 #include <asm/io.h>
 #include <asm/i8259.h>
 #include <asm/intel_scu_ipc.h>
@@ -294,6 +295,7 @@ void __init x86_mrst_early_setup(void)
 
 	x86_platform.calibrate_tsc = mrst_calibrate_tsc;
 	x86_platform.i8042_detect = mrst_i8042_detect;
+	x86_init.timers.wallclock_init = mrst_rtc_init;
 	x86_init.pci.init = pci_mrst_init;
 	x86_init.pci.fixup_irqs = x86_init_noop;
 
diff --git a/arch/x86/platform/mrst/vrtc.c b/arch/x86/platform/mrst/vrtc.c
index 32cd7ed..04cf645 100644
--- a/arch/x86/platform/mrst/vrtc.c
+++ b/arch/x86/platform/mrst/vrtc.c
@@ -100,22 +100,14 @@ int vrtc_set_mmss(unsigned long nowtime)
 
 void __init mrst_rtc_init(void)
 {
-	unsigned long rtc_paddr;
-	void __iomem *virt_base;
+	unsigned long vrtc_paddr = sfi_mrtc_array[0].phys_addr;
 
 	sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc);
-	if (!sfi_mrtc_num)
+	if (!sfi_mrtc_num || !vrtc_paddr)
 		return;
 
-	rtc_paddr = sfi_mrtc_array[0].phys_addr;
-
-	/* vRTC's register address may not be page aligned */
-	set_fixmap_nocache(FIX_LNW_VRTC, rtc_paddr);
-
-	virt_base = (void __iomem *)__fix_to_virt(FIX_LNW_VRTC);
-	virt_base += rtc_paddr & ~PAGE_MASK;
-	vrtc_virt_base = virt_base;
-
+	vrtc_virt_base = (void __iomem *)set_fixmap_offset_nocache(FIX_LNW_VRTC,
+								vrtc_paddr);
 	x86_platform.get_wallclock = vrtc_get_time;
 	x86_platform.set_wallclock = vrtc_set_mmss;
 }
diff --git a/drivers/rtc/rtc-mrst.c b/drivers/rtc/rtc-mrst.c
index bcd0cf6..28e02e7 100644
--- a/drivers/rtc/rtc-mrst.c
+++ b/drivers/rtc/rtc-mrst.c
@@ -62,6 +62,17 @@ static inline int is_intr(u8 rtc_intr)
 	return rtc_intr & RTC_IRQMASK;
 }
 
+static inline unsigned char vrtc_is_updating(void)
+{
+	unsigned char uip;
+	unsigned long flags;
+
+	spin_lock_irqsave(&rtc_lock, flags);
+	uip = (vrtc_cmos_read(RTC_FREQ_SELECT) & RTC_UIP);
+	spin_unlock_irqrestore(&rtc_lock, flags);
+	return uip;
+}
+
 /*
  * rtc_time's year contains the increment over 1900, but vRTC's YEAR
  * register can't be programmed to value larger than 0x64, so vRTC
@@ -76,7 +87,7 @@ static int mrst_read_time(struct device *dev, struct rtc_time *time)
 {
 	unsigned long flags;
 
-	if (rtc_is_updating())
+	if (vrtc_is_updating())
 		mdelay(20);
 
 	spin_lock_irqsave(&rtc_lock, flags);
-- 
cgit v0.10.2


From 6ea96e7e4946f790330557e4b7c4c8a174c1c6d2 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 6 Feb 2011 22:45:31 +0000
Subject: um: Remove stale irq_chip.end

irq_chip.end got obsolete with the remnoval of __do_IRQ().

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <20110206224515.135703209@linutronix.de>

diff --git a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c
index 3f0ac9e..c916748 100644
--- a/arch/um/kernel/irq.c
+++ b/arch/um/kernel/irq.c
@@ -374,7 +374,6 @@ static struct irq_chip normal_irq_type = {
 	.disable = dummy,
 	.enable = dummy,
 	.ack = dummy,
-	.end = dummy
 };
 
 static struct irq_chip SIGVTALRM_irq_type = {
@@ -384,7 +383,6 @@ static struct irq_chip SIGVTALRM_irq_type = {
 	.disable = dummy,
 	.enable = dummy,
 	.ack = dummy,
-	.end = dummy
 };
 
 void __init init_IRQ(void)
-- 
cgit v0.10.2


From 1d119aa06fb2b2608151a162f15c480d46694b65 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 6 Feb 2011 22:45:34 +0000
Subject: um: Convert irq_chips to new functions

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <20110206224515.224027758@linutronix.de>

diff --git a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c
index c916748..f771b85 100644
--- a/arch/um/kernel/irq.c
+++ b/arch/um/kernel/irq.c
@@ -360,10 +360,10 @@ EXPORT_SYMBOL(um_request_irq);
 EXPORT_SYMBOL(reactivate_fd);
 
 /*
- * irq_chip must define (startup || enable) &&
- * (shutdown || disable) && end
+ * irq_chip must define at least enable/disable and ack when
+ * the edge handler is used.
  */
-static void dummy(unsigned int irq)
+static void dummy(struct irq_data *d)
 {
 }
 
@@ -371,18 +371,17 @@ static void dummy(unsigned int irq)
 static struct irq_chip normal_irq_type = {
 	.name = "SIGIO",
 	.release = free_irq_by_irq_and_dev,
-	.disable = dummy,
-	.enable = dummy,
-	.ack = dummy,
+	.irq_disable = dummy,
+	.irq_enable = dummy,
+	.irq_ack = dummy,
 };
 
 static struct irq_chip SIGVTALRM_irq_type = {
 	.name = "SIGVTALRM",
 	.release = free_irq_by_irq_and_dev,
-	.shutdown = dummy, /* never called */
-	.disable = dummy,
-	.enable = dummy,
-	.ack = dummy,
+	.irq_disable = dummy,
+	.irq_enable = dummy,
+	.irq_ack = dummy,
 };
 
 void __init init_IRQ(void)
-- 
cgit v0.10.2


From d5b4eea1c575b78448fd5dc54d250ff302ca22f9 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 6 Feb 2011 22:45:36 +0000
Subject: um: Use proper accessors in show_interrupts()

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <20110206224515.322707425@linutronix.de>

diff --git a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c
index f771b85..64cfea8 100644
--- a/arch/um/kernel/irq.c
+++ b/arch/um/kernel/irq.c
@@ -35,8 +35,10 @@ int show_interrupts(struct seq_file *p, void *v)
 	}
 
 	if (i < NR_IRQS) {
-		raw_spin_lock_irqsave(&irq_desc[i].lock, flags);
-		action = irq_desc[i].action;
+		struct irq_desc *desc = irq_to_desc(i);
+
+		raw_spin_lock_irqsave(&desc->lock, flags);
+		action = desc->action;
 		if (!action)
 			goto skip;
 		seq_printf(p, "%3d: ",i);
@@ -46,7 +48,7 @@ int show_interrupts(struct seq_file *p, void *v)
 		for_each_online_cpu(j)
 			seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
 #endif
-		seq_printf(p, " %14s", irq_desc[i].chip->name);
+		seq_printf(p, " %14s", get_irq_desc_chip(desc)->name);
 		seq_printf(p, "  %s", action->name);
 
 		for (action=action->next; action; action = action->next)
@@ -54,7 +56,7 @@ int show_interrupts(struct seq_file *p, void *v)
 
 		seq_putc(p, '\n');
 skip:
-		raw_spin_unlock_irqrestore(&irq_desc[i].lock, flags);
+		raw_spin_unlock_irqrestore(&desc->lock, flags);
 	} else if (i == NR_IRQS)
 		seq_putc(p, '\n');
 
-- 
cgit v0.10.2


From 53c39ce56d203d80ba8217a16bb024b25185fb7e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 6 Feb 2011 22:45:38 +0000
Subject: um: Select GENERIC_HARDIRQS_NO_DEPRECATED

irq chips converted and proper accessor functions used.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <20110206224515.430825903@linutronix.de>

diff --git a/arch/um/Kconfig.common b/arch/um/Kconfig.common
index e351e14..1e78940 100644
--- a/arch/um/Kconfig.common
+++ b/arch/um/Kconfig.common
@@ -7,6 +7,7 @@ config UML
 	bool
 	default y
 	select HAVE_GENERIC_HARDIRQS
+	select GENERIC_HARDIRQS_NO_DEPRECATED
 
 config MMU
 	bool
-- 
cgit v0.10.2


From 7d36b7bc9022f35f95cd85cdf441846298e8f9fb Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 16 Feb 2011 12:13:06 +0100
Subject: x86-64, NUMA: Make dummy node initialization path similar to
 non-dummy ones

Dummy node initialization in initmem_init() didn't initialize apicid
to node mapping and set cpu to node mapping directly by caling
numa_set_node(), which is different from non-dummy init paths.

Update it such that they behave similarly.  Initialize apicid to node
mapping and call numa_init_array().  The actual cpu to node mapping is
handled by init_cpu_to_node() later.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Yinghai Lu <yinghai@kernel.org>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Shaohui Zheng <shaohui.zheng@intel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: H. Peter Anvin <hpa@linux.intel.com>

diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 43ad327..b7d78d7 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -624,11 +624,12 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn,
 	memnodemap[0] = 0;
 	node_set_online(0);
 	node_set(0, node_possible_map);
-	for (i = 0; i < nr_cpu_ids; i++)
-		numa_set_node(i, 0);
+	for (i = 0; i < MAX_LOCAL_APIC; i++)
+		set_apicid_to_node(i, NUMA_NO_NODE);
 	memblock_x86_register_active_regions(0, start_pfn, last_pfn);
 	init_memory_mapping_high();
 	setup_node_bootmem(0, start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT);
+	numa_init_array();
 }
 
 unsigned long __init numa_free_all_bootmem(void)
-- 
cgit v0.10.2


From 13081df5dd6eae1951a3c398fa17d3ed2037a78f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 16 Feb 2011 12:13:06 +0100
Subject: x86-64, NUMA: Simplify hotplug node handling in
 acpi_numa_memory_affinity_init()

Hotplug node handling in acpi_numa_memory_affinity_init() was
unnecessarily complicated with storing the original nodes[] entry and
restoring it afterwards.  Simplify it by not modifying the nodes[]
entry for hotplug nodes from the beginning.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Shaohui Zheng <shaohui.zheng@intel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: H. Peter Anvin <hpa@linux.intel.com>

diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 23498f8..988b0b70 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -251,7 +251,7 @@ update_nodes_add(int node, unsigned long start, unsigned long end)
 void __init
 acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
 {
-	struct bootnode *nd, oldnode;
+	struct bootnode *nd;
 	unsigned long start, end;
 	int node, pxm;
 	int i;
@@ -289,28 +289,23 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
 		bad_srat();
 		return;
 	}
-	nd = &nodes[node];
-	oldnode = *nd;
-	if (!node_test_and_set(node, nodes_parsed)) {
-		nd->start = start;
-		nd->end = end;
-	} else {
-		if (start < nd->start)
-			nd->start = start;
-		if (nd->end < end)
-			nd->end = end;
-	}
 
 	printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm,
 	       start, end);
 
-	if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) {
+	if (!(ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE)) {
+		nd = &nodes[node];
+		if (!node_test_and_set(node, nodes_parsed)) {
+			nd->start = start;
+			nd->end = end;
+		} else {
+			if (start < nd->start)
+				nd->start = start;
+			if (nd->end < end)
+				nd->end = end;
+		}
+	} else
 		update_nodes_add(node, start, end);
-		/* restore nodes[node] */
-		*nd = oldnode;
-		if ((nd->start | nd->end) == 0)
-			node_clear(node, nodes_parsed);
-	}
 
 	node_memblk_range[num_node_memblks].start = start;
 	node_memblk_range[num_node_memblks].end = end;
-- 
cgit v0.10.2


From 86ef4dbf1f736bb1a4d567e043e3dd81b8b7860c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 16 Feb 2011 12:13:06 +0100
Subject: x86, NUMA: Drop @start/last_pfn from initmem_init()

initmem_init() extensively accesses and modifies global data
structures and the parameters aren't even followed depending on which
path is being used.  Drop @start/last_pfn and let it deal with
@max_pfn directly.  This is in preparation for further NUMA init
cleanups.

- v2: x86-32 initmem_init() weren't updated breaking 32bit builds.
  Fixed.  Found by Yinghai.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Shaohui Zheng <shaohui.zheng@intel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: H. Peter Anvin <hpa@linux.intel.com>

diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index 731d211..eb9ed00 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -56,8 +56,7 @@ extern unsigned long init_memory_mapping(unsigned long start,
 
 void init_memory_mapping_high(void);
 
-extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn,
-				int acpi, int k8);
+extern void initmem_init(int acpi, int k8);
 extern void free_initmem(void);
 
 #endif	/* !__ASSEMBLY__ */
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index ac909ba..756d640 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1003,7 +1003,7 @@ void __init setup_arch(char **cmdline_p)
 		amd = !amd_numa_init(0, max_pfn);
 #endif
 
-	initmem_init(0, max_pfn, acpi, amd);
+	initmem_init(acpi, amd);
 	memblock_find_dma_reserve();
 	dma32_reserve_bootmem();
 
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index c821074..16adb66 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -644,8 +644,7 @@ void __init find_low_pfn_range(void)
 }
 
 #ifndef CONFIG_NEED_MULTIPLE_NODES
-void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
-				int acpi, int k8)
+void __init initmem_init(int acpi, int k8)
 {
 #ifdef CONFIG_HIGHMEM
 	highstart_pfn = highend_pfn = max_pfn;
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 194f273..04cc027 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -603,10 +603,9 @@ kernel_physical_mapping_init(unsigned long start,
 }
 
 #ifndef CONFIG_NUMA
-void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
-				int acpi, int k8)
+void __init initmem_init(int acpi, int k8)
 {
-	memblock_x86_register_active_regions(0, start_pfn, end_pfn);
+	memblock_x86_register_active_regions(0, 0, max_pfn);
 	init_memory_mapping_high();
 }
 #endif
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index 505bb04..3249b37 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -352,8 +352,7 @@ static void init_remap_allocator(int nid)
 		(ulong) node_remap_end_vaddr[nid]);
 }
 
-void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
-				int acpi, int k8)
+void __init initmem_init(int acpi, int k8)
 {
 	int nid;
 	long kva_target_pfn;
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index b7d78d7..d7e4aaf 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -579,8 +579,7 @@ static int __init numa_emulation(unsigned long start_pfn,
 }
 #endif /* CONFIG_NUMA_EMU */
 
-void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn,
-				int acpi, int amd)
+void __init initmem_init(int acpi, int amd)
 {
 	int i;
 
@@ -588,19 +587,16 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn,
 	nodes_clear(node_online_map);
 
 #ifdef CONFIG_NUMA_EMU
-	setup_physnodes(start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT,
-			acpi, amd);
-	if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, amd))
+	setup_physnodes(0, max_pfn << PAGE_SHIFT, acpi, amd);
+	if (cmdline && !numa_emulation(0, max_pfn, acpi, amd))
 		return;
-	setup_physnodes(start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT,
-			acpi, amd);
+	setup_physnodes(0, max_pfn << PAGE_SHIFT, acpi, amd);
 	nodes_clear(node_possible_map);
 	nodes_clear(node_online_map);
 #endif
 
 #ifdef CONFIG_ACPI_NUMA
-	if (!numa_off && acpi && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
-						  last_pfn << PAGE_SHIFT))
+	if (!numa_off && acpi && !acpi_scan_nodes(0, max_pfn << PAGE_SHIFT))
 		return;
 	nodes_clear(node_possible_map);
 	nodes_clear(node_online_map);
@@ -616,8 +612,7 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn,
 	       numa_off ? "NUMA turned off" : "No NUMA configuration found");
 
 	printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
-	       start_pfn << PAGE_SHIFT,
-	       last_pfn << PAGE_SHIFT);
+	       0LU, max_pfn << PAGE_SHIFT);
 	/* setup dummy node covering all memory */
 	memnode_shift = 63;
 	memnodemap = memnode.embedded_map;
@@ -626,9 +621,9 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn,
 	node_set(0, node_possible_map);
 	for (i = 0; i < MAX_LOCAL_APIC; i++)
 		set_apicid_to_node(i, NUMA_NO_NODE);
-	memblock_x86_register_active_regions(0, start_pfn, last_pfn);
+	memblock_x86_register_active_regions(0, 0, max_pfn);
 	init_memory_mapping_high();
-	setup_node_bootmem(0, start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT);
+	setup_node_bootmem(0, 0, max_pfn << PAGE_SHIFT);
 	numa_init_array();
 }
 
-- 
cgit v0.10.2


From 940fed2e79a15cf0d006c860d7811adbe5c19882 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 16 Feb 2011 12:13:06 +0100
Subject: x86-64, NUMA: Unify {acpi|amd}_{numa_init|scan_nodes}() arguments and
 return values

The functions used during NUMA initialization - *_numa_init() and
*_scan_nodes() - have different arguments and return values.  Unify
them such that they all take no argument and return 0 on success and
-errno on failure.  This is in preparation for further NUMA init
cleanups.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Shaohui Zheng <shaohui.zheng@intel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: H. Peter Anvin <hpa@linux.intel.com>

diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 211ca3f..4e5dff9 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -187,7 +187,7 @@ struct bootnode;
 extern int acpi_numa;
 extern void acpi_get_nodes(struct bootnode *physnodes, unsigned long start,
 				unsigned long end);
-extern int acpi_scan_nodes(unsigned long start, unsigned long end);
+extern int acpi_scan_nodes(void);
 #define NR_NODE_MEMBLKS (MAX_NUMNODES*2)
 
 #ifdef CONFIG_NUMA_EMU
diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h
index 2b33c4d..dc3c6e3 100644
--- a/arch/x86/include/asm/amd_nb.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -16,7 +16,7 @@ struct bootnode;
 extern int early_is_amd_nb(u32 value);
 extern int amd_cache_northbridges(void);
 extern void amd_flush_garts(void);
-extern int amd_numa_init(unsigned long start_pfn, unsigned long end_pfn);
+extern int amd_numa_init(void);
 extern int amd_scan_nodes(void);
 extern int amd_get_subcaches(int);
 extern int amd_set_subcaches(int, int);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 756d640..96810a3 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -995,12 +995,12 @@ void __init setup_arch(char **cmdline_p)
 	/*
 	 * Parse SRAT to discover nodes.
 	 */
-	acpi = acpi_numa_init();
+	acpi = !acpi_numa_init();
 #endif
 
 #ifdef CONFIG_AMD_NUMA
 	if (!acpi)
-		amd = !amd_numa_init(0, max_pfn);
+		amd = !amd_numa_init();
 #endif
 
 	initmem_init(acpi, amd);
diff --git a/arch/x86/mm/amdtopology_64.c b/arch/x86/mm/amdtopology_64.c
index 2523c35..655ccff 100644
--- a/arch/x86/mm/amdtopology_64.c
+++ b/arch/x86/mm/amdtopology_64.c
@@ -51,7 +51,7 @@ static __init int find_northbridge(void)
 		return num;
 	}
 
-	return -1;
+	return -ENOENT;
 }
 
 static __init void early_get_boot_cpu_id(void)
@@ -69,17 +69,17 @@ static __init void early_get_boot_cpu_id(void)
 #endif
 }
 
-int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn)
+int __init amd_numa_init(void)
 {
-	unsigned long start = PFN_PHYS(start_pfn);
-	unsigned long end = PFN_PHYS(end_pfn);
+	unsigned long start = PFN_PHYS(0);
+	unsigned long end = PFN_PHYS(max_pfn);
 	unsigned numnodes;
 	unsigned long prevbase;
 	int i, nb, found = 0;
 	u32 nodeid, reg;
 
 	if (!early_pci_allowed())
-		return -1;
+		return -EINVAL;
 
 	nb = find_northbridge();
 	if (nb < 0)
@@ -90,7 +90,7 @@ int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn)
 	reg = read_pci_config(0, nb, 0, 0x60);
 	numnodes = ((reg >> 4) & 0xF) + 1;
 	if (numnodes <= 1)
-		return -1;
+		return -ENOENT;
 
 	pr_info("Number of physical nodes %d\n", numnodes);
 
@@ -121,7 +121,7 @@ int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn)
 		if ((base >> 8) & 3 || (limit >> 8) & 3) {
 			pr_err("Node %d using interleaving mode %lx/%lx\n",
 			       nodeid, (base >> 8) & 3, (limit >> 8) & 3);
-			return -1;
+			return -EINVAL;
 		}
 		if (node_isset(nodeid, nodes_parsed)) {
 			pr_info("Node %d already present, skipping\n",
@@ -160,7 +160,7 @@ int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn)
 		if (prevbase > base) {
 			pr_err("Node map not sorted %lx,%lx\n",
 			       prevbase, base);
-			return -1;
+			return -EINVAL;
 		}
 
 		pr_info("Node %d MemBase %016lx Limit %016lx\n",
@@ -177,7 +177,7 @@ int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn)
 	}
 
 	if (!found)
-		return -1;
+		return -ENOENT;
 	return 0;
 }
 
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index d7e4aaf..a083f51 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -596,7 +596,7 @@ void __init initmem_init(int acpi, int amd)
 #endif
 
 #ifdef CONFIG_ACPI_NUMA
-	if (!numa_off && acpi && !acpi_scan_nodes(0, max_pfn << PAGE_SHIFT))
+	if (!numa_off && acpi && !acpi_scan_nodes())
 		return;
 	nodes_clear(node_possible_map);
 	nodes_clear(node_online_map);
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 988b0b70..4f9dbf0 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -359,7 +359,7 @@ void __init acpi_get_nodes(struct bootnode *physnodes, unsigned long start,
 #endif /* CONFIG_NUMA_EMU */
 
 /* Use the information discovered above to actually set up the nodes. */
-int __init acpi_scan_nodes(unsigned long start, unsigned long end)
+int __init acpi_scan_nodes(void)
 {
 	int i;
 
@@ -368,7 +368,7 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
 
 	/* First clean up the node list */
 	for (i = 0; i < MAX_NUMNODES; i++)
-		cutoff_node(i, start, end);
+		cutoff_node(i, 0, max_pfn << PAGE_SHIFT);
 
 	/*
 	 * Join together blocks on the same node, holes between
diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c
index 5eb25eb..3b5c318 100644
--- a/drivers/acpi/numa.c
+++ b/drivers/acpi/numa.c
@@ -274,7 +274,7 @@ acpi_table_parse_srat(enum acpi_srat_type id,
 
 int __init acpi_numa_init(void)
 {
-	int ret = 0;
+	int cnt = 0;
 
 	/*
 	 * Should not limit number with cpu num that is from NR_CPUS or nr_cpus=
@@ -288,7 +288,7 @@ int __init acpi_numa_init(void)
 				     acpi_parse_x2apic_affinity, 0);
 		acpi_table_parse_srat(ACPI_SRAT_TYPE_CPU_AFFINITY,
 				     acpi_parse_processor_affinity, 0);
-		ret = acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY,
+		cnt = acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY,
 					    acpi_parse_memory_affinity,
 					    NR_NODE_MEMBLKS);
 	}
@@ -297,7 +297,10 @@ int __init acpi_numa_init(void)
 	acpi_table_parse(ACPI_SIG_SLIT, acpi_parse_slit);
 
 	acpi_numa_arch_fixup();
-	return ret;
+
+	if (cnt <= 0)
+		return cnt ?: -ENOENT;
+	return 0;
 }
 
 int acpi_get_pxm(acpi_handle h)
-- 
cgit v0.10.2


From a9aec56afac238e4ed3980bd10b22121b83866dd Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 16 Feb 2011 12:13:06 +0100
Subject: x86-64, NUMA: Wrap acpi_numa_init() so that failure can be indicated
 by return value

Because of the way ACPI tables are parsed, the generic
acpi_numa_init() couldn't return failure when error was detected by
arch hooks.  Instead, the failure state was recorded and later arch
dependent init hook - acpi_scan_nodes() - would fail.

Wrap acpi_numa_init() with x86_acpi_numa_init() so that failure can be
indicated as return value immediately.  This is in preparation for
further NUMA init cleanups.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Shaohui Zheng <shaohui.zheng@intel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: H. Peter Anvin <hpa@linux.intel.com>

diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 4e5dff9..06fb786 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -187,6 +187,7 @@ struct bootnode;
 extern int acpi_numa;
 extern void acpi_get_nodes(struct bootnode *physnodes, unsigned long start,
 				unsigned long end);
+extern int x86_acpi_numa_init(void);
 extern int acpi_scan_nodes(void);
 #define NR_NODE_MEMBLKS (MAX_NUMNODES*2)
 
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 96810a3..c9a139c 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -995,7 +995,7 @@ void __init setup_arch(char **cmdline_p)
 	/*
 	 * Parse SRAT to discover nodes.
 	 */
-	acpi = !acpi_numa_init();
+	acpi = !x86_acpi_numa_init();
 #endif
 
 #ifdef CONFIG_AMD_NUMA
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 4f9dbf0..56b9263 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -358,6 +358,16 @@ void __init acpi_get_nodes(struct bootnode *physnodes, unsigned long start,
 }
 #endif /* CONFIG_NUMA_EMU */
 
+int __init x86_acpi_numa_init(void)
+{
+	int ret;
+
+	ret = acpi_numa_init();
+	if (ret < 0)
+		return ret;
+	return srat_disabled() ? -EINVAL : 0;
+}
+
 /* Use the information discovered above to actually set up the nodes. */
 int __init acpi_scan_nodes(void)
 {
-- 
cgit v0.10.2


From d8fc3afc49bb226c20e37f48a4ddd493cd092837 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 16 Feb 2011 12:13:06 +0100
Subject: x86, NUMA: Move *_numa_init() invocations into initmem_init()

There's no reason for these to live in setup_arch().  Move them inside
initmem_init().

- v2: x86-32 initmem_init() weren't updated breaking 32bit builds.
  Fixed.  Found by Ankita.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ankita Garg <ankita@in.ibm.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Shaohui Zheng <shaohui.zheng@intel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: H. Peter Anvin <hpa@linux.intel.com>

diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index eb9ed00..97e6007 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -56,7 +56,7 @@ extern unsigned long init_memory_mapping(unsigned long start,
 
 void init_memory_mapping_high(void);
 
-extern void initmem_init(int acpi, int k8);
+extern void initmem_init(void);
 extern void free_initmem(void);
 
 #endif	/* !__ASSEMBLY__ */
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index c9a139c..46e684f 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -719,8 +719,6 @@ early_param("reservelow", parse_reservelow);
 
 void __init setup_arch(char **cmdline_p)
 {
-	int acpi = 0;
-	int amd = 0;
 	unsigned long flags;
 
 #ifdef CONFIG_X86_32
@@ -991,19 +989,7 @@ void __init setup_arch(char **cmdline_p)
 
 	early_acpi_boot_init();
 
-#ifdef CONFIG_ACPI_NUMA
-	/*
-	 * Parse SRAT to discover nodes.
-	 */
-	acpi = !x86_acpi_numa_init();
-#endif
-
-#ifdef CONFIG_AMD_NUMA
-	if (!acpi)
-		amd = !amd_numa_init();
-#endif
-
-	initmem_init(acpi, amd);
+	initmem_init();
 	memblock_find_dma_reserve();
 	dma32_reserve_bootmem();
 
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 16adb66..5d43fa5 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -644,7 +644,7 @@ void __init find_low_pfn_range(void)
 }
 
 #ifndef CONFIG_NEED_MULTIPLE_NODES
-void __init initmem_init(int acpi, int k8)
+void __init initmem_init(void)
 {
 #ifdef CONFIG_HIGHMEM
 	highstart_pfn = highend_pfn = max_pfn;
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 04cc027..4f1f461 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -603,7 +603,7 @@ kernel_physical_mapping_init(unsigned long start,
 }
 
 #ifndef CONFIG_NUMA
-void __init initmem_init(int acpi, int k8)
+void __init initmem_init(void)
 {
 	memblock_x86_register_active_regions(0, 0, max_pfn);
 	init_memory_mapping_high();
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index 3249b37..bde3906 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -352,7 +352,7 @@ static void init_remap_allocator(int nid)
 		(ulong) node_remap_end_vaddr[nid]);
 }
 
-void __init initmem_init(int acpi, int k8)
+void __init initmem_init(void)
 {
 	int nid;
 	long kva_target_pfn;
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index a083f51..656b0cf 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -13,6 +13,7 @@
 #include <linux/module.h>
 #include <linux/nodemask.h>
 #include <linux/sched.h>
+#include <linux/acpi.h>
 
 #include <asm/e820.h>
 #include <asm/proto.h>
@@ -579,10 +580,23 @@ static int __init numa_emulation(unsigned long start_pfn,
 }
 #endif /* CONFIG_NUMA_EMU */
 
-void __init initmem_init(int acpi, int amd)
+void __init initmem_init(void)
 {
+	int acpi = 0, amd = 0;
 	int i;
 
+#ifdef CONFIG_ACPI_NUMA
+	/*
+	 * Parse SRAT to discover nodes.
+	 */
+	acpi = !x86_acpi_numa_init();
+#endif
+
+#ifdef CONFIG_AMD_NUMA
+	if (!acpi)
+		amd = !amd_numa_init();
+#endif
+
 	nodes_clear(node_possible_map);
 	nodes_clear(node_online_map);
 
-- 
cgit v0.10.2


From ffe77a4605fb2588f8666850ad3e3b196241658f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 16 Feb 2011 12:13:06 +0100
Subject: x86-64, NUMA: Restructure initmem_init()

Reorganize initmem_init() such that,

* Different NUMA init methods are iterated in a consistent way.

* Each iteration re-initializes all the parameters and different
  method can be tried after a failure.

* Dummy init is handled the same as other methods.

Apart from how retry after failure, this patch doesn't change the
behavior.  The call sequences are kept equivalent across the
conversion.

After the change, bad_srat() doesn't need to clear apic to node
mapping or worry about numa_off.  Simplified accordingly.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Shaohui Zheng <shaohui.zheng@intel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: H. Peter Anvin <hpa@linux.intel.com>

diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 656b0cf..c984e34 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -580,65 +580,73 @@ static int __init numa_emulation(unsigned long start_pfn,
 }
 #endif /* CONFIG_NUMA_EMU */
 
-void __init initmem_init(void)
+static int dummy_numa_init(void)
 {
-	int acpi = 0, amd = 0;
-	int i;
-
-#ifdef CONFIG_ACPI_NUMA
-	/*
-	 * Parse SRAT to discover nodes.
-	 */
-	acpi = !x86_acpi_numa_init();
-#endif
-
-#ifdef CONFIG_AMD_NUMA
-	if (!acpi)
-		amd = !amd_numa_init();
-#endif
-
-	nodes_clear(node_possible_map);
-	nodes_clear(node_online_map);
-
-#ifdef CONFIG_NUMA_EMU
-	setup_physnodes(0, max_pfn << PAGE_SHIFT, acpi, amd);
-	if (cmdline && !numa_emulation(0, max_pfn, acpi, amd))
-		return;
-	setup_physnodes(0, max_pfn << PAGE_SHIFT, acpi, amd);
-	nodes_clear(node_possible_map);
-	nodes_clear(node_online_map);
-#endif
-
-#ifdef CONFIG_ACPI_NUMA
-	if (!numa_off && acpi && !acpi_scan_nodes())
-		return;
-	nodes_clear(node_possible_map);
-	nodes_clear(node_online_map);
-#endif
+	return 0;
+}
 
-#ifdef CONFIG_AMD_NUMA
-	if (!numa_off && amd && !amd_scan_nodes())
-		return;
-	nodes_clear(node_possible_map);
-	nodes_clear(node_online_map);
-#endif
+static int dummy_scan_nodes(void)
+{
 	printk(KERN_INFO "%s\n",
 	       numa_off ? "NUMA turned off" : "No NUMA configuration found");
-
 	printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
 	       0LU, max_pfn << PAGE_SHIFT);
+
 	/* setup dummy node covering all memory */
 	memnode_shift = 63;
 	memnodemap = memnode.embedded_map;
 	memnodemap[0] = 0;
 	node_set_online(0);
 	node_set(0, node_possible_map);
-	for (i = 0; i < MAX_LOCAL_APIC; i++)
-		set_apicid_to_node(i, NUMA_NO_NODE);
 	memblock_x86_register_active_regions(0, 0, max_pfn);
 	init_memory_mapping_high();
 	setup_node_bootmem(0, 0, max_pfn << PAGE_SHIFT);
 	numa_init_array();
+
+	return 0;
+}
+
+void __init initmem_init(void)
+{
+	int (*numa_init[])(void) = { [2] = dummy_numa_init };
+	int (*scan_nodes[])(void) = { [2] = dummy_scan_nodes };
+	int i, j;
+
+	if (!numa_off) {
+#ifdef CONFIG_ACPI_NUMA
+		numa_init[0] = x86_acpi_numa_init;
+		scan_nodes[0] = acpi_scan_nodes;
+#endif
+#ifdef CONFIG_AMD_NUMA
+		numa_init[1] = amd_numa_init;
+		scan_nodes[1] = amd_scan_nodes;
+#endif
+	}
+
+	for (i = 0; i < ARRAY_SIZE(numa_init); i++) {
+		if (!numa_init[i])
+			continue;
+
+		for (j = 0; j < MAX_LOCAL_APIC; j++)
+			set_apicid_to_node(j, NUMA_NO_NODE);
+
+		nodes_clear(node_possible_map);
+		nodes_clear(node_online_map);
+
+		if (numa_init[i]() < 0)
+			continue;
+#ifdef CONFIG_NUMA_EMU
+		setup_physnodes(0, max_pfn << PAGE_SHIFT, i == 0, i == 1);
+		if (cmdline && !numa_emulation(0, max_pfn, i == 0, i == 1))
+			return;
+		setup_physnodes(0, max_pfn << PAGE_SHIFT, i == 0, i == 1);
+		nodes_clear(node_possible_map);
+		nodes_clear(node_online_map);
+#endif
+		if (!scan_nodes[i]())
+			return;
+	}
+	BUG();
 }
 
 unsigned long __init numa_free_all_bootmem(void)
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 56b9263..597e011 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -78,8 +78,6 @@ static __init void bad_srat(void)
 	int i;
 	printk(KERN_ERR "SRAT: SRAT not used.\n");
 	acpi_numa = -1;
-	for (i = 0; i < MAX_LOCAL_APIC; i++)
-		set_apicid_to_node(i, NUMA_NO_NODE);
 	for (i = 0; i < MAX_NUMNODES; i++) {
 		nodes[i].start = nodes[i].end = 0;
 		nodes_add[i].start = nodes_add[i].end = 0;
@@ -89,7 +87,7 @@ static __init void bad_srat(void)
 
 static __init inline int srat_disabled(void)
 {
-	return numa_off || acpi_numa < 0;
+	return acpi_numa < 0;
 }
 
 /* Callback for SLIT parsing */
-- 
cgit v0.10.2


From ec8cf29b1d39aeb6ef98bc589f0c9a33a8f94c49 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 16 Feb 2011 12:13:07 +0100
Subject: x86-64, NUMA: Use common {cpu|mem}_nodes_parsed

ACPI and amd are using separate nodes_parsed masks.  Add
{cpu|mem}_nodes_parsed and use them in all NUMA init methods.
Initialization of the masks and building node_possible_map are now
handled commonly by initmem_init().

dummy_numa_init() is updated to set node 0 on both masks.  While at
it, move the info messages from scan to init.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Shaohui Zheng <shaohui.zheng@intel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: H. Peter Anvin <hpa@linux.intel.com>

diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h
index 2819afa..de45936 100644
--- a/arch/x86/include/asm/numa_64.h
+++ b/arch/x86/include/asm/numa_64.h
@@ -27,6 +27,9 @@ extern void setup_node_bootmem(int nodeid, unsigned long start,
  */
 #define NODE_MIN_SIZE (4*1024*1024)
 
+extern nodemask_t cpu_nodes_parsed __initdata;
+extern nodemask_t mem_nodes_parsed __initdata;
+
 extern int __cpuinit numa_cpu_node(int cpu);
 
 #ifdef CONFIG_NUMA_EMU
diff --git a/arch/x86/mm/amdtopology_64.c b/arch/x86/mm/amdtopology_64.c
index 655ccff..4f822a2 100644
--- a/arch/x86/mm/amdtopology_64.c
+++ b/arch/x86/mm/amdtopology_64.c
@@ -28,7 +28,6 @@
 
 static struct bootnode __initdata nodes[8];
 static unsigned char __initdata nodeids[8];
-static nodemask_t __initdata nodes_parsed = NODE_MASK_NONE;
 
 static __init int find_northbridge(void)
 {
@@ -123,7 +122,7 @@ int __init amd_numa_init(void)
 			       nodeid, (base >> 8) & 3, (limit >> 8) & 3);
 			return -EINVAL;
 		}
-		if (node_isset(nodeid, nodes_parsed)) {
+		if (node_isset(nodeid, mem_nodes_parsed)) {
 			pr_info("Node %d already present, skipping\n",
 				nodeid);
 			continue;
@@ -173,7 +172,8 @@ int __init amd_numa_init(void)
 
 		prevbase = base;
 
-		node_set(nodeid, nodes_parsed);
+		node_set(nodeid, mem_nodes_parsed);
+		node_set(nodeid, cpu_nodes_parsed);
 	}
 
 	if (!found)
@@ -190,7 +190,7 @@ void __init amd_get_nodes(struct bootnode *physnodes)
 {
 	int i;
 
-	for_each_node_mask(i, nodes_parsed) {
+	for_each_node_mask(i, mem_nodes_parsed) {
 		physnodes[i].start = nodes[i].start;
 		physnodes[i].end = nodes[i].end;
 	}
@@ -258,8 +258,6 @@ int __init amd_scan_nodes(void)
 	unsigned int apicid_base;
 	int i;
 
-	BUG_ON(nodes_empty(nodes_parsed));
-	node_possible_map = nodes_parsed;
 	memnode_shift = compute_hash_shift(nodes, 8, NULL);
 	if (memnode_shift < 0) {
 		pr_err("No NUMA node hash function found. Contact maintainer\n");
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index c984e34..4404e1d 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -25,6 +25,9 @@
 struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
 EXPORT_SYMBOL(node_data);
 
+nodemask_t cpu_nodes_parsed __initdata;
+nodemask_t mem_nodes_parsed __initdata;
+
 struct memnode memnode;
 
 static unsigned long __initdata nodemap_addr;
@@ -582,22 +585,23 @@ static int __init numa_emulation(unsigned long start_pfn,
 
 static int dummy_numa_init(void)
 {
-	return 0;
-}
-
-static int dummy_scan_nodes(void)
-{
 	printk(KERN_INFO "%s\n",
 	       numa_off ? "NUMA turned off" : "No NUMA configuration found");
 	printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
 	       0LU, max_pfn << PAGE_SHIFT);
 
+	node_set(0, cpu_nodes_parsed);
+	node_set(0, mem_nodes_parsed);
+
+	return 0;
+}
+
+static int dummy_scan_nodes(void)
+{
 	/* setup dummy node covering all memory */
 	memnode_shift = 63;
 	memnodemap = memnode.embedded_map;
 	memnodemap[0] = 0;
-	node_set_online(0);
-	node_set(0, node_possible_map);
 	memblock_x86_register_active_regions(0, 0, max_pfn);
 	init_memory_mapping_high();
 	setup_node_bootmem(0, 0, max_pfn << PAGE_SHIFT);
@@ -630,6 +634,8 @@ void __init initmem_init(void)
 		for (j = 0; j < MAX_LOCAL_APIC; j++)
 			set_apicid_to_node(j, NUMA_NO_NODE);
 
+		nodes_clear(cpu_nodes_parsed);
+		nodes_clear(mem_nodes_parsed);
 		nodes_clear(node_possible_map);
 		nodes_clear(node_online_map);
 
@@ -643,6 +649,11 @@ void __init initmem_init(void)
 		nodes_clear(node_possible_map);
 		nodes_clear(node_online_map);
 #endif
+		/* Account for nodes with cpus and no memory */
+		nodes_or(node_possible_map, mem_nodes_parsed, cpu_nodes_parsed);
+		if (WARN_ON(nodes_empty(node_possible_map)))
+			continue;
+
 		if (!scan_nodes[i]())
 			return;
 	}
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 597e011..33e72ec 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -28,8 +28,6 @@ int acpi_numa __initdata;
 
 static struct acpi_table_slit *acpi_slit;
 
-static nodemask_t nodes_parsed __initdata;
-static nodemask_t cpu_nodes_parsed __initdata;
 static struct bootnode nodes[MAX_NUMNODES] __initdata;
 static struct bootnode nodes_add[MAX_NUMNODES];
 
@@ -293,7 +291,7 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
 
 	if (!(ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE)) {
 		nd = &nodes[node];
-		if (!node_test_and_set(node, nodes_parsed)) {
+		if (!node_test_and_set(node, mem_nodes_parsed)) {
 			nd->start = start;
 			nd->end = end;
 		} else {
@@ -319,7 +317,7 @@ static int __init nodes_cover_memory(const struct bootnode *nodes)
 	unsigned long pxmram, e820ram;
 
 	pxmram = 0;
-	for_each_node_mask(i, nodes_parsed) {
+	for_each_node_mask(i, mem_nodes_parsed) {
 		unsigned long s = nodes[i].start >> PAGE_SHIFT;
 		unsigned long e = nodes[i].end >> PAGE_SHIFT;
 		pxmram += e - s;
@@ -348,7 +346,7 @@ void __init acpi_get_nodes(struct bootnode *physnodes, unsigned long start,
 {
 	int i;
 
-	for_each_node_mask(i, nodes_parsed) {
+	for_each_node_mask(i, mem_nodes_parsed) {
 		cutoff_node(i, start, end);
 		physnodes[i].start = nodes[i].start;
 		physnodes[i].end = nodes[i].end;
@@ -449,9 +447,6 @@ int __init acpi_scan_nodes(void)
 
 	init_memory_mapping_high();
 
-	/* Account for nodes with cpus and no memory */
-	nodes_or(node_possible_map, nodes_parsed, cpu_nodes_parsed);
-
 	/* Finally register nodes */
 	for_each_node_mask(i, node_possible_map)
 		setup_node_bootmem(i, nodes[i].start, nodes[i].end);
@@ -485,7 +480,7 @@ static int __init find_node_by_addr(unsigned long addr)
 	int ret = NUMA_NO_NODE;
 	int i;
 
-	for_each_node_mask(i, nodes_parsed) {
+	for_each_node_mask(i, mem_nodes_parsed) {
 		/*
 		 * Find the real node that this emulated node appears on.  For
 		 * the sake of simplicity, we only use a real node's starting
@@ -545,10 +540,10 @@ void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
 		__acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i);
 	memcpy(__apicid_to_node, fake_apicid_to_node, sizeof(__apicid_to_node));
 
-	nodes_clear(nodes_parsed);
+	nodes_clear(mem_nodes_parsed);
 	for (i = 0; i < num_nodes; i++)
 		if (fake_nodes[i].start != fake_nodes[i].end)
-			node_set(i, nodes_parsed);
+			node_set(i, mem_nodes_parsed);
 }
 
 static int null_slit_node_compare(int a, int b)
-- 
cgit v0.10.2


From 99df738cd28cc39054cd1a77685d4a94ed2193a4 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 16 Feb 2011 12:13:07 +0100
Subject: x86-64, NUMA: Remove local variable found from amd_numa_init()

Use weight count on mem_nodes_parsed instead.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Shaohui Zheng <shaohui.zheng@intel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: H. Peter Anvin <hpa@linux.intel.com>

diff --git a/arch/x86/mm/amdtopology_64.c b/arch/x86/mm/amdtopology_64.c
index 4f822a2..7d85cf7 100644
--- a/arch/x86/mm/amdtopology_64.c
+++ b/arch/x86/mm/amdtopology_64.c
@@ -74,7 +74,7 @@ int __init amd_numa_init(void)
 	unsigned long end = PFN_PHYS(max_pfn);
 	unsigned numnodes;
 	unsigned long prevbase;
-	int i, nb, found = 0;
+	int i, nb;
 	u32 nodeid, reg;
 
 	if (!early_pci_allowed())
@@ -165,8 +165,6 @@ int __init amd_numa_init(void)
 		pr_info("Node %d MemBase %016lx Limit %016lx\n",
 			nodeid, base, limit);
 
-		found++;
-
 		nodes[nodeid].start = base;
 		nodes[nodeid].end = limit;
 
@@ -176,7 +174,7 @@ int __init amd_numa_init(void)
 		node_set(nodeid, cpu_nodes_parsed);
 	}
 
-	if (!found)
+	if (!nodes_weight(mem_nodes_parsed))
 		return -ENOENT;
 	return 0;
 }
-- 
cgit v0.10.2


From 45fe6c78c4ccc384044d1b4877eebe7acf359e76 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 16 Feb 2011 12:13:07 +0100
Subject: x86-64, NUMA: Move apicid to numa mapping initialization from
 amd_scan_nodes() to amd_numa_init()

This brings amd initialization behavior closer to that of acpi.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Shaohui Zheng <shaohui.zheng@intel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: H. Peter Anvin <hpa@linux.intel.com>

diff --git a/arch/x86/mm/amdtopology_64.c b/arch/x86/mm/amdtopology_64.c
index 7d85cf7..b6029a6 100644
--- a/arch/x86/mm/amdtopology_64.c
+++ b/arch/x86/mm/amdtopology_64.c
@@ -74,8 +74,9 @@ int __init amd_numa_init(void)
 	unsigned long end = PFN_PHYS(max_pfn);
 	unsigned numnodes;
 	unsigned long prevbase;
-	int i, nb;
+	int i, j, nb;
 	u32 nodeid, reg;
+	unsigned int bits, cores, apicid_base;
 
 	if (!early_pci_allowed())
 		return -EINVAL;
@@ -176,6 +177,26 @@ int __init amd_numa_init(void)
 
 	if (!nodes_weight(mem_nodes_parsed))
 		return -ENOENT;
+
+	/*
+	 * We seem to have valid NUMA configuration.  Map apicids to nodes
+	 * using the coreid bits from early_identify_cpu.
+	 */
+	bits = boot_cpu_data.x86_coreid_bits;
+	cores = 1 << bits;
+	apicid_base = 0;
+
+	/* get the APIC ID of the BSP early for systems with apicid lifting */
+	early_get_boot_cpu_id();
+	if (boot_cpu_physical_apicid > 0) {
+		pr_info("BSP APIC ID: %02x\n", boot_cpu_physical_apicid);
+		apicid_base = boot_cpu_physical_apicid;
+	}
+
+	for_each_node_mask(i, cpu_nodes_parsed)
+		for (j = apicid_base; j < cores + apicid_base; j++)
+			set_apicid_to_node((i << bits) + j, i);
+
 	return 0;
 }
 
@@ -251,9 +272,6 @@ void __init amd_fake_nodes(const struct bootnode *nodes, int nr_nodes)
 
 int __init amd_scan_nodes(void)
 {
-	unsigned int bits;
-	unsigned int cores;
-	unsigned int apicid_base;
 	int i;
 
 	memnode_shift = compute_hash_shift(nodes, 8, NULL);
@@ -264,28 +282,13 @@ int __init amd_scan_nodes(void)
 	pr_info("Using node hash shift of %d\n", memnode_shift);
 
 	/* use the coreid bits from early_identify_cpu */
-	bits = boot_cpu_data.x86_coreid_bits;
-	cores = (1<<bits);
-	apicid_base = 0;
-	/* get the APIC ID of the BSP early for systems with apicid lifting */
-	early_get_boot_cpu_id();
-	if (boot_cpu_physical_apicid > 0) {
-		pr_info("BSP APIC ID: %02x\n", boot_cpu_physical_apicid);
-		apicid_base = boot_cpu_physical_apicid;
-	}
-
 	for_each_node_mask(i, node_possible_map)
 		memblock_x86_register_active_regions(i,
 				nodes[i].start >> PAGE_SHIFT,
 				nodes[i].end >> PAGE_SHIFT);
 	init_memory_mapping_high();
-	for_each_node_mask(i, node_possible_map) {
-		int j;
-
-		for (j = apicid_base; j < cores + apicid_base; j++)
-			set_apicid_to_node((i << bits) + j, i);
+	for_each_node_mask(i, node_possible_map)
 		setup_node_bootmem(i, nodes[i].start, nodes[i].end);
-	}
 
 	numa_init_array();
 	return 0;
-- 
cgit v0.10.2


From 206e42087a037fa3adca8908fd318a0cb64d4dee Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 16 Feb 2011 12:13:07 +0100
Subject: x86-64, NUMA: Use common numa_nodes[]

ACPI and amd are using separate nodes[] array.  Add numa_nodes[] and
use them in all NUMA init methods.  cutoff_node() cleanup is moved
from srat_64.c to numa_64.c and applied in initmem_init() regardless
of init methods.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Shaohui Zheng <shaohui.zheng@intel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: H. Peter Anvin <hpa@linux.intel.com>

diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h
index de45936..d3a4514 100644
--- a/arch/x86/include/asm/numa_64.h
+++ b/arch/x86/include/asm/numa_64.h
@@ -29,6 +29,7 @@ extern void setup_node_bootmem(int nodeid, unsigned long start,
 
 extern nodemask_t cpu_nodes_parsed __initdata;
 extern nodemask_t mem_nodes_parsed __initdata;
+extern struct bootnode numa_nodes[MAX_NUMNODES] __initdata;
 
 extern int __cpuinit numa_cpu_node(int cpu);
 
diff --git a/arch/x86/mm/amdtopology_64.c b/arch/x86/mm/amdtopology_64.c
index b6029a6..f049fa6 100644
--- a/arch/x86/mm/amdtopology_64.c
+++ b/arch/x86/mm/amdtopology_64.c
@@ -26,7 +26,6 @@
 #include <asm/apic.h>
 #include <asm/amd_nb.h>
 
-static struct bootnode __initdata nodes[8];
 static unsigned char __initdata nodeids[8];
 
 static __init int find_northbridge(void)
@@ -166,8 +165,8 @@ int __init amd_numa_init(void)
 		pr_info("Node %d MemBase %016lx Limit %016lx\n",
 			nodeid, base, limit);
 
-		nodes[nodeid].start = base;
-		nodes[nodeid].end = limit;
+		numa_nodes[nodeid].start = base;
+		numa_nodes[nodeid].end = limit;
 
 		prevbase = base;
 
@@ -210,8 +209,8 @@ void __init amd_get_nodes(struct bootnode *physnodes)
 	int i;
 
 	for_each_node_mask(i, mem_nodes_parsed) {
-		physnodes[i].start = nodes[i].start;
-		physnodes[i].end = nodes[i].end;
+		physnodes[i].start = numa_nodes[i].start;
+		physnodes[i].end = numa_nodes[i].end;
 	}
 }
 
@@ -221,7 +220,7 @@ static int __init find_node_by_addr(unsigned long addr)
 	int i;
 
 	for (i = 0; i < 8; i++)
-		if (addr >= nodes[i].start && addr < nodes[i].end) {
+		if (addr >= numa_nodes[i].start && addr < numa_nodes[i].end) {
 			ret = i;
 			break;
 		}
@@ -274,7 +273,7 @@ int __init amd_scan_nodes(void)
 {
 	int i;
 
-	memnode_shift = compute_hash_shift(nodes, 8, NULL);
+	memnode_shift = compute_hash_shift(numa_nodes, 8, NULL);
 	if (memnode_shift < 0) {
 		pr_err("No NUMA node hash function found. Contact maintainer\n");
 		return -1;
@@ -284,11 +283,11 @@ int __init amd_scan_nodes(void)
 	/* use the coreid bits from early_identify_cpu */
 	for_each_node_mask(i, node_possible_map)
 		memblock_x86_register_active_regions(i,
-				nodes[i].start >> PAGE_SHIFT,
-				nodes[i].end >> PAGE_SHIFT);
+				numa_nodes[i].start >> PAGE_SHIFT,
+				numa_nodes[i].end >> PAGE_SHIFT);
 	init_memory_mapping_high();
 	for_each_node_mask(i, node_possible_map)
-		setup_node_bootmem(i, nodes[i].start, nodes[i].end);
+		setup_node_bootmem(i, numa_nodes[i].start, numa_nodes[i].end);
 
 	numa_init_array();
 	return 0;
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 4404e1d..a6b899f 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -33,6 +33,8 @@ struct memnode memnode;
 static unsigned long __initdata nodemap_addr;
 static unsigned long __initdata nodemap_size;
 
+struct bootnode numa_nodes[MAX_NUMNODES] __initdata;
+
 /*
  * Given a shift value, try to populate memnodemap[]
  * Returns :
@@ -182,6 +184,22 @@ static void * __init early_node_mem(int nodeid, unsigned long start,
 	return NULL;
 }
 
+static __init void cutoff_node(int i, unsigned long start, unsigned long end)
+{
+	struct bootnode *nd = &numa_nodes[i];
+
+	if (nd->start < start) {
+		nd->start = start;
+		if (nd->end < nd->start)
+			nd->start = nd->end;
+	}
+	if (nd->end > end) {
+		nd->end = end;
+		if (nd->start > nd->end)
+			nd->start = nd->end;
+	}
+}
+
 /* Initialize bootmem allocator for a node */
 void __init
 setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
@@ -638,9 +656,15 @@ void __init initmem_init(void)
 		nodes_clear(mem_nodes_parsed);
 		nodes_clear(node_possible_map);
 		nodes_clear(node_online_map);
+		memset(numa_nodes, 0, sizeof(numa_nodes));
 
 		if (numa_init[i]() < 0)
 			continue;
+
+		/* clean up the node list */
+		for (j = 0; j < MAX_NUMNODES; j++)
+			cutoff_node(j, 0, max_pfn << PAGE_SHIFT);
+
 #ifdef CONFIG_NUMA_EMU
 		setup_physnodes(0, max_pfn << PAGE_SHIFT, i == 0, i == 1);
 		if (cmdline && !numa_emulation(0, max_pfn, i == 0, i == 1))
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 33e72ec..bfa4a6af 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -28,7 +28,6 @@ int acpi_numa __initdata;
 
 static struct acpi_table_slit *acpi_slit;
 
-static struct bootnode nodes[MAX_NUMNODES] __initdata;
 static struct bootnode nodes_add[MAX_NUMNODES];
 
 static int num_node_memblks __initdata;
@@ -55,29 +54,13 @@ static __init int conflicting_memblks(unsigned long start, unsigned long end)
 	return -1;
 }
 
-static __init void cutoff_node(int i, unsigned long start, unsigned long end)
-{
-	struct bootnode *nd = &nodes[i];
-
-	if (nd->start < start) {
-		nd->start = start;
-		if (nd->end < nd->start)
-			nd->start = nd->end;
-	}
-	if (nd->end > end) {
-		nd->end = end;
-		if (nd->start > nd->end)
-			nd->start = nd->end;
-	}
-}
-
 static __init void bad_srat(void)
 {
 	int i;
 	printk(KERN_ERR "SRAT: SRAT not used.\n");
 	acpi_numa = -1;
 	for (i = 0; i < MAX_NUMNODES; i++) {
-		nodes[i].start = nodes[i].end = 0;
+		numa_nodes[i].start = numa_nodes[i].end = 0;
 		nodes_add[i].start = nodes_add[i].end = 0;
 	}
 	remove_all_active_ranges();
@@ -276,12 +259,12 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
 	if (i == node) {
 		printk(KERN_WARNING
 		"SRAT: Warning: PXM %d (%lx-%lx) overlaps with itself (%Lx-%Lx)\n",
-			pxm, start, end, nodes[i].start, nodes[i].end);
+		       pxm, start, end, numa_nodes[i].start, numa_nodes[i].end);
 	} else if (i >= 0) {
 		printk(KERN_ERR
 		       "SRAT: PXM %d (%lx-%lx) overlaps with PXM %d (%Lx-%Lx)\n",
 		       pxm, start, end, node_to_pxm(i),
-			nodes[i].start, nodes[i].end);
+		       numa_nodes[i].start, numa_nodes[i].end);
 		bad_srat();
 		return;
 	}
@@ -290,7 +273,7 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
 	       start, end);
 
 	if (!(ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE)) {
-		nd = &nodes[node];
+		nd = &numa_nodes[node];
 		if (!node_test_and_set(node, mem_nodes_parsed)) {
 			nd->start = start;
 			nd->end = end;
@@ -347,9 +330,8 @@ void __init acpi_get_nodes(struct bootnode *physnodes, unsigned long start,
 	int i;
 
 	for_each_node_mask(i, mem_nodes_parsed) {
-		cutoff_node(i, start, end);
-		physnodes[i].start = nodes[i].start;
-		physnodes[i].end = nodes[i].end;
+		physnodes[i].start = numa_nodes[i].start;
+		physnodes[i].end = numa_nodes[i].end;
 	}
 }
 #endif /* CONFIG_NUMA_EMU */
@@ -372,10 +354,6 @@ int __init acpi_scan_nodes(void)
 	if (acpi_numa <= 0)
 		return -1;
 
-	/* First clean up the node list */
-	for (i = 0; i < MAX_NUMNODES; i++)
-		cutoff_node(i, 0, max_pfn << PAGE_SHIFT);
-
 	/*
 	 * Join together blocks on the same node, holes between
 	 * which don't overlap with memory on other nodes.
@@ -440,7 +418,7 @@ int __init acpi_scan_nodes(void)
 
 	/* for out of order entries in SRAT */
 	sort_node_map();
-	if (!nodes_cover_memory(nodes)) {
+	if (!nodes_cover_memory(numa_nodes)) {
 		bad_srat();
 		return -1;
 	}
@@ -449,12 +427,13 @@ int __init acpi_scan_nodes(void)
 
 	/* Finally register nodes */
 	for_each_node_mask(i, node_possible_map)
-		setup_node_bootmem(i, nodes[i].start, nodes[i].end);
+		setup_node_bootmem(i, numa_nodes[i].start, numa_nodes[i].end);
 	/* Try again in case setup_node_bootmem missed one due
 	   to missing bootmem */
 	for_each_node_mask(i, node_possible_map)
 		if (!node_online(i))
-			setup_node_bootmem(i, nodes[i].start, nodes[i].end);
+			setup_node_bootmem(i, numa_nodes[i].start,
+					   numa_nodes[i].end);
 
 	for (i = 0; i < nr_cpu_ids; i++) {
 		int node = early_cpu_to_node(i);
@@ -486,7 +465,7 @@ static int __init find_node_by_addr(unsigned long addr)
 		 * the sake of simplicity, we only use a real node's starting
 		 * address to determine which emulated node it appears on.
 		 */
-		if (addr >= nodes[i].start && addr < nodes[i].end) {
+		if (addr >= numa_nodes[i].start && addr < numa_nodes[i].end) {
 			ret = i;
 			break;
 		}
-- 
cgit v0.10.2


From 19095548704ecd0f32fd5deba01d56430ad7a344 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 16 Feb 2011 12:13:07 +0100
Subject: x86-64, NUMA: Kill {acpi|amd}_get_nodes()

With common numa_nodes[], common code in numa_64.c can access it
directly.  Copy directly and kill {acpi|amd}_get_nodes().

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Shaohui Zheng <shaohui.zheng@intel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: H. Peter Anvin <hpa@linux.intel.com>

diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 06fb786..446a5b9 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -185,8 +185,6 @@ struct bootnode;
 
 #ifdef CONFIG_ACPI_NUMA
 extern int acpi_numa;
-extern void acpi_get_nodes(struct bootnode *physnodes, unsigned long start,
-				unsigned long end);
 extern int x86_acpi_numa_init(void);
 extern int acpi_scan_nodes(void);
 #define NR_NODE_MEMBLKS (MAX_NUMNODES*2)
diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h
index dc3c6e3..246cdc6 100644
--- a/arch/x86/include/asm/amd_nb.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -23,7 +23,6 @@ extern int amd_set_subcaches(int, int);
 
 #ifdef CONFIG_NUMA_EMU
 extern void amd_fake_nodes(const struct bootnode *nodes, int nr_nodes);
-extern void amd_get_nodes(struct bootnode *nodes);
 #endif
 
 struct amd_northbridge {
diff --git a/arch/x86/mm/amdtopology_64.c b/arch/x86/mm/amdtopology_64.c
index f049fa6..cf29527 100644
--- a/arch/x86/mm/amdtopology_64.c
+++ b/arch/x86/mm/amdtopology_64.c
@@ -204,16 +204,6 @@ static s16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = {
 	[0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
 };
 
-void __init amd_get_nodes(struct bootnode *physnodes)
-{
-	int i;
-
-	for_each_node_mask(i, mem_nodes_parsed) {
-		physnodes[i].start = numa_nodes[i].start;
-		physnodes[i].end = numa_nodes[i].end;
-	}
-}
-
 static int __init find_node_by_addr(unsigned long addr)
 {
 	int ret = NUMA_NO_NODE;
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index a6b899f..82ee308 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -257,21 +257,18 @@ void __init numa_emu_cmdline(char *str)
 	cmdline = str;
 }
 
-static int __init setup_physnodes(unsigned long start, unsigned long end,
-					int acpi, int amd)
+static int __init setup_physnodes(unsigned long start, unsigned long end)
 {
 	int ret = 0;
 	int i;
 
 	memset(physnodes, 0, sizeof(physnodes));
-#ifdef CONFIG_ACPI_NUMA
-	if (acpi)
-		acpi_get_nodes(physnodes, start, end);
-#endif
-#ifdef CONFIG_AMD_NUMA
-	if (amd)
-		amd_get_nodes(physnodes);
-#endif
+
+	for_each_node_mask(i, mem_nodes_parsed) {
+		physnodes[i].start = numa_nodes[i].start;
+		physnodes[i].end = numa_nodes[i].end;
+	}
+
 	/*
 	 * Basic sanity checking on the physical node map: there may be errors
 	 * if the SRAT or AMD code incorrectly reported the topology or the mem=
@@ -594,7 +591,7 @@ static int __init numa_emulation(unsigned long start_pfn,
 	init_memory_mapping_high();
 	for_each_node_mask(i, node_possible_map)
 		setup_node_bootmem(i, nodes[i].start, nodes[i].end);
-	setup_physnodes(addr, max_addr, acpi, amd);
+	setup_physnodes(addr, max_addr);
 	fake_physnodes(acpi, amd, num_nodes);
 	numa_init_array();
 	return 0;
@@ -666,10 +663,10 @@ void __init initmem_init(void)
 			cutoff_node(j, 0, max_pfn << PAGE_SHIFT);
 
 #ifdef CONFIG_NUMA_EMU
-		setup_physnodes(0, max_pfn << PAGE_SHIFT, i == 0, i == 1);
+		setup_physnodes(0, max_pfn << PAGE_SHIFT);
 		if (cmdline && !numa_emulation(0, max_pfn, i == 0, i == 1))
 			return;
-		setup_physnodes(0, max_pfn << PAGE_SHIFT, i == 0, i == 1);
+		setup_physnodes(0, max_pfn << PAGE_SHIFT);
 		nodes_clear(node_possible_map);
 		nodes_clear(node_online_map);
 #endif
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index bfa4a6af..82b1087 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -323,19 +323,6 @@ static int __init nodes_cover_memory(const struct bootnode *nodes)
 
 void __init acpi_numa_arch_fixup(void) {}
 
-#ifdef CONFIG_NUMA_EMU
-void __init acpi_get_nodes(struct bootnode *physnodes, unsigned long start,
-				unsigned long end)
-{
-	int i;
-
-	for_each_node_mask(i, mem_nodes_parsed) {
-		physnodes[i].start = numa_nodes[i].start;
-		physnodes[i].end = numa_nodes[i].end;
-	}
-}
-#endif /* CONFIG_NUMA_EMU */
-
 int __init x86_acpi_numa_init(void)
 {
 	int ret;
-- 
cgit v0.10.2


From d41d5a01631af821d3a3447e6613a316f5ee6c25 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 7 Feb 2011 17:02:20 +0100
Subject: cgroup: Fix cgroup_subsys::exit callback

Make the ::exit method act like ::attach, it is after all very nearly
the same thing.

The bug had no effect on correctness - fixing it is an optimization for
the scheduler. Also, later perf-cgroups patches rely on it.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Menage <menage@google.com>
LKML-Reference: <1297160655.13327.92.camel@laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index ce104e3..38117d9 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -474,7 +474,8 @@ struct cgroup_subsys {
 			struct cgroup *old_cgrp, struct task_struct *tsk,
 			bool threadgroup);
 	void (*fork)(struct cgroup_subsys *ss, struct task_struct *task);
-	void (*exit)(struct cgroup_subsys *ss, struct task_struct *task);
+	void (*exit)(struct cgroup_subsys *ss, struct cgroup *cgrp,
+			struct cgroup *old_cgrp, struct task_struct *task);
 	int (*populate)(struct cgroup_subsys *ss,
 			struct cgroup *cgrp);
 	void (*post_clone)(struct cgroup_subsys *ss, struct cgroup *cgrp);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index b24d702..f6495f3 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4230,20 +4230,8 @@ void cgroup_post_fork(struct task_struct *child)
  */
 void cgroup_exit(struct task_struct *tsk, int run_callbacks)
 {
-	int i;
 	struct css_set *cg;
-
-	if (run_callbacks && need_forkexit_callback) {
-		/*
-		 * modular subsystems can't use callbacks, so no need to lock
-		 * the subsys array
-		 */
-		for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
-			struct cgroup_subsys *ss = subsys[i];
-			if (ss->exit)
-				ss->exit(ss, tsk);
-		}
-	}
+	int i;
 
 	/*
 	 * Unlink from the css_set task list if necessary.
@@ -4261,7 +4249,24 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
 	task_lock(tsk);
 	cg = tsk->cgroups;
 	tsk->cgroups = &init_css_set;
+
+	if (run_callbacks && need_forkexit_callback) {
+		/*
+		 * modular subsystems can't use callbacks, so no need to lock
+		 * the subsys array
+		 */
+		for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
+			struct cgroup_subsys *ss = subsys[i];
+			if (ss->exit) {
+				struct cgroup *old_cgrp =
+					rcu_dereference_raw(cg->subsys[i])->cgroup;
+				struct cgroup *cgrp = task_cgroup(tsk, i);
+				ss->exit(ss, cgrp, old_cgrp, tsk);
+			}
+		}
+	}
 	task_unlock(tsk);
+
 	if (cg)
 		put_css_set_taskexit(cg);
 }
diff --git a/kernel/sched.c b/kernel/sched.c
index e142e92..79e611c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -606,9 +606,6 @@ static inline struct task_group *task_group(struct task_struct *p)
 	struct task_group *tg;
 	struct cgroup_subsys_state *css;
 
-	if (p->flags & PF_EXITING)
-		return &root_task_group;
-
 	css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
 			lockdep_is_held(&task_rq(p)->lock));
 	tg = container_of(css, struct task_group, css);
@@ -8863,7 +8860,8 @@ cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
 }
 
 static void
-cpu_cgroup_exit(struct cgroup_subsys *ss, struct task_struct *task)
+cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
+		struct cgroup *old_cgrp, struct task_struct *task)
 {
 	/*
 	 * cgroup_exit() is called in the copy_process() failure path.
-- 
cgit v0.10.2


From e5d1367f17ba6a6fed5fd8b74e4d5720923e0c25 Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Mon, 14 Feb 2011 11:20:01 +0200
Subject: perf: Add cgroup support

This kernel patch adds the ability to filter monitoring based on
container groups (cgroups). This is for use in per-cpu mode only.

The cgroup to monitor is passed as a file descriptor in the pid
argument to the syscall. The file descriptor must be opened to
the cgroup name in the cgroup filesystem. For instance, if the
cgroup name is foo and cgroupfs is mounted in /cgroup, then the
file descriptor is opened to /cgroup/foo. Cgroup mode is
activated by passing PERF_FLAG_PID_CGROUP in the flags argument
to the syscall.

For instance to measure in cgroup foo on CPU1 assuming
cgroupfs is mounted under /cgroup:

struct perf_event_attr attr;
int cgroup_fd, fd;

cgroup_fd = open("/cgroup/foo", O_RDONLY);
fd = perf_event_open(&attr, cgroup_fd, 1, -1, PERF_FLAG_PID_CGROUP);
close(cgroup_fd);

Signed-off-by: Stephane Eranian <eranian@google.com>
[ added perf_cgroup_{exit,attach} ]
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <4d590250.114ddf0a.689e.4482@mx.google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 38117d9..e654fa2 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -627,6 +627,7 @@ bool css_is_ancestor(struct cgroup_subsys_state *cg,
 /* Get id and depth of css */
 unsigned short css_id(struct cgroup_subsys_state *css);
 unsigned short css_depth(struct cgroup_subsys_state *css);
+struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id);
 
 #else /* !CONFIG_CGROUPS */
 
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index ccefff0..cdbfcb8 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -65,4 +65,8 @@ SUBSYS(net_cls)
 SUBSYS(blkio)
 #endif
 
+#ifdef CONFIG_CGROUP_PERF
+SUBSYS(perf)
+#endif
+
 /* */
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index dda5b0a..38c8b25 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -464,6 +464,7 @@ enum perf_callchain_context {
 
 #define PERF_FLAG_FD_NO_GROUP	(1U << 0)
 #define PERF_FLAG_FD_OUTPUT	(1U << 1)
+#define PERF_FLAG_PID_CGROUP	(1U << 2) /* pid=cgroup id, per-cpu mode only */
 
 #ifdef __KERNEL__
 /*
@@ -471,6 +472,7 @@ enum perf_callchain_context {
  */
 
 #ifdef CONFIG_PERF_EVENTS
+# include <linux/cgroup.h>
 # include <asm/perf_event.h>
 # include <asm/local64.h>
 #endif
@@ -716,6 +718,22 @@ struct swevent_hlist {
 #define PERF_ATTACH_GROUP	0x02
 #define PERF_ATTACH_TASK	0x04
 
+#ifdef CONFIG_CGROUP_PERF
+/*
+ * perf_cgroup_info keeps track of time_enabled for a cgroup.
+ * This is a per-cpu dynamically allocated data structure.
+ */
+struct perf_cgroup_info {
+	u64 time;
+	u64 timestamp;
+};
+
+struct perf_cgroup {
+	struct cgroup_subsys_state css;
+	struct perf_cgroup_info *info;	/* timing info, one per cpu */
+};
+#endif
+
 /**
  * struct perf_event - performance event kernel representation:
  */
@@ -832,6 +850,11 @@ struct perf_event {
 	struct event_filter		*filter;
 #endif
 
+#ifdef CONFIG_CGROUP_PERF
+	struct perf_cgroup		*cgrp; /* cgroup event is attach to */
+	int				cgrp_defer_enabled;
+#endif
+
 #endif /* CONFIG_PERF_EVENTS */
 };
 
@@ -886,6 +909,7 @@ struct perf_event_context {
 	u64				generation;
 	int				pin_count;
 	struct rcu_head			rcu_head;
+	int				nr_cgroups; /* cgroup events present */
 };
 
 /*
@@ -905,6 +929,9 @@ struct perf_cpu_context {
 	struct list_head		rotation_list;
 	int				jiffies_interval;
 	struct pmu			*active_pmu;
+#ifdef CONFIG_CGROUP_PERF
+	struct perf_cgroup		*cgrp;
+#endif
 };
 
 struct perf_output_handle {
@@ -1040,11 +1067,11 @@ have_event:
 	__perf_sw_event(event_id, nr, nmi, regs, addr);
 }
 
-extern atomic_t perf_task_events;
+extern atomic_t perf_sched_events;
 
 static inline void perf_event_task_sched_in(struct task_struct *task)
 {
-	COND_STMT(&perf_task_events, __perf_event_task_sched_in(task));
+	COND_STMT(&perf_sched_events, __perf_event_task_sched_in(task));
 }
 
 static inline
@@ -1052,7 +1079,7 @@ void perf_event_task_sched_out(struct task_struct *task, struct task_struct *nex
 {
 	perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0);
 
-	COND_STMT(&perf_task_events, __perf_event_task_sched_out(task, next));
+	COND_STMT(&perf_sched_events, __perf_event_task_sched_out(task, next));
 }
 
 extern void perf_event_mmap(struct vm_area_struct *vma);
diff --git a/init/Kconfig b/init/Kconfig
index be788c0..20d6bd9 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -683,6 +683,16 @@ config CGROUP_MEM_RES_CTLR_SWAP_ENABLED
 	  select this option (if, for some reason, they need to disable it
 	  then noswapaccount does the trick).
 
+config CGROUP_PERF
+	bool "Enable perf_event per-cpu per-container group (cgroup) monitoring"
+	depends on PERF_EVENTS && CGROUPS
+	help
+	  This option extends the per-cpu mode to restrict monitoring to
+	  threads which belong to the cgroup specificied and run on the
+	  designated cpu.
+
+	  Say N if unsure.
+
 menuconfig CGROUP_SCHED
 	bool "Group CPU scheduler"
 	depends on EXPERIMENTAL
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index f6495f3..95362d1 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4818,6 +4818,29 @@ css_get_next(struct cgroup_subsys *ss, int id,
 	return ret;
 }
 
+/*
+ * get corresponding css from file open on cgroupfs directory
+ */
+struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id)
+{
+	struct cgroup *cgrp;
+	struct inode *inode;
+	struct cgroup_subsys_state *css;
+
+	inode = f->f_dentry->d_inode;
+	/* check in cgroup filesystem dir */
+	if (inode->i_op != &cgroup_dir_inode_operations)
+		return ERR_PTR(-EBADF);
+
+	if (id < 0 || id >= CGROUP_SUBSYS_COUNT)
+		return ERR_PTR(-EINVAL);
+
+	/* get cgroup */
+	cgrp = __d_cgrp(f->f_dentry);
+	css = cgrp->subsys[id];
+	return css ? css : ERR_PTR(-ENOENT);
+}
+
 #ifdef CONFIG_CGROUP_DEBUG
 static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
 						   struct cgroup *cont)
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 3d3f282..65dcdc7 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -111,13 +111,23 @@ static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
 	return data.ret;
 }
 
+#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
+		       PERF_FLAG_FD_OUTPUT  |\
+		       PERF_FLAG_PID_CGROUP)
+
 enum event_type_t {
 	EVENT_FLEXIBLE = 0x1,
 	EVENT_PINNED = 0x2,
 	EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
 };
 
-atomic_t perf_task_events __read_mostly;
+/*
+ * perf_sched_events : >0 events exist
+ * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
+ */
+atomic_t perf_sched_events __read_mostly;
+static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
+
 static atomic_t nr_mmap_events __read_mostly;
 static atomic_t nr_comm_events __read_mostly;
 static atomic_t nr_task_events __read_mostly;
@@ -148,7 +158,11 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
 			      enum event_type_t event_type);
 
 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
-			     enum event_type_t event_type);
+			     enum event_type_t event_type,
+			     struct task_struct *task);
+
+static void update_context_time(struct perf_event_context *ctx);
+static u64 perf_event_time(struct perf_event *event);
 
 void __weak perf_event_print_debug(void)	{ }
 
@@ -162,6 +176,338 @@ static inline u64 perf_clock(void)
 	return local_clock();
 }
 
+static inline struct perf_cpu_context *
+__get_cpu_context(struct perf_event_context *ctx)
+{
+	return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
+}
+
+#ifdef CONFIG_CGROUP_PERF
+
+static inline struct perf_cgroup *
+perf_cgroup_from_task(struct task_struct *task)
+{
+	return container_of(task_subsys_state(task, perf_subsys_id),
+			struct perf_cgroup, css);
+}
+
+static inline bool
+perf_cgroup_match(struct perf_event *event)
+{
+	struct perf_event_context *ctx = event->ctx;
+	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+
+	return !event->cgrp || event->cgrp == cpuctx->cgrp;
+}
+
+static inline void perf_get_cgroup(struct perf_event *event)
+{
+	css_get(&event->cgrp->css);
+}
+
+static inline void perf_put_cgroup(struct perf_event *event)
+{
+	css_put(&event->cgrp->css);
+}
+
+static inline void perf_detach_cgroup(struct perf_event *event)
+{
+	perf_put_cgroup(event);
+	event->cgrp = NULL;
+}
+
+static inline int is_cgroup_event(struct perf_event *event)
+{
+	return event->cgrp != NULL;
+}
+
+static inline u64 perf_cgroup_event_time(struct perf_event *event)
+{
+	struct perf_cgroup_info *t;
+
+	t = per_cpu_ptr(event->cgrp->info, event->cpu);
+	return t->time;
+}
+
+static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
+{
+	struct perf_cgroup_info *info;
+	u64 now;
+
+	now = perf_clock();
+
+	info = this_cpu_ptr(cgrp->info);
+
+	info->time += now - info->timestamp;
+	info->timestamp = now;
+}
+
+static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
+{
+	struct perf_cgroup *cgrp_out = cpuctx->cgrp;
+	if (cgrp_out)
+		__update_cgrp_time(cgrp_out);
+}
+
+static inline void update_cgrp_time_from_event(struct perf_event *event)
+{
+	struct perf_cgroup *cgrp = perf_cgroup_from_task(current);
+	/*
+	 * do not update time when cgroup is not active
+	 */
+	if (!event->cgrp || cgrp != event->cgrp)
+		return;
+
+	__update_cgrp_time(event->cgrp);
+}
+
+static inline void
+perf_cgroup_set_timestamp(struct task_struct *task, u64 now)
+{
+	struct perf_cgroup *cgrp;
+	struct perf_cgroup_info *info;
+
+	if (!task)
+		return;
+
+	cgrp = perf_cgroup_from_task(task);
+	info = this_cpu_ptr(cgrp->info);
+	info->timestamp = now;
+}
+
+#define PERF_CGROUP_SWOUT	0x1 /* cgroup switch out every event */
+#define PERF_CGROUP_SWIN	0x2 /* cgroup switch in events based on task */
+
+/*
+ * reschedule events based on the cgroup constraint of task.
+ *
+ * mode SWOUT : schedule out everything
+ * mode SWIN : schedule in based on cgroup for next
+ */
+void perf_cgroup_switch(struct task_struct *task, int mode)
+{
+	struct perf_cpu_context *cpuctx;
+	struct pmu *pmu;
+	unsigned long flags;
+
+	/*
+	 * disable interrupts to avoid geting nr_cgroup
+	 * changes via __perf_event_disable(). Also
+	 * avoids preemption.
+	 */
+	local_irq_save(flags);
+
+	/*
+	 * we reschedule only in the presence of cgroup
+	 * constrained events.
+	 */
+	rcu_read_lock();
+
+	list_for_each_entry_rcu(pmu, &pmus, entry) {
+
+		cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+
+		perf_pmu_disable(cpuctx->ctx.pmu);
+
+		/*
+		 * perf_cgroup_events says at least one
+		 * context on this CPU has cgroup events.
+		 *
+		 * ctx->nr_cgroups reports the number of cgroup
+		 * events for a context.
+		 */
+		if (cpuctx->ctx.nr_cgroups > 0) {
+
+			if (mode & PERF_CGROUP_SWOUT) {
+				cpu_ctx_sched_out(cpuctx, EVENT_ALL);
+				/*
+				 * must not be done before ctxswout due
+				 * to event_filter_match() in event_sched_out()
+				 */
+				cpuctx->cgrp = NULL;
+			}
+
+			if (mode & PERF_CGROUP_SWIN) {
+				/* set cgrp before ctxsw in to
+				 * allow event_filter_match() to not
+				 * have to pass task around
+				 */
+				cpuctx->cgrp = perf_cgroup_from_task(task);
+				cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
+			}
+		}
+
+		perf_pmu_enable(cpuctx->ctx.pmu);
+	}
+
+	rcu_read_unlock();
+
+	local_irq_restore(flags);
+}
+
+static inline void perf_cgroup_sched_out(struct task_struct *task)
+{
+	perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
+}
+
+static inline void perf_cgroup_sched_in(struct task_struct *task)
+{
+	perf_cgroup_switch(task, PERF_CGROUP_SWIN);
+}
+
+static inline int perf_cgroup_connect(int fd, struct perf_event *event,
+				      struct perf_event_attr *attr,
+				      struct perf_event *group_leader)
+{
+	struct perf_cgroup *cgrp;
+	struct cgroup_subsys_state *css;
+	struct file *file;
+	int ret = 0, fput_needed;
+
+	file = fget_light(fd, &fput_needed);
+	if (!file)
+		return -EBADF;
+
+	css = cgroup_css_from_dir(file, perf_subsys_id);
+	if (IS_ERR(css))
+		return PTR_ERR(css);
+
+	cgrp = container_of(css, struct perf_cgroup, css);
+	event->cgrp = cgrp;
+
+	/*
+	 * all events in a group must monitor
+	 * the same cgroup because a task belongs
+	 * to only one perf cgroup at a time
+	 */
+	if (group_leader && group_leader->cgrp != cgrp) {
+		perf_detach_cgroup(event);
+		ret = -EINVAL;
+	} else {
+		/* must be done before we fput() the file */
+		perf_get_cgroup(event);
+	}
+	fput_light(file, fput_needed);
+	return ret;
+}
+
+static inline void
+perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
+{
+	struct perf_cgroup_info *t;
+	t = per_cpu_ptr(event->cgrp->info, event->cpu);
+	event->shadow_ctx_time = now - t->timestamp;
+}
+
+static inline void
+perf_cgroup_defer_enabled(struct perf_event *event)
+{
+	/*
+	 * when the current task's perf cgroup does not match
+	 * the event's, we need to remember to call the
+	 * perf_mark_enable() function the first time a task with
+	 * a matching perf cgroup is scheduled in.
+	 */
+	if (is_cgroup_event(event) && !perf_cgroup_match(event))
+		event->cgrp_defer_enabled = 1;
+}
+
+static inline void
+perf_cgroup_mark_enabled(struct perf_event *event,
+			 struct perf_event_context *ctx)
+{
+	struct perf_event *sub;
+	u64 tstamp = perf_event_time(event);
+
+	if (!event->cgrp_defer_enabled)
+		return;
+
+	event->cgrp_defer_enabled = 0;
+
+	event->tstamp_enabled = tstamp - event->total_time_enabled;
+	list_for_each_entry(sub, &event->sibling_list, group_entry) {
+		if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
+			sub->tstamp_enabled = tstamp - sub->total_time_enabled;
+			sub->cgrp_defer_enabled = 0;
+		}
+	}
+}
+#else /* !CONFIG_CGROUP_PERF */
+
+static inline bool
+perf_cgroup_match(struct perf_event *event)
+{
+	return true;
+}
+
+static inline void perf_detach_cgroup(struct perf_event *event)
+{}
+
+static inline int is_cgroup_event(struct perf_event *event)
+{
+	return 0;
+}
+
+static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event)
+{
+	return 0;
+}
+
+static inline void update_cgrp_time_from_event(struct perf_event *event)
+{
+}
+
+static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
+{
+}
+
+static inline void perf_cgroup_sched_out(struct task_struct *task)
+{
+}
+
+static inline void perf_cgroup_sched_in(struct task_struct *task)
+{
+}
+
+static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
+				      struct perf_event_attr *attr,
+				      struct perf_event *group_leader)
+{
+	return -EINVAL;
+}
+
+static inline void
+perf_cgroup_set_timestamp(struct task_struct *task, u64 now)
+{
+}
+
+void
+perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
+{
+}
+
+static inline void
+perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
+{
+}
+
+static inline u64 perf_cgroup_event_time(struct perf_event *event)
+{
+	return 0;
+}
+
+static inline void
+perf_cgroup_defer_enabled(struct perf_event *event)
+{
+}
+
+static inline void
+perf_cgroup_mark_enabled(struct perf_event *event,
+			 struct perf_event_context *ctx)
+{
+}
+#endif
+
 void perf_pmu_disable(struct pmu *pmu)
 {
 	int *count = this_cpu_ptr(pmu->pmu_disable_count);
@@ -343,6 +689,10 @@ static void update_context_time(struct perf_event_context *ctx)
 static u64 perf_event_time(struct perf_event *event)
 {
 	struct perf_event_context *ctx = event->ctx;
+
+	if (is_cgroup_event(event))
+		return perf_cgroup_event_time(event);
+
 	return ctx ? ctx->time : 0;
 }
 
@@ -357,9 +707,20 @@ static void update_event_times(struct perf_event *event)
 	if (event->state < PERF_EVENT_STATE_INACTIVE ||
 	    event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
 		return;
-
-	if (ctx->is_active)
+	/*
+	 * in cgroup mode, time_enabled represents
+	 * the time the event was enabled AND active
+	 * tasks were in the monitored cgroup. This is
+	 * independent of the activity of the context as
+	 * there may be a mix of cgroup and non-cgroup events.
+	 *
+	 * That is why we treat cgroup events differently
+	 * here.
+	 */
+	if (is_cgroup_event(event))
 		run_end = perf_event_time(event);
+	else if (ctx->is_active)
+		run_end = ctx->time;
 	else
 		run_end = event->tstamp_stopped;
 
@@ -371,6 +732,7 @@ static void update_event_times(struct perf_event *event)
 		run_end = perf_event_time(event);
 
 	event->total_time_running = run_end - event->tstamp_running;
+
 }
 
 /*
@@ -419,6 +781,17 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
 		list_add_tail(&event->group_entry, list);
 	}
 
+	if (is_cgroup_event(event)) {
+		ctx->nr_cgroups++;
+		/*
+		 * one more event:
+		 * - that has cgroup constraint on event->cpu
+		 * - that may need work on context switch
+		 */
+		atomic_inc(&per_cpu(perf_cgroup_events, event->cpu));
+		jump_label_inc(&perf_sched_events);
+	}
+
 	list_add_rcu(&event->event_entry, &ctx->event_list);
 	if (!ctx->nr_events)
 		perf_pmu_rotate_start(ctx->pmu);
@@ -545,6 +918,12 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
 
 	event->attach_state &= ~PERF_ATTACH_CONTEXT;
 
+	if (is_cgroup_event(event)) {
+		ctx->nr_cgroups--;
+		atomic_dec(&per_cpu(perf_cgroup_events, event->cpu));
+		jump_label_dec(&perf_sched_events);
+	}
+
 	ctx->nr_events--;
 	if (event->attr.inherit_stat)
 		ctx->nr_stat--;
@@ -616,7 +995,8 @@ out:
 static inline int
 event_filter_match(struct perf_event *event)
 {
-	return event->cpu == -1 || event->cpu == smp_processor_id();
+	return (event->cpu == -1 || event->cpu == smp_processor_id())
+	    && perf_cgroup_match(event);
 }
 
 static void
@@ -634,7 +1014,7 @@ event_sched_out(struct perf_event *event,
 	 */
 	if (event->state == PERF_EVENT_STATE_INACTIVE
 	    && !event_filter_match(event)) {
-		delta = ctx->time - event->tstamp_stopped;
+		delta = tstamp - event->tstamp_stopped;
 		event->tstamp_running += delta;
 		event->tstamp_stopped = tstamp;
 	}
@@ -678,12 +1058,6 @@ group_sched_out(struct perf_event *group_event,
 		cpuctx->exclusive = 0;
 }
 
-static inline struct perf_cpu_context *
-__get_cpu_context(struct perf_event_context *ctx)
-{
-	return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
-}
-
 /*
  * Cross CPU call to remove a performance event
  *
@@ -783,6 +1157,7 @@ static int __perf_event_disable(void *info)
 	 */
 	if (event->state >= PERF_EVENT_STATE_INACTIVE) {
 		update_context_time(ctx);
+		update_cgrp_time_from_event(event);
 		update_group_times(event);
 		if (event == event->group_leader)
 			group_sched_out(event, cpuctx, ctx);
@@ -851,6 +1226,41 @@ retry:
 	raw_spin_unlock_irq(&ctx->lock);
 }
 
+static void perf_set_shadow_time(struct perf_event *event,
+				 struct perf_event_context *ctx,
+				 u64 tstamp)
+{
+	/*
+	 * use the correct time source for the time snapshot
+	 *
+	 * We could get by without this by leveraging the
+	 * fact that to get to this function, the caller
+	 * has most likely already called update_context_time()
+	 * and update_cgrp_time_xx() and thus both timestamp
+	 * are identical (or very close). Given that tstamp is,
+	 * already adjusted for cgroup, we could say that:
+	 *    tstamp - ctx->timestamp
+	 * is equivalent to
+	 *    tstamp - cgrp->timestamp.
+	 *
+	 * Then, in perf_output_read(), the calculation would
+	 * work with no changes because:
+	 * - event is guaranteed scheduled in
+	 * - no scheduled out in between
+	 * - thus the timestamp would be the same
+	 *
+	 * But this is a bit hairy.
+	 *
+	 * So instead, we have an explicit cgroup call to remain
+	 * within the time time source all along. We believe it
+	 * is cleaner and simpler to understand.
+	 */
+	if (is_cgroup_event(event))
+		perf_cgroup_set_shadow_time(event, tstamp);
+	else
+		event->shadow_ctx_time = tstamp - ctx->timestamp;
+}
+
 #define MAX_INTERRUPTS (~0ULL)
 
 static void perf_log_throttle(struct perf_event *event, int enable);
@@ -891,7 +1301,7 @@ event_sched_in(struct perf_event *event,
 
 	event->tstamp_running += tstamp - event->tstamp_stopped;
 
-	event->shadow_ctx_time = tstamp - ctx->timestamp;
+	perf_set_shadow_time(event, ctx, tstamp);
 
 	if (!is_software_event(event))
 		cpuctx->active_oncpu++;
@@ -1012,7 +1422,8 @@ static void add_event_to_ctx(struct perf_event *event,
 	event->tstamp_stopped = tstamp;
 }
 
-static void perf_event_context_sched_in(struct perf_event_context *ctx);
+static void perf_event_context_sched_in(struct perf_event_context *ctx,
+					struct task_struct *tsk);
 
 /*
  * Cross CPU call to install and enable a performance event
@@ -1033,11 +1444,17 @@ static int  __perf_install_in_context(void *info)
 	 * which do context switches with IRQs enabled.
 	 */
 	if (ctx->task && !cpuctx->task_ctx)
-		perf_event_context_sched_in(ctx);
+		perf_event_context_sched_in(ctx, ctx->task);
 
 	raw_spin_lock(&ctx->lock);
 	ctx->is_active = 1;
 	update_context_time(ctx);
+	/*
+	 * update cgrp time only if current cgrp
+	 * matches event->cgrp. Must be done before
+	 * calling add_event_to_ctx()
+	 */
+	update_cgrp_time_from_event(event);
 
 	add_event_to_ctx(event, ctx);
 
@@ -1175,10 +1592,19 @@ static int __perf_event_enable(void *info)
 
 	if (event->state >= PERF_EVENT_STATE_INACTIVE)
 		goto unlock;
+
+	/*
+	 * set current task's cgroup time reference point
+	 */
+	perf_cgroup_set_timestamp(current, perf_clock());
+
 	__perf_event_mark_enabled(event, ctx);
 
-	if (!event_filter_match(event))
+	if (!event_filter_match(event)) {
+		if (is_cgroup_event(event))
+			perf_cgroup_defer_enabled(event);
 		goto unlock;
+	}
 
 	/*
 	 * If the event is in a group and isn't the group leader,
@@ -1307,6 +1733,7 @@ static void ctx_sched_out(struct perf_event_context *ctx,
 	if (likely(!ctx->nr_events))
 		goto out;
 	update_context_time(ctx);
+	update_cgrp_time_from_cpuctx(cpuctx);
 
 	if (!ctx->nr_active)
 		goto out;
@@ -1496,6 +1923,14 @@ void __perf_event_task_sched_out(struct task_struct *task,
 
 	for_each_task_context_nr(ctxn)
 		perf_event_context_sched_out(task, ctxn, next);
+
+	/*
+	 * if cgroup events exist on this CPU, then we need
+	 * to check if we have to switch out PMU state.
+	 * cgroup event are system-wide mode only
+	 */
+	if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
+		perf_cgroup_sched_out(task);
 }
 
 static void task_ctx_sched_out(struct perf_event_context *ctx,
@@ -1534,6 +1969,10 @@ ctx_pinned_sched_in(struct perf_event_context *ctx,
 		if (!event_filter_match(event))
 			continue;
 
+		/* may need to reset tstamp_enabled */
+		if (is_cgroup_event(event))
+			perf_cgroup_mark_enabled(event, ctx);
+
 		if (group_can_go_on(event, cpuctx, 1))
 			group_sched_in(event, cpuctx, ctx);
 
@@ -1566,6 +2005,10 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
 		if (!event_filter_match(event))
 			continue;
 
+		/* may need to reset tstamp_enabled */
+		if (is_cgroup_event(event))
+			perf_cgroup_mark_enabled(event, ctx);
+
 		if (group_can_go_on(event, cpuctx, can_add_hw)) {
 			if (group_sched_in(event, cpuctx, ctx))
 				can_add_hw = 0;
@@ -1576,15 +2019,19 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
 static void
 ctx_sched_in(struct perf_event_context *ctx,
 	     struct perf_cpu_context *cpuctx,
-	     enum event_type_t event_type)
+	     enum event_type_t event_type,
+	     struct task_struct *task)
 {
+	u64 now;
+
 	raw_spin_lock(&ctx->lock);
 	ctx->is_active = 1;
 	if (likely(!ctx->nr_events))
 		goto out;
 
-	ctx->timestamp = perf_clock();
-
+	now = perf_clock();
+	ctx->timestamp = now;
+	perf_cgroup_set_timestamp(task, now);
 	/*
 	 * First go through the list and put on any pinned groups
 	 * in order to give them the best chance of going on.
@@ -1601,11 +2048,12 @@ out:
 }
 
 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
-			     enum event_type_t event_type)
+			     enum event_type_t event_type,
+			     struct task_struct *task)
 {
 	struct perf_event_context *ctx = &cpuctx->ctx;
 
-	ctx_sched_in(ctx, cpuctx, event_type);
+	ctx_sched_in(ctx, cpuctx, event_type, task);
 }
 
 static void task_ctx_sched_in(struct perf_event_context *ctx,
@@ -1617,11 +2065,12 @@ static void task_ctx_sched_in(struct perf_event_context *ctx,
 	if (cpuctx->task_ctx == ctx)
 		return;
 
-	ctx_sched_in(ctx, cpuctx, event_type);
+	ctx_sched_in(ctx, cpuctx, event_type, NULL);
 	cpuctx->task_ctx = ctx;
 }
 
-static void perf_event_context_sched_in(struct perf_event_context *ctx)
+static void perf_event_context_sched_in(struct perf_event_context *ctx,
+					struct task_struct *task)
 {
 	struct perf_cpu_context *cpuctx;
 
@@ -1637,9 +2086,9 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx)
 	 */
 	cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
 
-	ctx_sched_in(ctx, cpuctx, EVENT_PINNED);
-	cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
-	ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE);
+	ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
+	cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
+	ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
 
 	cpuctx->task_ctx = ctx;
 
@@ -1672,8 +2121,15 @@ void __perf_event_task_sched_in(struct task_struct *task)
 		if (likely(!ctx))
 			continue;
 
-		perf_event_context_sched_in(ctx);
+		perf_event_context_sched_in(ctx, task);
 	}
+	/*
+	 * if cgroup events exist on this CPU, then we need
+	 * to check if we have to switch in PMU state.
+	 * cgroup event are system-wide mode only
+	 */
+	if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
+		perf_cgroup_sched_in(task);
 }
 
 static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
@@ -1873,7 +2329,7 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
 	if (ctx)
 		rotate_ctx(ctx);
 
-	cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
+	cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, current);
 	if (ctx)
 		task_ctx_sched_in(ctx, EVENT_FLEXIBLE);
 
@@ -1952,7 +2408,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
 
 	raw_spin_unlock(&ctx->lock);
 
-	perf_event_context_sched_in(ctx);
+	perf_event_context_sched_in(ctx, ctx->task);
 out:
 	local_irq_restore(flags);
 }
@@ -1977,8 +2433,10 @@ static void __perf_event_read(void *info)
 		return;
 
 	raw_spin_lock(&ctx->lock);
-	if (ctx->is_active)
+	if (ctx->is_active) {
 		update_context_time(ctx);
+		update_cgrp_time_from_event(event);
+	}
 	update_event_times(event);
 	if (event->state == PERF_EVENT_STATE_ACTIVE)
 		event->pmu->read(event);
@@ -2009,8 +2467,10 @@ static u64 perf_event_read(struct perf_event *event)
 		 * (e.g., thread is blocked), in that case
 		 * we cannot update context time
 		 */
-		if (ctx->is_active)
+		if (ctx->is_active) {
 			update_context_time(ctx);
+			update_cgrp_time_from_event(event);
+		}
 		update_event_times(event);
 		raw_spin_unlock_irqrestore(&ctx->lock, flags);
 	}
@@ -2395,7 +2855,7 @@ static void free_event(struct perf_event *event)
 
 	if (!event->parent) {
 		if (event->attach_state & PERF_ATTACH_TASK)
-			jump_label_dec(&perf_task_events);
+			jump_label_dec(&perf_sched_events);
 		if (event->attr.mmap || event->attr.mmap_data)
 			atomic_dec(&nr_mmap_events);
 		if (event->attr.comm)
@@ -2411,6 +2871,9 @@ static void free_event(struct perf_event *event)
 		event->buffer = NULL;
 	}
 
+	if (is_cgroup_event(event))
+		perf_detach_cgroup(event);
+
 	if (event->destroy)
 		event->destroy(event);
 
@@ -5300,6 +5763,7 @@ static void task_clock_event_read(struct perf_event *event)
 
 	if (!in_nmi()) {
 		update_context_time(event->ctx);
+		update_cgrp_time_from_event(event);
 		time = event->ctx->time;
 	} else {
 		u64 now = perf_clock();
@@ -5725,7 +6189,7 @@ done:
 
 	if (!event->parent) {
 		if (event->attach_state & PERF_ATTACH_TASK)
-			jump_label_inc(&perf_task_events);
+			jump_label_inc(&perf_sched_events);
 		if (event->attr.mmap || event->attr.mmap_data)
 			atomic_inc(&nr_mmap_events);
 		if (event->attr.comm)
@@ -5900,7 +6364,7 @@ SYSCALL_DEFINE5(perf_event_open,
 	int err;
 
 	/* for future expandability... */
-	if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT))
+	if (flags & ~PERF_FLAG_ALL)
 		return -EINVAL;
 
 	err = perf_copy_attr(attr_uptr, &attr);
@@ -5917,6 +6381,15 @@ SYSCALL_DEFINE5(perf_event_open,
 			return -EINVAL;
 	}
 
+	/*
+	 * In cgroup mode, the pid argument is used to pass the fd
+	 * opened to the cgroup directory in cgroupfs. The cpu argument
+	 * designates the cpu on which to monitor threads from that
+	 * cgroup.
+	 */
+	if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
+		return -EINVAL;
+
 	event_fd = get_unused_fd_flags(O_RDWR);
 	if (event_fd < 0)
 		return event_fd;
@@ -5934,7 +6407,7 @@ SYSCALL_DEFINE5(perf_event_open,
 			group_leader = NULL;
 	}
 
-	if (pid != -1) {
+	if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
 		task = find_lively_task_by_vpid(pid);
 		if (IS_ERR(task)) {
 			err = PTR_ERR(task);
@@ -5948,6 +6421,12 @@ SYSCALL_DEFINE5(perf_event_open,
 		goto err_task;
 	}
 
+	if (flags & PERF_FLAG_PID_CGROUP) {
+		err = perf_cgroup_connect(pid, event, &attr, group_leader);
+		if (err)
+			goto err_alloc;
+	}
+
 	/*
 	 * Special case software events and allow them to be part of
 	 * any hardware group.
@@ -6808,3 +7287,92 @@ unlock:
 	return ret;
 }
 device_initcall(perf_event_sysfs_init);
+
+#ifdef CONFIG_CGROUP_PERF
+static struct cgroup_subsys_state *perf_cgroup_create(
+	struct cgroup_subsys *ss, struct cgroup *cont)
+{
+	struct perf_cgroup *jc;
+	struct perf_cgroup_info *t;
+	int c;
+
+	jc = kmalloc(sizeof(*jc), GFP_KERNEL);
+	if (!jc)
+		return ERR_PTR(-ENOMEM);
+
+	memset(jc, 0, sizeof(*jc));
+
+	jc->info = alloc_percpu(struct perf_cgroup_info);
+	if (!jc->info) {
+		kfree(jc);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	for_each_possible_cpu(c) {
+		t = per_cpu_ptr(jc->info, c);
+		t->time = 0;
+		t->timestamp = 0;
+	}
+	return &jc->css;
+}
+
+static void perf_cgroup_destroy(struct cgroup_subsys *ss,
+				struct cgroup *cont)
+{
+	struct perf_cgroup *jc;
+	jc = container_of(cgroup_subsys_state(cont, perf_subsys_id),
+			  struct perf_cgroup, css);
+	free_percpu(jc->info);
+	kfree(jc);
+}
+
+static int __perf_cgroup_move(void *info)
+{
+	struct task_struct *task = info;
+	perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
+	return 0;
+}
+
+static void perf_cgroup_move(struct task_struct *task)
+{
+	task_function_call(task, __perf_cgroup_move, task);
+}
+
+static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
+		struct cgroup *old_cgrp, struct task_struct *task,
+		bool threadgroup)
+{
+	perf_cgroup_move(task);
+	if (threadgroup) {
+		struct task_struct *c;
+		rcu_read_lock();
+		list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
+			perf_cgroup_move(c);
+		}
+		rcu_read_unlock();
+	}
+}
+
+static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
+		struct cgroup *old_cgrp, struct task_struct *task)
+{
+	/*
+	 * cgroup_exit() is called in the copy_process() failure path.
+	 * Ignore this case since the task hasn't ran yet, this avoids
+	 * trying to poke a half freed task state from generic code.
+	 */
+	if (!(task->flags & PF_EXITING))
+		return;
+
+	perf_cgroup_move(task);
+}
+
+struct cgroup_subsys perf_subsys = {
+	.name = "perf_event",
+	.subsys_id = perf_subsys_id,
+	.create = perf_cgroup_create,
+	.destroy = perf_cgroup_destroy,
+	.exit = perf_cgroup_exit,
+	.attach = perf_cgroup_attach,
+};
+#endif /* CONFIG_CGROUP_PERF */
-- 
cgit v0.10.2


From 023695d96ee06f36cf5014e286edcd623e9fb847 Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Mon, 14 Feb 2011 11:20:01 +0200
Subject: perf tool: Add cgroup support

This patch adds the ability to filter monitoring based on container groups
(cgroups) for both perf stat and perf record. It is possible to monitor
multiple cgroup in parallel. There is one cgroup per event. The cgroups to
monitor are passed via a new -G option followed by a comma separated list of
cgroup names.

The cgroup filesystem has to be mounted. Given a cgroup name, the perf tool
finds the corresponding directory in the cgroup filesystem and opens it. It
then passes that file descriptor to the kernel.

Example:

$ perf stat -B -a -e cycles:u,cycles:u,cycles:u -G test1,,test2 -- sleep 1
 Performance counter stats for 'sleep 1':

      2,368,667,414  cycles                   test1
      2,369,661,459  cycles
      <not counted>  cycles                   test2

        1.001856890  seconds time elapsed

Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <4d590290.825bdf0a.7d0a.4890@mx.google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt
index e032716..5a520f8 100644
--- a/tools/perf/Documentation/perf-record.txt
+++ b/tools/perf/Documentation/perf-record.txt
@@ -137,6 +137,17 @@ Do not update the builid cache. This saves some overhead in situations
 where the information in the perf.data file (which includes buildids)
 is sufficient.
 
+-G name,...::
+--cgroup name,...::
+monitor only in the container (cgroup) called "name". This option is available only
+in per-cpu mode. The cgroup filesystem must be mounted. All threads belonging to
+container "name" are monitored when they run on the monitored CPUs. Multiple cgroups
+can be provided. Each cgroup is applied to the corresponding event, i.e., first cgroup
+to first event, second cgroup to second event and so on. It is possible to provide
+an empty cgroup (monitor all the time) using, e.g., -G foo,,bar. Cgroups must have
+corresponding events, i.e., they always refer to events defined earlier on the command
+line.
+
 SEE ALSO
 --------
 linkperf:perf-stat[1], linkperf:perf-list[1]
diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt
index b6da7af..918cc38 100644
--- a/tools/perf/Documentation/perf-stat.txt
+++ b/tools/perf/Documentation/perf-stat.txt
@@ -83,6 +83,17 @@ This option is only valid in system-wide mode.
 print counts using a CSV-style output to make it easy to import directly into
 spreadsheets. Columns are separated by the string specified in SEP.
 
+-G name::
+--cgroup name::
+monitor only in the container (cgroup) called "name". This option is available only
+in per-cpu mode. The cgroup filesystem must be mounted. All threads belonging to
+container "name" are monitored when they run on the monitored CPUs. Multiple cgroups
+can be provided. Each cgroup is applied to the corresponding event, i.e., first cgroup
+to first event, second cgroup to second event and so on. It is possible to provide
+an empty cgroup (monitor all the time) using, e.g., -G foo,,bar. Cgroups must have
+corresponding events, i.e., they always refer to events defined earlier on the command
+line.
+
 EXAMPLES
 --------
 
diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index 94f73ab..bc4d9bf 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -442,6 +442,7 @@ LIB_H += util/pstack.h
 LIB_H += util/cpumap.h
 LIB_H += util/top.h
 LIB_H += $(ARCH_INCLUDE)
+LIB_H += util/cgroup.h
 
 LIB_OBJS += $(OUTPUT)util/abspath.o
 LIB_OBJS += $(OUTPUT)util/alias.o
@@ -496,6 +497,7 @@ LIB_OBJS += $(OUTPUT)util/probe-event.o
 LIB_OBJS += $(OUTPUT)util/util.o
 LIB_OBJS += $(OUTPUT)util/xyarray.o
 LIB_OBJS += $(OUTPUT)util/cpumap.o
+LIB_OBJS += $(OUTPUT)util/cgroup.o
 
 BUILTIN_OBJS += $(OUTPUT)builtin-annotate.o
 
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 12e0e41..a4aaadc 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -807,6 +807,9 @@ const struct option record_options[] = {
 		    "do not update the buildid cache"),
 	OPT_BOOLEAN('B', "no-buildid", &no_buildid,
 		    "do not collect buildids in perf.data"),
+	OPT_CALLBACK('G', "cgroup", &evsel_list, "name",
+		     "monitor event in cgroup name only",
+		     parse_cgroups),
 	OPT_END()
 };
 
@@ -835,6 +838,12 @@ int cmd_record(int argc, const char **argv, const char *prefix __used)
 		write_mode = WRITE_FORCE;
 	}
 
+	if (nr_cgroups && !system_wide) {
+		fprintf(stderr, "cgroup monitoring only available in"
+			" system-wide mode\n");
+		usage_with_options(record_usage, record_options);
+	}
+
 	symbol__init();
 
 	if (no_buildid_cache || no_buildid)
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 806a999..21c0252 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -390,6 +390,9 @@ static void nsec_printout(int cpu, struct perf_evsel *evsel, double avg)
 
 	fprintf(stderr, fmt, cpustr, msecs, csv_sep, event_name(evsel));
 
+	if (evsel->cgrp)
+		fprintf(stderr, "%s%s", csv_sep, evsel->cgrp->name);
+
 	if (csv_output)
 		return;
 
@@ -420,6 +423,9 @@ static void abs_printout(int cpu, struct perf_evsel *evsel, double avg)
 
 	fprintf(stderr, fmt, cpustr, avg, csv_sep, event_name(evsel));
 
+	if (evsel->cgrp)
+		fprintf(stderr, "%s%s", csv_sep, evsel->cgrp->name);
+
 	if (csv_output)
 		return;
 
@@ -460,9 +466,17 @@ static void print_counter_aggr(struct perf_evsel *counter)
 	int scaled = counter->counts->scaled;
 
 	if (scaled == -1) {
-		fprintf(stderr, "%*s%s%-24s\n",
+		fprintf(stderr, "%*s%s%*s",
 			csv_output ? 0 : 18,
-			"<not counted>", csv_sep, event_name(counter));
+			"<not counted>",
+			csv_sep,
+			csv_output ? 0 : -24,
+			event_name(counter));
+
+		if (counter->cgrp)
+			fprintf(stderr, "%s%s", csv_sep, counter->cgrp->name);
+
+		fputc('\n', stderr);
 		return;
 	}
 
@@ -487,7 +501,6 @@ static void print_counter_aggr(struct perf_evsel *counter)
 		fprintf(stderr, "  (scaled from %.2f%%)",
 				100 * avg_running / avg_enabled);
 	}
-
 	fprintf(stderr, "\n");
 }
 
@@ -505,14 +518,18 @@ static void print_counter(struct perf_evsel *counter)
 		ena = counter->counts->cpu[cpu].ena;
 		run = counter->counts->cpu[cpu].run;
 		if (run == 0 || ena == 0) {
-			fprintf(stderr, "CPU%*d%s%*s%s%-24s",
+			fprintf(stderr, "CPU%*d%s%*s%s%*s",
 				csv_output ? 0 : -4,
 				evsel_list->cpus->map[cpu], csv_sep,
 				csv_output ? 0 : 18,
 				"<not counted>", csv_sep,
+				csv_output ? 0 : -24,
 				event_name(counter));
 
-			fprintf(stderr, "\n");
+			if (counter->cgrp)
+				fprintf(stderr, "%s%s", csv_sep, counter->cgrp->name);
+
+			fputc('\n', stderr);
 			continue;
 		}
 
@@ -529,7 +546,7 @@ static void print_counter(struct perf_evsel *counter)
 					100.0 * run / ena);
 			}
 		}
-		fprintf(stderr, "\n");
+		fputc('\n', stderr);
 	}
 }
 
@@ -642,6 +659,9 @@ static const struct option options[] = {
 		    "disable CPU count aggregation"),
 	OPT_STRING('x', "field-separator", &csv_sep, "separator",
 		   "print counts with custom separator"),
+	OPT_CALLBACK('G', "cgroup", &evsel_list, "name",
+		     "monitor event in cgroup name only",
+		     parse_cgroups),
 	OPT_END()
 };
 
@@ -682,9 +702,13 @@ int cmd_stat(int argc, const char **argv, const char *prefix __used)
 	if (run_count <= 0)
 		usage_with_options(stat_usage, options);
 
-	/* no_aggr is for system-wide only */
-	if (no_aggr && !system_wide)
+	/* no_aggr, cgroup are for system-wide only */
+	if ((no_aggr || nr_cgroups) && !system_wide) {
+		fprintf(stderr, "both cgroup and no-aggregation "
+			"modes only available in system-wide mode\n");
+
 		usage_with_options(stat_usage, options);
+	}
 
 	/* Set attrs and nr_counters if no event is selected and !null_run */
 	if (!null_run && !evsel_list->nr_entries) {
diff --git a/tools/perf/util/cgroup.c b/tools/perf/util/cgroup.c
new file mode 100644
index 0000000..9fea755
--- /dev/null
+++ b/tools/perf/util/cgroup.c
@@ -0,0 +1,178 @@
+#include "util.h"
+#include "../perf.h"
+#include "parse-options.h"
+#include "evsel.h"
+#include "cgroup.h"
+#include "debugfs.h" /* MAX_PATH, STR() */
+#include "evlist.h"
+
+int nr_cgroups;
+
+static int
+cgroupfs_find_mountpoint(char *buf, size_t maxlen)
+{
+	FILE *fp;
+	char mountpoint[MAX_PATH+1], tokens[MAX_PATH+1], type[MAX_PATH+1];
+	char *token, *saved_ptr;
+	int found = 0;
+
+	fp = fopen("/proc/mounts", "r");
+	if (!fp)
+		return -1;
+
+	/*
+	 * in order to handle split hierarchy, we need to scan /proc/mounts
+	 * and inspect every cgroupfs mount point to find one that has
+	 * perf_event subsystem
+	 */
+	while (fscanf(fp, "%*s %"STR(MAX_PATH)"s %"STR(MAX_PATH)"s %"
+				STR(MAX_PATH)"s %*d %*d\n",
+				mountpoint, type, tokens) == 3) {
+
+		if (!strcmp(type, "cgroup")) {
+
+			token = strtok_r(tokens, ",", &saved_ptr);
+
+			while (token != NULL) {
+				if (!strcmp(token, "perf_event")) {
+					found = 1;
+					break;
+				}
+				token = strtok_r(NULL, ",", &saved_ptr);
+			}
+		}
+		if (found)
+			break;
+	}
+	fclose(fp);
+	if (!found)
+		return -1;
+
+	if (strlen(mountpoint) < maxlen) {
+		strcpy(buf, mountpoint);
+		return 0;
+	}
+	return -1;
+}
+
+static int open_cgroup(char *name)
+{
+	char path[MAX_PATH+1];
+	char mnt[MAX_PATH+1];
+	int fd;
+
+
+	if (cgroupfs_find_mountpoint(mnt, MAX_PATH+1))
+		return -1;
+
+	snprintf(path, MAX_PATH, "%s/%s", mnt, name);
+
+	fd = open(path, O_RDONLY);
+	if (fd == -1)
+		fprintf(stderr, "no access to cgroup %s\n", path);
+
+	return fd;
+}
+
+static int add_cgroup(struct perf_evlist *evlist, char *str)
+{
+	struct perf_evsel *counter;
+	struct cgroup_sel *cgrp = NULL;
+	int n;
+	/*
+	 * check if cgrp is already defined, if so we reuse it
+	 */
+	list_for_each_entry(counter, &evlist->entries, node) {
+		cgrp = counter->cgrp;
+		if (!cgrp)
+			continue;
+		if (!strcmp(cgrp->name, str))
+			break;
+
+		cgrp = NULL;
+	}
+
+	if (!cgrp) {
+		cgrp = zalloc(sizeof(*cgrp));
+		if (!cgrp)
+			return -1;
+
+		cgrp->name = str;
+
+		cgrp->fd = open_cgroup(str);
+		if (cgrp->fd == -1) {
+			free(cgrp);
+			return -1;
+		}
+	}
+
+	/*
+	 * find corresponding event
+	 * if add cgroup N, then need to find event N
+	 */
+	n = 0;
+	list_for_each_entry(counter, &evlist->entries, node) {
+		if (n == nr_cgroups)
+			goto found;
+		n++;
+	}
+	if (cgrp->refcnt == 0)
+		free(cgrp);
+
+	return -1;
+found:
+	cgrp->refcnt++;
+	counter->cgrp = cgrp;
+	return 0;
+}
+
+void close_cgroup(struct cgroup_sel *cgrp)
+{
+	if (!cgrp)
+		return;
+
+	/* XXX: not reentrant */
+	if (--cgrp->refcnt == 0) {
+		close(cgrp->fd);
+		free(cgrp->name);
+		free(cgrp);
+	}
+}
+
+int parse_cgroups(const struct option *opt __used, const char *str,
+		  int unset __used)
+{
+	struct perf_evlist *evlist = *(struct perf_evlist **)opt->value;
+	const char *p, *e, *eos = str + strlen(str);
+	char *s;
+	int ret;
+
+	if (list_empty(&evlist->entries)) {
+		fprintf(stderr, "must define events before cgroups\n");
+		return -1;
+	}
+
+	for (;;) {
+		p = strchr(str, ',');
+		e = p ? p : eos;
+
+		/* allow empty cgroups, i.e., skip */
+		if (e - str) {
+			/* termination added */
+			s = strndup(str, e - str);
+			if (!s)
+				return -1;
+			ret = add_cgroup(evlist, s);
+			if (ret) {
+				free(s);
+				return -1;
+			}
+		}
+		/* nr_cgroups is increased een for empty cgroups */
+		nr_cgroups++;
+		if (!p)
+			break;
+		str = p+1;
+	}
+	return 0;
+}
diff --git a/tools/perf/util/cgroup.h b/tools/perf/util/cgroup.h
new file mode 100644
index 0000000..89acd6d
--- /dev/null
+++ b/tools/perf/util/cgroup.h
@@ -0,0 +1,17 @@
+#ifndef __CGROUP_H__
+#define __CGROUP_H__
+
+struct option;
+
+struct cgroup_sel {
+	char *name;
+	int fd;
+	int refcnt;
+};
+
+
+extern int nr_cgroups; /* number of explicit cgroups defined */
+extern void close_cgroup(struct cgroup_sel *cgrp);
+extern int parse_cgroups(const struct option *opt, const char *str, int unset);
+
+#endif /* __CGROUP_H__ */
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 211063e..c974e08 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -85,6 +85,7 @@ void perf_evsel__exit(struct perf_evsel *evsel)
 void perf_evsel__delete(struct perf_evsel *evsel)
 {
 	perf_evsel__exit(evsel);
+	close_cgroup(evsel->cgrp);
 	free(evsel);
 }
 
@@ -163,21 +164,32 @@ static int __perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus,
 			      struct thread_map *threads, bool group, bool inherit)
 {
 	int cpu, thread;
+	unsigned long flags = 0;
+	int pid = -1;
 
 	if (evsel->fd == NULL &&
 	    perf_evsel__alloc_fd(evsel, cpus->nr, threads->nr) < 0)
 		return -1;
 
+	if (evsel->cgrp) {
+		flags = PERF_FLAG_PID_CGROUP;
+		pid = evsel->cgrp->fd;
+	}
+
 	for (cpu = 0; cpu < cpus->nr; cpu++) {
 		int group_fd = -1;
 
 		evsel->attr.inherit = (cpus->map[cpu] < 0) && inherit;
 
 		for (thread = 0; thread < threads->nr; thread++) {
+
+			if (!evsel->cgrp)
+				pid = threads->map[thread];
+
 			FD(evsel, cpu, thread) = sys_perf_event_open(&evsel->attr,
-								     threads->map[thread],
+								     pid,
 								     cpus->map[cpu],
-								     group_fd, 0);
+								     group_fd, flags);
 			if (FD(evsel, cpu, thread) < 0)
 				goto out_close;
 
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index eecdc3a..1d3d5a3 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -6,6 +6,7 @@
 #include "../../../include/linux/perf_event.h"
 #include "types.h"
 #include "xyarray.h"
+#include "cgroup.h"
  
 struct perf_counts_values {
 	union {
@@ -45,6 +46,7 @@ struct perf_evsel {
 	struct perf_counts	*counts;
 	int			idx;
 	void			*priv;
+	struct cgroup_sel	*cgrp;
 };
 
 struct cpu_map;
-- 
cgit v0.10.2


From d45dd923fcc620c948bd1eda16cc61426ac31646 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 2 Feb 2011 17:40:56 +0100
Subject: perf, x86: Use helper function in x86_pmu_enable_all()

Use helper function in x86_pmu_enable_all() to minimize access to
x86_pmu.eventsel in the fast path. The counter's msr address is now
calculated using struct hw_perf_event. Later we add code that
calculates the msr addresses with a table lookup which shouldn't be
done in the fast path.

Signed-off-by: Robert Richter <robert.richter@amd.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1296664860-10886-2-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 4d98789..70d6d8f 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -642,21 +642,24 @@ static void x86_pmu_disable(struct pmu *pmu)
 	x86_pmu.disable_all();
 }
 
+static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
+					  u64 enable_mask)
+{
+	wrmsrl(hwc->config_base + hwc->idx, hwc->config | enable_mask);
+}
+
 static void x86_pmu_enable_all(int added)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	int idx;
 
 	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-		struct perf_event *event = cpuc->events[idx];
-		u64 val;
+		struct hw_perf_event *hwc = &cpuc->events[idx]->hw;
 
 		if (!test_bit(idx, cpuc->active_mask))
 			continue;
 
-		val = event->hw.config;
-		val |= ARCH_PERFMON_EVENTSEL_ENABLE;
-		wrmsrl(x86_pmu.eventsel + idx, val);
+		__x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
 	}
 }
 
@@ -915,12 +918,6 @@ static void x86_pmu_enable(struct pmu *pmu)
 	x86_pmu.enable_all(added);
 }
 
-static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
-					  u64 enable_mask)
-{
-	wrmsrl(hwc->config_base + hwc->idx, hwc->config | enable_mask);
-}
-
 static inline void x86_pmu_disable_event(struct perf_event *event)
 {
 	struct hw_perf_event *hwc = &event->hw;
-- 
cgit v0.10.2


From 41bf498949a263fa0b2d32524b89d696ac330e94 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 2 Feb 2011 17:40:57 +0100
Subject: perf, x86: Calculate perfctr msr addresses in helper functions

This patch adds helper functions to calculate perfctr msr addresses.
We need this to later add support for AMD family 15h cpus. For this we
have to change the algorithms to generate the perfctr's msr addresses.

Signed-off-by: Robert Richter <robert.richter@amd.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1296664860-10886-3-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 70d6d8f..ee40c1ad 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -321,6 +321,16 @@ again:
 	return new_raw_count;
 }
 
+static inline unsigned int x86_pmu_config_addr(int index)
+{
+	return x86_pmu.eventsel + index;
+}
+
+static inline unsigned int x86_pmu_event_addr(int index)
+{
+	return x86_pmu.perfctr + index;
+}
+
 static atomic_t active_events;
 static DEFINE_MUTEX(pmc_reserve_mutex);
 
@@ -331,12 +341,12 @@ static bool reserve_pmc_hardware(void)
 	int i;
 
 	for (i = 0; i < x86_pmu.num_counters; i++) {
-		if (!reserve_perfctr_nmi(x86_pmu.perfctr + i))
+		if (!reserve_perfctr_nmi(x86_pmu_event_addr(i)))
 			goto perfctr_fail;
 	}
 
 	for (i = 0; i < x86_pmu.num_counters; i++) {
-		if (!reserve_evntsel_nmi(x86_pmu.eventsel + i))
+		if (!reserve_evntsel_nmi(x86_pmu_config_addr(i)))
 			goto eventsel_fail;
 	}
 
@@ -344,13 +354,13 @@ static bool reserve_pmc_hardware(void)
 
 eventsel_fail:
 	for (i--; i >= 0; i--)
-		release_evntsel_nmi(x86_pmu.eventsel + i);
+		release_evntsel_nmi(x86_pmu_config_addr(i));
 
 	i = x86_pmu.num_counters;
 
 perfctr_fail:
 	for (i--; i >= 0; i--)
-		release_perfctr_nmi(x86_pmu.perfctr + i);
+		release_perfctr_nmi(x86_pmu_event_addr(i));
 
 	return false;
 }
@@ -360,8 +370,8 @@ static void release_pmc_hardware(void)
 	int i;
 
 	for (i = 0; i < x86_pmu.num_counters; i++) {
-		release_perfctr_nmi(x86_pmu.perfctr + i);
-		release_evntsel_nmi(x86_pmu.eventsel + i);
+		release_perfctr_nmi(x86_pmu_event_addr(i));
+		release_evntsel_nmi(x86_pmu_config_addr(i));
 	}
 }
 
@@ -382,7 +392,7 @@ static bool check_hw_exists(void)
 	 * complain and bail.
 	 */
 	for (i = 0; i < x86_pmu.num_counters; i++) {
-		reg = x86_pmu.eventsel + i;
+		reg = x86_pmu_config_addr(i);
 		ret = rdmsrl_safe(reg, &val);
 		if (ret)
 			goto msr_fail;
@@ -407,8 +417,8 @@ static bool check_hw_exists(void)
 	 * that don't trap on the MSR access and always return 0s.
 	 */
 	val = 0xabcdUL;
-	ret = checking_wrmsrl(x86_pmu.perfctr, val);
-	ret |= rdmsrl_safe(x86_pmu.perfctr, &val_new);
+	ret = checking_wrmsrl(x86_pmu_event_addr(0), val);
+	ret |= rdmsrl_safe(x86_pmu_event_addr(0), &val_new);
 	if (ret || val != val_new)
 		goto msr_fail;
 
@@ -617,11 +627,11 @@ static void x86_pmu_disable_all(void)
 
 		if (!test_bit(idx, cpuc->active_mask))
 			continue;
-		rdmsrl(x86_pmu.eventsel + idx, val);
+		rdmsrl(x86_pmu_config_addr(idx), val);
 		if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE))
 			continue;
 		val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
-		wrmsrl(x86_pmu.eventsel + idx, val);
+		wrmsrl(x86_pmu_config_addr(idx), val);
 	}
 }
 
@@ -1110,8 +1120,8 @@ void perf_event_print_debug(void)
 	pr_info("CPU#%d: active:     %016llx\n", cpu, *(u64 *)cpuc->active_mask);
 
 	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-		rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
-		rdmsrl(x86_pmu.perfctr  + idx, pmc_count);
+		rdmsrl(x86_pmu_config_addr(idx), pmc_ctrl);
+		rdmsrl(x86_pmu_event_addr(idx), pmc_count);
 
 		prev_left = per_cpu(pmc_prev_left[idx], cpu);
 
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 008835c..084b383 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -691,8 +691,8 @@ static void intel_pmu_reset(void)
 	printk("clearing PMU state on CPU#%d\n", smp_processor_id());
 
 	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-		checking_wrmsrl(x86_pmu.eventsel + idx, 0ull);
-		checking_wrmsrl(x86_pmu.perfctr  + idx, 0ull);
+		checking_wrmsrl(x86_pmu_config_addr(idx), 0ull);
+		checking_wrmsrl(x86_pmu_event_addr(idx),  0ull);
 	}
 	for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++)
 		checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
-- 
cgit v0.10.2


From 69d8e1e8ac0a7d829f1c0fd5bd07eb3022d9a1a0 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 2 Feb 2011 17:40:58 +0100
Subject: perf, x86: Add new AMD family 15h msrs to perfctr reservation code

This patch allows the reservation of perfctrs with new msr addresses
introduced for AMD cpu family 15h (0xc0010200/0xc0010201, etc).

Signed-off-by: Robert Richter <robert.richter@amd.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1296664860-10886-4-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index d5a2366..966512b 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -46,6 +46,8 @@ static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
 	/* returns the bit offset of the performance counter register */
 	switch (boot_cpu_data.x86_vendor) {
 	case X86_VENDOR_AMD:
+		if (msr >= MSR_F15H_PERF_CTR)
+			return (msr - MSR_F15H_PERF_CTR) >> 1;
 		return msr - MSR_K7_PERFCTR0;
 	case X86_VENDOR_INTEL:
 		if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
@@ -70,6 +72,8 @@ static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr)
 	/* returns the bit offset of the event selection register */
 	switch (boot_cpu_data.x86_vendor) {
 	case X86_VENDOR_AMD:
+		if (msr >= MSR_F15H_PERF_CTL)
+			return (msr - MSR_F15H_PERF_CTL) >> 1;
 		return msr - MSR_K7_EVNTSEL0;
 	case X86_VENDOR_INTEL:
 		if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
-- 
cgit v0.10.2


From 73d6e52206a20354738418625cedc244cbfd5023 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 2 Feb 2011 17:40:59 +0100
Subject: perf, x86: Store perfctr msr addresses in config_base/event_base

Instead of storing the base addresses we can store the counter's msr
addresses directly in config_base/event_base of struct hw_perf_event.
This avoids recalculating the address with each msr access. The
addresses are configured one time. We also need this change to later
modify the address calculation.

Signed-off-by: Robert Richter <robert.richter@amd.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1296664860-10886-5-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index ee40c1ad..3161943 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -298,7 +298,7 @@ x86_perf_event_update(struct perf_event *event)
 	 */
 again:
 	prev_raw_count = local64_read(&hwc->prev_count);
-	rdmsrl(hwc->event_base + idx, new_raw_count);
+	rdmsrl(hwc->event_base, new_raw_count);
 
 	if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
 					new_raw_count) != prev_raw_count)
@@ -655,7 +655,7 @@ static void x86_pmu_disable(struct pmu *pmu)
 static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
 					  u64 enable_mask)
 {
-	wrmsrl(hwc->config_base + hwc->idx, hwc->config | enable_mask);
+	wrmsrl(hwc->config_base, hwc->config | enable_mask);
 }
 
 static void x86_pmu_enable_all(int added)
@@ -834,15 +834,10 @@ static inline void x86_assign_hw_event(struct perf_event *event,
 		hwc->event_base	= 0;
 	} else if (hwc->idx >= X86_PMC_IDX_FIXED) {
 		hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
-		/*
-		 * We set it so that event_base + idx in wrmsr/rdmsr maps to
-		 * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
-		 */
-		hwc->event_base =
-			MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
+		hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0;
 	} else {
-		hwc->config_base = x86_pmu.eventsel;
-		hwc->event_base  = x86_pmu.perfctr;
+		hwc->config_base = x86_pmu_config_addr(hwc->idx);
+		hwc->event_base  = x86_pmu_event_addr(hwc->idx);
 	}
 }
 
@@ -932,7 +927,7 @@ static inline void x86_pmu_disable_event(struct perf_event *event)
 {
 	struct hw_perf_event *hwc = &event->hw;
 
-	wrmsrl(hwc->config_base + hwc->idx, hwc->config);
+	wrmsrl(hwc->config_base, hwc->config);
 }
 
 static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
@@ -985,7 +980,7 @@ x86_perf_event_set_period(struct perf_event *event)
 	 */
 	local64_set(&hwc->prev_count, (u64)-left);
 
-	wrmsrl(hwc->event_base + idx, (u64)(-left) & x86_pmu.cntval_mask);
+	wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask);
 
 	/*
 	 * Due to erratum on certan cpu we need
@@ -993,7 +988,7 @@ x86_perf_event_set_period(struct perf_event *event)
 	 * is updated properly
 	 */
 	if (x86_pmu.perfctr_second_write) {
-		wrmsrl(hwc->event_base + idx,
+		wrmsrl(hwc->event_base,
 			(u64)(-left) & x86_pmu.cntval_mask);
 	}
 
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index ff751a9..3769ac82 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -764,9 +764,9 @@ static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc)
 	u64 v;
 
 	/* an official way for overflow indication */
-	rdmsrl(hwc->config_base + hwc->idx, v);
+	rdmsrl(hwc->config_base, v);
 	if (v & P4_CCCR_OVF) {
-		wrmsrl(hwc->config_base + hwc->idx, v & ~P4_CCCR_OVF);
+		wrmsrl(hwc->config_base, v & ~P4_CCCR_OVF);
 		return 1;
 	}
 
@@ -815,7 +815,7 @@ static inline void p4_pmu_disable_event(struct perf_event *event)
 	 * state we need to clear P4_CCCR_OVF, otherwise interrupt get
 	 * asserted again and again
 	 */
-	(void)checking_wrmsrl(hwc->config_base + hwc->idx,
+	(void)checking_wrmsrl(hwc->config_base,
 		(u64)(p4_config_unpack_cccr(hwc->config)) &
 			~P4_CCCR_ENABLE & ~P4_CCCR_OVF & ~P4_CCCR_RESERVED);
 }
@@ -885,7 +885,7 @@ static void p4_pmu_enable_event(struct perf_event *event)
 	p4_pmu_enable_pebs(hwc->config);
 
 	(void)checking_wrmsrl(escr_addr, escr_conf);
-	(void)checking_wrmsrl(hwc->config_base + hwc->idx,
+	(void)checking_wrmsrl(hwc->config_base,
 				(cccr & ~P4_CCCR_RESERVED) | P4_CCCR_ENABLE);
 }
 
diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c
index 34ba07b..20c097e 100644
--- a/arch/x86/kernel/cpu/perf_event_p6.c
+++ b/arch/x86/kernel/cpu/perf_event_p6.c
@@ -68,7 +68,7 @@ p6_pmu_disable_event(struct perf_event *event)
 	if (cpuc->enabled)
 		val |= ARCH_PERFMON_EVENTSEL_ENABLE;
 
-	(void)checking_wrmsrl(hwc->config_base + hwc->idx, val);
+	(void)checking_wrmsrl(hwc->config_base, val);
 }
 
 static void p6_pmu_enable_event(struct perf_event *event)
@@ -81,7 +81,7 @@ static void p6_pmu_enable_event(struct perf_event *event)
 	if (cpuc->enabled)
 		val |= ARCH_PERFMON_EVENTSEL_ENABLE;
 
-	(void)checking_wrmsrl(hwc->config_base + hwc->idx, val);
+	(void)checking_wrmsrl(hwc->config_base, val);
 }
 
 static __initconst const struct x86_pmu p6_pmu = {
-- 
cgit v0.10.2


From 4979d2729af22f6ce8faa325fc60a85a2c2daa02 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 2 Feb 2011 17:36:12 +0100
Subject: perf, x86: Add support for AMD family 15h core counters

This patch adds support for AMD family 15h core counters. There are
major changes compared to family 10h. First, there is a new perfctr
msr range for up to 6 counters. Northbridge counters are separate
now. This patch only adds support for core counters. Second, certain
events may only be scheduled on certain counters. For this we need to
extend the event scheduling and constraints.

We use cpu feature flags to calculate family 15h msr address offsets.
This way we later can implement a faster ALTERNATIVE() version for
this.

Signed-off-by: Robert Richter <robert.richter@amd.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20110215135210.GB5874@erda.amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 220e2ea..91f3e087 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -160,6 +160,7 @@
 #define X86_FEATURE_NODEID_MSR	(6*32+19) /* NodeId MSR */
 #define X86_FEATURE_TBM		(6*32+21) /* trailing bit manipulations */
 #define X86_FEATURE_TOPOEXT	(6*32+22) /* topology extensions CPUID leafs */
+#define X86_FEATURE_PERFCTR_CORE (6*32+23) /* core performance counter extensions */
 
 /*
  * Auxiliary flags: Linux defined - For features scattered in various
@@ -279,6 +280,7 @@ extern const char * const x86_power_flags[32];
 #define cpu_has_xsave		boot_cpu_has(X86_FEATURE_XSAVE)
 #define cpu_has_hypervisor	boot_cpu_has(X86_FEATURE_HYPERVISOR)
 #define cpu_has_pclmulqdq	boot_cpu_has(X86_FEATURE_PCLMULQDQ)
+#define cpu_has_perfctr_core	boot_cpu_has(X86_FEATURE_PERFCTR_CORE)
 
 #if defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_64)
 # define cpu_has_invlpg		1
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 3161943..10bfe24 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -321,14 +321,22 @@ again:
 	return new_raw_count;
 }
 
+/* using X86_FEATURE_PERFCTR_CORE to later implement ALTERNATIVE() here */
+static inline int x86_pmu_addr_offset(int index)
+{
+	if (boot_cpu_has(X86_FEATURE_PERFCTR_CORE))
+		return index << 1;
+	return index;
+}
+
 static inline unsigned int x86_pmu_config_addr(int index)
 {
-	return x86_pmu.eventsel + index;
+	return x86_pmu.eventsel + x86_pmu_addr_offset(index);
 }
 
 static inline unsigned int x86_pmu_event_addr(int index)
 {
-	return x86_pmu.perfctr + index;
+	return x86_pmu.perfctr + x86_pmu_addr_offset(index);
 }
 
 static atomic_t active_events;
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index 67e2202..461f62b 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -127,6 +127,11 @@ static int amd_pmu_hw_config(struct perf_event *event)
 /*
  * AMD64 events are detected based on their event codes.
  */
+static inline unsigned int amd_get_event_code(struct hw_perf_event *hwc)
+{
+	return ((hwc->config >> 24) & 0x0f00) | (hwc->config & 0x00ff);
+}
+
 static inline int amd_is_nb_event(struct hw_perf_event *hwc)
 {
 	return (hwc->config & 0xe0) == 0xe0;
@@ -385,13 +390,181 @@ static __initconst const struct x86_pmu amd_pmu = {
 	.cpu_dead		= amd_pmu_cpu_dead,
 };
 
+/* AMD Family 15h */
+
+#define AMD_EVENT_TYPE_MASK	0x000000F0ULL
+
+#define AMD_EVENT_FP		0x00000000ULL ... 0x00000010ULL
+#define AMD_EVENT_LS		0x00000020ULL ... 0x00000030ULL
+#define AMD_EVENT_DC		0x00000040ULL ... 0x00000050ULL
+#define AMD_EVENT_CU		0x00000060ULL ... 0x00000070ULL
+#define AMD_EVENT_IC_DE		0x00000080ULL ... 0x00000090ULL
+#define AMD_EVENT_EX_LS		0x000000C0ULL
+#define AMD_EVENT_DE		0x000000D0ULL
+#define AMD_EVENT_NB		0x000000E0ULL ... 0x000000F0ULL
+
+/*
+ * AMD family 15h event code/PMC mappings:
+ *
+ * type = event_code & 0x0F0:
+ *
+ * 0x000	FP	PERF_CTL[5:3]
+ * 0x010	FP	PERF_CTL[5:3]
+ * 0x020	LS	PERF_CTL[5:0]
+ * 0x030	LS	PERF_CTL[5:0]
+ * 0x040	DC	PERF_CTL[5:0]
+ * 0x050	DC	PERF_CTL[5:0]
+ * 0x060	CU	PERF_CTL[2:0]
+ * 0x070	CU	PERF_CTL[2:0]
+ * 0x080	IC/DE	PERF_CTL[2:0]
+ * 0x090	IC/DE	PERF_CTL[2:0]
+ * 0x0A0	---
+ * 0x0B0	---
+ * 0x0C0	EX/LS	PERF_CTL[5:0]
+ * 0x0D0	DE	PERF_CTL[2:0]
+ * 0x0E0	NB	NB_PERF_CTL[3:0]
+ * 0x0F0	NB	NB_PERF_CTL[3:0]
+ *
+ * Exceptions:
+ *
+ * 0x003	FP	PERF_CTL[3]
+ * 0x00B	FP	PERF_CTL[3]
+ * 0x00D	FP	PERF_CTL[3]
+ * 0x023	DE	PERF_CTL[2:0]
+ * 0x02D	LS	PERF_CTL[3]
+ * 0x02E	LS	PERF_CTL[3,0]
+ * 0x043	CU	PERF_CTL[2:0]
+ * 0x045	CU	PERF_CTL[2:0]
+ * 0x046	CU	PERF_CTL[2:0]
+ * 0x054	CU	PERF_CTL[2:0]
+ * 0x055	CU	PERF_CTL[2:0]
+ * 0x08F	IC	PERF_CTL[0]
+ * 0x187	DE	PERF_CTL[0]
+ * 0x188	DE	PERF_CTL[0]
+ * 0x0DB	EX	PERF_CTL[5:0]
+ * 0x0DC	LS	PERF_CTL[5:0]
+ * 0x0DD	LS	PERF_CTL[5:0]
+ * 0x0DE	LS	PERF_CTL[5:0]
+ * 0x0DF	LS	PERF_CTL[5:0]
+ * 0x1D6	EX	PERF_CTL[5:0]
+ * 0x1D8	EX	PERF_CTL[5:0]
+ */
+
+static struct event_constraint amd_f15_PMC0  = EVENT_CONSTRAINT(0, 0x01, 0);
+static struct event_constraint amd_f15_PMC20 = EVENT_CONSTRAINT(0, 0x07, 0);
+static struct event_constraint amd_f15_PMC3  = EVENT_CONSTRAINT(0, 0x08, 0);
+static struct event_constraint amd_f15_PMC30 = EVENT_CONSTRAINT(0, 0x09, 0);
+static struct event_constraint amd_f15_PMC50 = EVENT_CONSTRAINT(0, 0x3F, 0);
+static struct event_constraint amd_f15_PMC53 = EVENT_CONSTRAINT(0, 0x38, 0);
+
+static struct event_constraint *
+amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, struct perf_event *event)
+{
+	unsigned int event_code = amd_get_event_code(&event->hw);
+
+	switch (event_code & AMD_EVENT_TYPE_MASK) {
+	case AMD_EVENT_FP:
+		switch (event_code) {
+		case 0x003:
+		case 0x00B:
+		case 0x00D:
+			return &amd_f15_PMC3;
+		default:
+			return &amd_f15_PMC53;
+		}
+	case AMD_EVENT_LS:
+	case AMD_EVENT_DC:
+	case AMD_EVENT_EX_LS:
+		switch (event_code) {
+		case 0x023:
+		case 0x043:
+		case 0x045:
+		case 0x046:
+		case 0x054:
+		case 0x055:
+			return &amd_f15_PMC20;
+		case 0x02D:
+			return &amd_f15_PMC3;
+		case 0x02E:
+			return &amd_f15_PMC30;
+		default:
+			return &amd_f15_PMC50;
+		}
+	case AMD_EVENT_CU:
+	case AMD_EVENT_IC_DE:
+	case AMD_EVENT_DE:
+		switch (event_code) {
+		case 0x08F:
+		case 0x187:
+		case 0x188:
+			return &amd_f15_PMC0;
+		case 0x0DB ... 0x0DF:
+		case 0x1D6:
+		case 0x1D8:
+			return &amd_f15_PMC50;
+		default:
+			return &amd_f15_PMC20;
+		}
+	case AMD_EVENT_NB:
+		/* not yet implemented */
+		return &emptyconstraint;
+	default:
+		return &emptyconstraint;
+	}
+}
+
+static __initconst const struct x86_pmu amd_pmu_f15h = {
+	.name			= "AMD Family 15h",
+	.handle_irq		= x86_pmu_handle_irq,
+	.disable_all		= x86_pmu_disable_all,
+	.enable_all		= x86_pmu_enable_all,
+	.enable			= x86_pmu_enable_event,
+	.disable		= x86_pmu_disable_event,
+	.hw_config		= amd_pmu_hw_config,
+	.schedule_events	= x86_schedule_events,
+	.eventsel		= MSR_F15H_PERF_CTL,
+	.perfctr		= MSR_F15H_PERF_CTR,
+	.event_map		= amd_pmu_event_map,
+	.max_events		= ARRAY_SIZE(amd_perfmon_event_map),
+	.num_counters		= 6,
+	.cntval_bits		= 48,
+	.cntval_mask		= (1ULL << 48) - 1,
+	.apic			= 1,
+	/* use highest bit to detect overflow */
+	.max_period		= (1ULL << 47) - 1,
+	.get_event_constraints	= amd_get_event_constraints_f15h,
+	/* nortbridge counters not yet implemented: */
+#if 0
+	.put_event_constraints	= amd_put_event_constraints,
+
+	.cpu_prepare		= amd_pmu_cpu_prepare,
+	.cpu_starting		= amd_pmu_cpu_starting,
+	.cpu_dead		= amd_pmu_cpu_dead,
+#endif
+};
+
 static __init int amd_pmu_init(void)
 {
 	/* Performance-monitoring supported from K7 and later: */
 	if (boot_cpu_data.x86 < 6)
 		return -ENODEV;
 
-	x86_pmu = amd_pmu;
+	/*
+	 * If core performance counter extensions exists, it must be
+	 * family 15h, otherwise fail. See x86_pmu_addr_offset().
+	 */
+	switch (boot_cpu_data.x86) {
+	case 0x15:
+		if (!cpu_has_perfctr_core)
+			return -ENODEV;
+		x86_pmu = amd_pmu_f15h;
+		break;
+	default:
+		if (cpu_has_perfctr_core)
+			return -ENODEV;
+		x86_pmu = amd_pmu;
+		break;
+	}
 
 	/* Events are common for all AMDs */
 	memcpy(hw_cache_event_ids, amd_hw_cache_event_ids,
-- 
cgit v0.10.2


From 163ec4354a5135c6c38c3f4a9b46a31900ebdf48 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 16 Feb 2011 11:22:34 +0100
Subject: perf: Optimize throttling code

By pre-computing the maximum number of samples per tick we can avoid a
multiplication and a conditional since MAX_INTERRUPTS >
max_samples_per_tick.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 38c8b25..8ceb5a6 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1110,6 +1110,10 @@ extern int sysctl_perf_event_paranoid;
 extern int sysctl_perf_event_mlock;
 extern int sysctl_perf_event_sample_rate;
 
+extern int perf_proc_update_handler(struct ctl_table *table, int write,
+		void __user *buffer, size_t *lenp,
+		loff_t *ppos);
+
 static inline bool perf_paranoid_tracepoint_raw(void)
 {
 	return sysctl_perf_event_paranoid > -1;
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 65dcdc7..e03be08 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -150,7 +150,24 @@ int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */
 /*
  * max perf event sample rate
  */
-int sysctl_perf_event_sample_rate __read_mostly = 100000;
+#define DEFAULT_MAX_SAMPLE_RATE 100000
+int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
+static int max_samples_per_tick __read_mostly =
+	DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
+
+int perf_proc_update_handler(struct ctl_table *table, int write,
+		void __user *buffer, size_t *lenp,
+		loff_t *ppos)
+{
+	int ret = proc_dointvec(table, write, buffer, lenp, ppos);
+
+	if (ret || !write)
+		return ret;
+
+	max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
+
+	return 0;
+}
 
 static atomic64_t perf_event_id;
 
@@ -4941,26 +4958,14 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
 	if (unlikely(!is_sampling_event(event)))
 		return 0;
 
-	if (!throttle) {
-		hwc->interrupts++;
-	} else {
-		if (hwc->interrupts != MAX_INTERRUPTS) {
-			hwc->interrupts++;
-			if (HZ * hwc->interrupts >
-					(u64)sysctl_perf_event_sample_rate) {
-				hwc->interrupts = MAX_INTERRUPTS;
-				perf_log_throttle(event, 0);
-				ret = 1;
-			}
-		} else {
-			/*
-			 * Keep re-disabling events even though on the previous
-			 * pass we disabled it - just in case we raced with a
-			 * sched-in and the event got enabled again:
-			 */
+	if (unlikely(hwc->interrupts >= max_samples_per_tick)) {
+		if (throttle) {
+			hwc->interrupts = MAX_INTERRUPTS;
+			perf_log_throttle(event, 0);
 			ret = 1;
 		}
-	}
+	} else
+		hwc->interrupts++;
 
 	if (event->attr.freq) {
 		u64 now = perf_clock();
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 0f1bd83..daef911 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -948,7 +948,7 @@ static struct ctl_table kern_table[] = {
 		.data		= &sysctl_perf_event_sample_rate,
 		.maxlen		= sizeof(sysctl_perf_event_sample_rate),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
+		.proc_handler	= perf_proc_update_handler,
 	},
 #endif
 #ifdef CONFIG_KMEMCHECK
-- 
cgit v0.10.2


From ba3dd36c6775264ee6e7354ba1aabcd6e86d7298 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 15 Feb 2011 12:41:46 +0100
Subject: perf: Optimize hrtimer events

There is no need to re-initialize the hrtimer every time we start it,
so don't do that (shaves a few cycles). Also, since we know hrtimers
run at a fixed rate (nanoseconds) we can pre-compute the desired
frequency at which they tick. This avoids us having to go through the
whole adaptive frequency feedback logic (shaves another few cycles).

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1297448589.5226.47.camel@laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index e03be08..a0a6987 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -5602,6 +5602,10 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
 	u64 period;
 
 	event = container_of(hrtimer, struct perf_event, hw.hrtimer);
+
+	if (event->state != PERF_EVENT_STATE_ACTIVE)
+		return HRTIMER_NORESTART;
+
 	event->pmu->read(event);
 
 	perf_sample_data_init(&data, 0);
@@ -5628,9 +5632,6 @@ static void perf_swevent_start_hrtimer(struct perf_event *event)
 	if (!is_sampling_event(event))
 		return;
 
-	hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-	hwc->hrtimer.function = perf_swevent_hrtimer;
-
 	period = local64_read(&hwc->period_left);
 	if (period) {
 		if (period < 0)
@@ -5657,6 +5658,30 @@ static void perf_swevent_cancel_hrtimer(struct perf_event *event)
 	}
 }
 
+static void perf_swevent_init_hrtimer(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	if (!is_sampling_event(event))
+		return;
+
+	hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	hwc->hrtimer.function = perf_swevent_hrtimer;
+
+	/*
+	 * Since hrtimers have a fixed rate, we can do a static freq->period
+	 * mapping and avoid the whole period adjust feedback stuff.
+	 */
+	if (event->attr.freq) {
+		long freq = event->attr.sample_freq;
+
+		event->attr.sample_period = NSEC_PER_SEC / freq;
+		hwc->sample_period = event->attr.sample_period;
+		local64_set(&hwc->period_left, hwc->sample_period);
+		event->attr.freq = 0;
+	}
+}
+
 /*
  * Software event: cpu wall time clock
  */
@@ -5709,6 +5734,8 @@ static int cpu_clock_event_init(struct perf_event *event)
 	if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
 		return -ENOENT;
 
+	perf_swevent_init_hrtimer(event);
+
 	return 0;
 }
 
@@ -5787,6 +5814,8 @@ static int task_clock_event_init(struct perf_event *event)
 	if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
 		return -ENOENT;
 
+	perf_swevent_init_hrtimer(event);
+
 	return 0;
 }
 
-- 
cgit v0.10.2


From 46e49b3836c7cd2ae5b5fe76fa981d0d292a52fe Mon Sep 17 00:00:00 2001
From: Venkatesh Pallipadi <venki@google.com>
Date: Mon, 14 Feb 2011 14:38:50 -0800
Subject: sched: Wholesale removal of sd_idle logic

sd_idle logic was introduced way back in 2005 (commit 5969fe06),
as an HT optimization.

As per the discussion in the thread here:

  lkml - sched: Resolve sd_idle and first_idle_cpu Catch-22 - v1
  https://patchwork.kernel.org/patch/532501/

The capacity based logic in the load balancer right now handles this
in a much cleaner way, handling more than 2 SMT siblings etc, and sd_idle
does not seem to bring any additional benefits. sd_idle logic also has
some bugs that has performance impact. Here is the patch that removes
the sd_idle logic altogether.

Also, there was a dependency of sched_mc_power_savings == 2, with sd_idle
logic.

Signed-off-by: Venkatesh Pallipadi <venki@google.com>
Acked-by: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1297723130-693-1-git-send-email-venki@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 0270246..d384e73 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -2672,7 +2672,6 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
  * @this_cpu: Cpu for which load balance is currently performed.
  * @idle: Idle status of this_cpu
  * @load_idx: Load index of sched_domain of this_cpu for load calc.
- * @sd_idle: Idle status of the sched_domain containing group.
  * @local_group: Does group contain this_cpu.
  * @cpus: Set of cpus considered for load balancing.
  * @balance: Should we balance.
@@ -2680,7 +2679,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
  */
 static inline void update_sg_lb_stats(struct sched_domain *sd,
 			struct sched_group *group, int this_cpu,
-			enum cpu_idle_type idle, int load_idx, int *sd_idle,
+			enum cpu_idle_type idle, int load_idx,
 			int local_group, const struct cpumask *cpus,
 			int *balance, struct sg_lb_stats *sgs)
 {
@@ -2700,9 +2699,6 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
 	for_each_cpu_and(i, sched_group_cpus(group), cpus) {
 		struct rq *rq = cpu_rq(i);
 
-		if (*sd_idle && rq->nr_running)
-			*sd_idle = 0;
-
 		/* Bias balancing toward cpus of our domain */
 		if (local_group) {
 			if (idle_cpu(i) && !first_idle_cpu) {
@@ -2817,15 +2813,13 @@ static bool update_sd_pick_busiest(struct sched_domain *sd,
  * @sd: sched_domain whose statistics are to be updated.
  * @this_cpu: Cpu for which load balance is currently performed.
  * @idle: Idle status of this_cpu
- * @sd_idle: Idle status of the sched_domain containing sg.
  * @cpus: Set of cpus considered for load balancing.
  * @balance: Should we balance.
  * @sds: variable to hold the statistics for this sched_domain.
  */
 static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
-			enum cpu_idle_type idle, int *sd_idle,
-			const struct cpumask *cpus, int *balance,
-			struct sd_lb_stats *sds)
+			enum cpu_idle_type idle, const struct cpumask *cpus,
+			int *balance, struct sd_lb_stats *sds)
 {
 	struct sched_domain *child = sd->child;
 	struct sched_group *sg = sd->groups;
@@ -2843,7 +2837,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
 
 		local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg));
 		memset(&sgs, 0, sizeof(sgs));
-		update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, sd_idle,
+		update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx,
 				local_group, cpus, balance, &sgs);
 
 		if (local_group && !(*balance))
@@ -3095,7 +3089,6 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
  * @imbalance: Variable which stores amount of weighted load which should
  *		be moved to restore balance/put a group to idle.
  * @idle: The idle status of this_cpu.
- * @sd_idle: The idleness of sd
  * @cpus: The set of CPUs under consideration for load-balancing.
  * @balance: Pointer to a variable indicating if this_cpu
  *	is the appropriate cpu to perform load balancing at this_level.
@@ -3108,7 +3101,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
 static struct sched_group *
 find_busiest_group(struct sched_domain *sd, int this_cpu,
 		   unsigned long *imbalance, enum cpu_idle_type idle,
-		   int *sd_idle, const struct cpumask *cpus, int *balance)
+		   const struct cpumask *cpus, int *balance)
 {
 	struct sd_lb_stats sds;
 
@@ -3118,8 +3111,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 	 * Compute the various statistics relavent for load balancing at
 	 * this level.
 	 */
-	update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
-					balance, &sds);
+	update_sd_lb_stats(sd, this_cpu, idle, cpus, balance, &sds);
 
 	/* Cases where imbalance does not exist from POV of this_cpu */
 	/* 1) this_cpu is not the appropriate cpu to perform load balancing
@@ -3255,7 +3247,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
 /* Working cpumask for load_balance and load_balance_newidle. */
 static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
 
-static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle,
+static int need_active_balance(struct sched_domain *sd, int idle,
 			       int busiest_cpu, int this_cpu)
 {
 	if (idle == CPU_NEWLY_IDLE) {
@@ -3287,10 +3279,6 @@ static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle,
 		 * move_tasks() will succeed.  ld_moved will be true and this
 		 * active balance code will not be triggered.
 		 */
-		if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
-		    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-			return 0;
-
 		if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
 			return 0;
 	}
@@ -3308,7 +3296,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 			struct sched_domain *sd, enum cpu_idle_type idle,
 			int *balance)
 {
-	int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
+	int ld_moved, all_pinned = 0, active_balance = 0;
 	struct sched_group *group;
 	unsigned long imbalance;
 	struct rq *busiest;
@@ -3317,20 +3305,10 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 
 	cpumask_copy(cpus, cpu_active_mask);
 
-	/*
-	 * When power savings policy is enabled for the parent domain, idle
-	 * sibling can pick up load irrespective of busy siblings. In this case,
-	 * let the state of idle sibling percolate up as CPU_IDLE, instead of
-	 * portraying it as CPU_NOT_IDLE.
-	 */
-	if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
-	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-		sd_idle = 1;
-
 	schedstat_inc(sd, lb_count[idle]);
 
 redo:
-	group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
+	group = find_busiest_group(sd, this_cpu, &imbalance, idle,
 				   cpus, balance);
 
 	if (*balance == 0)
@@ -3392,8 +3370,7 @@ redo:
 		if (idle != CPU_NEWLY_IDLE)
 			sd->nr_balance_failed++;
 
-		if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest),
-					this_cpu)) {
+		if (need_active_balance(sd, idle, cpu_of(busiest), this_cpu)) {
 			raw_spin_lock_irqsave(&busiest->lock, flags);
 
 			/* don't kick the active_load_balance_cpu_stop,
@@ -3448,10 +3425,6 @@ redo:
 			sd->balance_interval *= 2;
 	}
 
-	if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
-	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-		ld_moved = -1;
-
 	goto out;
 
 out_balanced:
@@ -3465,11 +3438,7 @@ out_one_pinned:
 			(sd->balance_interval < sd->max_interval))
 		sd->balance_interval *= 2;
 
-	if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
-	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-		ld_moved = -1;
-	else
-		ld_moved = 0;
+	ld_moved = 0;
 out:
 	return ld_moved;
 }
-- 
cgit v0.10.2


From 48228f7b470a74b6469a250d2977a13128d8fe96 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Tue, 8 Feb 2011 12:39:54 -0500
Subject: lockdep/timers: Explain in detail the locking problems
 del_timer_sync() may cause

Twice I had to explain the output about why lockdep gives an error with
locks in IRQ context and with del_timer_sync(). Might as well write it
up and place it in the comments above the code in del_timer_sync().
Perhaps the next time this lockdep dump triggers people will understand
the issues.

It is a ticky issue and very subtle, explaining it in detail in the code
may help others understand the issue when they stumble upon the bug
again.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1297186794.23343.19.camel@gandalf.stny.rr.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/timer.c b/kernel/timer.c
index d645992..5f40c2e 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -964,6 +964,25 @@ EXPORT_SYMBOL(try_to_del_timer_sync);
  * add_timer_on(). Upon exit the timer is not queued and the handler is
  * not running on any CPU.
  *
+ * Note: You must not hold locks that are held in interrupt context
+ *   while calling this function. Even if the lock has nothing to do
+ *   with the timer in question.  Here's why:
+ *
+ *    CPU0                             CPU1
+ *    ----                             ----
+ *                                   <SOFTIRQ>
+ *                                   call_timer_fn();
+ *                                     base->running_timer = mytimer;
+ *  spin_lock_irq(somelock);
+ *                                     <IRQ>
+ *                                        spin_lock(somelock);
+ *  del_timer_sync(mytimer);
+ *   while (base->running_timer == mytimer);
+ *
+ * Now del_timer_sync() will never return and never release somelock.
+ * The interrupt on the other CPU is waiting to grab somelock but
+ * it has interrupted the softirq that CPU0 is waiting to finish.
+ *
  * The function returns whether it has deactivated a pending timer or not.
  */
 int del_timer_sync(struct timer_list *timer)
@@ -971,6 +990,10 @@ int del_timer_sync(struct timer_list *timer)
 #ifdef CONFIG_LOCKDEP
 	unsigned long flags;
 
+	/*
+	 * If lockdep gives a backtrace here, please reference
+	 * the synchronization rules above.
+	 */
 	local_irq_save(flags);
 	lock_map_acquire(&timer->lockdep_map);
 	lock_map_release(&timer->lockdep_map);
-- 
cgit v0.10.2


From ef396ec96c1a8ffd2b0bc67f1f79c7274de02b95 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 16 Feb 2011 17:11:07 +0100
Subject: x86-64, NUMA: Factor out memblk handling into
 numa_{add|register}_memblk()

Factor out memblk handling from srat_64.c into two functions in
numa_64.c.  This patch doesn't introduce any behavior change.  The
next patch will make all init methods use these functions.

- v2: Fixed build failure on 32bit due to misplaced NR_NODE_MEMBLKS.
      Reported by Ingo.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Shaohui Zheng <shaohui.zheng@intel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: H. Peter Anvin <hpa@linux.intel.com>

diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 446a5b9..12bd1fd 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -187,7 +187,6 @@ struct bootnode;
 extern int acpi_numa;
 extern int x86_acpi_numa_init(void);
 extern int acpi_scan_nodes(void);
-#define NR_NODE_MEMBLKS (MAX_NUMNODES*2)
 
 #ifdef CONFIG_NUMA_EMU
 extern void acpi_fake_nodes(const struct bootnode *fake_nodes,
diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h
index 26fc6e2..3d4dab4 100644
--- a/arch/x86/include/asm/numa.h
+++ b/arch/x86/include/asm/numa.h
@@ -5,6 +5,9 @@
 #include <asm/apicdef.h>
 
 #ifdef CONFIG_NUMA
+
+#define NR_NODE_MEMBLKS		(MAX_NUMNODES*2)
+
 /*
  * __apicid_to_node[] stores the raw mapping between physical apicid and
  * node and is used to initialize cpu_to_node mapping.
diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h
index d3a4514..3306a2b 100644
--- a/arch/x86/include/asm/numa_64.h
+++ b/arch/x86/include/asm/numa_64.h
@@ -32,6 +32,8 @@ extern nodemask_t mem_nodes_parsed __initdata;
 extern struct bootnode numa_nodes[MAX_NUMNODES] __initdata;
 
 extern int __cpuinit numa_cpu_node(int cpu);
+extern int __init numa_add_memblk(int nodeid, u64 start, u64 end);
+extern int __init numa_register_memblks(void);
 
 #ifdef CONFIG_NUMA_EMU
 #define FAKE_NODE_MIN_SIZE	((u64)32 << 20)
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 82ee308..a1d702d 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -33,6 +33,10 @@ struct memnode memnode;
 static unsigned long __initdata nodemap_addr;
 static unsigned long __initdata nodemap_size;
 
+static int num_node_memblks __initdata;
+static struct bootnode node_memblk_range[NR_NODE_MEMBLKS] __initdata;
+static int memblk_nodeid[NR_NODE_MEMBLKS] __initdata;
+
 struct bootnode numa_nodes[MAX_NUMNODES] __initdata;
 
 /*
@@ -184,6 +188,43 @@ static void * __init early_node_mem(int nodeid, unsigned long start,
 	return NULL;
 }
 
+static __init int conflicting_memblks(unsigned long start, unsigned long end)
+{
+	int i;
+	for (i = 0; i < num_node_memblks; i++) {
+		struct bootnode *nd = &node_memblk_range[i];
+		if (nd->start == nd->end)
+			continue;
+		if (nd->end > start && nd->start < end)
+			return memblk_nodeid[i];
+		if (nd->end == end && nd->start == start)
+			return memblk_nodeid[i];
+	}
+	return -1;
+}
+
+int __init numa_add_memblk(int nid, u64 start, u64 end)
+{
+	int i;
+
+	i = conflicting_memblks(start, end);
+	if (i == nid) {
+		printk(KERN_WARNING "NUMA: Warning: node %d (%Lx-%Lx) overlaps with itself (%Lx-%Lx)\n",
+		       nid, start, end, numa_nodes[i].start, numa_nodes[i].end);
+	} else if (i >= 0) {
+		printk(KERN_ERR "NUMA: node %d (%Lx-%Lx) overlaps with node %d (%Lx-%Lx)\n",
+		       nid, start, end, i,
+		       numa_nodes[i].start, numa_nodes[i].end);
+		return -EINVAL;
+	}
+
+	node_memblk_range[num_node_memblks].start = start;
+	node_memblk_range[num_node_memblks].end = end;
+	memblk_nodeid[num_node_memblks] = nid;
+	num_node_memblks++;
+	return 0;
+}
+
 static __init void cutoff_node(int i, unsigned long start, unsigned long end)
 {
 	struct bootnode *nd = &numa_nodes[i];
@@ -246,6 +287,71 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
 	node_set_online(nodeid);
 }
 
+int __init numa_register_memblks(void)
+{
+	int i;
+
+	/*
+	 * Join together blocks on the same node, holes between
+	 * which don't overlap with memory on other nodes.
+	 */
+	for (i = 0; i < num_node_memblks; ++i) {
+		int j, k;
+
+		for (j = i + 1; j < num_node_memblks; ++j) {
+			unsigned long start, end;
+
+			if (memblk_nodeid[i] != memblk_nodeid[j])
+				continue;
+			start = min(node_memblk_range[i].end,
+			            node_memblk_range[j].end);
+			end = max(node_memblk_range[i].start,
+			          node_memblk_range[j].start);
+			for (k = 0; k < num_node_memblks; ++k) {
+				if (memblk_nodeid[i] == memblk_nodeid[k])
+					continue;
+				if (start < node_memblk_range[k].end &&
+				    end > node_memblk_range[k].start)
+					break;
+			}
+			if (k < num_node_memblks)
+				continue;
+			start = min(node_memblk_range[i].start,
+			            node_memblk_range[j].start);
+			end = max(node_memblk_range[i].end,
+			          node_memblk_range[j].end);
+			printk(KERN_INFO "NUMA: Node %d [%Lx,%Lx) + [%Lx,%Lx) -> [%lx,%lx)\n",
+			       memblk_nodeid[i],
+			       node_memblk_range[i].start,
+			       node_memblk_range[i].end,
+			       node_memblk_range[j].start,
+			       node_memblk_range[j].end,
+			       start, end);
+			node_memblk_range[i].start = start;
+			node_memblk_range[i].end = end;
+			k = --num_node_memblks - j;
+			memmove(memblk_nodeid + j, memblk_nodeid + j+1,
+				k * sizeof(*memblk_nodeid));
+			memmove(node_memblk_range + j, node_memblk_range + j+1,
+				k * sizeof(*node_memblk_range));
+			--j;
+		}
+	}
+
+	memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks,
+					   memblk_nodeid);
+	if (memnode_shift < 0) {
+		printk(KERN_ERR "NUMA: No NUMA node hash function found. Contact maintainer\n");
+		return -EINVAL;
+	}
+
+	for (i = 0; i < num_node_memblks; i++)
+		memblock_x86_register_active_regions(memblk_nodeid[i],
+				node_memblk_range[i].start >> PAGE_SHIFT,
+				node_memblk_range[i].end >> PAGE_SHIFT);
+	return 0;
+}
+
 #ifdef CONFIG_NUMA_EMU
 /* Numa emulation */
 static struct bootnode nodes[MAX_NUMNODES] __initdata;
@@ -653,6 +759,9 @@ void __init initmem_init(void)
 		nodes_clear(mem_nodes_parsed);
 		nodes_clear(node_possible_map);
 		nodes_clear(node_online_map);
+		num_node_memblks = 0;
+		memset(node_memblk_range, 0, sizeof(node_memblk_range));
+		memset(memblk_nodeid, 0, sizeof(memblk_nodeid));
 		memset(numa_nodes, 0, sizeof(numa_nodes));
 
 		if (numa_init[i]() < 0)
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 82b1087..341b371 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -30,30 +30,11 @@ static struct acpi_table_slit *acpi_slit;
 
 static struct bootnode nodes_add[MAX_NUMNODES];
 
-static int num_node_memblks __initdata;
-static struct bootnode node_memblk_range[NR_NODE_MEMBLKS] __initdata;
-static int memblk_nodeid[NR_NODE_MEMBLKS] __initdata;
-
 static __init int setup_node(int pxm)
 {
 	return acpi_map_pxm_to_node(pxm);
 }
 
-static __init int conflicting_memblks(unsigned long start, unsigned long end)
-{
-	int i;
-	for (i = 0; i < num_node_memblks; i++) {
-		struct bootnode *nd = &node_memblk_range[i];
-		if (nd->start == nd->end)
-			continue;
-		if (nd->end > start && nd->start < end)
-			return memblk_nodeid[i];
-		if (nd->end == end && nd->start == start)
-			return memblk_nodeid[i];
-	}
-	return -1;
-}
-
 static __init void bad_srat(void)
 {
 	int i;
@@ -233,7 +214,6 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
 	struct bootnode *nd;
 	unsigned long start, end;
 	int node, pxm;
-	int i;
 
 	if (srat_disabled())
 		return;
@@ -255,16 +235,8 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
 		bad_srat();
 		return;
 	}
-	i = conflicting_memblks(start, end);
-	if (i == node) {
-		printk(KERN_WARNING
-		"SRAT: Warning: PXM %d (%lx-%lx) overlaps with itself (%Lx-%Lx)\n",
-		       pxm, start, end, numa_nodes[i].start, numa_nodes[i].end);
-	} else if (i >= 0) {
-		printk(KERN_ERR
-		       "SRAT: PXM %d (%lx-%lx) overlaps with PXM %d (%Lx-%Lx)\n",
-		       pxm, start, end, node_to_pxm(i),
-		       numa_nodes[i].start, numa_nodes[i].end);
+
+	if (numa_add_memblk(node, start, end) < 0) {
 		bad_srat();
 		return;
 	}
@@ -285,11 +257,6 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
 		}
 	} else
 		update_nodes_add(node, start, end);
-
-	node_memblk_range[num_node_memblks].start = start;
-	node_memblk_range[num_node_memblks].end = end;
-	memblk_nodeid[num_node_memblks] = node;
-	num_node_memblks++;
 }
 
 /* Sanity check to catch more bad SRATs (they are amazingly common).
@@ -341,68 +308,11 @@ int __init acpi_scan_nodes(void)
 	if (acpi_numa <= 0)
 		return -1;
 
-	/*
-	 * Join together blocks on the same node, holes between
-	 * which don't overlap with memory on other nodes.
-	 */
-	for (i = 0; i < num_node_memblks; ++i) {
-		int j, k;
-
-		for (j = i + 1; j < num_node_memblks; ++j) {
-			unsigned long start, end;
-
-			if (memblk_nodeid[i] != memblk_nodeid[j])
-				continue;
-			start = min(node_memblk_range[i].end,
-			            node_memblk_range[j].end);
-			end = max(node_memblk_range[i].start,
-			          node_memblk_range[j].start);
-			for (k = 0; k < num_node_memblks; ++k) {
-				if (memblk_nodeid[i] == memblk_nodeid[k])
-					continue;
-				if (start < node_memblk_range[k].end &&
-				    end > node_memblk_range[k].start)
-					break;
-			}
-			if (k < num_node_memblks)
-				continue;
-			start = min(node_memblk_range[i].start,
-			            node_memblk_range[j].start);
-			end = max(node_memblk_range[i].end,
-			          node_memblk_range[j].end);
-			printk(KERN_INFO "SRAT: Node %d "
-			       "[%Lx,%Lx) + [%Lx,%Lx) -> [%lx,%lx)\n",
-			       memblk_nodeid[i],
-			       node_memblk_range[i].start,
-			       node_memblk_range[i].end,
-			       node_memblk_range[j].start,
-			       node_memblk_range[j].end,
-			       start, end);
-			node_memblk_range[i].start = start;
-			node_memblk_range[i].end = end;
-			k = --num_node_memblks - j;
-			memmove(memblk_nodeid + j, memblk_nodeid + j+1,
-				k * sizeof(*memblk_nodeid));
-			memmove(node_memblk_range + j, node_memblk_range + j+1,
-				k * sizeof(*node_memblk_range));
-			--j;
-		}
-	}
-
-	memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks,
-					   memblk_nodeid);
-	if (memnode_shift < 0) {
-		printk(KERN_ERR
-		     "SRAT: No NUMA node hash function found. Contact maintainer\n");
+	if (numa_register_memblks() < 0) {
 		bad_srat();
 		return -1;
 	}
 
-	for (i = 0; i < num_node_memblks; i++)
-		memblock_x86_register_active_regions(memblk_nodeid[i],
-				node_memblk_range[i].start >> PAGE_SHIFT,
-				node_memblk_range[i].end >> PAGE_SHIFT);
-
 	/* for out of order entries in SRAT */
 	sort_node_map();
 	if (!nodes_cover_memory(numa_nodes)) {
-- 
cgit v0.10.2


From 43a662f04f731c331706456c9852ef7146ba5d85 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 16 Feb 2011 17:11:08 +0100
Subject: x86-64, NUMA: Unify use of memblk in all init methods

Make both amd and dummy use numa_add_memblk() to describe the detected
memory blocks.  This allows initmem_init() to call
numa_register_memblk() regardless of init method in use.  Drop custom
memory registration codes from amd and dummy.

After this change, memblk merge/cleanup in numa_register_memblks() is
applied to all init methods.

As this makes compute_hash_shift() and numa_register_memblks() used
only inside numa_64.c, make them static.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Shaohui Zheng <shaohui.zheng@intel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: H. Peter Anvin <hpa@linux.intel.com>

diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h
index 3306a2b..e925605 100644
--- a/arch/x86/include/asm/numa_64.h
+++ b/arch/x86/include/asm/numa_64.h
@@ -8,9 +8,6 @@ struct bootnode {
 	u64 end;
 };
 
-extern int compute_hash_shift(struct bootnode *nodes, int numblks,
-			      int *nodeids);
-
 #define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT))
 
 extern int numa_off;
@@ -33,7 +30,6 @@ extern struct bootnode numa_nodes[MAX_NUMNODES] __initdata;
 
 extern int __cpuinit numa_cpu_node(int cpu);
 extern int __init numa_add_memblk(int nodeid, u64 start, u64 end);
-extern int __init numa_register_memblks(void);
 
 #ifdef CONFIG_NUMA_EMU
 #define FAKE_NODE_MIN_SIZE	((u64)32 << 20)
diff --git a/arch/x86/mm/amdtopology_64.c b/arch/x86/mm/amdtopology_64.c
index cf29527..d6d7aa4 100644
--- a/arch/x86/mm/amdtopology_64.c
+++ b/arch/x86/mm/amdtopology_64.c
@@ -167,6 +167,7 @@ int __init amd_numa_init(void)
 
 		numa_nodes[nodeid].start = base;
 		numa_nodes[nodeid].end = limit;
+		numa_add_memblk(nodeid, base, limit);
 
 		prevbase = base;
 
@@ -263,18 +264,6 @@ int __init amd_scan_nodes(void)
 {
 	int i;
 
-	memnode_shift = compute_hash_shift(numa_nodes, 8, NULL);
-	if (memnode_shift < 0) {
-		pr_err("No NUMA node hash function found. Contact maintainer\n");
-		return -1;
-	}
-	pr_info("Using node hash shift of %d\n", memnode_shift);
-
-	/* use the coreid bits from early_identify_cpu */
-	for_each_node_mask(i, node_possible_map)
-		memblock_x86_register_active_regions(i,
-				numa_nodes[i].start >> PAGE_SHIFT,
-				numa_nodes[i].end >> PAGE_SHIFT);
 	init_memory_mapping_high();
 	for_each_node_mask(i, node_possible_map)
 		setup_node_bootmem(i, numa_nodes[i].start, numa_nodes[i].end);
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index a1d702d..552080e 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -131,8 +131,8 @@ static int __init extract_lsb_from_nodes(const struct bootnode *nodes,
 	return i;
 }
 
-int __init compute_hash_shift(struct bootnode *nodes, int numnodes,
-			      int *nodeids)
+static int __init compute_hash_shift(struct bootnode *nodes, int numnodes,
+				     int *nodeids)
 {
 	int shift;
 
@@ -287,7 +287,7 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
 	node_set_online(nodeid);
 }
 
-int __init numa_register_memblks(void)
+static int __init numa_register_memblks(void)
 {
 	int i;
 
@@ -713,17 +713,13 @@ static int dummy_numa_init(void)
 
 	node_set(0, cpu_nodes_parsed);
 	node_set(0, mem_nodes_parsed);
+	numa_add_memblk(0, 0, (u64)max_pfn << PAGE_SHIFT);
 
 	return 0;
 }
 
 static int dummy_scan_nodes(void)
 {
-	/* setup dummy node covering all memory */
-	memnode_shift = 63;
-	memnodemap = memnode.embedded_map;
-	memnodemap[0] = 0;
-	memblock_x86_register_active_regions(0, 0, max_pfn);
 	init_memory_mapping_high();
 	setup_node_bootmem(0, 0, max_pfn << PAGE_SHIFT);
 	numa_init_array();
@@ -784,6 +780,9 @@ void __init initmem_init(void)
 		if (WARN_ON(nodes_empty(node_possible_map)))
 			continue;
 
+		if (numa_register_memblks() < 0)
+			continue;
+
 		if (!scan_nodes[i]())
 			return;
 	}
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 341b371..69f1471 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -308,11 +308,6 @@ int __init acpi_scan_nodes(void)
 	if (acpi_numa <= 0)
 		return -1;
 
-	if (numa_register_memblks() < 0) {
-		bad_srat();
-		return -1;
-	}
-
 	/* for out of order entries in SRAT */
 	sort_node_map();
 	if (!nodes_cover_memory(numa_nodes)) {
-- 
cgit v0.10.2


From fd0435d8fb1d4e5771f9ae3af71f2a77c1f4bd09 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 16 Feb 2011 17:11:08 +0100
Subject: x86-64, NUMA: Unify the rest of memblk registration

Move the remaining memblk registration logic from acpi_scan_nodes() to
numa_register_memblks() and initmem_init().

This applies nodes_cover_memory() sanity check, memory node sorting
and node_online() checking, which were only applied to acpi, to all
init methods.

As all memblk registration is moved to common code, active range
clearing is moved to initmem_init() too and removed from bad_srat().

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Shaohui Zheng <shaohui.zheng@intel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: H. Peter Anvin <hpa@linux.intel.com>

diff --git a/arch/x86/mm/amdtopology_64.c b/arch/x86/mm/amdtopology_64.c
index d6d7aa4..9c9f46a 100644
--- a/arch/x86/mm/amdtopology_64.c
+++ b/arch/x86/mm/amdtopology_64.c
@@ -262,12 +262,5 @@ void __init amd_fake_nodes(const struct bootnode *nodes, int nr_nodes)
 
 int __init amd_scan_nodes(void)
 {
-	int i;
-
-	init_memory_mapping_high();
-	for_each_node_mask(i, node_possible_map)
-		setup_node_bootmem(i, numa_nodes[i].start, numa_nodes[i].end);
-
-	numa_init_array();
 	return 0;
 }
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 552080e..748c6b5 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -287,6 +287,37 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
 	node_set_online(nodeid);
 }
 
+/*
+ * Sanity check to catch more bad NUMA configurations (they are amazingly
+ * common).  Make sure the nodes cover all memory.
+ */
+static int __init nodes_cover_memory(const struct bootnode *nodes)
+{
+	unsigned long numaram, e820ram;
+	int i;
+
+	numaram = 0;
+	for_each_node_mask(i, mem_nodes_parsed) {
+		unsigned long s = nodes[i].start >> PAGE_SHIFT;
+		unsigned long e = nodes[i].end >> PAGE_SHIFT;
+		numaram += e - s;
+		numaram -= __absent_pages_in_range(i, s, e);
+		if ((long)numaram < 0)
+			numaram = 0;
+	}
+
+	e820ram = max_pfn -
+		(memblock_x86_hole_size(0, max_pfn<<PAGE_SHIFT) >> PAGE_SHIFT);
+	/* We seem to lose 3 pages somewhere. Allow 1M of slack. */
+	if ((long)(e820ram - numaram) >= (1<<(20 - PAGE_SHIFT))) {
+		printk(KERN_ERR "NUMA: nodes only cover %luMB of your %luMB e820 RAM. Not used.\n",
+			(numaram << PAGE_SHIFT) >> 20,
+			(e820ram << PAGE_SHIFT) >> 20);
+		return 0;
+	}
+	return 1;
+}
+
 static int __init numa_register_memblks(void)
 {
 	int i;
@@ -349,6 +380,27 @@ static int __init numa_register_memblks(void)
 		memblock_x86_register_active_regions(memblk_nodeid[i],
 				node_memblk_range[i].start >> PAGE_SHIFT,
 				node_memblk_range[i].end >> PAGE_SHIFT);
+
+	/* for out of order entries */
+	sort_node_map();
+	if (!nodes_cover_memory(numa_nodes))
+		return -EINVAL;
+
+	init_memory_mapping_high();
+
+	/* Finally register nodes. */
+	for_each_node_mask(i, node_possible_map)
+		setup_node_bootmem(i, numa_nodes[i].start, numa_nodes[i].end);
+
+	/*
+	 * Try again in case setup_node_bootmem missed one due to missing
+	 * bootmem.
+	 */
+	for_each_node_mask(i, node_possible_map)
+		if (!node_online(i))
+			setup_node_bootmem(i, numa_nodes[i].start,
+					   numa_nodes[i].end);
+
 	return 0;
 }
 
@@ -714,16 +766,14 @@ static int dummy_numa_init(void)
 	node_set(0, cpu_nodes_parsed);
 	node_set(0, mem_nodes_parsed);
 	numa_add_memblk(0, 0, (u64)max_pfn << PAGE_SHIFT);
+	numa_nodes[0].start = 0;
+	numa_nodes[0].end = (u64)max_pfn << PAGE_SHIFT;
 
 	return 0;
 }
 
 static int dummy_scan_nodes(void)
 {
-	init_memory_mapping_high();
-	setup_node_bootmem(0, 0, max_pfn << PAGE_SHIFT);
-	numa_init_array();
-
 	return 0;
 }
 
@@ -759,6 +809,7 @@ void __init initmem_init(void)
 		memset(node_memblk_range, 0, sizeof(node_memblk_range));
 		memset(memblk_nodeid, 0, sizeof(memblk_nodeid));
 		memset(numa_nodes, 0, sizeof(numa_nodes));
+		remove_all_active_ranges();
 
 		if (numa_init[i]() < 0)
 			continue;
@@ -783,8 +834,19 @@ void __init initmem_init(void)
 		if (numa_register_memblks() < 0)
 			continue;
 
-		if (!scan_nodes[i]())
-			return;
+		if (scan_nodes[i]() < 0)
+			continue;
+
+		for (j = 0; j < nr_cpu_ids; j++) {
+			int nid = early_cpu_to_node(j);
+
+			if (nid == NUMA_NO_NODE)
+				continue;
+			if (!node_online(nid))
+				numa_clear_node(j);
+		}
+		numa_init_array();
+		return;
 	}
 	BUG();
 }
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 69f1471..4a2c33b 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -44,7 +44,6 @@ static __init void bad_srat(void)
 		numa_nodes[i].start = numa_nodes[i].end = 0;
 		nodes_add[i].start = nodes_add[i].end = 0;
 	}
-	remove_all_active_ranges();
 }
 
 static __init inline int srat_disabled(void)
@@ -259,35 +258,6 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
 		update_nodes_add(node, start, end);
 }
 
-/* Sanity check to catch more bad SRATs (they are amazingly common).
-   Make sure the PXMs cover all memory. */
-static int __init nodes_cover_memory(const struct bootnode *nodes)
-{
-	int i;
-	unsigned long pxmram, e820ram;
-
-	pxmram = 0;
-	for_each_node_mask(i, mem_nodes_parsed) {
-		unsigned long s = nodes[i].start >> PAGE_SHIFT;
-		unsigned long e = nodes[i].end >> PAGE_SHIFT;
-		pxmram += e - s;
-		pxmram -= __absent_pages_in_range(i, s, e);
-		if ((long)pxmram < 0)
-			pxmram = 0;
-	}
-
-	e820ram = max_pfn - (memblock_x86_hole_size(0, max_pfn<<PAGE_SHIFT)>>PAGE_SHIFT);
-	/* We seem to lose 3 pages somewhere. Allow 1M of slack. */
-	if ((long)(e820ram - pxmram) >= (1<<(20 - PAGE_SHIFT))) {
-		printk(KERN_ERR
-	"SRAT: PXMs only cover %luMB of your %luMB e820 RAM. Not used.\n",
-			(pxmram << PAGE_SHIFT) >> 20,
-			(e820ram << PAGE_SHIFT) >> 20);
-		return 0;
-	}
-	return 1;
-}
-
 void __init acpi_numa_arch_fixup(void) {}
 
 int __init x86_acpi_numa_init(void)
@@ -303,39 +273,8 @@ int __init x86_acpi_numa_init(void)
 /* Use the information discovered above to actually set up the nodes. */
 int __init acpi_scan_nodes(void)
 {
-	int i;
-
 	if (acpi_numa <= 0)
 		return -1;
-
-	/* for out of order entries in SRAT */
-	sort_node_map();
-	if (!nodes_cover_memory(numa_nodes)) {
-		bad_srat();
-		return -1;
-	}
-
-	init_memory_mapping_high();
-
-	/* Finally register nodes */
-	for_each_node_mask(i, node_possible_map)
-		setup_node_bootmem(i, numa_nodes[i].start, numa_nodes[i].end);
-	/* Try again in case setup_node_bootmem missed one due
-	   to missing bootmem */
-	for_each_node_mask(i, node_possible_map)
-		if (!node_online(i))
-			setup_node_bootmem(i, numa_nodes[i].start,
-					   numa_nodes[i].end);
-
-	for (i = 0; i < nr_cpu_ids; i++) {
-		int node = early_cpu_to_node(i);
-
-		if (node == NUMA_NO_NODE)
-			continue;
-		if (!node_online(node))
-			numa_clear_node(i);
-	}
-	numa_init_array();
 	return 0;
 }
 
-- 
cgit v0.10.2


From 5d371b08fea80c4fb7450d31e5a4e35b438ef850 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 16 Feb 2011 17:11:08 +0100
Subject: x86-64, NUMA: Kill {acpi|amd|dummy}_scan_nodes()

They are empty now.  Kill them.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Shaohui Zheng <shaohui.zheng@intel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: H. Peter Anvin <hpa@linux.intel.com>

diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 12bd1fd..cfa3d5c 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -186,7 +186,6 @@ struct bootnode;
 #ifdef CONFIG_ACPI_NUMA
 extern int acpi_numa;
 extern int x86_acpi_numa_init(void);
-extern int acpi_scan_nodes(void);
 
 #ifdef CONFIG_NUMA_EMU
 extern void acpi_fake_nodes(const struct bootnode *fake_nodes,
diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h
index 246cdc6..384d118 100644
--- a/arch/x86/include/asm/amd_nb.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -17,7 +17,6 @@ extern int early_is_amd_nb(u32 value);
 extern int amd_cache_northbridges(void);
 extern void amd_flush_garts(void);
 extern int amd_numa_init(void);
-extern int amd_scan_nodes(void);
 extern int amd_get_subcaches(int);
 extern int amd_set_subcaches(int, int);
 
diff --git a/arch/x86/mm/amdtopology_64.c b/arch/x86/mm/amdtopology_64.c
index 9c9f46a..90cf297 100644
--- a/arch/x86/mm/amdtopology_64.c
+++ b/arch/x86/mm/amdtopology_64.c
@@ -259,8 +259,3 @@ void __init amd_fake_nodes(const struct bootnode *nodes, int nr_nodes)
 	memcpy(__apicid_to_node, fake_apicid_to_node, sizeof(__apicid_to_node));
 }
 #endif /* CONFIG_NUMA_EMU */
-
-int __init amd_scan_nodes(void)
-{
-	return 0;
-}
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 748c6b5..e211c00 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -772,25 +772,17 @@ static int dummy_numa_init(void)
 	return 0;
 }
 
-static int dummy_scan_nodes(void)
-{
-	return 0;
-}
-
 void __init initmem_init(void)
 {
 	int (*numa_init[])(void) = { [2] = dummy_numa_init };
-	int (*scan_nodes[])(void) = { [2] = dummy_scan_nodes };
 	int i, j;
 
 	if (!numa_off) {
 #ifdef CONFIG_ACPI_NUMA
 		numa_init[0] = x86_acpi_numa_init;
-		scan_nodes[0] = acpi_scan_nodes;
 #endif
 #ifdef CONFIG_AMD_NUMA
 		numa_init[1] = amd_numa_init;
-		scan_nodes[1] = amd_scan_nodes;
 #endif
 	}
 
@@ -834,9 +826,6 @@ void __init initmem_init(void)
 		if (numa_register_memblks() < 0)
 			continue;
 
-		if (scan_nodes[i]() < 0)
-			continue;
-
 		for (j = 0; j < nr_cpu_ids; j++) {
 			int nid = early_cpu_to_node(j);
 
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 4a2c33b..d56eff8 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -270,14 +270,6 @@ int __init x86_acpi_numa_init(void)
 	return srat_disabled() ? -EINVAL : 0;
 }
 
-/* Use the information discovered above to actually set up the nodes. */
-int __init acpi_scan_nodes(void)
-{
-	if (acpi_numa <= 0)
-		return -1;
-	return 0;
-}
-
 #ifdef CONFIG_NUMA_EMU
 static int fake_node_to_pxm_map[MAX_NUMNODES] __initdata = {
 	[0 ... MAX_NUMNODES-1] = PXM_INVAL
-- 
cgit v0.10.2


From 8968dab8ad90ea16ef92f2406868354ea3ab6bb9 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 16 Feb 2011 17:11:08 +0100
Subject: x86-64, NUMA: Remove %NULL @nodeids handling from
 compute_hash_shift()

numa_emulation() called compute_hash_shift() with %NULL @nodeids which
meant identity mapping between index and nodeid.  Make
numa_emulation() build identity array and drop %NULL @nodeids handling
from populate_memnodemap() and thus from compute_hash_shift().  This
is to prepare for transition to using memblks instead.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Shaohui Zheng <shaohui.zheng@intel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: H. Peter Anvin <hpa@linux.intel.com>

diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index e211c00..243d18d 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -63,12 +63,7 @@ static int __init populate_memnodemap(const struct bootnode *nodes,
 		do {
 			if (memnodemap[addr >> shift] != NUMA_NO_NODE)
 				return -1;
-
-			if (!nodeids)
-				memnodemap[addr >> shift] = i;
-			else
-				memnodemap[addr >> shift] = nodeids[i];
-
+			memnodemap[addr >> shift] = nodeids[i];
 			addr += (1UL << shift);
 		} while (addr < end);
 		res = 1;
@@ -706,6 +701,7 @@ static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size)
 static int __init numa_emulation(unsigned long start_pfn,
 			unsigned long last_pfn, int acpi, int amd)
 {
+	static int nodeid[NR_NODE_MEMBLKS] __initdata;
 	u64 addr = start_pfn << PAGE_SHIFT;
 	u64 max_addr = last_pfn << PAGE_SHIFT;
 	int num_nodes;
@@ -730,7 +726,11 @@ static int __init numa_emulation(unsigned long start_pfn,
 
 	if (num_nodes < 0)
 		return num_nodes;
-	memnode_shift = compute_hash_shift(nodes, num_nodes, NULL);
+
+	for (i = 0; i < ARRAY_SIZE(nodeid); i++)
+		nodeid[i] = i;
+
+	memnode_shift = compute_hash_shift(nodes, num_nodes, nodeid);
 	if (memnode_shift < 0) {
 		memnode_shift = 0;
 		printk(KERN_ERR "No NUMA hash function found.  NUMA emulation "
-- 
cgit v0.10.2


From 97e7b78d0674882a0aae043fda428c583dbb225d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 16 Feb 2011 17:11:08 +0100
Subject: x86-64, NUMA: Introduce struct numa_meminfo

Arrays for memblks and nodeids and their length lived in separate
variables making things unnecessarily cumbersome.  Introduce struct
numa_meminfo which contains all memory configuration info.  This patch
doesn't cause any behavior change.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Shaohui Zheng <shaohui.zheng@intel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: H. Peter Anvin <hpa@linux.intel.com>

diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 243d18d..c3496e2 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -22,6 +22,17 @@
 #include <asm/acpi.h>
 #include <asm/amd_nb.h>
 
+struct numa_memblk {
+	u64			start;
+	u64			end;
+	int			nid;
+};
+
+struct numa_meminfo {
+	int			nr_blks;
+	struct numa_memblk	blk[NR_NODE_MEMBLKS];
+};
+
 struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
 EXPORT_SYMBOL(node_data);
 
@@ -33,9 +44,7 @@ struct memnode memnode;
 static unsigned long __initdata nodemap_addr;
 static unsigned long __initdata nodemap_size;
 
-static int num_node_memblks __initdata;
-static struct bootnode node_memblk_range[NR_NODE_MEMBLKS] __initdata;
-static int memblk_nodeid[NR_NODE_MEMBLKS] __initdata;
+static struct numa_meminfo numa_meminfo __initdata;
 
 struct bootnode numa_nodes[MAX_NUMNODES] __initdata;
 
@@ -46,16 +55,15 @@ struct bootnode numa_nodes[MAX_NUMNODES] __initdata;
  * 0 if memnodmap[] too small (of shift too small)
  * -1 if node overlap or lost ram (shift too big)
  */
-static int __init populate_memnodemap(const struct bootnode *nodes,
-				      int numnodes, int shift, int *nodeids)
+static int __init populate_memnodemap(const struct numa_meminfo *mi, int shift)
 {
 	unsigned long addr, end;
 	int i, res = -1;
 
 	memset(memnodemap, 0xff, sizeof(s16)*memnodemapsize);
-	for (i = 0; i < numnodes; i++) {
-		addr = nodes[i].start;
-		end = nodes[i].end;
+	for (i = 0; i < mi->nr_blks; i++) {
+		addr = mi->blk[i].start;
+		end = mi->blk[i].end;
 		if (addr >= end)
 			continue;
 		if ((end >> shift) >= memnodemapsize)
@@ -63,7 +71,7 @@ static int __init populate_memnodemap(const struct bootnode *nodes,
 		do {
 			if (memnodemap[addr >> shift] != NUMA_NO_NODE)
 				return -1;
-			memnodemap[addr >> shift] = nodeids[i];
+			memnodemap[addr >> shift] = mi->blk[i].nid;
 			addr += (1UL << shift);
 		} while (addr < end);
 		res = 1;
@@ -101,16 +109,15 @@ static int __init allocate_cachealigned_memnodemap(void)
  * The LSB of all start and end addresses in the node map is the value of the
  * maximum possible shift.
  */
-static int __init extract_lsb_from_nodes(const struct bootnode *nodes,
-					 int numnodes)
+static int __init extract_lsb_from_nodes(const struct numa_meminfo *mi)
 {
 	int i, nodes_used = 0;
 	unsigned long start, end;
 	unsigned long bitfield = 0, memtop = 0;
 
-	for (i = 0; i < numnodes; i++) {
-		start = nodes[i].start;
-		end = nodes[i].end;
+	for (i = 0; i < mi->nr_blks; i++) {
+		start = mi->blk[i].start;
+		end = mi->blk[i].end;
 		if (start >= end)
 			continue;
 		bitfield |= start;
@@ -126,18 +133,17 @@ static int __init extract_lsb_from_nodes(const struct bootnode *nodes,
 	return i;
 }
 
-static int __init compute_hash_shift(struct bootnode *nodes, int numnodes,
-				     int *nodeids)
+static int __init compute_hash_shift(const struct numa_meminfo *mi)
 {
 	int shift;
 
-	shift = extract_lsb_from_nodes(nodes, numnodes);
+	shift = extract_lsb_from_nodes(mi);
 	if (allocate_cachealigned_memnodemap())
 		return -1;
 	printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
 		shift);
 
-	if (populate_memnodemap(nodes, numnodes, shift, nodeids) != 1) {
+	if (populate_memnodemap(mi, shift) != 1) {
 		printk(KERN_INFO "Your memory is not aligned you need to "
 		       "rebuild your kernel with a bigger NODEMAPSIZE "
 		       "shift=%d\n", shift);
@@ -185,21 +191,25 @@ static void * __init early_node_mem(int nodeid, unsigned long start,
 
 static __init int conflicting_memblks(unsigned long start, unsigned long end)
 {
+	struct numa_meminfo *mi = &numa_meminfo;
 	int i;
-	for (i = 0; i < num_node_memblks; i++) {
-		struct bootnode *nd = &node_memblk_range[i];
-		if (nd->start == nd->end)
+
+	for (i = 0; i < mi->nr_blks; i++) {
+		struct numa_memblk *blk = &mi->blk[i];
+
+		if (blk->start == blk->end)
 			continue;
-		if (nd->end > start && nd->start < end)
-			return memblk_nodeid[i];
-		if (nd->end == end && nd->start == start)
-			return memblk_nodeid[i];
+		if (blk->end > start && blk->start < end)
+			return blk->nid;
+		if (blk->end == end && blk->start == start)
+			return blk->nid;
 	}
 	return -1;
 }
 
 int __init numa_add_memblk(int nid, u64 start, u64 end)
 {
+	struct numa_meminfo *mi = &numa_meminfo;
 	int i;
 
 	i = conflicting_memblks(start, end);
@@ -213,10 +223,10 @@ int __init numa_add_memblk(int nid, u64 start, u64 end)
 		return -EINVAL;
 	}
 
-	node_memblk_range[num_node_memblks].start = start;
-	node_memblk_range[num_node_memblks].end = end;
-	memblk_nodeid[num_node_memblks] = nid;
-	num_node_memblks++;
+	mi->blk[mi->nr_blks].start = start;
+	mi->blk[mi->nr_blks].end = end;
+	mi->blk[mi->nr_blks].nid = nid;
+	mi->nr_blks++;
 	return 0;
 }
 
@@ -315,66 +325,59 @@ static int __init nodes_cover_memory(const struct bootnode *nodes)
 
 static int __init numa_register_memblks(void)
 {
+	struct numa_meminfo *mi = &numa_meminfo;
 	int i;
 
 	/*
 	 * Join together blocks on the same node, holes between
 	 * which don't overlap with memory on other nodes.
 	 */
-	for (i = 0; i < num_node_memblks; ++i) {
+	for (i = 0; i < mi->nr_blks; ++i) {
+		struct numa_memblk *bi = &mi->blk[i];
 		int j, k;
 
-		for (j = i + 1; j < num_node_memblks; ++j) {
+		for (j = i + 1; j < mi->nr_blks; ++j) {
+			struct numa_memblk *bj = &mi->blk[j];
 			unsigned long start, end;
 
-			if (memblk_nodeid[i] != memblk_nodeid[j])
+			if (bi->nid != bj->nid)
 				continue;
-			start = min(node_memblk_range[i].end,
-			            node_memblk_range[j].end);
-			end = max(node_memblk_range[i].start,
-			          node_memblk_range[j].start);
-			for (k = 0; k < num_node_memblks; ++k) {
-				if (memblk_nodeid[i] == memblk_nodeid[k])
+			start = min(bi->end, bj->end);
+			end = max(bi->start, bj->start);
+			for (k = 0; k < mi->nr_blks; ++k) {
+				struct numa_memblk *bk = &mi->blk[k];
+
+				if (bi->nid == bk->nid)
 					continue;
-				if (start < node_memblk_range[k].end &&
-				    end > node_memblk_range[k].start)
+				if (start < bk->end && end > bk->start)
 					break;
 			}
-			if (k < num_node_memblks)
+			if (k < mi->nr_blks)
 				continue;
-			start = min(node_memblk_range[i].start,
-			            node_memblk_range[j].start);
-			end = max(node_memblk_range[i].end,
-			          node_memblk_range[j].end);
+			start = min(bi->start, bj->start);
+			end = max(bi->end, bj->end);
 			printk(KERN_INFO "NUMA: Node %d [%Lx,%Lx) + [%Lx,%Lx) -> [%lx,%lx)\n",
-			       memblk_nodeid[i],
-			       node_memblk_range[i].start,
-			       node_memblk_range[i].end,
-			       node_memblk_range[j].start,
-			       node_memblk_range[j].end,
+			       bi->nid, bi->start, bi->end, bj->start, bj->end,
 			       start, end);
-			node_memblk_range[i].start = start;
-			node_memblk_range[i].end = end;
-			k = --num_node_memblks - j;
-			memmove(memblk_nodeid + j, memblk_nodeid + j+1,
-				k * sizeof(*memblk_nodeid));
-			memmove(node_memblk_range + j, node_memblk_range + j+1,
-				k * sizeof(*node_memblk_range));
+			bi->start = start;
+			bi->end = end;
+			k = --mi->nr_blks - j;
+			memmove(mi->blk + j, mi->blk + j + 1,
+				k * sizeof(mi->blk[0]));
 			--j;
 		}
 	}
 
-	memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks,
-					   memblk_nodeid);
+	memnode_shift = compute_hash_shift(mi);
 	if (memnode_shift < 0) {
 		printk(KERN_ERR "NUMA: No NUMA node hash function found. Contact maintainer\n");
 		return -EINVAL;
 	}
 
-	for (i = 0; i < num_node_memblks; i++)
-		memblock_x86_register_active_regions(memblk_nodeid[i],
-				node_memblk_range[i].start >> PAGE_SHIFT,
-				node_memblk_range[i].end >> PAGE_SHIFT);
+	for (i = 0; i < mi->nr_blks; i++)
+		memblock_x86_register_active_regions(mi->blk[i].nid,
+					mi->blk[i].start >> PAGE_SHIFT,
+					mi->blk[i].end >> PAGE_SHIFT);
 
 	/* for out of order entries */
 	sort_node_map();
@@ -701,7 +704,7 @@ static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size)
 static int __init numa_emulation(unsigned long start_pfn,
 			unsigned long last_pfn, int acpi, int amd)
 {
-	static int nodeid[NR_NODE_MEMBLKS] __initdata;
+	static struct numa_meminfo ei __initdata;
 	u64 addr = start_pfn << PAGE_SHIFT;
 	u64 max_addr = last_pfn << PAGE_SHIFT;
 	int num_nodes;
@@ -727,10 +730,14 @@ static int __init numa_emulation(unsigned long start_pfn,
 	if (num_nodes < 0)
 		return num_nodes;
 
-	for (i = 0; i < ARRAY_SIZE(nodeid); i++)
-		nodeid[i] = i;
+	ei.nr_blks = num_nodes;
+	for (i = 0; i < ei.nr_blks; i++) {
+		ei.blk[i].start = nodes[i].start;
+		ei.blk[i].end = nodes[i].end;
+		ei.blk[i].nid = i;
+	}
 
-	memnode_shift = compute_hash_shift(nodes, num_nodes, nodeid);
+	memnode_shift = compute_hash_shift(&ei);
 	if (memnode_shift < 0) {
 		memnode_shift = 0;
 		printk(KERN_ERR "No NUMA hash function found.  NUMA emulation "
@@ -797,9 +804,7 @@ void __init initmem_init(void)
 		nodes_clear(mem_nodes_parsed);
 		nodes_clear(node_possible_map);
 		nodes_clear(node_online_map);
-		num_node_memblks = 0;
-		memset(node_memblk_range, 0, sizeof(node_memblk_range));
-		memset(memblk_nodeid, 0, sizeof(memblk_nodeid));
+		memset(&numa_meminfo, 0, sizeof(numa_meminfo));
 		memset(numa_nodes, 0, sizeof(numa_nodes));
 		remove_all_active_ranges();
 
-- 
cgit v0.10.2


From f9c60251c3d08777db6758cafd959a55a838abd6 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 16 Feb 2011 17:11:09 +0100
Subject: x86-64, NUMA: Separate out numa_cleanup_meminfo()

Separate out numa_cleanup_meminfo() from numa_register_memblks().
node_possible_map initialization is moved to the top of the split
numa_register_memblks().

This patch doesn't cause behavior change.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Shaohui Zheng <shaohui.zheng@intel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: H. Peter Anvin <hpa@linux.intel.com>

diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index c3496e2..f2721de 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -292,40 +292,8 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
 	node_set_online(nodeid);
 }
 
-/*
- * Sanity check to catch more bad NUMA configurations (they are amazingly
- * common).  Make sure the nodes cover all memory.
- */
-static int __init nodes_cover_memory(const struct bootnode *nodes)
+static int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
 {
-	unsigned long numaram, e820ram;
-	int i;
-
-	numaram = 0;
-	for_each_node_mask(i, mem_nodes_parsed) {
-		unsigned long s = nodes[i].start >> PAGE_SHIFT;
-		unsigned long e = nodes[i].end >> PAGE_SHIFT;
-		numaram += e - s;
-		numaram -= __absent_pages_in_range(i, s, e);
-		if ((long)numaram < 0)
-			numaram = 0;
-	}
-
-	e820ram = max_pfn -
-		(memblock_x86_hole_size(0, max_pfn<<PAGE_SHIFT) >> PAGE_SHIFT);
-	/* We seem to lose 3 pages somewhere. Allow 1M of slack. */
-	if ((long)(e820ram - numaram) >= (1<<(20 - PAGE_SHIFT))) {
-		printk(KERN_ERR "NUMA: nodes only cover %luMB of your %luMB e820 RAM. Not used.\n",
-			(numaram << PAGE_SHIFT) >> 20,
-			(e820ram << PAGE_SHIFT) >> 20);
-		return 0;
-	}
-	return 1;
-}
-
-static int __init numa_register_memblks(void)
-{
-	struct numa_meminfo *mi = &numa_meminfo;
 	int i;
 
 	/*
@@ -368,6 +336,49 @@ static int __init numa_register_memblks(void)
 		}
 	}
 
+	return 0;
+}
+
+/*
+ * Sanity check to catch more bad NUMA configurations (they are amazingly
+ * common).  Make sure the nodes cover all memory.
+ */
+static int __init nodes_cover_memory(const struct bootnode *nodes)
+{
+	unsigned long numaram, e820ram;
+	int i;
+
+	numaram = 0;
+	for_each_node_mask(i, mem_nodes_parsed) {
+		unsigned long s = nodes[i].start >> PAGE_SHIFT;
+		unsigned long e = nodes[i].end >> PAGE_SHIFT;
+		numaram += e - s;
+		numaram -= __absent_pages_in_range(i, s, e);
+		if ((long)numaram < 0)
+			numaram = 0;
+	}
+
+	e820ram = max_pfn - (memblock_x86_hole_size(0,
+					max_pfn << PAGE_SHIFT) >> PAGE_SHIFT);
+	/* We seem to lose 3 pages somewhere. Allow 1M of slack. */
+	if ((long)(e820ram - numaram) >= (1 << (20 - PAGE_SHIFT))) {
+		printk(KERN_ERR "NUMA: nodes only cover %luMB of your %luMB e820 RAM. Not used.\n",
+		       (numaram << PAGE_SHIFT) >> 20,
+		       (e820ram << PAGE_SHIFT) >> 20);
+		return 0;
+	}
+	return 1;
+}
+
+static int __init numa_register_memblks(struct numa_meminfo *mi)
+{
+	int i;
+
+	/* Account for nodes with cpus and no memory */
+	nodes_or(node_possible_map, mem_nodes_parsed, cpu_nodes_parsed);
+	if (WARN_ON(nodes_empty(node_possible_map)))
+		return -EINVAL;
+
 	memnode_shift = compute_hash_shift(mi);
 	if (memnode_shift < 0) {
 		printk(KERN_ERR "NUMA: No NUMA node hash function found. Contact maintainer\n");
@@ -823,12 +834,10 @@ void __init initmem_init(void)
 		nodes_clear(node_possible_map);
 		nodes_clear(node_online_map);
 #endif
-		/* Account for nodes with cpus and no memory */
-		nodes_or(node_possible_map, mem_nodes_parsed, cpu_nodes_parsed);
-		if (WARN_ON(nodes_empty(node_possible_map)))
+		if (numa_cleanup_meminfo(&numa_meminfo) < 0)
 			continue;
 
-		if (numa_register_memblks() < 0)
+		if (numa_register_memblks(&numa_meminfo) < 0)
 			continue;
 
 		for (j = 0; j < nr_cpu_ids; j++) {
-- 
cgit v0.10.2


From 2e756be44714d0ec2f9827e4f4797c60876167a1 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 16 Feb 2011 17:11:09 +0100
Subject: x86-64, NUMA: make numa_cleanup_meminfo() prettier

* Factor out numa_remove_memblk_from().

* Hole detection doesn't need separate start/end.  Calculate start/end
  once.

* Relocate comment.

* Define iterators at the top and remove unnecessary prefix
  increments.

This prepares for further improvements to the function.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Shaohui Zheng <shaohui.zheng@intel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: H. Peter Anvin <hpa@linux.intel.com>

diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index f2721de..4fd3368 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -230,6 +230,13 @@ int __init numa_add_memblk(int nid, u64 start, u64 end)
 	return 0;
 }
 
+static void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi)
+{
+	mi->nr_blks--;
+	memmove(&mi->blk[idx], &mi->blk[idx + 1],
+		(mi->nr_blks - idx) * sizeof(mi->blk[0]));
+}
+
 static __init void cutoff_node(int i, unsigned long start, unsigned long end)
 {
 	struct bootnode *nd = &numa_nodes[i];
@@ -294,25 +301,25 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
 
 static int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
 {
-	int i;
+	int i, j, k;
 
-	/*
-	 * Join together blocks on the same node, holes between
-	 * which don't overlap with memory on other nodes.
-	 */
-	for (i = 0; i < mi->nr_blks; ++i) {
+	for (i = 0; i < mi->nr_blks; i++) {
 		struct numa_memblk *bi = &mi->blk[i];
-		int j, k;
 
-		for (j = i + 1; j < mi->nr_blks; ++j) {
+		for (j = i + 1; j < mi->nr_blks; j++) {
 			struct numa_memblk *bj = &mi->blk[j];
 			unsigned long start, end;
 
+			/*
+			 * Join together blocks on the same node, holes
+			 * between which don't overlap with memory on other
+			 * nodes.
+			 */
 			if (bi->nid != bj->nid)
 				continue;
-			start = min(bi->end, bj->end);
-			end = max(bi->start, bj->start);
-			for (k = 0; k < mi->nr_blks; ++k) {
+			start = min(bi->start, bj->start);
+			end = max(bi->end, bj->end);
+			for (k = 0; k < mi->nr_blks; k++) {
 				struct numa_memblk *bk = &mi->blk[k];
 
 				if (bi->nid == bk->nid)
@@ -322,17 +329,12 @@ static int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
 			}
 			if (k < mi->nr_blks)
 				continue;
-			start = min(bi->start, bj->start);
-			end = max(bi->end, bj->end);
 			printk(KERN_INFO "NUMA: Node %d [%Lx,%Lx) + [%Lx,%Lx) -> [%lx,%lx)\n",
 			       bi->nid, bi->start, bi->end, bj->start, bj->end,
 			       start, end);
 			bi->start = start;
 			bi->end = end;
-			k = --mi->nr_blks - j;
-			memmove(mi->blk + j, mi->blk + j + 1,
-				k * sizeof(mi->blk[0]));
-			--j;
+			numa_remove_memblk_from(j--, mi);
 		}
 	}
 
-- 
cgit v0.10.2


From 56e827fbde9a3cb886a2fe138db0d99e98efbfb1 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 16 Feb 2011 17:11:09 +0100
Subject: x86-64, NUMA: consolidate and improve memblk sanity checks

memblk sanity check was scattered around and incomplete.  Consolidate
and improve.

* Confliction detection and cutoff_node() logic are moved to
  numa_cleanup_meminfo().

* numa_cleanup_meminfo() clears the unused memblks before returning.

* Check and warn about invalid input parameters in numa_add_memblk().

* Check the maximum number of memblk isn't exceeded in
  numa_add_memblk().

* numa_cleanup_meminfo() is now called before numa_emulation() so that
  the emulation code also uses the cleaned up version.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Shaohui Zheng <shaohui.zheng@intel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: H. Peter Anvin <hpa@linux.intel.com>

diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 4fd3368..20aa1d3 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -189,37 +189,23 @@ static void * __init early_node_mem(int nodeid, unsigned long start,
 	return NULL;
 }
 
-static __init int conflicting_memblks(unsigned long start, unsigned long end)
+int __init numa_add_memblk(int nid, u64 start, u64 end)
 {
 	struct numa_meminfo *mi = &numa_meminfo;
-	int i;
 
-	for (i = 0; i < mi->nr_blks; i++) {
-		struct numa_memblk *blk = &mi->blk[i];
+	/* ignore zero length blks */
+	if (start == end)
+		return 0;
 
-		if (blk->start == blk->end)
-			continue;
-		if (blk->end > start && blk->start < end)
-			return blk->nid;
-		if (blk->end == end && blk->start == start)
-			return blk->nid;
+	/* whine about and ignore invalid blks */
+	if (start > end || nid < 0 || nid >= MAX_NUMNODES) {
+		pr_warning("NUMA: Warning: invalid memblk node %d (%Lx-%Lx)\n",
+			   nid, start, end);
+		return 0;
 	}
-	return -1;
-}
-
-int __init numa_add_memblk(int nid, u64 start, u64 end)
-{
-	struct numa_meminfo *mi = &numa_meminfo;
-	int i;
 
-	i = conflicting_memblks(start, end);
-	if (i == nid) {
-		printk(KERN_WARNING "NUMA: Warning: node %d (%Lx-%Lx) overlaps with itself (%Lx-%Lx)\n",
-		       nid, start, end, numa_nodes[i].start, numa_nodes[i].end);
-	} else if (i >= 0) {
-		printk(KERN_ERR "NUMA: node %d (%Lx-%Lx) overlaps with node %d (%Lx-%Lx)\n",
-		       nid, start, end, i,
-		       numa_nodes[i].start, numa_nodes[i].end);
+	if (mi->nr_blks >= NR_NODE_MEMBLKS) {
+		pr_err("NUMA: too many memblk ranges\n");
 		return -EINVAL;
 	}
 
@@ -237,22 +223,6 @@ static void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi)
 		(mi->nr_blks - idx) * sizeof(mi->blk[0]));
 }
 
-static __init void cutoff_node(int i, unsigned long start, unsigned long end)
-{
-	struct bootnode *nd = &numa_nodes[i];
-
-	if (nd->start < start) {
-		nd->start = start;
-		if (nd->end < nd->start)
-			nd->start = nd->end;
-	}
-	if (nd->end > end) {
-		nd->end = end;
-		if (nd->start > nd->end)
-			nd->start = nd->end;
-	}
-}
-
 /* Initialize bootmem allocator for a node */
 void __init
 setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
@@ -301,24 +271,53 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
 
 static int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
 {
+	const u64 low = 0;
+	const u64 high = (u64)max_pfn << PAGE_SHIFT;
 	int i, j, k;
 
 	for (i = 0; i < mi->nr_blks; i++) {
 		struct numa_memblk *bi = &mi->blk[i];
 
+		/* make sure all blocks are inside the limits */
+		bi->start = max(bi->start, low);
+		bi->end = min(bi->end, high);
+
+		/* and there's no empty block */
+		if (bi->start == bi->end) {
+			numa_remove_memblk_from(i--, mi);
+			continue;
+		}
+
 		for (j = i + 1; j < mi->nr_blks; j++) {
 			struct numa_memblk *bj = &mi->blk[j];
 			unsigned long start, end;
 
 			/*
+			 * See whether there are overlapping blocks.  Whine
+			 * about but allow overlaps of the same nid.  They
+			 * will be merged below.
+			 */
+			if (bi->end > bj->start && bi->start < bj->end) {
+				if (bi->nid != bj->nid) {
+					pr_err("NUMA: node %d (%Lx-%Lx) overlaps with node %d (%Lx-%Lx)\n",
+					       bi->nid, bi->start, bi->end,
+					       bj->nid, bj->start, bj->end);
+					return -EINVAL;
+				}
+				pr_warning("NUMA: Warning: node %d (%Lx-%Lx) overlaps with itself (%Lx-%Lx)\n",
+					   bi->nid, bi->start, bi->end,
+					   bj->start, bj->end);
+			}
+
+			/*
 			 * Join together blocks on the same node, holes
 			 * between which don't overlap with memory on other
 			 * nodes.
 			 */
 			if (bi->nid != bj->nid)
 				continue;
-			start = min(bi->start, bj->start);
-			end = max(bi->end, bj->end);
+			start = max(min(bi->start, bj->start), low);
+			end = min(max(bi->end, bj->end), high);
 			for (k = 0; k < mi->nr_blks; k++) {
 				struct numa_memblk *bk = &mi->blk[k];
 
@@ -338,6 +337,11 @@ static int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
 		}
 	}
 
+	for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) {
+		mi->blk[i].start = mi->blk[i].end = 0;
+		mi->blk[i].nid = NUMA_NO_NODE;
+	}
+
 	return 0;
 }
 
@@ -824,10 +828,8 @@ void __init initmem_init(void)
 		if (numa_init[i]() < 0)
 			continue;
 
-		/* clean up the node list */
-		for (j = 0; j < MAX_NUMNODES; j++)
-			cutoff_node(j, 0, max_pfn << PAGE_SHIFT);
-
+		if (numa_cleanup_meminfo(&numa_meminfo) < 0)
+			continue;
 #ifdef CONFIG_NUMA_EMU
 		setup_physnodes(0, max_pfn << PAGE_SHIFT);
 		if (cmdline && !numa_emulation(0, max_pfn, i == 0, i == 1))
@@ -836,9 +838,6 @@ void __init initmem_init(void)
 		nodes_clear(node_possible_map);
 		nodes_clear(node_online_map);
 #endif
-		if (numa_cleanup_meminfo(&numa_meminfo) < 0)
-			continue;
-
 		if (numa_register_memblks(&numa_meminfo) < 0)
 			continue;
 
-- 
cgit v0.10.2


From a844ef46fa3055165c28feede6114a711b8375ad Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 16 Feb 2011 17:11:09 +0100
Subject: x86-64, NUMA: Add common find_node_by_addr()

srat_64.c and amdtopology_64.c had their own versions of
find_node_by_addr() which were basically the same.  Add common one in
numa_64.c and remove the duplicates.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Shaohui Zheng <shaohui.zheng@intel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: H. Peter Anvin <hpa@linux.intel.com>

diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h
index e925605..925ade9 100644
--- a/arch/x86/include/asm/numa_64.h
+++ b/arch/x86/include/asm/numa_64.h
@@ -35,6 +35,7 @@ extern int __init numa_add_memblk(int nodeid, u64 start, u64 end);
 #define FAKE_NODE_MIN_SIZE	((u64)32 << 20)
 #define FAKE_NODE_MIN_HASH_MASK	(~(FAKE_NODE_MIN_SIZE - 1UL))
 void numa_emu_cmdline(char *);
+int __init find_node_by_addr(unsigned long addr);
 #endif /* CONFIG_NUMA_EMU */
 #else
 static inline int numa_cpu_node(int cpu)		{ return NUMA_NO_NODE; }
diff --git a/arch/x86/mm/amdtopology_64.c b/arch/x86/mm/amdtopology_64.c
index 90cf297..8f7a5eb 100644
--- a/arch/x86/mm/amdtopology_64.c
+++ b/arch/x86/mm/amdtopology_64.c
@@ -205,19 +205,6 @@ static s16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = {
 	[0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
 };
 
-static int __init find_node_by_addr(unsigned long addr)
-{
-	int ret = NUMA_NO_NODE;
-	int i;
-
-	for (i = 0; i < 8; i++)
-		if (addr >= numa_nodes[i].start && addr < numa_nodes[i].end) {
-			ret = i;
-			break;
-		}
-	return ret;
-}
-
 /*
  * For NUMA emulation, fake proximity domain (_PXM) to node id mappings must be
  * setup to represent the physical topology but reflect the emulated
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 20aa1d3..681bc0d 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -430,6 +430,25 @@ void __init numa_emu_cmdline(char *str)
 	cmdline = str;
 }
 
+int __init find_node_by_addr(unsigned long addr)
+{
+	int ret = NUMA_NO_NODE;
+	int i;
+
+	for_each_node_mask(i, mem_nodes_parsed) {
+		/*
+		 * Find the real node that this emulated node appears on.  For
+		 * the sake of simplicity, we only use a real node's starting
+		 * address to determine which emulated node it appears on.
+		 */
+		if (addr >= numa_nodes[i].start && addr < numa_nodes[i].end) {
+			ret = i;
+			break;
+		}
+	}
+	return ret;
+}
+
 static int __init setup_physnodes(unsigned long start, unsigned long end)
 {
 	int ret = 0;
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index d56eff8..51d0733 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -277,24 +277,6 @@ static int fake_node_to_pxm_map[MAX_NUMNODES] __initdata = {
 static s16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = {
 	[0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
 };
-static int __init find_node_by_addr(unsigned long addr)
-{
-	int ret = NUMA_NO_NODE;
-	int i;
-
-	for_each_node_mask(i, mem_nodes_parsed) {
-		/*
-		 * Find the real node that this emulated node appears on.  For
-		 * the sake of simplicity, we only use a real node's starting
-		 * address to determine which emulated node it appears on.
-		 */
-		if (addr >= numa_nodes[i].start && addr < numa_nodes[i].end) {
-			ret = i;
-			break;
-		}
-	}
-	return ret;
-}
 
 /*
  * In NUMA emulation, we need to setup proximity domain (_PXM) to node ID
-- 
cgit v0.10.2


From 91556237ec872e1029e3036174bae3b1a8df65eb Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 16 Feb 2011 17:11:09 +0100
Subject: x86-64, NUMA: Kill numa_nodes[]

numa_nodes[] doesn't carry any information which isn't present in
numa_meminfo.  Each entry is simply min/max range of all the memblks
for the node.  This is not only redundant but also inaccurate when
memblks for different nodes interleave - for example,
find_node_by_addr() can return the wrong nodeid.

Kill numa_nodes[] and always use numa_meminfo instead.

* nodes_cover_memory() is renamed to numa_meminfo_cover_memory() and
  now operations on numa_meminfo and returns bool.

* setup_node_bootmem() needs min/max range.  Compute the range on the
  fly.  setup_node_bootmem() invocation is restructured to use outer
  loop instead of hardcoding the double invocations.

* find_node_by_addr() now operates on numa_meminfo.

* setup_physnodes() builds physnodes[] from memblks.  This will go
  away when emulation code is updated to use struct numa_meminfo.

This patch also makes the following misc changes.

* Clearing of nodes_add[] clearing is converted to memset().

* numa_add_memblk() in amd_numa_init() is moved down a bit for
  consistency.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Shaohui Zheng <shaohui.zheng@intel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: H. Peter Anvin <hpa@linux.intel.com>

diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h
index 925ade9..20b69a9 100644
--- a/arch/x86/include/asm/numa_64.h
+++ b/arch/x86/include/asm/numa_64.h
@@ -26,7 +26,6 @@ extern void setup_node_bootmem(int nodeid, unsigned long start,
 
 extern nodemask_t cpu_nodes_parsed __initdata;
 extern nodemask_t mem_nodes_parsed __initdata;
-extern struct bootnode numa_nodes[MAX_NUMNODES] __initdata;
 
 extern int __cpuinit numa_cpu_node(int cpu);
 extern int __init numa_add_memblk(int nodeid, u64 start, u64 end);
diff --git a/arch/x86/mm/amdtopology_64.c b/arch/x86/mm/amdtopology_64.c
index 8f7a5eb..0cb59e5 100644
--- a/arch/x86/mm/amdtopology_64.c
+++ b/arch/x86/mm/amdtopology_64.c
@@ -165,12 +165,8 @@ int __init amd_numa_init(void)
 		pr_info("Node %d MemBase %016lx Limit %016lx\n",
 			nodeid, base, limit);
 
-		numa_nodes[nodeid].start = base;
-		numa_nodes[nodeid].end = limit;
-		numa_add_memblk(nodeid, base, limit);
-
 		prevbase = base;
-
+		numa_add_memblk(nodeid, base, limit);
 		node_set(nodeid, mem_nodes_parsed);
 		node_set(nodeid, cpu_nodes_parsed);
 	}
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 681bc0d..c490448 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -46,8 +46,6 @@ static unsigned long __initdata nodemap_size;
 
 static struct numa_meminfo numa_meminfo __initdata;
 
-struct bootnode numa_nodes[MAX_NUMNODES] __initdata;
-
 /*
  * Given a shift value, try to populate memnodemap[]
  * Returns :
@@ -349,17 +347,17 @@ static int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
  * Sanity check to catch more bad NUMA configurations (they are amazingly
  * common).  Make sure the nodes cover all memory.
  */
-static int __init nodes_cover_memory(const struct bootnode *nodes)
+static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi)
 {
 	unsigned long numaram, e820ram;
 	int i;
 
 	numaram = 0;
-	for_each_node_mask(i, mem_nodes_parsed) {
-		unsigned long s = nodes[i].start >> PAGE_SHIFT;
-		unsigned long e = nodes[i].end >> PAGE_SHIFT;
+	for (i = 0; i < mi->nr_blks; i++) {
+		unsigned long s = mi->blk[i].start >> PAGE_SHIFT;
+		unsigned long e = mi->blk[i].end >> PAGE_SHIFT;
 		numaram += e - s;
-		numaram -= __absent_pages_in_range(i, s, e);
+		numaram -= __absent_pages_in_range(mi->blk[i].nid, s, e);
 		if ((long)numaram < 0)
 			numaram = 0;
 	}
@@ -371,14 +369,14 @@ static int __init nodes_cover_memory(const struct bootnode *nodes)
 		printk(KERN_ERR "NUMA: nodes only cover %luMB of your %luMB e820 RAM. Not used.\n",
 		       (numaram << PAGE_SHIFT) >> 20,
 		       (e820ram << PAGE_SHIFT) >> 20);
-		return 0;
+		return false;
 	}
-	return 1;
+	return true;
 }
 
 static int __init numa_register_memblks(struct numa_meminfo *mi)
 {
-	int i;
+	int i, j, nid;
 
 	/* Account for nodes with cpus and no memory */
 	nodes_or(node_possible_map, mem_nodes_parsed, cpu_nodes_parsed);
@@ -398,23 +396,34 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
 
 	/* for out of order entries */
 	sort_node_map();
-	if (!nodes_cover_memory(numa_nodes))
+	if (!numa_meminfo_cover_memory(mi))
 		return -EINVAL;
 
 	init_memory_mapping_high();
 
-	/* Finally register nodes. */
-	for_each_node_mask(i, node_possible_map)
-		setup_node_bootmem(i, numa_nodes[i].start, numa_nodes[i].end);
-
 	/*
-	 * Try again in case setup_node_bootmem missed one due to missing
-	 * bootmem.
+	 * Finally register nodes.  Do it twice in case setup_node_bootmem
+	 * missed one due to missing bootmem.
 	 */
-	for_each_node_mask(i, node_possible_map)
-		if (!node_online(i))
-			setup_node_bootmem(i, numa_nodes[i].start,
-					   numa_nodes[i].end);
+	for (i = 0; i < 2; i++) {
+		for_each_node_mask(nid, node_possible_map) {
+			u64 start = (u64)max_pfn << PAGE_SHIFT;
+			u64 end = 0;
+
+			if (node_online(nid))
+				continue;
+
+			for (j = 0; j < mi->nr_blks; j++) {
+				if (nid != mi->blk[j].nid)
+					continue;
+				start = min(mi->blk[j].start, start);
+				end = max(mi->blk[j].end, end);
+			}
+
+			if (start < end)
+				setup_node_bootmem(nid, start, end);
+		}
+	}
 
 	return 0;
 }
@@ -432,33 +441,41 @@ void __init numa_emu_cmdline(char *str)
 
 int __init find_node_by_addr(unsigned long addr)
 {
-	int ret = NUMA_NO_NODE;
+	const struct numa_meminfo *mi = &numa_meminfo;
 	int i;
 
-	for_each_node_mask(i, mem_nodes_parsed) {
+	for (i = 0; i < mi->nr_blks; i++) {
 		/*
 		 * Find the real node that this emulated node appears on.  For
 		 * the sake of simplicity, we only use a real node's starting
 		 * address to determine which emulated node it appears on.
 		 */
-		if (addr >= numa_nodes[i].start && addr < numa_nodes[i].end) {
-			ret = i;
-			break;
-		}
+		if (addr >= mi->blk[i].start && addr < mi->blk[i].end)
+			return mi->blk[i].nid;
 	}
-	return ret;
+	return NUMA_NO_NODE;
 }
 
 static int __init setup_physnodes(unsigned long start, unsigned long end)
 {
+	const struct numa_meminfo *mi = &numa_meminfo;
 	int ret = 0;
 	int i;
 
 	memset(physnodes, 0, sizeof(physnodes));
 
-	for_each_node_mask(i, mem_nodes_parsed) {
-		physnodes[i].start = numa_nodes[i].start;
-		physnodes[i].end = numa_nodes[i].end;
+	for (i = 0; i < mi->nr_blks; i++) {
+		int nid = mi->blk[i].nid;
+
+		if (physnodes[nid].start == physnodes[nid].end) {
+			physnodes[nid].start = mi->blk[i].start;
+			physnodes[nid].end = mi->blk[i].end;
+		} else {
+			physnodes[nid].start = min(physnodes[nid].start,
+						   mi->blk[i].start);
+			physnodes[nid].end = max(physnodes[nid].end,
+						 mi->blk[i].end);
+		}
 	}
 
 	/*
@@ -809,8 +826,6 @@ static int dummy_numa_init(void)
 	node_set(0, cpu_nodes_parsed);
 	node_set(0, mem_nodes_parsed);
 	numa_add_memblk(0, 0, (u64)max_pfn << PAGE_SHIFT);
-	numa_nodes[0].start = 0;
-	numa_nodes[0].end = (u64)max_pfn << PAGE_SHIFT;
 
 	return 0;
 }
@@ -841,7 +856,6 @@ void __init initmem_init(void)
 		nodes_clear(node_possible_map);
 		nodes_clear(node_online_map);
 		memset(&numa_meminfo, 0, sizeof(numa_meminfo));
-		memset(numa_nodes, 0, sizeof(numa_nodes));
 		remove_all_active_ranges();
 
 		if (numa_init[i]() < 0)
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 51d0733..e8b3b3c 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -37,13 +37,9 @@ static __init int setup_node(int pxm)
 
 static __init void bad_srat(void)
 {
-	int i;
 	printk(KERN_ERR "SRAT: SRAT not used.\n");
 	acpi_numa = -1;
-	for (i = 0; i < MAX_NUMNODES; i++) {
-		numa_nodes[i].start = numa_nodes[i].end = 0;
-		nodes_add[i].start = nodes_add[i].end = 0;
-	}
+	memset(nodes_add, 0, sizeof(nodes_add));
 }
 
 static __init inline int srat_disabled(void)
@@ -210,7 +206,6 @@ update_nodes_add(int node, unsigned long start, unsigned long end)
 void __init
 acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
 {
-	struct bootnode *nd;
 	unsigned long start, end;
 	int node, pxm;
 
@@ -243,18 +238,9 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
 	printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm,
 	       start, end);
 
-	if (!(ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE)) {
-		nd = &numa_nodes[node];
-		if (!node_test_and_set(node, mem_nodes_parsed)) {
-			nd->start = start;
-			nd->end = end;
-		} else {
-			if (start < nd->start)
-				nd->start = start;
-			if (nd->end < end)
-				nd->end = end;
-		}
-	} else
+	if (!(ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE))
+		node_set(node, mem_nodes_parsed);
+	else
 		update_nodes_add(node, start, end);
 }
 
-- 
cgit v0.10.2


From 92d4a4371eeb89e1e12b9ebbed0956f499b6c2c0 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 16 Feb 2011 17:11:09 +0100
Subject: x86-64, NUMA: Rename cpu_nodes_parsed to numa_nodes_parsed

It's no longer necessary to keep both cpu_nodes_parsed and
mem_nodes_parsed.  In preparation for merge, rename cpu_nodes_parsed
to numa_nodes_parsed.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Shaohui Zheng <shaohui.zheng@intel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: H. Peter Anvin <hpa@linux.intel.com>

diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h
index 20b69a9..6e94469 100644
--- a/arch/x86/include/asm/numa_64.h
+++ b/arch/x86/include/asm/numa_64.h
@@ -24,7 +24,7 @@ extern void setup_node_bootmem(int nodeid, unsigned long start,
  */
 #define NODE_MIN_SIZE (4*1024*1024)
 
-extern nodemask_t cpu_nodes_parsed __initdata;
+extern nodemask_t numa_nodes_parsed __initdata;
 extern nodemask_t mem_nodes_parsed __initdata;
 
 extern int __cpuinit numa_cpu_node(int cpu);
diff --git a/arch/x86/mm/amdtopology_64.c b/arch/x86/mm/amdtopology_64.c
index 0cb59e5..e76bffa 100644
--- a/arch/x86/mm/amdtopology_64.c
+++ b/arch/x86/mm/amdtopology_64.c
@@ -168,7 +168,7 @@ int __init amd_numa_init(void)
 		prevbase = base;
 		numa_add_memblk(nodeid, base, limit);
 		node_set(nodeid, mem_nodes_parsed);
-		node_set(nodeid, cpu_nodes_parsed);
+		node_set(nodeid, numa_nodes_parsed);
 	}
 
 	if (!nodes_weight(mem_nodes_parsed))
@@ -189,7 +189,7 @@ int __init amd_numa_init(void)
 		apicid_base = boot_cpu_physical_apicid;
 	}
 
-	for_each_node_mask(i, cpu_nodes_parsed)
+	for_each_node_mask(i, numa_nodes_parsed)
 		for (j = apicid_base; j < cores + apicid_base; j++)
 			set_apicid_to_node((i << bits) + j, i);
 
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index c490448..6e4fbd7 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -36,7 +36,7 @@ struct numa_meminfo {
 struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
 EXPORT_SYMBOL(node_data);
 
-nodemask_t cpu_nodes_parsed __initdata;
+nodemask_t numa_nodes_parsed __initdata;
 nodemask_t mem_nodes_parsed __initdata;
 
 struct memnode memnode;
@@ -379,7 +379,7 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
 	int i, j, nid;
 
 	/* Account for nodes with cpus and no memory */
-	nodes_or(node_possible_map, mem_nodes_parsed, cpu_nodes_parsed);
+	nodes_or(node_possible_map, mem_nodes_parsed, numa_nodes_parsed);
 	if (WARN_ON(nodes_empty(node_possible_map)))
 		return -EINVAL;
 
@@ -823,7 +823,7 @@ static int dummy_numa_init(void)
 	printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
 	       0LU, max_pfn << PAGE_SHIFT);
 
-	node_set(0, cpu_nodes_parsed);
+	node_set(0, numa_nodes_parsed);
 	node_set(0, mem_nodes_parsed);
 	numa_add_memblk(0, 0, (u64)max_pfn << PAGE_SHIFT);
 
@@ -851,7 +851,7 @@ void __init initmem_init(void)
 		for (j = 0; j < MAX_LOCAL_APIC; j++)
 			set_apicid_to_node(j, NUMA_NO_NODE);
 
-		nodes_clear(cpu_nodes_parsed);
+		nodes_clear(numa_nodes_parsed);
 		nodes_clear(mem_nodes_parsed);
 		nodes_clear(node_possible_map);
 		nodes_clear(node_online_map);
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index e8b3b3c..8185189 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -94,7 +94,7 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
 		return;
 	}
 	set_apicid_to_node(apic_id, node);
-	node_set(node, cpu_nodes_parsed);
+	node_set(node, numa_nodes_parsed);
 	acpi_numa = 1;
 	printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u\n",
 	       pxm, apic_id, node);
@@ -134,7 +134,7 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
 	}
 
 	set_apicid_to_node(apic_id, node);
-	node_set(node, cpu_nodes_parsed);
+	node_set(node, numa_nodes_parsed);
 	acpi_numa = 1;
 	printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u\n",
 	       pxm, apic_id, node);
@@ -196,7 +196,7 @@ update_nodes_add(int node, unsigned long start, unsigned long end)
 	}
 
 	if (changed) {
-		node_set(node, cpu_nodes_parsed);
+		node_set(node, numa_nodes_parsed);
 		printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n",
 				 nd->start, nd->end);
 	}
-- 
cgit v0.10.2


From 4697bdcc945c094d2c8a4876a24faeaf31a283e0 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 16 Feb 2011 17:11:09 +0100
Subject: x86-64, NUMA: Kill mem_nodes_parsed

With all memory configuration information now carried in numa_meminfo,
there's no need to keep mem_nodes_parsed separate.  Drop it and use
numa_nodes_parsed for CPU / memory-less nodes.

A new helper numa_nodemask_from_meminfo() is added to calculate
memnode mask on the fly which is currently used to set
node_possible_map.

This simplifies NUMA init methods a bit and removes a source of
possible inconsistencies.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Shaohui Zheng <shaohui.zheng@intel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: H. Peter Anvin <hpa@linux.intel.com>

diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h
index 6e94469..f42710b 100644
--- a/arch/x86/include/asm/numa_64.h
+++ b/arch/x86/include/asm/numa_64.h
@@ -25,7 +25,6 @@ extern void setup_node_bootmem(int nodeid, unsigned long start,
 #define NODE_MIN_SIZE (4*1024*1024)
 
 extern nodemask_t numa_nodes_parsed __initdata;
-extern nodemask_t mem_nodes_parsed __initdata;
 
 extern int __cpuinit numa_cpu_node(int cpu);
 extern int __init numa_add_memblk(int nodeid, u64 start, u64 end);
diff --git a/arch/x86/mm/amdtopology_64.c b/arch/x86/mm/amdtopology_64.c
index e76bffa..fd7b609 100644
--- a/arch/x86/mm/amdtopology_64.c
+++ b/arch/x86/mm/amdtopology_64.c
@@ -122,7 +122,7 @@ int __init amd_numa_init(void)
 			       nodeid, (base >> 8) & 3, (limit >> 8) & 3);
 			return -EINVAL;
 		}
-		if (node_isset(nodeid, mem_nodes_parsed)) {
+		if (node_isset(nodeid, numa_nodes_parsed)) {
 			pr_info("Node %d already present, skipping\n",
 				nodeid);
 			continue;
@@ -167,11 +167,10 @@ int __init amd_numa_init(void)
 
 		prevbase = base;
 		numa_add_memblk(nodeid, base, limit);
-		node_set(nodeid, mem_nodes_parsed);
 		node_set(nodeid, numa_nodes_parsed);
 	}
 
-	if (!nodes_weight(mem_nodes_parsed))
+	if (!nodes_weight(numa_nodes_parsed))
 		return -ENOENT;
 
 	/*
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 6e4fbd7..8b1f178 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -37,7 +37,6 @@ struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
 EXPORT_SYMBOL(node_data);
 
 nodemask_t numa_nodes_parsed __initdata;
-nodemask_t mem_nodes_parsed __initdata;
 
 struct memnode memnode;
 
@@ -344,6 +343,20 @@ static int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
 }
 
 /*
+ * Set nodes, which have memory in @mi, in *@nodemask.
+ */
+static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask,
+					      const struct numa_meminfo *mi)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(mi->blk); i++)
+		if (mi->blk[i].start != mi->blk[i].end &&
+		    mi->blk[i].nid != NUMA_NO_NODE)
+			node_set(mi->blk[i].nid, *nodemask);
+}
+
+/*
  * Sanity check to catch more bad NUMA configurations (they are amazingly
  * common).  Make sure the nodes cover all memory.
  */
@@ -379,7 +392,8 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
 	int i, j, nid;
 
 	/* Account for nodes with cpus and no memory */
-	nodes_or(node_possible_map, mem_nodes_parsed, numa_nodes_parsed);
+	node_possible_map = numa_nodes_parsed;
+	numa_nodemask_from_meminfo(&node_possible_map, mi);
 	if (WARN_ON(nodes_empty(node_possible_map)))
 		return -EINVAL;
 
@@ -824,7 +838,6 @@ static int dummy_numa_init(void)
 	       0LU, max_pfn << PAGE_SHIFT);
 
 	node_set(0, numa_nodes_parsed);
-	node_set(0, mem_nodes_parsed);
 	numa_add_memblk(0, 0, (u64)max_pfn << PAGE_SHIFT);
 
 	return 0;
@@ -852,7 +865,6 @@ void __init initmem_init(void)
 			set_apicid_to_node(j, NUMA_NO_NODE);
 
 		nodes_clear(numa_nodes_parsed);
-		nodes_clear(mem_nodes_parsed);
 		nodes_clear(node_possible_map);
 		nodes_clear(node_online_map);
 		memset(&numa_meminfo, 0, sizeof(numa_meminfo));
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 8185189..4f8e6cd 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -238,9 +238,7 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
 	printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm,
 	       start, end);
 
-	if (!(ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE))
-		node_set(node, mem_nodes_parsed);
-	else
+	if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE)
 		update_nodes_add(node, start, end);
 }
 
@@ -310,10 +308,9 @@ void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
 		__acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i);
 	memcpy(__apicid_to_node, fake_apicid_to_node, sizeof(__apicid_to_node));
 
-	nodes_clear(mem_nodes_parsed);
 	for (i = 0; i < num_nodes; i++)
 		if (fake_nodes[i].start != fake_nodes[i].end)
-			node_set(i, mem_nodes_parsed);
+			node_set(i, numa_nodes_parsed);
 }
 
 static int null_slit_node_compare(int a, int b)
-- 
cgit v0.10.2


From ac7136b611ee8f8bd6231ce2e1dbdd31ae3d39bc Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 16 Feb 2011 17:11:09 +0100
Subject: x86-64, NUMA: Implement generic node distance handling

Node distance either used direct node comparison, ACPI PXM comparison
or ACPI SLIT table lookup.  This patch implements generic node
distance handling.  NUMA init methods can call numa_set_distance() to
set distance between nodes and the common __node_distance()
implementation will report the set distance.

Due to the way NUMA emulation is implemented, the generic node
distance handling is used only when emulation is not used.  Later
patches will update NUMA emulation to use the generic distance
mechanism.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Shaohui Zheng <shaohui.zheng@intel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: H. Peter Anvin <hpa@linux.intel.com>

diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index cfa3d5c..9c9fe1b 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -190,6 +190,7 @@ extern int x86_acpi_numa_init(void);
 #ifdef CONFIG_NUMA_EMU
 extern void acpi_fake_nodes(const struct bootnode *fake_nodes,
 				   int num_nodes);
+extern int acpi_emu_node_distance(int a, int b);
 #endif
 #endif /* CONFIG_ACPI_NUMA */
 
diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h
index f42710b..5361c59 100644
--- a/arch/x86/include/asm/numa_64.h
+++ b/arch/x86/include/asm/numa_64.h
@@ -28,6 +28,7 @@ extern nodemask_t numa_nodes_parsed __initdata;
 
 extern int __cpuinit numa_cpu_node(int cpu);
 extern int __init numa_add_memblk(int nodeid, u64 start, u64 end);
+extern void __init numa_set_distance(int from, int to, int distance);
 
 #ifdef CONFIG_NUMA_EMU
 #define FAKE_NODE_MIN_SIZE	((u64)32 << 20)
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index b101c17..910a708 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -138,7 +138,7 @@ extern unsigned long node_remap_size[];
 	.balance_interval	= 1,					\
 }
 
-#ifdef CONFIG_X86_64_ACPI_NUMA
+#ifdef CONFIG_X86_64
 extern int __node_distance(int, int);
 #define node_distance(a, b) __node_distance(a, b)
 #endif
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 8b1f178..a3621f2 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -45,6 +45,13 @@ static unsigned long __initdata nodemap_size;
 
 static struct numa_meminfo numa_meminfo __initdata;
 
+static int numa_distance_cnt;
+static u8 *numa_distance;
+
+#ifdef CONFIG_NUMA_EMU
+static bool numa_emu_dist;
+#endif
+
 /*
  * Given a shift value, try to populate memnodemap[]
  * Returns :
@@ -357,6 +364,92 @@ static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask,
 }
 
 /*
+ * Reset distance table.  The current table is freed.  The next
+ * numa_set_distance() call will create a new one.
+ */
+static void __init numa_reset_distance(void)
+{
+	size_t size;
+
+	size = numa_distance_cnt * sizeof(numa_distance[0]);
+	memblock_x86_free_range(__pa(numa_distance),
+				__pa(numa_distance) + size);
+	numa_distance = NULL;
+	numa_distance_cnt = 0;
+}
+
+/*
+ * Set the distance between node @from to @to to @distance.  If distance
+ * table doesn't exist, one which is large enough to accomodate all the
+ * currently known nodes will be created.
+ */
+void __init numa_set_distance(int from, int to, int distance)
+{
+	if (!numa_distance) {
+		nodemask_t nodes_parsed;
+		size_t size;
+		int i, j, cnt = 0;
+		u64 phys;
+
+		/* size the new table and allocate it */
+		nodes_parsed = numa_nodes_parsed;
+		numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo);
+
+		for_each_node_mask(i, nodes_parsed)
+			cnt = i;
+		size = ++cnt * sizeof(numa_distance[0]);
+
+		phys = memblock_find_in_range(0,
+					      (u64)max_pfn_mapped << PAGE_SHIFT,
+					      size, PAGE_SIZE);
+		if (phys == MEMBLOCK_ERROR) {
+			pr_warning("NUMA: Warning: can't allocate distance table!\n");
+			/* don't retry until explicitly reset */
+			numa_distance = (void *)1LU;
+			return;
+		}
+		memblock_x86_reserve_range(phys, phys + size, "NUMA DIST");
+
+		numa_distance = __va(phys);
+		numa_distance_cnt = cnt;
+
+		/* fill with the default distances */
+		for (i = 0; i < cnt; i++)
+			for (j = 0; j < cnt; j++)
+				numa_distance[i * cnt + j] = i == j ?
+					LOCAL_DISTANCE : REMOTE_DISTANCE;
+		printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt);
+	}
+
+	if (from >= numa_distance_cnt || to >= numa_distance_cnt) {
+		printk_once(KERN_DEBUG "NUMA: Debug: distance out of bound, from=%d to=%d distance=%d\n",
+			    from, to, distance);
+		return;
+	}
+
+	if ((u8)distance != distance ||
+	    (from == to && distance != LOCAL_DISTANCE)) {
+		pr_warn_once("NUMA: Warning: invalid distance parameter, from=%d to=%d distance=%d\n",
+			     from, to, distance);
+		return;
+	}
+
+	numa_distance[from * numa_distance_cnt + to] = distance;
+}
+
+int __node_distance(int from, int to)
+{
+#if defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA_EMU)
+	if (numa_emu_dist)
+		return acpi_emu_node_distance(from, to);
+#endif
+	if (from >= numa_distance_cnt || to >= numa_distance_cnt)
+		return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE;
+	return numa_distance[from * numa_distance_cnt + to];
+}
+EXPORT_SYMBOL(__node_distance);
+
+/*
  * Sanity check to catch more bad NUMA configurations (they are amazingly
  * common).  Make sure the nodes cover all memory.
  */
@@ -826,6 +919,7 @@ static int __init numa_emulation(unsigned long start_pfn,
 	setup_physnodes(addr, max_addr);
 	fake_physnodes(acpi, amd, num_nodes);
 	numa_init_array();
+	numa_emu_dist = true;
 	return 0;
 }
 #endif /* CONFIG_NUMA_EMU */
@@ -869,6 +963,7 @@ void __init initmem_init(void)
 		nodes_clear(node_online_map);
 		memset(&numa_meminfo, 0, sizeof(numa_meminfo));
 		remove_all_active_ranges();
+		numa_reset_distance();
 
 		if (numa_init[i]() < 0)
 			continue;
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 4f8e6cd..d2f53f3 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -50,9 +50,16 @@ static __init inline int srat_disabled(void)
 /* Callback for SLIT parsing */
 void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
 {
+	int i, j;
 	unsigned length;
 	unsigned long phys;
 
+	for (i = 0; i < slit->locality_count; i++)
+		for (j = 0; j < slit->locality_count; j++)
+			numa_set_distance(pxm_to_node(i), pxm_to_node(j),
+				slit->entry[slit->locality_count * i + j]);
+
+	/* acpi_slit is used only by emulation */
 	length = slit->header.length;
 	phys = memblock_find_in_range(0, max_pfn_mapped<<PAGE_SHIFT, length,
 		 PAGE_SIZE);
@@ -313,29 +320,17 @@ void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
 			node_set(i, numa_nodes_parsed);
 }
 
-static int null_slit_node_compare(int a, int b)
-{
-	return node_to_pxm(a) == node_to_pxm(b);
-}
-#else
-static int null_slit_node_compare(int a, int b)
-{
-	return a == b;
-}
-#endif /* CONFIG_NUMA_EMU */
-
-int __node_distance(int a, int b)
+int acpi_emu_node_distance(int a, int b)
 {
 	int index;
 
 	if (!acpi_slit)
-		return null_slit_node_compare(a, b) ? LOCAL_DISTANCE :
-						      REMOTE_DISTANCE;
+		return node_to_pxm(a) == node_to_pxm(b) ?
+			LOCAL_DISTANCE : REMOTE_DISTANCE;
 	index = acpi_slit->locality_count * node_to_pxm(a);
 	return acpi_slit->entry[index + node_to_pxm(b)];
 }
-
-EXPORT_SYMBOL(__node_distance);
+#endif /* CONFIG_NUMA_EMU */
 
 #if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) || defined(CONFIG_ACPI_HOTPLUG_MEMORY)
 int memory_add_physaddr_to_nid(u64 start)
-- 
cgit v0.10.2


From d9c515eacb3bde73f7a5ecb7e35ea6e660ad421d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 16 Feb 2011 17:11:10 +0100
Subject: x86-64, NUMA: Trivial changes to prepare for emulation updates

* Separate out numa_add_memblk_to() from numa_add_memblk() so that
  different numa_meminfo can be used.

* Rename cmdline to emu_cmdline.

* Drop @start/last_pfn from numa_emulation() and use max_pfn directly.

This patch doesn't introduce any behavior change.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Shaohui Zheng <shaohui.zheng@intel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: H. Peter Anvin <hpa@linux.intel.com>

diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index a3621f2..20e2cfe 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -193,10 +193,9 @@ static void * __init early_node_mem(int nodeid, unsigned long start,
 	return NULL;
 }
 
-int __init numa_add_memblk(int nid, u64 start, u64 end)
+static int __init numa_add_memblk_to(int nid, u64 start, u64 end,
+				     struct numa_meminfo *mi)
 {
-	struct numa_meminfo *mi = &numa_meminfo;
-
 	/* ignore zero length blks */
 	if (start == end)
 		return 0;
@@ -227,6 +226,11 @@ static void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi)
 		(mi->nr_blks - idx) * sizeof(mi->blk[0]));
 }
 
+int __init numa_add_memblk(int nid, u64 start, u64 end)
+{
+	return numa_add_memblk_to(nid, start, end, &numa_meminfo);
+}
+
 /* Initialize bootmem allocator for a node */
 void __init
 setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
@@ -539,11 +543,11 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
 /* Numa emulation */
 static struct bootnode nodes[MAX_NUMNODES] __initdata;
 static struct bootnode physnodes[MAX_NUMNODES] __cpuinitdata;
-static char *cmdline __initdata;
+static char *emu_cmdline __initdata;
 
 void __init numa_emu_cmdline(char *str)
 {
-	cmdline = str;
+	emu_cmdline = str;
 }
 
 int __init find_node_by_addr(unsigned long addr)
@@ -861,12 +865,10 @@ static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size)
  * Sets up the system RAM area from start_pfn to last_pfn according to the
  * numa=fake command-line option.
  */
-static int __init numa_emulation(unsigned long start_pfn,
-			unsigned long last_pfn, int acpi, int amd)
+static int __init numa_emulation(int acpi, int amd)
 {
 	static struct numa_meminfo ei __initdata;
-	u64 addr = start_pfn << PAGE_SHIFT;
-	u64 max_addr = last_pfn << PAGE_SHIFT;
+	const u64 max_addr = max_pfn << PAGE_SHIFT;
 	int num_nodes;
 	int i;
 
@@ -875,16 +877,16 @@ static int __init numa_emulation(unsigned long start_pfn,
 	 * the fixed node size.  Otherwise, if it is just a single number N,
 	 * split the system RAM into N fake nodes.
 	 */
-	if (strchr(cmdline, 'M') || strchr(cmdline, 'G')) {
+	if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) {
 		u64 size;
 
-		size = memparse(cmdline, &cmdline);
-		num_nodes = split_nodes_size_interleave(addr, max_addr, size);
+		size = memparse(emu_cmdline, &emu_cmdline);
+		num_nodes = split_nodes_size_interleave(0, max_addr, size);
 	} else {
 		unsigned long n;
 
-		n = simple_strtoul(cmdline, NULL, 0);
-		num_nodes = split_nodes_interleave(addr, max_addr, n);
+		n = simple_strtoul(emu_cmdline, NULL, 0);
+		num_nodes = split_nodes_interleave(0, max_addr, n);
 	}
 
 	if (num_nodes < 0)
@@ -916,7 +918,7 @@ static int __init numa_emulation(unsigned long start_pfn,
 	init_memory_mapping_high();
 	for_each_node_mask(i, node_possible_map)
 		setup_node_bootmem(i, nodes[i].start, nodes[i].end);
-	setup_physnodes(addr, max_addr);
+	setup_physnodes(0, max_addr);
 	fake_physnodes(acpi, amd, num_nodes);
 	numa_init_array();
 	numa_emu_dist = true;
@@ -972,7 +974,7 @@ void __init initmem_init(void)
 			continue;
 #ifdef CONFIG_NUMA_EMU
 		setup_physnodes(0, max_pfn << PAGE_SHIFT);
-		if (cmdline && !numa_emulation(0, max_pfn, i == 0, i == 1))
+		if (emu_cmdline && !numa_emulation(i == 0, i == 1))
 			return;
 		setup_physnodes(0, max_pfn << PAGE_SHIFT);
 		nodes_clear(node_possible_map);
-- 
cgit v0.10.2


From 9d073caeb372940af02a768d2b7e845ac732bda0 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 16 Feb 2011 17:11:10 +0100
Subject: x86-64, NUMA: Build and use direct emulated nid -> phys nid mapping

NUMA emulation copied physical NUMA configuration into physnodes[] and
used it to reverse-map emulated nodes to physical nodes, which is
unnecessarily convoluted.  Build emu_nid_to_phys[] array to map
emulated nids directly to the matching physical nids and use it in
numa_add_cpu().

physnodes[] will be removed with further patches.

- v2: Build failure when CONFIG_DEBUG_PER_CPU_MAPS due to missing
  local variable definition fixed.  Reported by Ingo.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Shaohui Zheng <shaohui.zheng@intel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: H. Peter Anvin <hpa@linux.intel.com>

diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 20e2cfe..e9919c4 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -542,7 +542,9 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
 #ifdef CONFIG_NUMA_EMU
 /* Numa emulation */
 static struct bootnode nodes[MAX_NUMNODES] __initdata;
-static struct bootnode physnodes[MAX_NUMNODES] __cpuinitdata;
+static struct bootnode physnodes[MAX_NUMNODES] __initdata;
+
+static int emu_nid_to_phys[MAX_NUMNODES] __cpuinitdata;
 static char *emu_cmdline __initdata;
 
 void __init numa_emu_cmdline(char *str)
@@ -649,7 +651,8 @@ static void __init fake_physnodes(int acpi, int amd, int nr_nodes)
  * allocation past addr and -1 otherwise.  addr is adjusted to be at
  * the end of the node.
  */
-static int __init setup_node_range(int nid, u64 *addr, u64 size, u64 max_addr)
+static int __init setup_node_range(int nid, int physnid,
+				   u64 *addr, u64 size, u64 max_addr)
 {
 	int ret = 0;
 	nodes[nid].start = *addr;
@@ -660,6 +663,10 @@ static int __init setup_node_range(int nid, u64 *addr, u64 size, u64 max_addr)
 	}
 	nodes[nid].end = *addr;
 	node_set(nid, node_possible_map);
+
+	if (emu_nid_to_phys[nid] == NUMA_NO_NODE)
+		emu_nid_to_phys[nid] = physnid;
+
 	printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid,
 	       nodes[nid].start, nodes[nid].end,
 	       (nodes[nid].end - nodes[nid].start) >> 20);
@@ -756,7 +763,7 @@ static int __init split_nodes_interleave(u64 addr, u64 max_addr, int nr_nodes)
 			if (nodes_weight(physnode_mask) + ret >= nr_nodes)
 				end = physnodes[i].end;
 
-			if (setup_node_range(ret++, &physnodes[i].start,
+			if (setup_node_range(ret++, i, &physnodes[i].start,
 						end - physnodes[i].start,
 						physnodes[i].end) < 0)
 				node_clear(i, physnode_mask);
@@ -852,7 +859,7 @@ static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size)
 			 * later.  If setup_node_range() returns non-zero, there
 			 * is no more memory available on this physical node.
 			 */
-			if (setup_node_range(ret++, &physnodes[i].start,
+			if (setup_node_range(ret++, i, &physnodes[i].start,
 						end - physnodes[i].start,
 						physnodes[i].end) < 0)
 				node_clear(i, physnode_mask);
@@ -872,6 +879,9 @@ static int __init numa_emulation(int acpi, int amd)
 	int num_nodes;
 	int i;
 
+	for (i = 0; i < MAX_NUMNODES; i++)
+		emu_nid_to_phys[i] = NUMA_NO_NODE;
+
 	/*
 	 * If the numa=fake command-line contains a 'M' or 'G', it represents
 	 * the fixed node size.  Otherwise, if it is just a single number N,
@@ -892,6 +902,11 @@ static int __init numa_emulation(int acpi, int amd)
 	if (num_nodes < 0)
 		return num_nodes;
 
+	/* make sure all emulated nodes are mapped to a physical node */
+	for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
+		if (emu_nid_to_phys[i] == NUMA_NO_NODE)
+			emu_nid_to_phys[i] = 0;
+
 	ei.nr_blks = num_nodes;
 	for (i = 0; i < ei.nr_blks; i++) {
 		ei.blk[i].start = nodes[i].start;
@@ -918,7 +933,6 @@ static int __init numa_emulation(int acpi, int amd)
 	init_memory_mapping_high();
 	for_each_node_mask(i, node_possible_map)
 		setup_node_bootmem(i, nodes[i].start, nodes[i].end);
-	setup_physnodes(0, max_addr);
 	fake_physnodes(acpi, amd, num_nodes);
 	numa_init_array();
 	numa_emu_dist = true;
@@ -976,7 +990,11 @@ void __init initmem_init(void)
 		setup_physnodes(0, max_pfn << PAGE_SHIFT);
 		if (emu_cmdline && !numa_emulation(i == 0, i == 1))
 			return;
-		setup_physnodes(0, max_pfn << PAGE_SHIFT);
+
+		/* not emulating, build identity mapping for numa_add_cpu() */
+		for (j = 0; j < ARRAY_SIZE(emu_nid_to_phys); j++)
+			emu_nid_to_phys[j] = j;
+
 		nodes_clear(node_possible_map);
 		nodes_clear(node_online_map);
 #endif
@@ -1033,7 +1051,6 @@ int __cpuinit numa_cpu_node(int cpu)
 # ifndef CONFIG_DEBUG_PER_CPU_MAPS
 void __cpuinit numa_add_cpu(int cpu)
 {
-	unsigned long addr;
 	int physnid, nid;
 
 	nid = numa_cpu_node(cpu);
@@ -1041,26 +1058,15 @@ void __cpuinit numa_add_cpu(int cpu)
 		nid = early_cpu_to_node(cpu);
 	BUG_ON(nid == NUMA_NO_NODE || !node_online(nid));
 
-	/*
-	 * Use the starting address of the emulated node to find which physical
-	 * node it is allocated on.
-	 */
-	addr = node_start_pfn(nid) << PAGE_SHIFT;
-	for (physnid = 0; physnid < MAX_NUMNODES; physnid++)
-		if (addr >= physnodes[physnid].start &&
-		    addr < physnodes[physnid].end)
-			break;
+	physnid = emu_nid_to_phys[nid];
 
 	/*
 	 * Map the cpu to each emulated node that is allocated on the physical
 	 * node of the cpu's apic id.
 	 */
-	for_each_online_node(nid) {
-		addr = node_start_pfn(nid) << PAGE_SHIFT;
-		if (addr >= physnodes[physnid].start &&
-		    addr < physnodes[physnid].end)
+	for_each_online_node(nid)
+		if (emu_nid_to_phys[nid] == physnid)
 			cpumask_set_cpu(cpu, node_to_cpumask_map[nid]);
-	}
 }
 
 void __cpuinit numa_remove_cpu(int cpu)
@@ -1073,21 +1079,21 @@ void __cpuinit numa_remove_cpu(int cpu)
 # else	/* !CONFIG_DEBUG_PER_CPU_MAPS */
 static void __cpuinit numa_set_cpumask(int cpu, int enable)
 {
-	int node = early_cpu_to_node(cpu);
 	struct cpumask *mask;
-	int i;
+	int nid, physnid, i;
 
-	if (node == NUMA_NO_NODE) {
+	nid = early_cpu_to_node(cpu);
+	if (nid == NUMA_NO_NODE) {
 		/* early_cpu_to_node() already emits a warning and trace */
 		return;
 	}
-	for_each_online_node(i) {
-		unsigned long addr;
 
-		addr = node_start_pfn(i) << PAGE_SHIFT;
-		if (addr < physnodes[node].start ||
-					addr >= physnodes[node].end)
+	physnid = emu_nid_to_phys[nid];
+
+	for_each_online_node(i) {
+		if (emu_nid_to_phys[nid] != physnid)
 			continue;
+
 		mask = debug_cpumask_set_cpu(cpu, enable);
 		if (!mask)
 			return;
-- 
cgit v0.10.2


From c88aea7a70b0f014f98c695069ba91abc9e9b9a4 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 16 Feb 2011 17:11:10 +0100
Subject: x86-64, NUMA: Make emulation code build numa_meminfo and share the
 registration path

NUMA emulation code built nodes[] array and had its own registration
path to set up the emulated nodes.  Update it such that it generates
emulated numa_meminfo and returns control to initmem_init() and shares
the same registration path with non-emulated cases.

Because {acpi|amd}_fake_nodes() expect nodes[] parameter,
fake_physnodes() now generates nodes[] from numa_meminfo.  This will
go away with further updates.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Shaohui Zheng <shaohui.zheng@intel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: H. Peter Anvin <hpa@linux.intel.com>

diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index e9919c4..9736204 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -541,7 +541,6 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
 
 #ifdef CONFIG_NUMA_EMU
 /* Numa emulation */
-static struct bootnode nodes[MAX_NUMNODES] __initdata;
 static struct bootnode physnodes[MAX_NUMNODES] __initdata;
 
 static int emu_nid_to_phys[MAX_NUMNODES] __cpuinitdata;
@@ -626,9 +625,24 @@ static int __init setup_physnodes(unsigned long start, unsigned long end)
 	return ret;
 }
 
-static void __init fake_physnodes(int acpi, int amd, int nr_nodes)
+static void __init fake_physnodes(int acpi, int amd,
+				  const struct numa_meminfo *ei)
 {
-	int i;
+	static struct bootnode nodes[MAX_NUMNODES] __initdata;
+	int i, nr_nodes = 0;
+
+	for (i = 0; i < ei->nr_blks; i++) {
+		int nid = ei->blk[i].nid;
+
+		if (nodes[nid].start == nodes[nid].end) {
+			nodes[nid].start = ei->blk[i].start;
+			nodes[nid].end = ei->blk[i].end;
+			nr_nodes++;
+		} else {
+			nodes[nid].start = min(ei->blk[i].start, nodes[nid].start);
+			nodes[nid].end = max(ei->blk[i].end, nodes[nid].end);
+		}
+	}
 
 	BUG_ON(acpi && amd);
 #ifdef CONFIG_ACPI_NUMA
@@ -645,45 +659,44 @@ static void __init fake_physnodes(int acpi, int amd, int nr_nodes)
 }
 
 /*
- * Setups up nid to range from addr to addr + size.  If the end
- * boundary is greater than max_addr, then max_addr is used instead.
- * The return value is 0 if there is additional memory left for
- * allocation past addr and -1 otherwise.  addr is adjusted to be at
- * the end of the node.
+ * Sets up nid to range from @start to @end.  The return value is -errno if
+ * something went wrong, 0 otherwise.
  */
-static int __init setup_node_range(int nid, int physnid,
-				   u64 *addr, u64 size, u64 max_addr)
+static int __init emu_setup_memblk(struct numa_meminfo *ei,
+				   int nid, int physnid, u64 start, u64 end)
 {
-	int ret = 0;
-	nodes[nid].start = *addr;
-	*addr += size;
-	if (*addr >= max_addr) {
-		*addr = max_addr;
-		ret = -1;
+	struct numa_memblk *eb = &ei->blk[ei->nr_blks];
+
+	if (ei->nr_blks >= NR_NODE_MEMBLKS) {
+		pr_err("NUMA: Too many emulated memblks, failing emulation\n");
+		return -EINVAL;
 	}
-	nodes[nid].end = *addr;
-	node_set(nid, node_possible_map);
+
+	ei->nr_blks++;
+	eb->start = start;
+	eb->end = end;
+	eb->nid = nid;
 
 	if (emu_nid_to_phys[nid] == NUMA_NO_NODE)
 		emu_nid_to_phys[nid] = physnid;
 
 	printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid,
-	       nodes[nid].start, nodes[nid].end,
-	       (nodes[nid].end - nodes[nid].start) >> 20);
-	return ret;
+	       eb->start, eb->end, (eb->end - eb->start) >> 20);
+	return 0;
 }
 
 /*
  * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
  * to max_addr.  The return value is the number of nodes allocated.
  */
-static int __init split_nodes_interleave(u64 addr, u64 max_addr, int nr_nodes)
+static int __init split_nodes_interleave(struct numa_meminfo *ei,
+					 u64 addr, u64 max_addr, int nr_nodes)
 {
 	nodemask_t physnode_mask = NODE_MASK_NONE;
 	u64 size;
 	int big;
-	int ret = 0;
-	int i;
+	int nid = 0;
+	int i, ret;
 
 	if (nr_nodes <= 0)
 		return -1;
@@ -721,7 +734,7 @@ static int __init split_nodes_interleave(u64 addr, u64 max_addr, int nr_nodes)
 			u64 end = physnodes[i].start + size;
 			u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
 
-			if (ret < big)
+			if (nid < big)
 				end += FAKE_NODE_MIN_SIZE;
 
 			/*
@@ -760,16 +773,21 @@ static int __init split_nodes_interleave(u64 addr, u64 max_addr, int nr_nodes)
 			 * happen as a result of rounding down each node's size
 			 * to FAKE_NODE_MIN_SIZE.
 			 */
-			if (nodes_weight(physnode_mask) + ret >= nr_nodes)
+			if (nodes_weight(physnode_mask) + nid >= nr_nodes)
 				end = physnodes[i].end;
 
-			if (setup_node_range(ret++, i, &physnodes[i].start,
-						end - physnodes[i].start,
-						physnodes[i].end) < 0)
+			ret = emu_setup_memblk(ei, nid++, i,
+					       physnodes[i].start,
+					       min(end, physnodes[i].end));
+			if (ret < 0)
+				return ret;
+
+			physnodes[i].start = min(end, physnodes[i].end);
+			if (physnodes[i].start == physnodes[i].end)
 				node_clear(i, physnode_mask);
 		}
 	}
-	return ret;
+	return 0;
 }
 
 /*
@@ -794,12 +812,13 @@ static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size)
  * Sets up fake nodes of `size' interleaved over physical nodes ranging from
  * `addr' to `max_addr'.  The return value is the number of nodes allocated.
  */
-static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size)
+static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
+					      u64 addr, u64 max_addr, u64 size)
 {
 	nodemask_t physnode_mask = NODE_MASK_NONE;
 	u64 min_size;
-	int ret = 0;
-	int i;
+	int nid = 0;
+	int i, ret;
 
 	if (!size)
 		return -1;
@@ -854,30 +873,31 @@ static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size)
 			    memblock_x86_hole_size(end, physnodes[i].end) < size)
 				end = physnodes[i].end;
 
-			/*
-			 * Setup the fake node that will be allocated as bootmem
-			 * later.  If setup_node_range() returns non-zero, there
-			 * is no more memory available on this physical node.
-			 */
-			if (setup_node_range(ret++, i, &physnodes[i].start,
-						end - physnodes[i].start,
-						physnodes[i].end) < 0)
+			ret = emu_setup_memblk(ei, nid++, i,
+					       physnodes[i].start,
+					       min(end, physnodes[i].end));
+			if (ret < 0)
+				return ret;
+
+			physnodes[i].start = min(end, physnodes[i].end);
+			if (physnodes[i].start == physnodes[i].end)
 				node_clear(i, physnode_mask);
 		}
 	}
-	return ret;
+	return 0;
 }
 
 /*
  * Sets up the system RAM area from start_pfn to last_pfn according to the
  * numa=fake command-line option.
  */
-static int __init numa_emulation(int acpi, int amd)
+static bool __init numa_emulation(int acpi, int amd)
 {
 	static struct numa_meminfo ei __initdata;
 	const u64 max_addr = max_pfn << PAGE_SHIFT;
-	int num_nodes;
-	int i;
+	int i, ret;
+
+	memset(&ei, 0, sizeof(ei));
 
 	for (i = 0; i < MAX_NUMNODES; i++)
 		emu_nid_to_phys[i] = NUMA_NO_NODE;
@@ -891,52 +911,33 @@ static int __init numa_emulation(int acpi, int amd)
 		u64 size;
 
 		size = memparse(emu_cmdline, &emu_cmdline);
-		num_nodes = split_nodes_size_interleave(0, max_addr, size);
+		ret = split_nodes_size_interleave(&ei, 0, max_addr, size);
 	} else {
 		unsigned long n;
 
 		n = simple_strtoul(emu_cmdline, NULL, 0);
-		num_nodes = split_nodes_interleave(0, max_addr, n);
+		ret = split_nodes_interleave(&ei, 0, max_addr, n);
+	}
+
+	if (ret < 0)
+		return false;
+
+	if (numa_cleanup_meminfo(&ei) < 0) {
+		pr_warning("NUMA: Warning: constructed meminfo invalid, disabling emulation\n");
+		return false;
 	}
 
-	if (num_nodes < 0)
-		return num_nodes;
+	/* commit */
+	numa_meminfo = ei;
 
 	/* make sure all emulated nodes are mapped to a physical node */
 	for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
 		if (emu_nid_to_phys[i] == NUMA_NO_NODE)
 			emu_nid_to_phys[i] = 0;
 
-	ei.nr_blks = num_nodes;
-	for (i = 0; i < ei.nr_blks; i++) {
-		ei.blk[i].start = nodes[i].start;
-		ei.blk[i].end = nodes[i].end;
-		ei.blk[i].nid = i;
-	}
-
-	memnode_shift = compute_hash_shift(&ei);
-	if (memnode_shift < 0) {
-		memnode_shift = 0;
-		printk(KERN_ERR "No NUMA hash function found.  NUMA emulation "
-		       "disabled.\n");
-		return -1;
-	}
-
-	/*
-	 * We need to vacate all active ranges that may have been registered for
-	 * the e820 memory map.
-	 */
-	remove_all_active_ranges();
-	for_each_node_mask(i, node_possible_map)
-		memblock_x86_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
-						nodes[i].end >> PAGE_SHIFT);
-	init_memory_mapping_high();
-	for_each_node_mask(i, node_possible_map)
-		setup_node_bootmem(i, nodes[i].start, nodes[i].end);
-	fake_physnodes(acpi, amd, num_nodes);
-	numa_init_array();
+	fake_physnodes(acpi, amd, &ei);
 	numa_emu_dist = true;
-	return 0;
+	return true;
 }
 #endif /* CONFIG_NUMA_EMU */
 
@@ -988,15 +989,13 @@ void __init initmem_init(void)
 			continue;
 #ifdef CONFIG_NUMA_EMU
 		setup_physnodes(0, max_pfn << PAGE_SHIFT);
-		if (emu_cmdline && !numa_emulation(i == 0, i == 1))
-			return;
-
-		/* not emulating, build identity mapping for numa_add_cpu() */
-		for (j = 0; j < ARRAY_SIZE(emu_nid_to_phys); j++)
-			emu_nid_to_phys[j] = j;
-
-		nodes_clear(node_possible_map);
-		nodes_clear(node_online_map);
+		/*
+		 * If requested, try emulation.  If emulation is not used,
+		 * build identity emu_nid_to_phys[] for numa_add_cpu()
+		 */
+		if (!emu_cmdline || !numa_emulation(i == 0, i == 1))
+			for (j = 0; j < ARRAY_SIZE(emu_nid_to_phys); j++)
+				emu_nid_to_phys[j] = j;
 #endif
 		if (numa_register_memblks(&numa_meminfo) < 0)
 			continue;
-- 
cgit v0.10.2


From 775ee85d7bff8ce7c7eccde90eda400658b650a3 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 16 Feb 2011 17:11:10 +0100
Subject: x86-64, NUMA: Wrap node ID during emulation

Both emulation layout functions - split_nodes[_size]_interleave() -
didn't wrap emulated nid while laying out the fake nodes and tried to
avoid interating over the specified number of nodes, which is fragile.

Now that the emulation code generates numa_meminfo, the node memblks
don't need to be consecutive and emulated node IDs can simply wrap.
This makes the code more robust and is necessary for updates to better
handle the cases where the physical nodes are interleaved.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Shaohui Zheng <shaohui.zheng@intel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: H. Peter Anvin <hpa@linux.intel.com>

diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 9736204..dc95165 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -768,15 +768,7 @@ static int __init split_nodes_interleave(struct numa_meminfo *ei,
 			    memblock_x86_hole_size(end, physnodes[i].end) < size)
 				end = physnodes[i].end;
 
-			/*
-			 * Avoid allocating more nodes than requested, which can
-			 * happen as a result of rounding down each node's size
-			 * to FAKE_NODE_MIN_SIZE.
-			 */
-			if (nodes_weight(physnode_mask) + nid >= nr_nodes)
-				end = physnodes[i].end;
-
-			ret = emu_setup_memblk(ei, nid++, i,
+			ret = emu_setup_memblk(ei, nid++ % nr_nodes, i,
 					       physnodes[i].start,
 					       min(end, physnodes[i].end));
 			if (ret < 0)
@@ -873,7 +865,7 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
 			    memblock_x86_hole_size(end, physnodes[i].end) < size)
 				end = physnodes[i].end;
 
-			ret = emu_setup_memblk(ei, nid++, i,
+			ret = emu_setup_memblk(ei, nid++ % MAX_NUMNODES, i,
 					       physnodes[i].start,
 					       min(end, physnodes[i].end));
 			if (ret < 0)
-- 
cgit v0.10.2


From 1cca53407336fb6a86092e36dbc5c1e4d45d912b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 16 Feb 2011 17:11:10 +0100
Subject: x86-64, NUMA: Emulate directly from numa_meminfo

NUMA emulation built physnodes[] array which could only represent
configurations from the physical meminfo and emulated nodes using the
information.  There's no reason to take this extra level of
indirection.  Update emulation functions so that they operate directly
on numa_meminfo.  This simplifies the code and makes emulation layout
behave better with interleaved physical nodes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Shaohui Zheng <shaohui.zheng@intel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: H. Peter Anvin <hpa@linux.intel.com>

diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index dc95165..bd086eb 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -541,8 +541,6 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
 
 #ifdef CONFIG_NUMA_EMU
 /* Numa emulation */
-static struct bootnode physnodes[MAX_NUMNODES] __initdata;
-
 static int emu_nid_to_phys[MAX_NUMNODES] __cpuinitdata;
 static char *emu_cmdline __initdata;
 
@@ -551,6 +549,16 @@ void __init numa_emu_cmdline(char *str)
 	emu_cmdline = str;
 }
 
+static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi)
+{
+	int i;
+
+	for (i = 0; i < mi->nr_blks; i++)
+		if (mi->blk[i].nid == nid)
+			return i;
+	return -ENOENT;
+}
+
 int __init find_node_by_addr(unsigned long addr)
 {
 	const struct numa_meminfo *mi = &numa_meminfo;
@@ -568,63 +576,6 @@ int __init find_node_by_addr(unsigned long addr)
 	return NUMA_NO_NODE;
 }
 
-static int __init setup_physnodes(unsigned long start, unsigned long end)
-{
-	const struct numa_meminfo *mi = &numa_meminfo;
-	int ret = 0;
-	int i;
-
-	memset(physnodes, 0, sizeof(physnodes));
-
-	for (i = 0; i < mi->nr_blks; i++) {
-		int nid = mi->blk[i].nid;
-
-		if (physnodes[nid].start == physnodes[nid].end) {
-			physnodes[nid].start = mi->blk[i].start;
-			physnodes[nid].end = mi->blk[i].end;
-		} else {
-			physnodes[nid].start = min(physnodes[nid].start,
-						   mi->blk[i].start);
-			physnodes[nid].end = max(physnodes[nid].end,
-						 mi->blk[i].end);
-		}
-	}
-
-	/*
-	 * Basic sanity checking on the physical node map: there may be errors
-	 * if the SRAT or AMD code incorrectly reported the topology or the mem=
-	 * kernel parameter is used.
-	 */
-	for (i = 0; i < MAX_NUMNODES; i++) {
-		if (physnodes[i].start == physnodes[i].end)
-			continue;
-		if (physnodes[i].start > end) {
-			physnodes[i].end = physnodes[i].start;
-			continue;
-		}
-		if (physnodes[i].end < start) {
-			physnodes[i].start = physnodes[i].end;
-			continue;
-		}
-		if (physnodes[i].start < start)
-			physnodes[i].start = start;
-		if (physnodes[i].end > end)
-			physnodes[i].end = end;
-		ret++;
-	}
-
-	/*
-	 * If no physical topology was detected, a single node is faked to cover
-	 * the entire address space.
-	 */
-	if (!ret) {
-		physnodes[ret].start = start;
-		physnodes[ret].end = end;
-		ret = 1;
-	}
-	return ret;
-}
-
 static void __init fake_physnodes(int acpi, int amd,
 				  const struct numa_meminfo *ei)
 {
@@ -663,9 +614,11 @@ static void __init fake_physnodes(int acpi, int amd,
  * something went wrong, 0 otherwise.
  */
 static int __init emu_setup_memblk(struct numa_meminfo *ei,
-				   int nid, int physnid, u64 start, u64 end)
+				   struct numa_meminfo *pi,
+				   int nid, int phys_blk, u64 size)
 {
 	struct numa_memblk *eb = &ei->blk[ei->nr_blks];
+	struct numa_memblk *pb = &pi->blk[phys_blk];
 
 	if (ei->nr_blks >= NR_NODE_MEMBLKS) {
 		pr_err("NUMA: Too many emulated memblks, failing emulation\n");
@@ -673,12 +626,18 @@ static int __init emu_setup_memblk(struct numa_meminfo *ei,
 	}
 
 	ei->nr_blks++;
-	eb->start = start;
-	eb->end = end;
+	eb->start = pb->start;
+	eb->end = pb->start + size;
 	eb->nid = nid;
 
 	if (emu_nid_to_phys[nid] == NUMA_NO_NODE)
-		emu_nid_to_phys[nid] = physnid;
+		emu_nid_to_phys[nid] = pb->nid;
+
+	pb->start += size;
+	if (pb->start >= pb->end) {
+		WARN_ON_ONCE(pb->start > pb->end);
+		numa_remove_memblk_from(phys_blk, pi);
+	}
 
 	printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid,
 	       eb->start, eb->end, (eb->end - eb->start) >> 20);
@@ -690,6 +649,7 @@ static int __init emu_setup_memblk(struct numa_meminfo *ei,
  * to max_addr.  The return value is the number of nodes allocated.
  */
 static int __init split_nodes_interleave(struct numa_meminfo *ei,
+					 struct numa_meminfo *pi,
 					 u64 addr, u64 max_addr, int nr_nodes)
 {
 	nodemask_t physnode_mask = NODE_MASK_NONE;
@@ -721,9 +681,8 @@ static int __init split_nodes_interleave(struct numa_meminfo *ei,
 		return -1;
 	}
 
-	for (i = 0; i < MAX_NUMNODES; i++)
-		if (physnodes[i].start != physnodes[i].end)
-			node_set(i, physnode_mask);
+	for (i = 0; i < pi->nr_blks; i++)
+		node_set(pi->blk[i].nid, physnode_mask);
 
 	/*
 	 * Continue to fill physical nodes with fake nodes until there is no
@@ -731,8 +690,18 @@ static int __init split_nodes_interleave(struct numa_meminfo *ei,
 	 */
 	while (nodes_weight(physnode_mask)) {
 		for_each_node_mask(i, physnode_mask) {
-			u64 end = physnodes[i].start + size;
 			u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
+			u64 start, limit, end;
+			int phys_blk;
+
+			phys_blk = emu_find_memblk_by_nid(i, pi);
+			if (phys_blk < 0) {
+				node_clear(i, physnode_mask);
+				continue;
+			}
+			start = pi->blk[phys_blk].start;
+			limit = pi->blk[phys_blk].end;
+			end = start + size;
 
 			if (nid < big)
 				end += FAKE_NODE_MIN_SIZE;
@@ -741,11 +710,11 @@ static int __init split_nodes_interleave(struct numa_meminfo *ei,
 			 * Continue to add memory to this fake node if its
 			 * non-reserved memory is less than the per-node size.
 			 */
-			while (end - physnodes[i].start -
-				memblock_x86_hole_size(physnodes[i].start, end) < size) {
+			while (end - start -
+			       memblock_x86_hole_size(start, end) < size) {
 				end += FAKE_NODE_MIN_SIZE;
-				if (end > physnodes[i].end) {
-					end = physnodes[i].end;
+				if (end > limit) {
+					end = limit;
 					break;
 				}
 			}
@@ -764,19 +733,15 @@ static int __init split_nodes_interleave(struct numa_meminfo *ei,
 			 * next node, this one must extend to the end of the
 			 * physical node.
 			 */
-			if (physnodes[i].end - end -
-			    memblock_x86_hole_size(end, physnodes[i].end) < size)
-				end = physnodes[i].end;
+			if (limit - end -
+			    memblock_x86_hole_size(end, limit) < size)
+				end = limit;
 
-			ret = emu_setup_memblk(ei, nid++ % nr_nodes, i,
-					       physnodes[i].start,
-					       min(end, physnodes[i].end));
+			ret = emu_setup_memblk(ei, pi, nid++ % nr_nodes,
+					       phys_blk,
+					       min(end, limit) - start);
 			if (ret < 0)
 				return ret;
-
-			physnodes[i].start = min(end, physnodes[i].end);
-			if (physnodes[i].start == physnodes[i].end)
-				node_clear(i, physnode_mask);
 		}
 	}
 	return 0;
@@ -805,6 +770,7 @@ static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size)
  * `addr' to `max_addr'.  The return value is the number of nodes allocated.
  */
 static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
+					      struct numa_meminfo *pi,
 					      u64 addr, u64 max_addr, u64 size)
 {
 	nodemask_t physnode_mask = NODE_MASK_NONE;
@@ -833,9 +799,9 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
 	}
 	size &= FAKE_NODE_MIN_HASH_MASK;
 
-	for (i = 0; i < MAX_NUMNODES; i++)
-		if (physnodes[i].start != physnodes[i].end)
-			node_set(i, physnode_mask);
+	for (i = 0; i < pi->nr_blks; i++)
+		node_set(pi->blk[i].nid, physnode_mask);
+
 	/*
 	 * Fill physical nodes with fake nodes of size until there is no memory
 	 * left on any of them.
@@ -843,10 +809,18 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
 	while (nodes_weight(physnode_mask)) {
 		for_each_node_mask(i, physnode_mask) {
 			u64 dma32_end = MAX_DMA32_PFN << PAGE_SHIFT;
-			u64 end;
+			u64 start, limit, end;
+			int phys_blk;
 
-			end = find_end_of_node(physnodes[i].start,
-						physnodes[i].end, size);
+			phys_blk = emu_find_memblk_by_nid(i, pi);
+			if (phys_blk < 0) {
+				node_clear(i, physnode_mask);
+				continue;
+			}
+			start = pi->blk[phys_blk].start;
+			limit = pi->blk[phys_blk].end;
+
+			end = find_end_of_node(start, limit, size);
 			/*
 			 * If there won't be at least FAKE_NODE_MIN_SIZE of
 			 * non-reserved memory in ZONE_DMA32 for the next node,
@@ -861,19 +835,15 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
 			 * next node, this one must extend to the end of the
 			 * physical node.
 			 */
-			if (physnodes[i].end - end -
-			    memblock_x86_hole_size(end, physnodes[i].end) < size)
-				end = physnodes[i].end;
+			if (limit - end -
+			    memblock_x86_hole_size(end, limit) < size)
+				end = limit;
 
-			ret = emu_setup_memblk(ei, nid++ % MAX_NUMNODES, i,
-					       physnodes[i].start,
-					       min(end, physnodes[i].end));
+			ret = emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES,
+					       phys_blk,
+					       min(end, limit) - start);
 			if (ret < 0)
 				return ret;
-
-			physnodes[i].start = min(end, physnodes[i].end);
-			if (physnodes[i].start == physnodes[i].end)
-				node_clear(i, physnode_mask);
 		}
 	}
 	return 0;
@@ -886,10 +856,12 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
 static bool __init numa_emulation(int acpi, int amd)
 {
 	static struct numa_meminfo ei __initdata;
+	static struct numa_meminfo pi __initdata;
 	const u64 max_addr = max_pfn << PAGE_SHIFT;
 	int i, ret;
 
 	memset(&ei, 0, sizeof(ei));
+	pi = numa_meminfo;
 
 	for (i = 0; i < MAX_NUMNODES; i++)
 		emu_nid_to_phys[i] = NUMA_NO_NODE;
@@ -903,12 +875,12 @@ static bool __init numa_emulation(int acpi, int amd)
 		u64 size;
 
 		size = memparse(emu_cmdline, &emu_cmdline);
-		ret = split_nodes_size_interleave(&ei, 0, max_addr, size);
+		ret = split_nodes_size_interleave(&ei, &pi, 0, max_addr, size);
 	} else {
 		unsigned long n;
 
 		n = simple_strtoul(emu_cmdline, NULL, 0);
-		ret = split_nodes_interleave(&ei, 0, max_addr, n);
+		ret = split_nodes_interleave(&ei, &pi, 0, max_addr, n);
 	}
 
 	if (ret < 0)
@@ -980,7 +952,6 @@ void __init initmem_init(void)
 		if (numa_cleanup_meminfo(&numa_meminfo) < 0)
 			continue;
 #ifdef CONFIG_NUMA_EMU
-		setup_physnodes(0, max_pfn << PAGE_SHIFT);
 		/*
 		 * If requested, try emulation.  If emulation is not used,
 		 * build identity emu_nid_to_phys[] for numa_add_cpu()
-- 
cgit v0.10.2


From 6b78cb549b4105cbf7c6f7461f27a21f00c44997 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 16 Feb 2011 17:11:10 +0100
Subject: x86-64, NUMA: Unify emulated apicid -> node mapping transformation

NUMA emulation changes node mappings and thus apicid -> node mapping
needs to be updated accordingly.  srat_64 and amdtopology_64 did this
separately; however, all the necessary information is the mapping from
emulated nodes to physical nodes which is available in
emu_nid_to_phys[].

Implement common __apicid_to_node[] transformation in numa_emulation()
and drop duplicate implementations.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Shaohui Zheng <shaohui.zheng@intel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: H. Peter Anvin <hpa@linux.intel.com>

diff --git a/arch/x86/mm/amdtopology_64.c b/arch/x86/mm/amdtopology_64.c
index fd7b609..f37ea2f 100644
--- a/arch/x86/mm/amdtopology_64.c
+++ b/arch/x86/mm/amdtopology_64.c
@@ -196,10 +196,6 @@ int __init amd_numa_init(void)
 }
 
 #ifdef CONFIG_NUMA_EMU
-static s16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = {
-	[0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
-};
-
 /*
  * For NUMA emulation, fake proximity domain (_PXM) to node id mappings must be
  * setup to represent the physical topology but reflect the emulated
@@ -224,20 +220,15 @@ void __init amd_fake_nodes(const struct bootnode *nodes, int nr_nodes)
 	for (i = 0; i < nr_nodes; i++) {
 		int index;
 		int nid;
-		int j;
 
 		nid = find_node_by_addr(nodes[i].start);
 		if (nid == NUMA_NO_NODE)
 			continue;
 
 		index = nodeids[nid] << bits;
-		if (fake_apicid_to_node[index + apicid_base] == NUMA_NO_NODE)
-			for (j = apicid_base; j < cores + apicid_base; j++)
-				fake_apicid_to_node[index + j] = i;
 #ifdef CONFIG_ACPI_NUMA
 		__acpi_map_pxm_to_node(nid, i);
 #endif
 	}
-	memcpy(__apicid_to_node, fake_apicid_to_node, sizeof(__apicid_to_node));
 }
 #endif /* CONFIG_NUMA_EMU */
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index bd086eb..722039e 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -858,7 +858,7 @@ static bool __init numa_emulation(int acpi, int amd)
 	static struct numa_meminfo ei __initdata;
 	static struct numa_meminfo pi __initdata;
 	const u64 max_addr = max_pfn << PAGE_SHIFT;
-	int i, ret;
+	int i, j, ret;
 
 	memset(&ei, 0, sizeof(ei));
 	pi = numa_meminfo;
@@ -894,6 +894,20 @@ static bool __init numa_emulation(int acpi, int amd)
 	/* commit */
 	numa_meminfo = ei;
 
+	/*
+	 * Transform __apicid_to_node table to use emulated nids by
+	 * reverse-mapping phys_nid.  The maps should always exist but fall
+	 * back to zero just in case.
+	 */
+	for (i = 0; i < ARRAY_SIZE(__apicid_to_node); i++) {
+		if (__apicid_to_node[i] == NUMA_NO_NODE)
+			continue;
+		for (j = 0; j < ARRAY_SIZE(emu_nid_to_phys); j++)
+			if (__apicid_to_node[i] == emu_nid_to_phys[j])
+				break;
+		__apicid_to_node[i] = j < ARRAY_SIZE(emu_nid_to_phys) ? j : 0;
+	}
+
 	/* make sure all emulated nodes are mapped to a physical node */
 	for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
 		if (emu_nid_to_phys[i] == NUMA_NO_NODE)
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index d2f53f3..d4fbfea 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -265,9 +265,6 @@ int __init x86_acpi_numa_init(void)
 static int fake_node_to_pxm_map[MAX_NUMNODES] __initdata = {
 	[0 ... MAX_NUMNODES-1] = PXM_INVAL
 };
-static s16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = {
-	[0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
-};
 
 /*
  * In NUMA emulation, we need to setup proximity domain (_PXM) to node ID
@@ -279,7 +276,7 @@ static s16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = {
  */
 void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
 {
-	int i, j;
+	int i;
 
 	for (i = 0; i < num_nodes; i++) {
 		int nid, pxm;
@@ -291,29 +288,10 @@ void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
 		if (pxm == PXM_INVAL)
 			continue;
 		fake_node_to_pxm_map[i] = pxm;
-		/*
-		 * For each apicid_to_node mapping that exists for this real
-		 * node, it must now point to the fake node ID.
-		 */
-		for (j = 0; j < MAX_LOCAL_APIC; j++)
-			if (__apicid_to_node[j] == nid &&
-			    fake_apicid_to_node[j] == NUMA_NO_NODE)
-				fake_apicid_to_node[j] = i;
 	}
 
-	/*
-	 * If there are apicid-to-node mappings for physical nodes that do not
-	 * have a corresponding emulated node, it should default to a guaranteed
-	 * value.
-	 */
-	for (i = 0; i < MAX_LOCAL_APIC; i++)
-		if (__apicid_to_node[i] != NUMA_NO_NODE &&
-		    fake_apicid_to_node[i] == NUMA_NO_NODE)
-			fake_apicid_to_node[i] = 0;
-
 	for (i = 0; i < num_nodes; i++)
 		__acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i);
-	memcpy(__apicid_to_node, fake_apicid_to_node, sizeof(__apicid_to_node));
 
 	for (i = 0; i < num_nodes; i++)
 		if (fake_nodes[i].start != fake_nodes[i].end)
-- 
cgit v0.10.2


From e23bba604433a202cd301a976454a90ea6b783ef Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 16 Feb 2011 17:11:10 +0100
Subject: x86-64, NUMA: Unify emulated distance mapping

NUMA emulation needs to update node distance information.  It did it
by remapping apicid to PXM mapping, even when amdtopology is being
used.  There is no reason to go through such convolution.  The generic
code has all the information necessary to transform the distance table
to the emulated nid space.

Implement generic distance table transformation in numa_emulation()
and drop private implementations in srat_64 and amdtopology_64.  This
makes find_node_by_addr() and fake_physnodes() and related functions
unnecessary, drop them.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Shaohui Zheng <shaohui.zheng@intel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: H. Peter Anvin <hpa@linux.intel.com>

diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 9c9fe1b..a37da6d 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -186,12 +186,6 @@ struct bootnode;
 #ifdef CONFIG_ACPI_NUMA
 extern int acpi_numa;
 extern int x86_acpi_numa_init(void);
-
-#ifdef CONFIG_NUMA_EMU
-extern void acpi_fake_nodes(const struct bootnode *fake_nodes,
-				   int num_nodes);
-extern int acpi_emu_node_distance(int a, int b);
-#endif
 #endif /* CONFIG_ACPI_NUMA */
 
 #define acpi_unlazy_tlb(x)	leave_mm(x)
diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h
index 384d118..e264ae5 100644
--- a/arch/x86/include/asm/amd_nb.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -20,10 +20,6 @@ extern int amd_numa_init(void);
 extern int amd_get_subcaches(int);
 extern int amd_set_subcaches(int, int);
 
-#ifdef CONFIG_NUMA_EMU
-extern void amd_fake_nodes(const struct bootnode *nodes, int nr_nodes);
-#endif
-
 struct amd_northbridge {
 	struct pci_dev *misc;
 	struct pci_dev *link;
diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h
index 5361c59..344eb17 100644
--- a/arch/x86/include/asm/numa_64.h
+++ b/arch/x86/include/asm/numa_64.h
@@ -34,7 +34,6 @@ extern void __init numa_set_distance(int from, int to, int distance);
 #define FAKE_NODE_MIN_SIZE	((u64)32 << 20)
 #define FAKE_NODE_MIN_HASH_MASK	(~(FAKE_NODE_MIN_SIZE - 1UL))
 void numa_emu_cmdline(char *);
-int __init find_node_by_addr(unsigned long addr);
 #endif /* CONFIG_NUMA_EMU */
 #else
 static inline int numa_cpu_node(int cpu)		{ return NUMA_NO_NODE; }
diff --git a/arch/x86/mm/amdtopology_64.c b/arch/x86/mm/amdtopology_64.c
index f37ea2f..0919c26 100644
--- a/arch/x86/mm/amdtopology_64.c
+++ b/arch/x86/mm/amdtopology_64.c
@@ -194,41 +194,3 @@ int __init amd_numa_init(void)
 
 	return 0;
 }
-
-#ifdef CONFIG_NUMA_EMU
-/*
- * For NUMA emulation, fake proximity domain (_PXM) to node id mappings must be
- * setup to represent the physical topology but reflect the emulated
- * environment.  For each emulated node, the real node which it appears on is
- * found and a fake pxm to nid mapping is created which mirrors the actual
- * locality.  node_distance() then represents the correct distances between
- * emulated nodes by using the fake acpi mappings to pxms.
- */
-void __init amd_fake_nodes(const struct bootnode *nodes, int nr_nodes)
-{
-	unsigned int bits;
-	unsigned int cores;
-	unsigned int apicid_base = 0;
-	int i;
-
-	bits = boot_cpu_data.x86_coreid_bits;
-	cores = 1 << bits;
-	early_get_boot_cpu_id();
-	if (boot_cpu_physical_apicid > 0)
-		apicid_base = boot_cpu_physical_apicid;
-
-	for (i = 0; i < nr_nodes; i++) {
-		int index;
-		int nid;
-
-		nid = find_node_by_addr(nodes[i].start);
-		if (nid == NUMA_NO_NODE)
-			continue;
-
-		index = nodeids[nid] << bits;
-#ifdef CONFIG_ACPI_NUMA
-		__acpi_map_pxm_to_node(nid, i);
-#endif
-	}
-}
-#endif /* CONFIG_NUMA_EMU */
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 722039e..8ce6177 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -48,10 +48,6 @@ static struct numa_meminfo numa_meminfo __initdata;
 static int numa_distance_cnt;
 static u8 *numa_distance;
 
-#ifdef CONFIG_NUMA_EMU
-static bool numa_emu_dist;
-#endif
-
 /*
  * Given a shift value, try to populate memnodemap[]
  * Returns :
@@ -443,10 +439,6 @@ void __init numa_set_distance(int from, int to, int distance)
 
 int __node_distance(int from, int to)
 {
-#if defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA_EMU)
-	if (numa_emu_dist)
-		return acpi_emu_node_distance(from, to);
-#endif
 	if (from >= numa_distance_cnt || to >= numa_distance_cnt)
 		return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE;
 	return numa_distance[from * numa_distance_cnt + to];
@@ -559,56 +551,6 @@ static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi)
 	return -ENOENT;
 }
 
-int __init find_node_by_addr(unsigned long addr)
-{
-	const struct numa_meminfo *mi = &numa_meminfo;
-	int i;
-
-	for (i = 0; i < mi->nr_blks; i++) {
-		/*
-		 * Find the real node that this emulated node appears on.  For
-		 * the sake of simplicity, we only use a real node's starting
-		 * address to determine which emulated node it appears on.
-		 */
-		if (addr >= mi->blk[i].start && addr < mi->blk[i].end)
-			return mi->blk[i].nid;
-	}
-	return NUMA_NO_NODE;
-}
-
-static void __init fake_physnodes(int acpi, int amd,
-				  const struct numa_meminfo *ei)
-{
-	static struct bootnode nodes[MAX_NUMNODES] __initdata;
-	int i, nr_nodes = 0;
-
-	for (i = 0; i < ei->nr_blks; i++) {
-		int nid = ei->blk[i].nid;
-
-		if (nodes[nid].start == nodes[nid].end) {
-			nodes[nid].start = ei->blk[i].start;
-			nodes[nid].end = ei->blk[i].end;
-			nr_nodes++;
-		} else {
-			nodes[nid].start = min(ei->blk[i].start, nodes[nid].start);
-			nodes[nid].end = max(ei->blk[i].end, nodes[nid].end);
-		}
-	}
-
-	BUG_ON(acpi && amd);
-#ifdef CONFIG_ACPI_NUMA
-	if (acpi)
-		acpi_fake_nodes(nodes, nr_nodes);
-#endif
-#ifdef CONFIG_AMD_NUMA
-	if (amd)
-		amd_fake_nodes(nodes, nr_nodes);
-#endif
-	if (!acpi && !amd)
-		for (i = 0; i < nr_cpu_ids; i++)
-			numa_set_node(i, 0);
-}
-
 /*
  * Sets up nid to range from @start to @end.  The return value is -errno if
  * something went wrong, 0 otherwise.
@@ -853,11 +795,13 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
  * Sets up the system RAM area from start_pfn to last_pfn according to the
  * numa=fake command-line option.
  */
-static bool __init numa_emulation(int acpi, int amd)
+static bool __init numa_emulation(void)
 {
 	static struct numa_meminfo ei __initdata;
 	static struct numa_meminfo pi __initdata;
 	const u64 max_addr = max_pfn << PAGE_SHIFT;
+	int phys_dist_cnt = numa_distance_cnt;
+	u8 *phys_dist = NULL;
 	int i, j, ret;
 
 	memset(&ei, 0, sizeof(ei));
@@ -891,6 +835,25 @@ static bool __init numa_emulation(int acpi, int amd)
 		return false;
 	}
 
+	/*
+	 * Copy the original distance table.  It's temporary so no need to
+	 * reserve it.
+	 */
+	if (phys_dist_cnt) {
+		size_t size = phys_dist_cnt * sizeof(numa_distance[0]);
+		u64 phys;
+
+		phys = memblock_find_in_range(0,
+					      (u64)max_pfn_mapped << PAGE_SHIFT,
+					      size, PAGE_SIZE);
+		if (phys == MEMBLOCK_ERROR) {
+			pr_warning("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n");
+			return false;
+		}
+		phys_dist = __va(phys);
+		memcpy(phys_dist, numa_distance, size);
+	}
+
 	/* commit */
 	numa_meminfo = ei;
 
@@ -913,8 +876,23 @@ static bool __init numa_emulation(int acpi, int amd)
 		if (emu_nid_to_phys[i] == NUMA_NO_NODE)
 			emu_nid_to_phys[i] = 0;
 
-	fake_physnodes(acpi, amd, &ei);
-	numa_emu_dist = true;
+	/* transform distance table */
+	numa_reset_distance();
+	for (i = 0; i < MAX_NUMNODES; i++) {
+		for (j = 0; j < MAX_NUMNODES; j++) {
+			int physi = emu_nid_to_phys[i];
+			int physj = emu_nid_to_phys[j];
+			int dist;
+
+			if (physi >= phys_dist_cnt || physj >= phys_dist_cnt)
+				dist = physi == physj ?
+					LOCAL_DISTANCE : REMOTE_DISTANCE;
+			else
+				dist = phys_dist[physi * phys_dist_cnt + physj];
+
+			numa_set_distance(i, j, dist);
+		}
+	}
 	return true;
 }
 #endif /* CONFIG_NUMA_EMU */
@@ -970,7 +948,7 @@ void __init initmem_init(void)
 		 * If requested, try emulation.  If emulation is not used,
 		 * build identity emu_nid_to_phys[] for numa_add_cpu()
 		 */
-		if (!emu_cmdline || !numa_emulation(i == 0, i == 1))
+		if (!emu_cmdline || !numa_emulation())
 			for (j = 0; j < ARRAY_SIZE(emu_nid_to_phys); j++)
 				emu_nid_to_phys[j] = j;
 #endif
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index d4fbfea..8e9d339 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -26,8 +26,6 @@
 
 int acpi_numa __initdata;
 
-static struct acpi_table_slit *acpi_slit;
-
 static struct bootnode nodes_add[MAX_NUMNODES];
 
 static __init int setup_node(int pxm)
@@ -51,25 +49,11 @@ static __init inline int srat_disabled(void)
 void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
 {
 	int i, j;
-	unsigned length;
-	unsigned long phys;
 
 	for (i = 0; i < slit->locality_count; i++)
 		for (j = 0; j < slit->locality_count; j++)
 			numa_set_distance(pxm_to_node(i), pxm_to_node(j),
 				slit->entry[slit->locality_count * i + j]);
-
-	/* acpi_slit is used only by emulation */
-	length = slit->header.length;
-	phys = memblock_find_in_range(0, max_pfn_mapped<<PAGE_SHIFT, length,
-		 PAGE_SIZE);
-
-	if (phys == MEMBLOCK_ERROR)
-		panic(" Can not save slit!\n");
-
-	acpi_slit = __va(phys);
-	memcpy(acpi_slit, slit, length);
-	memblock_x86_reserve_range(phys, phys + length, "ACPI SLIT");
 }
 
 /* Callback for Proximity Domain -> x2APIC mapping */
@@ -261,55 +245,6 @@ int __init x86_acpi_numa_init(void)
 	return srat_disabled() ? -EINVAL : 0;
 }
 
-#ifdef CONFIG_NUMA_EMU
-static int fake_node_to_pxm_map[MAX_NUMNODES] __initdata = {
-	[0 ... MAX_NUMNODES-1] = PXM_INVAL
-};
-
-/*
- * In NUMA emulation, we need to setup proximity domain (_PXM) to node ID
- * mappings that respect the real ACPI topology but reflect our emulated
- * environment.  For each emulated node, we find which real node it appears on
- * and create PXM to NID mappings for those fake nodes which mirror that
- * locality.  SLIT will now represent the correct distances between emulated
- * nodes as a result of the real topology.
- */
-void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
-{
-	int i;
-
-	for (i = 0; i < num_nodes; i++) {
-		int nid, pxm;
-
-		nid = find_node_by_addr(fake_nodes[i].start);
-		if (nid == NUMA_NO_NODE)
-			continue;
-		pxm = node_to_pxm(nid);
-		if (pxm == PXM_INVAL)
-			continue;
-		fake_node_to_pxm_map[i] = pxm;
-	}
-
-	for (i = 0; i < num_nodes; i++)
-		__acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i);
-
-	for (i = 0; i < num_nodes; i++)
-		if (fake_nodes[i].start != fake_nodes[i].end)
-			node_set(i, numa_nodes_parsed);
-}
-
-int acpi_emu_node_distance(int a, int b)
-{
-	int index;
-
-	if (!acpi_slit)
-		return node_to_pxm(a) == node_to_pxm(b) ?
-			LOCAL_DISTANCE : REMOTE_DISTANCE;
-	index = acpi_slit->locality_count * node_to_pxm(a);
-	return acpi_slit->entry[index + node_to_pxm(b)];
-}
-#endif /* CONFIG_NUMA_EMU */
-
 #if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) || defined(CONFIG_ACPI_HOTPLUG_MEMORY)
 int memory_add_physaddr_to_nid(u64 start)
 {
-- 
cgit v0.10.2


From 5c35d69fb60b1dc49595f5b9a2c7158283e9eaf3 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Wed, 9 Feb 2011 11:38:43 -0200
Subject: perf ui: Serialize screen updates

The ui operations so far were used by just one thread, but 'perf top
--tui' now has two threads updating the screen, so we need to use a
mutex to avoid garbling the screen.

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index bc4d9bf..ffd1047 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -637,6 +637,7 @@ else
 		LIB_H += util/ui/libslang.h
 		LIB_H += util/ui/progress.h
 		LIB_H += util/ui/util.h
+		LIB_H += util/ui/ui.h
 	endif
 endif
 
diff --git a/tools/perf/util/ui/browser.c b/tools/perf/util/ui/browser.c
index 8bc010e..60d6c81 100644
--- a/tools/perf/util/ui/browser.c
+++ b/tools/perf/util/ui/browser.c
@@ -1,4 +1,5 @@
 #include "libslang.h"
+#include "ui.h"
 #include <linux/compiler.h>
 #include <linux/list.h>
 #include <linux/rbtree.h>
@@ -178,6 +179,7 @@ int ui_browser__show(struct ui_browser *self, const char *title,
 	if (self->sb == NULL)
 		return -1;
 
+	pthread_mutex_lock(&ui__lock);
 	SLsmg_gotorc(0, 0);
 	ui_browser__set_color(self, NEWT_COLORSET_ROOT);
 	slsmg_write_nstring(title, self->width);
@@ -188,25 +190,30 @@ int ui_browser__show(struct ui_browser *self, const char *title,
 	va_start(ap, helpline);
 	ui_helpline__vpush(helpline, ap);
 	va_end(ap);
+	pthread_mutex_unlock(&ui__lock);
 	return 0;
 }
 
 void ui_browser__hide(struct ui_browser *self)
 {
+	pthread_mutex_lock(&ui__lock);
 	newtFormDestroy(self->form);
 	self->form = NULL;
 	ui_helpline__pop();
+	pthread_mutex_unlock(&ui__lock);
 }
 
 int ui_browser__refresh(struct ui_browser *self)
 {
 	int row;
 
+	pthread_mutex_lock(&ui__lock);
 	newtScrollbarSet(self->sb, self->index, self->nr_entries - 1);
 	row = self->refresh(self);
 	ui_browser__set_color(self, HE_COLORSET_NORMAL);
 	SLsmg_fill_region(self->y + row, self->x,
 			  self->height - row, self->width, ' ');
+	pthread_mutex_unlock(&ui__lock);
 
 	return 0;
 }
diff --git a/tools/perf/util/ui/helpline.c b/tools/perf/util/ui/helpline.c
index 8d79daa..f36d2ff5 100644
--- a/tools/perf/util/ui/helpline.c
+++ b/tools/perf/util/ui/helpline.c
@@ -5,6 +5,7 @@
 
 #include "../debug.h"
 #include "helpline.h"
+#include "ui.h"
 
 void ui_helpline__pop(void)
 {
@@ -55,7 +56,8 @@ int ui_helpline__show_help(const char *format, va_list ap)
 	int ret;
 	static int backlog;
 
-        ret = vsnprintf(ui_helpline__last_msg + backlog,
+	pthread_mutex_lock(&ui__lock);
+	ret = vsnprintf(ui_helpline__last_msg + backlog,
 			sizeof(ui_helpline__last_msg) - backlog, format, ap);
 	backlog += ret;
 
@@ -64,6 +66,7 @@ int ui_helpline__show_help(const char *format, va_list ap)
 		newtRefresh();
 		backlog = 0;
 	}
+	pthread_mutex_unlock(&ui__lock);
 
 	return ret;
 }
diff --git a/tools/perf/util/ui/setup.c b/tools/perf/util/ui/setup.c
index fbf1a14..ee46d67 100644
--- a/tools/perf/util/ui/setup.c
+++ b/tools/perf/util/ui/setup.c
@@ -6,6 +6,9 @@
 #include "../debug.h"
 #include "browser.h"
 #include "helpline.h"
+#include "ui.h"
+
+pthread_mutex_t ui__lock = PTHREAD_MUTEX_INITIALIZER;
 
 static void newt_suspend(void *d __used)
 {
diff --git a/tools/perf/util/ui/ui.h b/tools/perf/util/ui/ui.h
new file mode 100644
index 0000000..d264e05
--- /dev/null
+++ b/tools/perf/util/ui/ui.h
@@ -0,0 +1,8 @@
+#ifndef _PERF_UI_H_
+#define _PERF_UI_H_ 1
+
+#include <pthread.h>
+
+extern pthread_mutex_t ui__lock;
+
+#endif /* _PERF_UI_H_ */
-- 
cgit v0.10.2


From 289c082044643e55f65c6a16bb3621cf3f35a454 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Wed, 9 Feb 2011 13:56:28 -0200
Subject: perf annotate: Check if offset is less than symbol size

Just like done on symbol__inc_addr_samples to catch misparsed offsets
from objdump.

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index 02976b8..70ec422 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -541,11 +541,12 @@ void symbol__annotate_decay_histogram(struct symbol *sym, int evidx)
 	struct annotation *notes = symbol__annotation(sym);
 	struct sym_hist *h = annotation__histogram(notes, evidx);
 	struct objdump_line *pos;
+	int len = sym->end - sym->start;
 
 	h->sum = 0;
 
 	list_for_each_entry(pos, &notes->src->source, node) {
-		if (pos->offset != -1) {
+		if (pos->offset != -1 && pos->offset < len) {
 			h->addr[pos->offset] = h->addr[pos->offset] * 7 / 8;
 			h->sum += h->addr[pos->offset];
 		}
-- 
cgit v0.10.2


From b99976e2d277c963138e090ae17bf835f8a07680 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Wed, 9 Feb 2011 13:59:14 -0200
Subject: perf annotate browser: Use the percent color for the whole line

Not just for the percentage number, to see the hot lines more easily.

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/ui/browsers/annotate.c b/tools/perf/util/ui/browsers/annotate.c
index 1aa3965..cfb5a27 100644
--- a/tools/perf/util/ui/browsers/annotate.c
+++ b/tools/perf/util/ui/browsers/annotate.c
@@ -44,8 +44,6 @@ static void annotate_browser__write(struct ui_browser *self, void *entry, int ro
 		struct objdump_line_rb_node *olrb = objdump_line__rb(ol);
 		ui_browser__set_percent_color(self, olrb->percent, current_entry);
 		slsmg_printf(" %7.2f ", olrb->percent);
-		if (!current_entry)
-			ui_browser__set_color(self, HE_COLORSET_CODE);
 	} else {
 		ui_browser__set_percent_color(self, 0, current_entry);
 		slsmg_write_nstring(" ", 9);
@@ -57,6 +55,9 @@ static void annotate_browser__write(struct ui_browser *self, void *entry, int ro
 		slsmg_write_nstring(" ", width - 18);
 	else
 		slsmg_write_nstring(ol->line, width - 18);
+
+	if (!current_entry)
+		ui_browser__set_color(self, HE_COLORSET_CODE);
 }
 
 static double objdump_line__calc_percent(struct objdump_line *self,
-- 
cgit v0.10.2


From 4187e262bc90369ba581ee28ec74ed416618889e Mon Sep 17 00:00:00 2001
From: Jesse Brandeburg <jesse.brandeburg@intel.com>
Date: Wed, 9 Feb 2011 17:11:00 -0800
Subject: perf tools: Update Makefile with some help

The perf makefile is nicely complete except for

a) an uninstall option
b) a 'make help' description

This patch implements b)
it also comments out other non-working makefile targets

Signed-off-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/Documentation/Makefile b/tools/perf/Documentation/Makefile
index bd498d4..4626a39 100644
--- a/tools/perf/Documentation/Makefile
+++ b/tools/perf/Documentation/Makefile
@@ -178,8 +178,8 @@ install-pdf: pdf
 	$(INSTALL) -d -m 755 $(DESTDIR)$(pdfdir)
 	$(INSTALL) -m 644 user-manual.pdf $(DESTDIR)$(pdfdir)
 
-install-html: html
-	'$(SHELL_PATH_SQ)' ./install-webdoc.sh $(DESTDIR)$(htmldir)
+#install-html: html
+#	'$(SHELL_PATH_SQ)' ./install-webdoc.sh $(DESTDIR)$(htmldir)
 
 ../PERF-VERSION-FILE: .FORCE-PERF-VERSION-FILE
 	$(QUIET_SUBDIR0)../ $(QUIET_SUBDIR1) PERF-VERSION-FILE
@@ -288,15 +288,16 @@ $(patsubst %.txt,%.html,$(wildcard howto/*.txt)): %.html : %.txt
 	sed -e '1,/^$$/d' $< | $(ASCIIDOC) -b xhtml11 - >$@+ && \
 	mv $@+ $@
 
-install-webdoc : html
-	'$(SHELL_PATH_SQ)' ./install-webdoc.sh $(WEBDOC_DEST)
+# UNIMPLEMENTED
+#install-webdoc : html
+#	'$(SHELL_PATH_SQ)' ./install-webdoc.sh $(WEBDOC_DEST)
 
-quick-install: quick-install-man
+# quick-install: quick-install-man
 
-quick-install-man:
-	'$(SHELL_PATH_SQ)' ./install-doc-quick.sh $(DOC_REF) $(DESTDIR)$(mandir)
+# quick-install-man:
+#	'$(SHELL_PATH_SQ)' ./install-doc-quick.sh $(DOC_REF) $(DESTDIR)$(mandir)
 
-quick-install-html:
-	'$(SHELL_PATH_SQ)' ./install-doc-quick.sh $(HTML_REF) $(DESTDIR)$(htmldir)
+#quick-install-html:
+#	'$(SHELL_PATH_SQ)' ./install-doc-quick.sh $(HTML_REF) $(DESTDIR)$(htmldir)
 
 .PHONY: .FORCE-PERF-VERSION-FILE
diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index ffd1047..7c75f1d 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -1102,6 +1102,36 @@ $(sort $(dir $(DIRECTORY_DEPS))):
 $(LIB_FILE): $(LIB_OBJS)
 	$(QUIET_AR)$(RM) $@ && $(AR) rcs $@ $(LIB_OBJS)
 
+help:
+	@echo 'Perf make targets:'
+	@echo '  doc		- make *all* documentation (see below)'
+	@echo '  man		- make manpage documentation (access with man <foo>)'
+	@echo '  html		- make html documentation'
+	@echo '  info		- make GNU info documentation (access with info <foo>)'
+	@echo '  pdf		- make pdf documentation'
+	@echo '  TAGS		- use etags to make tag information for source browsing'
+	@echo '  tags		- use ctags to make tag information for source browsing'
+	@echo '  cscope	- use cscope to make interactive browsing database'
+	@echo ''
+	@echo 'Perf install targets:'
+	@echo '  NOTE: documentation build requires asciidoc, xmlto packages to be installed'
+	@echo '  HINT: use "make prefix=<path> <install target>" to install to a particular'
+	@echo '        path like make prefix=/usr/local install install-doc'
+	@echo '  install	- install compiled binaries'
+	@echo '  install-doc	- install *all* documentation'
+	@echo '  install-man	- install manpage documentation'
+	@echo '  install-html	- install html documentation'
+	@echo '  install-info	- install GNU info documentation'
+	@echo '  install-pdf	- install pdf documentation'
+	@echo ''
+	@echo '  quick-install-doc	- alias for quick-install-man'
+	@echo '  quick-install-man	- install the documentation quickly'
+	@echo '  quick-install-html	- install the html documentation quickly'
+	@echo ''
+	@echo 'Perf maintainer targets:'
+	@echo '  distclean		- alias to clean'
+	@echo '  clean			- clean all binary objects and build output'
+
 doc:
 	$(MAKE) -C Documentation all
 
-- 
cgit v0.10.2


From e116dfa1c357da49f55e1555767ec991225a8321 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Thu, 10 Feb 2011 18:08:10 +0900
Subject: perf probe: Support function@filename syntax for --line

Since "perf probe --add" supports function@filename syntax, --line
option should also support it.

Cc: 2nddept-manager@sdl.hitachi.co.jp
Cc: Franck Bui-Huu <fbuihuu@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: linux-kernel@vger.kernel.org
LKML-Reference: <20110210090810.1809.26913.stgit@ltc236.sdl.hitachi.co.jp>
Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/Documentation/perf-probe.txt b/tools/perf/Documentation/perf-probe.txt
index 81c3220..02bafce 100644
--- a/tools/perf/Documentation/perf-probe.txt
+++ b/tools/perf/Documentation/perf-probe.txt
@@ -16,7 +16,7 @@ or
 or
 'perf probe' --list
 or
-'perf probe' [options] --line='FUNC[:RLN[+NUM|:RLN2]]|SRC:ALN[+NUM|:ALN2]'
+'perf probe' [options] --line='LINE'
 or
 'perf probe' [options] --vars='PROBEPOINT'
 
@@ -128,13 +128,14 @@ LINE SYNTAX
 -----------
 Line range is described by following syntax.
 
- "FUNC[:RLN[+NUM|-RLN2]]|SRC[:ALN[+NUM|-ALN2]]"
+ "FUNC[@SRC][:RLN[+NUM|-RLN2]]|SRC[:ALN[+NUM|-ALN2]]"
 
 FUNC specifies the function name of showing lines. 'RLN' is the start line
 number from function entry line, and 'RLN2' is the end line number. As same as
 probe syntax, 'SRC' means the source file path, 'ALN' is start line number,
 and 'ALN2' is end line number in the file. It is also possible to specify how
-many lines to show by using 'NUM'.
+many lines to show by using 'NUM'. Moreover, 'FUNC@SRC' combination is good
+for searching a specific function when several functions share same name.
 So, "source.c:100-120" shows lines between 100th to l20th in source.c file. And "func:10+20" shows 20 lines from 10th line of func function.
 
 LAZY MATCHING
diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c
index 9d237e3..cbd7650 100644
--- a/tools/perf/util/probe-event.c
+++ b/tools/perf/util/probe-event.c
@@ -595,11 +595,11 @@ static int parse_line_num(char **ptr, int *val, const char *what)
  * The line range syntax is described by:
  *
  *         SRC[:SLN[+NUM|-ELN]]
- *         FNC[:SLN[+NUM|-ELN]]
+ *         FNC[@SRC][:SLN[+NUM|-ELN]]
  */
 int parse_line_range_desc(const char *arg, struct line_range *lr)
 {
-	char *range, *name = strdup(arg);
+	char *range, *file, *name = strdup(arg);
 	int err;
 
 	if (!name)
@@ -649,7 +649,16 @@ int parse_line_range_desc(const char *arg, struct line_range *lr)
 		}
 	}
 
-	if (strchr(name, '.'))
+	file = strchr(name, '@');
+	if (file) {
+		*file = '\0';
+		lr->file = strdup(++file);
+		if (lr->file == NULL) {
+			err = -ENOMEM;
+			goto err;
+		}
+		lr->function = name;
+	} else if (strchr(name, '.'))
 		lr->file = name;
 	else
 		lr->function = name;
-- 
cgit v0.10.2


From 8737ebdea02315eaffaebb3b73d55f2f726a4fe0 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Thu, 10 Feb 2011 18:08:16 +0900
Subject: perf probe: Show filename which contains target function

Show filename which contains a target function with the function name on
"--lines" mode, because perf-probe just shows the first function even if
there are many same-name functions.

Originally adopted by Franck Bui-Huu's patch which shows file name
instead of function name. I've just modified it to show both of function
name and file name, because of completeness of output.

 E.g.)
 $ perf probe -L t_show
 <t_show@/home/mhiramat/ksrc/linux-2.6-tip/kernel/trace/ftrace.c:0>
      0  static int t_show(struct seq_file *m, void *v)
      1  {
      2         struct ftrace_iterator *iter = m->private;
 ...

 $ perf probe -L t_show@trace/trace.c
 <t_show@/home/mhiramat/ksrc/linux-2.6-tip/kernel/trace/trace.c:0>
      0  static int t_show(struct seq_file *m, void *v)
      1  {
                struct tracer *t = v;
 ...

Original-patch-by: Franck Bui-Huu <fbuihuu@gmail.com>
Cc: 2nddept-manager@sdl.hitachi.co.jp
Cc: Franck Bui-Huu <fbuihuu@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20110210090816.1809.43426.stgit@ltc236.sdl.hitachi.co.jp>
Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c
index cbd7650..0e3ea13 100644
--- a/tools/perf/util/probe-event.c
+++ b/tools/perf/util/probe-event.c
@@ -409,7 +409,7 @@ int show_line_range(struct line_range *lr, const char *module)
 	setup_pager();
 
 	if (lr->function)
-		fprintf(stdout, "<%s:%d>\n", lr->function,
+		fprintf(stdout, "<%s@%s:%d>\n", lr->function, lr->path,
 			lr->start - lr->offset);
 	else
 		fprintf(stdout, "<%s:%d>\n", lr->path, lr->start);
-- 
cgit v0.10.2


From 4498062e72fd55b2a9a4ac1b44fab8cb44ad5367 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 17 Feb 2011 10:07:42 -0200
Subject: perf python: Add cgroup.c to setup.py to get it building again

The 023695d cset added a new file, util/cgroup.c, that is referenced from
util/evsel.c, so it needs to be present in util/setup.py so that the python
shared object binding works, fixing this:

[root@emilia linux]# export PYTHONPATH=~acme/git/build/perf/python/
[root@emilia linux]# ./tools/perf/python/twatch.py
Traceback (most recent call last):
  File "./tools/perf/python/twatch.py", line 16, in <module>
    import perf
ImportError: /home/acme/git/build/perf/python/perf.so: undefined symbol: close_cgroup

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/setup.py b/tools/perf/util/setup.py
index 1947b04..e24ffad 100644
--- a/tools/perf/util/setup.py
+++ b/tools/perf/util/setup.py
@@ -5,7 +5,7 @@ from distutils.core import setup, Extension
 perf = Extension('perf',
 		  sources = ['util/python.c', 'util/ctype.c', 'util/evlist.c',
 			     'util/evsel.c', 'util/cpumap.c', 'util/thread_map.c',
-			     'util/util.c', 'util/xyarray.c'],
+			     'util/util.c', 'util/xyarray.c', 'util/cgroup.c'],
 		  include_dirs = ['util/include'],
 		  extra_compile_args = ['-fno-strict-aliasing', '-Wno-write-strings'])
 
-- 
cgit v0.10.2


From f0c55bcf4aa41b4b1dbee826513b1acb01bf65e1 Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Wed, 16 Feb 2011 15:10:01 +0200
Subject: perf: make perf stat print user provided full event names

This patch changes the way perf stat prints event names at the end of a
run. Until now, it was trying to reconstruct the event name from its
encoding. The problem is that it would only print generic events without
their modifiers (u, k, pp).

This patch saves the event name as passed by the user in the evsel
struct and uses it to print the final event name.

This would also work in case perf is linked with a library (such as
libpfm4) which provides full PMU event tables.

$ perf stat -e cycles:u,cycles:k date
Wed Feb 16 14:58:52 CET 2011

 Performance counter stats for 'date':

            568600 cycles:u
           2779715 cycles:k

        0.001908182  seconds time elapsed

Cc: Arun Sharma <arun@sharma-home.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Stephane Eranian <eranian@gmail.com>
LPU-Reference: <4d5bdc64.98a1df0a.7aa3.06c2@mx.google.com>
Signed-off-by: Stephane Eranian <eranian@google.com>
[ committer note: Fixed a merge problem with 023695d "Add cgroup support" ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index c974e08..63cadaf 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -86,6 +86,7 @@ void perf_evsel__delete(struct perf_evsel *evsel)
 {
 	perf_evsel__exit(evsel);
 	close_cgroup(evsel->cgrp);
+	free(evsel->name);
 	free(evsel);
 }
 
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index 1d3d5a3..f6fc8f6 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -37,6 +37,12 @@ struct perf_sample_id {
 	struct perf_evsel	*evsel;
 };
 
+/** struct perf_evsel - event selector
+ *
+ * @name - Can be set to retain the original event name passed by the user,
+ *         so that when showing results in tools such as 'perf stat', we
+ *         show the name used, not some alias.
+ */
 struct perf_evsel {
 	struct list_head	node;
 	struct perf_event_attr	attr;
@@ -45,6 +51,7 @@ struct perf_evsel {
 	struct xyarray		*id;
 	struct perf_counts	*counts;
 	int			idx;
+	char			*name;
 	void			*priv;
 	struct cgroup_sel	*cgrp;
 };
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index cf082da..80a3dd5 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -268,6 +268,9 @@ const char *event_name(struct perf_evsel *evsel)
 	u64 config = evsel->attr.config;
 	int type = evsel->attr.type;
 
+	if (evsel->name)
+		return evsel->name;
+
 	return __event_name(type, config);
 }
 
@@ -782,8 +785,10 @@ int parse_events(const struct option *opt, const char *str, int unset __used)
 	struct perf_evlist *evlist = *(struct perf_evlist **)opt->value;
 	struct perf_event_attr attr;
 	enum event_result ret;
+	const char *ostr;
 
 	for (;;) {
+		ostr = str;
 		memset(&attr, 0, sizeof(attr));
 		ret = parse_event_symbols(opt, &str, &attr);
 		if (ret == EVT_FAILED)
@@ -798,6 +803,11 @@ int parse_events(const struct option *opt, const char *str, int unset __used)
 			if (evsel == NULL)
 				return -1;
 			perf_evlist__add(evlist, evsel);
+
+			evsel->name = calloc(str - ostr + 1, 1);
+			if (!evsel->name)
+				return -1;
+			strncpy(evsel->name, ostr, str - ostr);
 		}
 
 		if (*str == 0)
-- 
cgit v0.10.2


From da1016df85ed67b6f7dbb765532c54bc35ba08d7 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Wed, 16 Feb 2011 23:48:35 +0900
Subject: x86: Use bitmap library functions

Use bitmap_set()/bitmap_clear() to fill/zero a region of a
bitmap instead of doing set_bit()/clear_bit() each bit.

This change has been tested with ioperm() and there's no
change in behavior.

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
LKML-Reference: <1297867715-20394-1-git-send-email-akinobu.mita@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index 8eec0ec..8c96897 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -14,22 +14,9 @@
 #include <linux/slab.h>
 #include <linux/thread_info.h>
 #include <linux/syscalls.h>
+#include <linux/bitmap.h>
 #include <asm/syscalls.h>
 
-/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
-static void set_bitmap(unsigned long *bitmap, unsigned int base,
-		       unsigned int extent, int new_value)
-{
-	unsigned int i;
-
-	for (i = base; i < base + extent; i++) {
-		if (new_value)
-			__set_bit(i, bitmap);
-		else
-			__clear_bit(i, bitmap);
-	}
-}
-
 /*
  * this changes the io permissions bitmap in the current task.
  */
@@ -69,7 +56,10 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
 	 */
 	tss = &per_cpu(init_tss, get_cpu());
 
-	set_bitmap(t->io_bitmap_ptr, from, num, !turn_on);
+	if (turn_on)
+		bitmap_clear(t->io_bitmap_ptr, from, num);
+	else
+		bitmap_set(t->io_bitmap_ptr, from, num);
 
 	/*
 	 * Search for a (possibly new) maximum. This is simple and stupid,
-- 
cgit v0.10.2


From 2ca230baeb7c61864cab9b53e37a3da28a2ca7e5 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Thu, 17 Feb 2011 14:46:37 +0100
Subject: x86-64, NUMA: Don't call __pa() with invalid address in
 numa_reset_distance()

Do not call __pa(numa_distance) if it was not allocated before.
Calling with invalid address triggers VIRTUAL_BUG_ON() in
__phys_addr() if CONFIG_DEBUG_VIRTUAL.

Also reported by Ingo.

 http://thread.gmane.org/gmane.linux.kernel/1101306/focus=1101785

- v2: Change to check existing path as tj requested.
- tj: Description update.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 8ce6177..1bd6de4 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -371,11 +371,13 @@ static void __init numa_reset_distance(void)
 {
 	size_t size;
 
-	size = numa_distance_cnt * sizeof(numa_distance[0]);
-	memblock_x86_free_range(__pa(numa_distance),
-				__pa(numa_distance) + size);
+	if (numa_distance_cnt) {
+		size = numa_distance_cnt * sizeof(numa_distance[0]);
+		memblock_x86_free_range(__pa(numa_distance),
+					__pa(numa_distance) + size);
+		numa_distance_cnt = 0;
+	}
 	numa_distance = NULL;
-	numa_distance_cnt = 0;
 }
 
 /*
-- 
cgit v0.10.2


From 6d496f9f232790d44144f3784856290e0b27b8f3 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Thu, 17 Feb 2011 14:53:20 +0100
Subject: x86-64, NUMA: Put dummy_numa_init() in the init section

dummy_numa_init() is used only during system boot.  Put it in .init
like other NUMA init functions.

- tj: Description update.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 1bd6de4..f6d85e3 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -899,7 +899,7 @@ static bool __init numa_emulation(void)
 }
 #endif /* CONFIG_NUMA_EMU */
 
-static int dummy_numa_init(void)
+static int __init dummy_numa_init(void)
 {
 	printk(KERN_INFO "%s\n",
 	       numa_off ? "NUMA turned off" : "No NUMA configuration found");
-- 
cgit v0.10.2


From fec9cbd15b9e99bab9bc50f1ed7e20a1087d7c6d Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 17 Feb 2011 10:37:23 -0200
Subject: perf hists: Print number of samples, not the period sum

So that we match the header where we state the number of events with the
"Samples" column when using 'perf report -n/--show-nr-samples':

 [root@emilia ~]# perf record -a sleep 1
 [ perf record: Woken up 1 times to write data ]
 [ perf record: Captured and wrote 0.111 MB perf.data (~4860 samples) ]
 [root@emilia ~]# perf report --stdio --show-nr-samples
 # Events: 11  cycles
 #
 # Overhead  Samples        Command       Shared Object                        Symbol
 # ........ ..........  ...........  ..................  ............................
 #
     16.65%          1        sleep  [kernel.kallsyms]   [k] unmap_vmas
     16.10%          1         perf  libpthread-2.12.so  [.] __pthread_cleanup_push_defer
     15.79%          2         perf  [kernel.kallsyms]   [k] format_decode
     12.88%          1  kworker/1:2  [kernel.kallsyms]   [k] cache_reap
     10.69%          1      swapper  [kernel.kallsyms]   [k] _raw_spin_lock
      7.55%          1        sleep  [kernel.kallsyms]   [k] prepare_exec_creds
      6.00%          1         perf  [jbd2]              [k] start_this_handle
      5.29%          1         perf  [kernel.kallsyms]   [k] seq_read
      4.75%          1         perf  [kernel.kallsyms]   [k] get_pid_task
      4.30%          1         perf  [kernel.kallsyms]   [k] _raw_spin_unlock_irqrestore

 #
 # (For a higher level overview, try: perf report --sort comm,dso)
 #
 [root@emilia ~]#

Reported-by: Stephane Eranian <eranian@google.com>
Acked-by: Stephane Eranian <eranian@google.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index 3f43723..da2899e8 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -591,6 +591,7 @@ int hist_entry__snprintf(struct hist_entry *self, char *s, size_t size,
 {
 	struct sort_entry *se;
 	u64 period, total, period_sys, period_us, period_guest_sys, period_guest_us;
+	u64 nr_events;
 	const char *sep = symbol_conf.field_sep;
 	int ret;
 
@@ -599,6 +600,7 @@ int hist_entry__snprintf(struct hist_entry *self, char *s, size_t size,
 
 	if (pair_hists) {
 		period = self->pair ? self->pair->period : 0;
+		nr_events = self->pair ? self->pair->nr_events : 0;
 		total = pair_hists->stats.total_period;
 		period_sys = self->pair ? self->pair->period_sys : 0;
 		period_us = self->pair ? self->pair->period_us : 0;
@@ -606,6 +608,7 @@ int hist_entry__snprintf(struct hist_entry *self, char *s, size_t size,
 		period_guest_us = self->pair ? self->pair->period_guest_us : 0;
 	} else {
 		period = self->period;
+		nr_events = self->nr_events;
 		total = session_total;
 		period_sys = self->period_sys;
 		period_us = self->period_us;
@@ -646,9 +649,9 @@ int hist_entry__snprintf(struct hist_entry *self, char *s, size_t size,
 
 	if (symbol_conf.show_nr_samples) {
 		if (sep)
-			ret += snprintf(s + ret, size - ret, "%c%" PRIu64, *sep, period);
+			ret += snprintf(s + ret, size - ret, "%c%" PRIu64, *sep, nr_events);
 		else
-			ret += snprintf(s + ret, size - ret, "%11" PRIu64, period);
+			ret += snprintf(s + ret, size - ret, "%11" PRIu64, nr_events);
 	}
 
 	if (pair_hists) {
-- 
cgit v0.10.2


From 712a4b6049724278121d56aba683151d86c8c35a Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 17 Feb 2011 12:18:42 -0200
Subject: perf record: Delay setting the header writing atexit call

While testing the --filter option I noticed that we were writing lots of
unneeded stuff to the perf.data header when the filter ioctl fails, so
move the atexit(atexit_header) call to after we create the counters
successfully.

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index a4aaadc..db4cd1e 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -538,11 +538,6 @@ static int __cmd_record(int argc, const char **argv)
 	if (have_tracepoints(&evsel_list->entries))
 		perf_header__set_feat(&session->header, HEADER_TRACE_INFO);
 
-	/*
- 	 * perf_session__delete(session) will be called at atexit_header()
-	 */
-	atexit(atexit_header);
-
 	if (forks) {
 		child_pid = fork();
 		if (child_pid < 0) {
@@ -601,6 +596,11 @@ static int __cmd_record(int argc, const char **argv)
 
 	perf_session__set_sample_type(session, sample_type);
 
+	/*
+	 * perf_session__delete(session) will be called at atexit_header()
+	 */
+	atexit(atexit_header);
+
 	if (pipe_output) {
 		err = perf_header__write_pipe(output);
 		if (err < 0)
-- 
cgit v0.10.2


From 74cfc17dc1a69c37ce6c8a76c1ce84bcb796eb0e Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 17 Feb 2011 14:40:46 -0200
Subject: perf report: Tell the user when a perf.data file has no samples

[root@emilia ~]# perf report --stdio
The perf.data file has no samples!
[root@emilia ~]#

The TUI shows a popup warning message with the same message.

Reported-by: Ingo Molnar <mingo@elte.hu>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index f9a99a1..dddcc7e 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -350,6 +350,12 @@ static int __cmd_report(void)
 		perf_session__fprintf_dsos(session, stdout);
 
 	next = rb_first(&session->hists_tree);
+
+	if (next == NULL) {
+		ui__warning("The %s file has no samples!\n", input_name);
+		goto out_delete;
+	}
+
 	while (next) {
 		struct hists *hists;
 
-- 
cgit v0.10.2


From 668b8788f497b2386402daeca583d6300240d41d Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 17 Feb 2011 15:38:58 -0200
Subject: perf list: Allow filtering list of events

The man page has the details, here are some examples:

[root@emilia ~]# perf list *fault*  *:*wait*

List of pre-defined events (to be used in -e):
  page-faults OR faults                      [Software event]
  minor-faults                               [Software event]
  major-faults                               [Software event]
  alignment-faults                           [Software event]
  emulation-faults                           [Software event]

  radeon:radeon_fence_wait_begin             [Tracepoint event]
  radeon:radeon_fence_wait_end               [Tracepoint event]
  writeback:wbc_writeback_wait               [Tracepoint event]
  writeback:wbc_balance_dirty_wait           [Tracepoint event]
  writeback:writeback_congestion_wait        [Tracepoint event]
  writeback:writeback_wait_iff_congested     [Tracepoint event]
  sched:sched_wait_task                      [Tracepoint event]
  sched:sched_process_wait                   [Tracepoint event]
  sched:sched_stat_wait                      [Tracepoint event]
  sched:sched_stat_iowait                    [Tracepoint event]
  syscalls:sys_enter_epoll_wait              [Tracepoint event]
  syscalls:sys_exit_epoll_wait               [Tracepoint event]
  syscalls:sys_enter_epoll_pwait             [Tracepoint event]
  syscalls:sys_exit_epoll_pwait              [Tracepoint event]
  syscalls:sys_enter_rt_sigtimedwait         [Tracepoint event]
  syscalls:sys_exit_rt_sigtimedwait          [Tracepoint event]
  syscalls:sys_enter_waitid                  [Tracepoint event]
  syscalls:sys_exit_waitid                   [Tracepoint event]
  syscalls:sys_enter_wait4                   [Tracepoint event]
  syscalls:sys_exit_wait4                    [Tracepoint event]
  syscalls:sys_enter_waitpid                 [Tracepoint event]
  syscalls:sys_exit_waitpid                  [Tracepoint event]
[root@emilia ~]#

Suggested-by: Ingo Molnar <mingo@elte.hu>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/Documentation/perf-list.txt b/tools/perf/Documentation/perf-list.txt
index 399751b..7a527f7 100644
--- a/tools/perf/Documentation/perf-list.txt
+++ b/tools/perf/Documentation/perf-list.txt
@@ -8,7 +8,7 @@ perf-list - List all symbolic event types
 SYNOPSIS
 --------
 [verse]
-'perf list'
+'perf list' [hw|sw|cache|tracepoint|event_glob]
 
 DESCRIPTION
 -----------
@@ -63,7 +63,26 @@ details. Some of them are referenced in the SEE ALSO section below.
 
 OPTIONS
 -------
-None
+
+Without options all known events will be listed.
+
+To limit the list use:
+
+. 'hw' or 'hardware' to list hardware events such as cache-misses, etc.
+
+. 'sw' or 'software' to list software events such as context switches, etc.
+
+. 'cache' or 'hwcache' to list hardware cache events such as L1-dcache-loads, etc.
+
+. 'tracepoint' to list all tracepoint events, alternatively use
+  'subsys_glob:event_glob' to filter by tracepoint subsystems such as sched,
+  block, etc.
+
+. If none of the above is matched, it will apply the supplied glob to all
+  events, printing the ones that match.
+
+One or more types can be used at the same time, listing the events for the
+types specified.
 
 SEE ALSO
 --------
diff --git a/tools/perf/builtin-list.c b/tools/perf/builtin-list.c
index d88c696..6313b6e 100644
--- a/tools/perf/builtin-list.c
+++ b/tools/perf/builtin-list.c
@@ -5,6 +5,7 @@
  *
  * Copyright (C) 2009, Thomas Gleixner <tglx@linutronix.de>
  * Copyright (C) 2008-2009, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
+ * Copyright (C) 2011, Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
  */
 #include "builtin.h"
 
@@ -13,9 +14,47 @@
 #include "util/parse-events.h"
 #include "util/cache.h"
 
-int cmd_list(int argc __used, const char **argv __used, const char *prefix __used)
+int cmd_list(int argc, const char **argv, const char *prefix __used)
 {
 	setup_pager();
-	print_events();
+
+	if (argc == 1)
+		print_events(NULL);
+	else {
+		int i;
+
+		for (i = 1; i < argc; ++i) {
+			if (i > 1)
+				putchar('\n');
+			if (strncmp(argv[i], "tracepoint", 10) == 0)
+				print_tracepoint_events(NULL, NULL);
+			else if (strcmp(argv[i], "hw") == 0 ||
+				 strcmp(argv[i], "hardware") == 0)
+				print_events_type(PERF_TYPE_HARDWARE);
+			else if (strcmp(argv[i], "sw") == 0 ||
+				 strcmp(argv[i], "software") == 0)
+				print_events_type(PERF_TYPE_SOFTWARE);
+			else if (strcmp(argv[i], "cache") == 0 ||
+				 strcmp(argv[i], "hwcache") == 0)
+				print_hwcache_events(NULL);
+			else {
+				char *sep = strchr(argv[i], ':'), *s;
+				int sep_idx;
+
+				if (sep == NULL) {
+					print_events(argv[i]);
+					continue;
+				}
+				sep_idx = sep - argv[i];
+				s = strdup(argv[i]);
+				if (s == NULL)
+					return -1;
+
+				s[sep_idx] = '\0';
+				print_tracepoint_events(s, s + sep_idx + 1);
+				free(s);
+			}
+		}
+	}
 	return 0;
 }
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 80a3dd5..54a7e26 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -858,7 +858,7 @@ static const char * const event_type_descriptors[] = {
  * Print the events from <debugfs_mount_point>/tracing/events
  */
 
-static void print_tracepoint_events(void)
+void print_tracepoint_events(const char *subsys_glob, const char *event_glob)
 {
 	DIR *sys_dir, *evt_dir;
 	struct dirent *sys_next, *evt_next, sys_dirent, evt_dirent;
@@ -873,6 +873,9 @@ static void print_tracepoint_events(void)
 		return;
 
 	for_each_subsystem(sys_dir, sys_dirent, sys_next) {
+		if (subsys_glob != NULL && 
+		    !strglobmatch(sys_dirent.d_name, subsys_glob))
+			continue;
 
 		snprintf(dir_path, MAXPATHLEN, "%s/%s", debugfs_path,
 			 sys_dirent.d_name);
@@ -881,6 +884,10 @@ static void print_tracepoint_events(void)
 			continue;
 
 		for_each_event(sys_dirent, evt_dir, evt_dirent, evt_next) {
+			if (event_glob != NULL && 
+			    !strglobmatch(evt_dirent.d_name, event_glob))
+				continue;
+
 			snprintf(evt_path, MAXPATHLEN, "%s:%s",
 				 sys_dirent.d_name, evt_dirent.d_name);
 			printf("  %-42s [%s]\n", evt_path,
@@ -932,13 +939,61 @@ int is_valid_tracepoint(const char *event_string)
 	return 0;
 }
 
+void print_events_type(u8 type)
+{
+	struct event_symbol *syms = event_symbols;
+	unsigned int i;
+	char name[64];
+
+	for (i = 0; i < ARRAY_SIZE(event_symbols); i++, syms++) {
+		if (type != syms->type)
+			continue;
+
+		if (strlen(syms->alias))
+			snprintf(name, sizeof(name),  "%s OR %s",
+				 syms->symbol, syms->alias);
+		else
+			snprintf(name, sizeof(name), "%s", syms->symbol);
+
+		printf("  %-42s [%s]\n", name,
+			event_type_descriptors[type]);
+	}
+}
+
+int print_hwcache_events(const char *event_glob)
+{
+	unsigned int type, op, i, printed = 0;
+
+	for (type = 0; type < PERF_COUNT_HW_CACHE_MAX; type++) {
+		for (op = 0; op < PERF_COUNT_HW_CACHE_OP_MAX; op++) {
+			/* skip invalid cache type */
+			if (!is_cache_op_valid(type, op))
+				continue;
+
+			for (i = 0; i < PERF_COUNT_HW_CACHE_RESULT_MAX; i++) {
+				char *name = event_cache_name(type, op, i);
+
+				if (event_glob != NULL && 
+				    !strglobmatch(name, event_glob))
+					continue;
+
+				printf("  %-42s [%s]\n", name,
+					event_type_descriptors[PERF_TYPE_HW_CACHE]);
+				++printed;
+			}
+		}
+	}
+
+	return printed;
+}
+
 /*
  * Print the help text for the event symbols:
  */
-void print_events(void)
+void print_events(const char *event_glob)
 {
 	struct event_symbol *syms = event_symbols;
-	unsigned int i, type, op, prev_type = -1;
+	unsigned int i, type, prev_type = -1, printed = 0, ntypes_printed = 0;
 	char name[40];
 
 	printf("\n");
@@ -947,8 +1002,16 @@ void print_events(void)
 	for (i = 0; i < ARRAY_SIZE(event_symbols); i++, syms++) {
 		type = syms->type;
 
-		if (type != prev_type)
+		if (type != prev_type && printed) {
 			printf("\n");
+			printed = 0;
+			ntypes_printed++;
+		}
+
+		if (event_glob != NULL && 
+		    !(strglobmatch(syms->symbol, event_glob) ||
+		      (syms->alias && strglobmatch(syms->alias, event_glob))))
+			continue;
 
 		if (strlen(syms->alias))
 			sprintf(name, "%s OR %s", syms->symbol, syms->alias);
@@ -958,22 +1021,17 @@ void print_events(void)
 			event_type_descriptors[type]);
 
 		prev_type = type;
+		++printed;
 	}
 
-	printf("\n");
-	for (type = 0; type < PERF_COUNT_HW_CACHE_MAX; type++) {
-		for (op = 0; op < PERF_COUNT_HW_CACHE_OP_MAX; op++) {
-			/* skip invalid cache type */
-			if (!is_cache_op_valid(type, op))
-				continue;
-
-			for (i = 0; i < PERF_COUNT_HW_CACHE_RESULT_MAX; i++) {
-				printf("  %-42s [%s]\n",
-					event_cache_name(type, op, i),
-					event_type_descriptors[PERF_TYPE_HW_CACHE]);
-			}
-		}
+	if (ntypes_printed) {
+		printed = 0;
+		printf("\n");
 	}
+	print_hwcache_events(event_glob);
+
+	if (event_glob != NULL)
+		return;
 
 	printf("\n");
 	printf("  %-42s [%s]\n",
@@ -986,7 +1044,7 @@ void print_events(void)
 			event_type_descriptors[PERF_TYPE_BREAKPOINT]);
 	printf("\n");
 
-	print_tracepoint_events();
+	print_tracepoint_events(NULL, NULL);
 
 	exit(129);
 }
diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h
index cf7e94a..212f88e 100644
--- a/tools/perf/util/parse-events.h
+++ b/tools/perf/util/parse-events.h
@@ -28,7 +28,10 @@ extern int parse_filter(const struct option *opt, const char *str, int unset);
 
 #define EVENTS_HELP_MAX (128*1024)
 
-extern void print_events(void);
+void print_events(const char *event_glob);
+void print_events_type(u8 type);
+void print_tracepoint_events(const char *subsys_glob, const char *event_glob);
+int print_hwcache_events(const char *event_glob);
 extern int is_valid_tracepoint(const char *event_string);
 
 extern char debugfs_path[];
-- 
cgit v0.10.2


From db2e2e6ee9ee9ce93b04c6975fdfef304771d6ad Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 24 Jan 2011 15:43:03 +0100
Subject: xen-pcifront: don't use flush_scheduled_work()

flush_scheduled_work() is scheduled for deprecation.  Cancel ->op_work
directly instead.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ryan Wilson <hap9@epoch.ncsc.mil>
Cc: Jan Beulich <JBeulich@novell.com>
Cc: Jesse Barnes <jbarnes@virtuousgeek.org>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/drivers/pci/xen-pcifront.c b/drivers/pci/xen-pcifront.c
index 3a5a6fc..030ce37 100644
--- a/drivers/pci/xen-pcifront.c
+++ b/drivers/pci/xen-pcifront.c
@@ -733,8 +733,7 @@ static void free_pdev(struct pcifront_device *pdev)
 
 	pcifront_free_roots(pdev);
 
-	/*For PCIE_AER error handling job*/
-	flush_scheduled_work();
+	cancel_work_sync(&pdev->op_work);
 
 	if (pdev->irq >= 0)
 		unbind_from_irqhandler(pdev->irq, pdev);
-- 
cgit v0.10.2


From e9345aab675382176740bc8a2c6d3caf1510e46d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 18 Feb 2011 08:09:49 +0100
Subject: Revert "tracing: Add unstable sched clock note to the warning"

This reverts commit 5e38ca8f3ea423442eaafe1b7e206084aa38120a.

Breaks the build of several !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
architectures.

Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Message-ID: <20110217171823.GB17058@elte.hu>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 7739893..bd1c35a 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2163,14 +2163,10 @@ rb_reserve_next_event(struct ring_buffer *buffer,
 		delta = diff;
 		if (unlikely(test_time_stamp(delta))) {
 			WARN_ONCE(delta > (1ULL << 59),
-				  KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s",
+				  KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n",
 				  (unsigned long long)delta,
 				  (unsigned long long)ts,
-				  (unsigned long long)cpu_buffer->write_stamp,
-				  sched_clock_stable ? "" :
-				  "If you just came from a suspend/resume,\n"
-				  "please switch to the trace global clock:\n"
-				  "  echo global > /sys/kernel/debug/tracing/trace_clock\n");
+				  (unsigned long long)cpu_buffer->write_stamp);
 			add_timestamp = 1;
 		}
 	}
-- 
cgit v0.10.2


From bb3e6251a69e67d7620373ee18e35b404964273e Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@novell.com>
Date: Thu, 17 Feb 2011 15:47:37 +0000
Subject: x86: Don't call dump_stack() from
 arch_trigger_all_cpu_backtrace_handler()

show_regs() already prints two(!) stack traces, no need for a third one.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
LKML-Reference: <4D5D512902000078000326EE@vpn.id2.novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index 79fd43c..c4e557a 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -83,7 +83,6 @@ arch_trigger_all_cpu_backtrace_handler(struct notifier_block *self,
 		arch_spin_lock(&lock);
 		printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu);
 		show_regs(regs);
-		dump_stack();
 		arch_spin_unlock(&lock);
 		cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
 		return NOTIFY_STOP;
-- 
cgit v0.10.2


From fd8fa4d3ddc4cc04ec8097e632b995d535c52beb Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@novell.com>
Date: Thu, 17 Feb 2011 15:56:58 +0000
Subject: x86: Combine printk()s in show_regs_common()

Printing a single character alone when there's an immediately
following printk() is pretty pointless (and wasteful).

Signed-off-by: Jan Beulich <jbeulich@novell.com>
LKML-Reference: <4D5D535A0200007800032730@vpn.id2.novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index ff45541..99fa3ad 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -110,12 +110,9 @@ void show_regs_common(void)
 		init_utsname()->release,
 		(int)strcspn(init_utsname()->version, " "),
 		init_utsname()->version);
-	printk(KERN_CONT " ");
-	printk(KERN_CONT "%s %s", vendor, product);
-	if (board) {
-		printk(KERN_CONT "/");
-		printk(KERN_CONT "%s", board);
-	}
+	printk(KERN_CONT " %s %s", vendor, product);
+	if (board)
+		printk(KERN_CONT "/%s", board);
 	printk(KERN_CONT "\n");
 }
 
-- 
cgit v0.10.2


From 02ca752e4181e219e243cd61a60dd1da47251f11 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@novell.com>
Date: Thu, 17 Feb 2011 15:51:40 +0000
Subject: x86: Remove die_nmi()

With no caller left, the function and the DIE_NMIWATCHDOG
enumerator can both go away.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Don Zickus <dzickus@redhat.com>
LKML-Reference: <4D5D521C0200007800032702@vpn.id2.novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h
index ca242d3..518bbbb 100644
--- a/arch/x86/include/asm/kdebug.h
+++ b/arch/x86/include/asm/kdebug.h
@@ -13,7 +13,6 @@ enum die_val {
 	DIE_PANIC,
 	DIE_NMI,
 	DIE_DIE,
-	DIE_NMIWATCHDOG,
 	DIE_KERNELDEBUG,
 	DIE_TRAP,
 	DIE_GPF,
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index c76f5b9..07f4601 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -7,7 +7,6 @@
 
 #ifdef CONFIG_X86_LOCAL_APIC
 
-extern void die_nmi(char *str, struct pt_regs *regs, int do_panic);
 extern int avail_to_resrv_perfctr_nmi_bit(unsigned int);
 extern int reserve_perfctr_nmi(unsigned int);
 extern void release_perfctr_nmi(unsigned int);
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index df20723..220a1c1 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -320,31 +320,6 @@ void die(const char *str, struct pt_regs *regs, long err)
 	oops_end(flags, regs, sig);
 }
 
-void notrace __kprobes
-die_nmi(char *str, struct pt_regs *regs, int do_panic)
-{
-	unsigned long flags;
-
-	if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
-		return;
-
-	/*
-	 * We are in trouble anyway, lets at least try
-	 * to get a message out.
-	 */
-	flags = oops_begin();
-	printk(KERN_EMERG "%s", str);
-	printk(" on CPU%d, ip %08lx, registers:\n",
-		smp_processor_id(), regs->ip);
-	show_registers(regs);
-	oops_end(flags, regs, 0);
-	if (do_panic || panic_on_oops)
-		panic("Non maskable interrupt");
-	nmi_exit();
-	local_irq_enable();
-	do_exit(SIGBUS);
-}
-
 static int __init oops_setup(char *s)
 {
 	if (!s)
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index a413000..7c64c42 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -533,15 +533,6 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd)
 		}
 		return NOTIFY_DONE;
 
-	case DIE_NMIWATCHDOG:
-		if (atomic_read(&kgdb_active) != -1) {
-			/* KGDB CPU roundup: */
-			kgdb_nmicallback(raw_smp_processor_id(), regs);
-			return NOTIFY_STOP;
-		}
-		/* Enter debugger: */
-		break;
-
 	case DIE_DEBUG:
 		if (atomic_read(&kgdb_cpu_doing_single_step) != -1) {
 			if (user_mode(regs))
-- 
cgit v0.10.2


From 58bff947e2d164c7e5cbf7f485e4b3d4884befeb Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@novell.com>
Date: Thu, 17 Feb 2011 15:54:26 +0000
Subject: x86: Eliminate pointless adjustment attempts in fixup_irqs()

Not only when an IRQ's affinity equals cpu_online_mask is there
no need to actually try to adjust the affinity, but also when
it's a subset thereof. This particularly avoids adjustment
attempts during system shutdown to any IRQs bound to CPU#0.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Gary Hade <garyhade@us.ibm.com>
LKML-Reference: <4D5D52C2020000780003272C@vpn.id2.novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 387b6a0..78793ef 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -310,7 +310,7 @@ void fixup_irqs(void)
 		data = &desc->irq_data;
 		affinity = data->affinity;
 		if (!irq_has_action(irq) ||
-		    cpumask_equal(affinity, cpu_online_mask)) {
+		    cpumask_subset(affinity, cpu_online_mask)) {
 			raw_spin_unlock(&desc->lock);
 			continue;
 		}
-- 
cgit v0.10.2


From 006cdc32618e09ffe228a7a86af044f3cc0dd714 Mon Sep 17 00:00:00 2001
From: Michael Witten <mfwitten@gmail.com>
Date: Wed, 2 Feb 2011 13:01:41 -0600
Subject: perf tools: Makefile: Remove vestigial git-specific cruft

This commit squashes several commits that remove:

 NO_SYMLINK_HEAD
 NO_SVN_TESTS
 NO_FAST_WORKING_DIRECTORY
 USE_STDEV
 SHA1/SSL cruft
 makefile rules

Signed-off-by: Michael Witten <mfwitten@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index 7c75f1d..544367c 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -21,9 +21,6 @@ endif
 # Define FREAD_READS_DIRECTORIES if your are on a system which succeeds
 # when attempting to read from an fopen'ed directory.
 #
-# Define NO_OPENSSL environment variable if you do not have OpenSSL.
-# This also implies MOZILLA_SHA1.
-#
 # Define CURLDIR=/foo/bar if your curl header and library files are in
 # /foo/bar/include and /foo/bar/lib directories.
 #
@@ -56,13 +53,6 @@ endif
 #
 # Define NO_SYS_SELECT_H if you don't have sys/select.h.
 #
-# Define NO_SYMLINK_HEAD if you never want .perf/HEAD to be a symbolic link.
-# Enable it on Windows.  By default, symrefs are still used.
-#
-# Define NO_SVN_TESTS if you want to skip time-consuming SVN interoperability
-# tests.  These tests take up a significant amount of the total test time
-# but are not needed unless you plan to talk to SVN repos.
-#
 # Define NO_FINK if you are building on Darwin/Mac OS X, have Fink
 # installed in /sw, but don't want PERF to link against any libraries
 # installed there.  If defined you may specify your own (or Fink's)
@@ -75,19 +65,6 @@ endif
 # specify your own (or DarwinPort's) include directories and
 # library directories by defining CFLAGS and LDFLAGS appropriately.
 #
-# Define PPC_SHA1 environment variable when running make to make use of
-# a bundled SHA1 routine optimized for PowerPC.
-#
-# Define ARM_SHA1 environment variable when running make to make use of
-# a bundled SHA1 routine optimized for ARM.
-#
-# Define MOZILLA_SHA1 environment variable when running make to make use of
-# a bundled SHA1 routine coming from Mozilla. It is GPL'd and should be fast
-# on non-x86 architectures (e.g. PowerPC), while the OpenSSL version (default
-# choice) has very fast version optimized for i586.
-#
-# Define NEEDS_SSL_WITH_CRYPTO if you need -lcrypto with -lssl (Darwin).
-#
 # Define NEEDS_LIBICONV if linking with libc is not enough (Darwin).
 #
 # Define NEEDS_SOCKET if linking with libc is not enough (SunOS,
@@ -100,9 +77,6 @@ endif
 # Define NO_PREAD if you have a problem with pread() system call (e.g.
 # cygwin.dll before v1.5.22).
 #
-# Define NO_FAST_WORKING_DIRECTORY if accessing objects in pack files is
-# generally faster on your platform than accessing the working directory.
-#
 # Define NO_TRUSTABLE_FILEMODE if your filesystem may claim to support
 # the executable mode bit, but doesn't really do so.
 #
@@ -134,9 +108,6 @@ endif
 # Define NO_NSEC if your "struct stat" does not have "st_ctim.tv_nsec"
 # available.  This automatically turns USE_NSEC off.
 #
-# Define USE_STDEV below if you want perf to care about the underlying device
-# change being considered an inode change from the update-index perspective.
-#
 # Define NO_ST_BLOCKS_IN_STRUCT_STAT if your platform does not have st_blocks
 # field that counts the on-disk footprint in 512-byte blocks.
 #
@@ -771,9 +742,6 @@ ifdef FREAD_READS_DIRECTORIES
 	COMPAT_CFLAGS += -DFREAD_READS_DIRECTORIES
 	COMPAT_OBJS += $(OUTPUT)compat/fopen.o
 endif
-ifdef NO_SYMLINK_HEAD
-	BASIC_CFLAGS += -DNO_SYMLINK_HEAD
-endif
 ifdef NO_STRCASESTR
 	COMPAT_CFLAGS += -DNO_STRCASESTR
 	COMPAT_OBJS += $(OUTPUT)compat/strcasestr.o
@@ -813,9 +781,6 @@ ifdef NO_PREAD
 	COMPAT_CFLAGS += -DNO_PREAD
 	COMPAT_OBJS += $(OUTPUT)compat/pread.o
 endif
-ifdef NO_FAST_WORKING_DIRECTORY
-	BASIC_CFLAGS += -DNO_FAST_WORKING_DIRECTORY
-endif
 ifdef NO_TRUSTABLE_FILEMODE
 	BASIC_CFLAGS += -DNO_TRUSTABLE_FILEMODE
 endif
@@ -851,23 +816,6 @@ ifdef NO_DEFLATE_BOUND
 	BASIC_CFLAGS += -DNO_DEFLATE_BOUND
 endif
 
-ifdef PPC_SHA1
-	SHA1_HEADER = "ppc/sha1.h"
-	LIB_OBJS += $(OUTPUT)ppc/sha1.o ppc/sha1ppc.o
-else
-ifdef ARM_SHA1
-	SHA1_HEADER = "arm/sha1.h"
-	LIB_OBJS += $(OUTPUT)arm/sha1.o $(OUTPUT)arm/sha1_arm.o
-else
-ifdef MOZILLA_SHA1
-	SHA1_HEADER = "mozilla-sha1/sha1.h"
-	LIB_OBJS += $(OUTPUT)mozilla-sha1/sha1.o
-else
-	SHA1_HEADER = <openssl/sha.h>
-	EXTLIBS += $(LIB_4_CRYPTO)
-endif
-endif
-endif
 ifdef NO_PERL_MAKEMAKER
 	export NO_PERL_MAKEMAKER
 endif
@@ -930,7 +878,6 @@ endif
 
 # Shell quote (do not use $(call) to accommodate ancient setups);
 
-SHA1_HEADER_SQ = $(subst ','\'',$(SHA1_HEADER))
 ETC_PERFCONFIG_SQ = $(subst ','\'',$(ETC_PERFCONFIG))
 
 DESTDIR_SQ = $(subst ','\'',$(DESTDIR))
@@ -948,8 +895,7 @@ PERL_PATH_SQ = $(subst ','\'',$(PERL_PATH))
 
 LIBS = -Wl,--whole-archive $(PERFLIBS) -Wl,--no-whole-archive $(EXTLIBS)
 
-BASIC_CFLAGS += -DSHA1_HEADER='$(SHA1_HEADER_SQ)' \
-	$(COMPAT_CFLAGS)
+BASIC_CFLAGS += $(COMPAT_CFLAGS)
 LIB_OBJS += $(COMPAT_OBJS)
 
 ALL_CFLAGS += $(BASIC_CFLAGS)
@@ -1048,9 +994,6 @@ $(OUTPUT)util/exec_cmd.o: util/exec_cmd.c $(OUTPUT)PERF-CFLAGS
 		'-DPREFIX="$(prefix_SQ)"' \
 		$<
 
-$(OUTPUT)builtin-init-db.o: builtin-init-db.c $(OUTPUT)PERF-CFLAGS
-	$(QUIET_CC)$(CC) -o $@ -c $(ALL_CFLAGS) -DDEFAULT_PERF_TEMPLATE_DIR='"$(template_dir_SQ)"' $<
-
 $(OUTPUT)util/config.o: util/config.c $(OUTPUT)PERF-CFLAGS
 	$(QUIET_CC)$(CC) -o $@ -c $(ALL_CFLAGS) -DETC_PERFCONFIG='"$(ETC_PERFCONFIG_SQ)"' $<
 
@@ -1089,7 +1032,6 @@ $(OUTPUT)perf-%$X: %.o $(PERFLIBS)
 
 $(LIB_OBJS) $(BUILTIN_OBJS): $(LIB_H)
 $(patsubst perf-%$X,%.o,$(PROGRAMS)): $(LIB_H) $(wildcard */*.h)
-builtin-revert.o wt-status.o: wt-status.h
 
 # we compile into subdirectories. if the target directory is not the source directory, they might not exists. So
 # we depend the various files onto their directories.
@@ -1192,8 +1134,6 @@ all:: $(TEST_PROGRAMS)
 # However, the environment gets quite big, and some programs have problems
 # with that.
 
-export NO_SVN_TESTS
-
 check: $(OUTPUT)common-cmds.h
 	if sparse; \
 	then \
-- 
cgit v0.10.2


From 8796cb9d7dc028945af4b2ea858ae8f8f2ecbe8c Mon Sep 17 00:00:00 2001
From: Michael Witten <mfwitten@gmail.com>
Date: Wed, 2 Feb 2011 11:57:41 -0600
Subject: perf tools: Makefile: Remove platform-specific cruft

While it makes sense that this tool could be used on
other platforms at least to parse data, there doesn't
appear to be any real support for such usage.

This commit squashes several commits that remove:

 SNPRINTF_RETURNS_BOGUS
 FREAD_READS_DIRECTORIES
 NO_D_{INO,TYPE}_IN_DIRENT
 NO_STRCASESTR
 NO_MEMMEM
 NO_STRTOUMAX and NO_STRTOULL
 NO_SETENV
 NO_UNSETENV
 NO_MKDTEMP
 NEEDS_LIBICONV
 NEEDS_SOCKET
 NO_MMAP
 NO_PTHREADS
 NO_PREAD
 NO_TRUSTABLE_FILEMODE
 NO_IPV6 and NO_SOCKADDR_STORAGE
 NO_ICONV and OLD_ICONV
 NO_NSEC, USE_NSEC, and USE_ST_TIMESPEC
 NO_ST_BLOCKS_IN_STRUCT_STAT
 NO_FINK and NO_DARWIN_PORTS
 NO_SYS_SELECT_H
 NO_HSTRERROR
 DIR_HAS_BSD_GROUP_SEMANTICS and FORCE_DIR_SET_GID
 NEEDS_NSL, NO_UINTMAX_T, NO_INET_{N,P}TON
 COMPAT_{CFLAGS,OBJS}
 Executable extension `X'

Signed-off-by: Michael Witten <mfwitten@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index 544367c..53c1e93 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -14,103 +14,23 @@ endif
 # Define V=1 to have a more verbose compile.
 # Define V=2 to have an even more verbose compile.
 #
-# Define SNPRINTF_RETURNS_BOGUS if your are on a system which snprintf()
-# or vsnprintf() return -1 instead of number of characters which would
-# have been written to the final string if enough space had been available.
-#
-# Define FREAD_READS_DIRECTORIES if your are on a system which succeeds
-# when attempting to read from an fopen'ed directory.
-#
 # Define CURLDIR=/foo/bar if your curl header and library files are in
 # /foo/bar/include and /foo/bar/lib directories.
 #
 # Define EXPATDIR=/foo/bar if your expat header and library files are in
 # /foo/bar/include and /foo/bar/lib directories.
 #
-# Define NO_D_INO_IN_DIRENT if you don't have d_ino in your struct dirent.
-#
-# Define NO_D_TYPE_IN_DIRENT if your platform defines DT_UNKNOWN but lacks
-# d_type in struct dirent (latest Cygwin -- will be fixed soonish).
-#
 # Define NO_C99_FORMAT if your formatted IO functions (printf/scanf et.al.)
 # do not support the 'size specifiers' introduced by C99, namely ll, hh,
 # j, z, t. (representing long long int, char, intmax_t, size_t, ptrdiff_t).
 # some C compilers supported these specifiers prior to C99 as an extension.
 #
-# Define NO_STRCASESTR if you don't have strcasestr.
-#
-# Define NO_MEMMEM if you don't have memmem.
-#
-# Define NO_STRTOUMAX if you don't have strtoumax in the C library.
-# If your compiler also does not support long long or does not have
-# strtoull, define NO_STRTOULL.
-#
-# Define NO_SETENV if you don't have setenv in the C library.
-#
-# Define NO_UNSETENV if you don't have unsetenv in the C library.
-#
-# Define NO_MKDTEMP if you don't have mkdtemp in the C library.
-#
-# Define NO_SYS_SELECT_H if you don't have sys/select.h.
-#
-# Define NO_FINK if you are building on Darwin/Mac OS X, have Fink
-# installed in /sw, but don't want PERF to link against any libraries
-# installed there.  If defined you may specify your own (or Fink's)
-# include directories and library directories by defining CFLAGS
-# and LDFLAGS appropriately.
-#
-# Define NO_DARWIN_PORTS if you are building on Darwin/Mac OS X,
-# have DarwinPorts installed in /opt/local, but don't want PERF to
-# link against any libraries installed there.  If defined you may
-# specify your own (or DarwinPort's) include directories and
-# library directories by defining CFLAGS and LDFLAGS appropriately.
-#
-# Define NEEDS_LIBICONV if linking with libc is not enough (Darwin).
-#
-# Define NEEDS_SOCKET if linking with libc is not enough (SunOS,
-# Patrick Mauritz).
-#
-# Define NO_MMAP if you want to avoid mmap.
-#
-# Define NO_PTHREADS if you do not have or do not want to use Pthreads.
-#
-# Define NO_PREAD if you have a problem with pread() system call (e.g.
-# cygwin.dll before v1.5.22).
-#
-# Define NO_TRUSTABLE_FILEMODE if your filesystem may claim to support
-# the executable mode bit, but doesn't really do so.
-#
-# Define NO_IPV6 if you lack IPv6 support and getaddrinfo().
-#
-# Define NO_SOCKADDR_STORAGE if your platform does not have struct
-# sockaddr_storage.
-#
-# Define NO_ICONV if your libc does not properly support iconv.
-#
-# Define OLD_ICONV if your library has an old iconv(), where the second
-# (input buffer pointer) parameter is declared with type (const char **).
-#
 # Define NO_DEFLATE_BOUND if your zlib does not have deflateBound.
 #
 # Define NO_R_TO_GCC_LINKER if your gcc does not like "-R/path/lib"
 # that tells runtime paths to dynamic libraries;
 # "-Wl,-rpath=/path/lib" is used instead.
 #
-# Define USE_NSEC below if you want perf to care about sub-second file mtimes
-# and ctimes. Note that you need recent glibc (at least 2.2.4) for this, and
-# it will BREAK YOUR LOCAL DIFFS! show-diff and anything using it will likely
-# randomly break unless your underlying filesystem supports those sub-second
-# times (my ext3 doesn't).
-#
-# Define USE_ST_TIMESPEC if your "struct stat" uses "st_ctimespec" instead of
-# "st_ctim"
-#
-# Define NO_NSEC if your "struct stat" does not have "st_ctim.tv_nsec"
-# available.  This automatically turns USE_NSEC off.
-#
-# Define NO_ST_BLOCKS_IN_STRUCT_STAT if your platform does not have st_blocks
-# field that counts the on-disk footprint in 512-byte blocks.
-#
 # Define ASCIIDOC8 if you want to format documentation with AsciiDoc 8
 #
 # Define DOCBOOK_XSL_172 if you want to format man pages with DocBook XSL v1.72.
@@ -282,8 +202,6 @@ BASIC_LDFLAGS =
 # Guard against environment variables
 BUILTIN_OBJS =
 BUILT_INS =
-COMPAT_CFLAGS =
-COMPAT_OBJS =
 LIB_H =
 LIB_OBJS =
 PYRF_OBJS =
@@ -329,7 +247,7 @@ LANG_BINDINGS =
 ALL_PROGRAMS = $(PROGRAMS) $(SCRIPTS)
 
 # what 'all' will build but not install in perfexecdir
-OTHER_PROGRAMS = $(OUTPUT)perf$X
+OTHER_PROGRAMS = $(OUTPUT)perf
 
 # Set paths to tools early so that they can be used for version tests.
 ifndef SHELL_PATH
@@ -538,22 +456,6 @@ endif # NO_DWARF
 
 -include arch/$(ARCH)/Makefile
 
-ifeq ($(uname_S),Darwin)
-	ifndef NO_FINK
-		ifeq ($(shell test -d /sw/lib && echo y),y)
-			BASIC_CFLAGS += -I/sw/include
-			BASIC_LDFLAGS += -L/sw/lib
-		endif
-	endif
-	ifndef NO_DARWIN_PORTS
-		ifeq ($(shell test -d /opt/local/lib && echo y),y)
-			BASIC_CFLAGS += -I/opt/local/include
-			BASIC_LDFLAGS += -L/opt/local/lib
-		endif
-	endif
-	PTHREAD_LIBS =
-endif
-
 ifneq ($(OUTPUT),)
 	BASIC_CFLAGS += -I$(OUTPUT)
 endif
@@ -707,110 +609,9 @@ ifndef CC_LD_DYNPATH
 	endif
 endif
 
-ifdef NEEDS_SOCKET
-	EXTLIBS += -lsocket
-endif
-ifdef NEEDS_NSL
-	EXTLIBS += -lnsl
-endif
-ifdef NO_D_TYPE_IN_DIRENT
-	BASIC_CFLAGS += -DNO_D_TYPE_IN_DIRENT
-endif
-ifdef NO_D_INO_IN_DIRENT
-	BASIC_CFLAGS += -DNO_D_INO_IN_DIRENT
-endif
-ifdef NO_ST_BLOCKS_IN_STRUCT_STAT
-	BASIC_CFLAGS += -DNO_ST_BLOCKS_IN_STRUCT_STAT
-endif
-ifdef USE_NSEC
-	BASIC_CFLAGS += -DUSE_NSEC
-endif
-ifdef USE_ST_TIMESPEC
-	BASIC_CFLAGS += -DUSE_ST_TIMESPEC
-endif
-ifdef NO_NSEC
-	BASIC_CFLAGS += -DNO_NSEC
-endif
 ifdef NO_C99_FORMAT
 	BASIC_CFLAGS += -DNO_C99_FORMAT
 endif
-ifdef SNPRINTF_RETURNS_BOGUS
-	COMPAT_CFLAGS += -DSNPRINTF_RETURNS_BOGUS
-	COMPAT_OBJS += $(OUTPUT)compat/snprintf.o
-endif
-ifdef FREAD_READS_DIRECTORIES
-	COMPAT_CFLAGS += -DFREAD_READS_DIRECTORIES
-	COMPAT_OBJS += $(OUTPUT)compat/fopen.o
-endif
-ifdef NO_STRCASESTR
-	COMPAT_CFLAGS += -DNO_STRCASESTR
-	COMPAT_OBJS += $(OUTPUT)compat/strcasestr.o
-endif
-ifdef NO_STRTOUMAX
-	COMPAT_CFLAGS += -DNO_STRTOUMAX
-	COMPAT_OBJS += $(OUTPUT)compat/strtoumax.o
-endif
-ifdef NO_STRTOULL
-	COMPAT_CFLAGS += -DNO_STRTOULL
-endif
-ifdef NO_SETENV
-	COMPAT_CFLAGS += -DNO_SETENV
-	COMPAT_OBJS += $(OUTPUT)compat/setenv.o
-endif
-ifdef NO_MKDTEMP
-	COMPAT_CFLAGS += -DNO_MKDTEMP
-	COMPAT_OBJS += $(OUTPUT)compat/mkdtemp.o
-endif
-ifdef NO_UNSETENV
-	COMPAT_CFLAGS += -DNO_UNSETENV
-	COMPAT_OBJS += $(OUTPUT)compat/unsetenv.o
-endif
-ifdef NO_SYS_SELECT_H
-	BASIC_CFLAGS += -DNO_SYS_SELECT_H
-endif
-ifdef NO_MMAP
-	COMPAT_CFLAGS += -DNO_MMAP
-	COMPAT_OBJS += $(OUTPUT)compat/mmap.o
-else
-	ifdef USE_WIN32_MMAP
-		COMPAT_CFLAGS += -DUSE_WIN32_MMAP
-		COMPAT_OBJS += $(OUTPUT)compat/win32mmap.o
-	endif
-endif
-ifdef NO_PREAD
-	COMPAT_CFLAGS += -DNO_PREAD
-	COMPAT_OBJS += $(OUTPUT)compat/pread.o
-endif
-ifdef NO_TRUSTABLE_FILEMODE
-	BASIC_CFLAGS += -DNO_TRUSTABLE_FILEMODE
-endif
-ifdef NO_IPV6
-	BASIC_CFLAGS += -DNO_IPV6
-endif
-ifdef NO_UINTMAX_T
-	BASIC_CFLAGS += -Duintmax_t=uint32_t
-endif
-ifdef NO_SOCKADDR_STORAGE
-ifdef NO_IPV6
-	BASIC_CFLAGS += -Dsockaddr_storage=sockaddr_in
-else
-	BASIC_CFLAGS += -Dsockaddr_storage=sockaddr_in6
-endif
-endif
-ifdef NO_INET_NTOP
-	LIB_OBJS += $(OUTPUT)compat/inet_ntop.o
-endif
-ifdef NO_INET_PTON
-	LIB_OBJS += $(OUTPUT)compat/inet_pton.o
-endif
-
-ifdef NO_ICONV
-	BASIC_CFLAGS += -DNO_ICONV
-endif
-
-ifdef OLD_ICONV
-	BASIC_CFLAGS += -DOLD_ICONV
-endif
 
 ifdef NO_DEFLATE_BOUND
 	BASIC_CFLAGS += -DNO_DEFLATE_BOUND
@@ -819,14 +620,6 @@ endif
 ifdef NO_PERL_MAKEMAKER
 	export NO_PERL_MAKEMAKER
 endif
-ifdef NO_HSTRERROR
-	COMPAT_CFLAGS += -DNO_HSTRERROR
-	COMPAT_OBJS += $(OUTPUT)compat/hstrerror.o
-endif
-ifdef NO_MEMMEM
-	COMPAT_CFLAGS += -DNO_MEMMEM
-	COMPAT_OBJS += $(OUTPUT)compat/memmem.o
-endif
 ifdef INTERNAL_QSORT
 	COMPAT_CFLAGS += -DINTERNAL_QSORT
 	COMPAT_OBJS += $(OUTPUT)compat/qsort.o
@@ -835,9 +628,6 @@ ifdef RUNTIME_PREFIX
 	COMPAT_CFLAGS += -DRUNTIME_PREFIX
 endif
 
-ifdef DIR_HAS_BSD_GROUP_SEMANTICS
-	COMPAT_CFLAGS += -DDIR_HAS_BSD_GROUP_SEMANTICS
-endif
 ifdef NO_EXTERNAL_GREP
 	BASIC_CFLAGS += -DNO_EXTERNAL_GREP
 endif
@@ -895,9 +685,6 @@ PERL_PATH_SQ = $(subst ','\'',$(PERL_PATH))
 
 LIBS = -Wl,--whole-archive $(PERFLIBS) -Wl,--no-whole-archive $(EXTLIBS)
 
-BASIC_CFLAGS += $(COMPAT_CFLAGS)
-LIB_OBJS += $(COMPAT_OBJS)
-
 ALL_CFLAGS += $(BASIC_CFLAGS)
 ALL_CFLAGS += $(ARCH_CFLAGS)
 ALL_LDFLAGS += $(BASIC_LDFLAGS)
@@ -910,9 +697,6 @@ export TAR INSTALL DESTDIR SHELL_PATH
 SHELL = $(SHELL_PATH)
 
 all:: shell_compatibility_test $(ALL_PROGRAMS) $(LANG_BINDINGS) $(BUILT_INS) $(OTHER_PROGRAMS) $(OUTPUT)PERF-BUILD-OPTIONS
-ifneq (,$X)
-	$(foreach p,$(patsubst %$X,%,$(filter %$X,$(ALL_PROGRAMS) $(BUILT_INS) perf$X)), test '$p' -ef '$p$X' || $(RM) '$p';)
-endif
 
 all::
 
@@ -921,15 +705,15 @@ please_set_SHELL_PATH_to_a_more_modern_shell:
 
 shell_compatibility_test: please_set_SHELL_PATH_to_a_more_modern_shell
 
-strip: $(PROGRAMS) $(OUTPUT)perf$X
-	$(STRIP) $(STRIP_OPTS) $(PROGRAMS) $(OUTPUT)perf$X
+strip: $(PROGRAMS) $(OUTPUT)perf
+	$(STRIP) $(STRIP_OPTS) $(PROGRAMS) $(OUTPUT)perf
 
 $(OUTPUT)perf.o: perf.c $(OUTPUT)common-cmds.h $(OUTPUT)PERF-CFLAGS
 	$(QUIET_CC)$(CC) -DPERF_VERSION='"$(PERF_VERSION)"' \
 		'-DPERF_HTML_PATH="$(htmldir_SQ)"' \
 		$(ALL_CFLAGS) -c $(filter %.c,$^) -o $@
 
-$(OUTPUT)perf$X: $(OUTPUT)perf.o $(BUILTIN_OBJS) $(PERFLIBS)
+$(OUTPUT)perf: $(OUTPUT)perf.o $(BUILTIN_OBJS) $(PERFLIBS)
 	$(QUIET_LINK)$(CC) $(ALL_CFLAGS) $(ALL_LDFLAGS) $(OUTPUT)perf.o \
                $(BUILTIN_OBJS) $(LIBS) -o $@
 
@@ -1027,11 +811,11 @@ $(OUTPUT)util/scripting-engines/trace-event-python.o: util/scripting-engines/tra
 $(OUTPUT)scripts/python/Perf-Trace-Util/Context.o: scripts/python/Perf-Trace-Util/Context.c $(OUTPUT)PERF-CFLAGS
 	$(QUIET_CC)$(CC) -o $@ -c $(ALL_CFLAGS) $(PYTHON_EMBED_CCOPTS) -Wno-redundant-decls -Wno-strict-prototypes -Wno-unused-parameter -Wno-nested-externs $<
 
-$(OUTPUT)perf-%$X: %.o $(PERFLIBS)
+$(OUTPUT)perf-%: %.o $(PERFLIBS)
 	$(QUIET_LINK)$(CC) $(ALL_CFLAGS) -o $@ $(ALL_LDFLAGS) $(filter %.o,$^) $(LIBS)
 
 $(LIB_OBJS) $(BUILTIN_OBJS): $(LIB_H)
-$(patsubst perf-%$X,%.o,$(PROGRAMS)): $(LIB_H) $(wildcard */*.h)
+$(patsubst perf-%,%.o,$(PROGRAMS)): $(LIB_H) $(wildcard */*.h)
 
 # we compile into subdirectories. if the target directory is not the source directory, they might not exists. So
 # we depend the various files onto their directories.
@@ -1168,7 +952,7 @@ export perfexec_instdir
 
 install: all
 	$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(bindir_SQ)'
-	$(INSTALL) $(OUTPUT)perf$X '$(DESTDIR_SQ)$(bindir_SQ)'
+	$(INSTALL) $(OUTPUT)perf '$(DESTDIR_SQ)$(bindir_SQ)'
 	$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/perl/Perf-Trace-Util/lib/Perf/Trace'
 	$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/perl/bin'
 	$(INSTALL) $(OUTPUT)perf-archive -t '$(DESTDIR_SQ)$(perfexec_instdir_SQ)'
@@ -1267,7 +1051,7 @@ distclean: clean
 
 clean:
 	$(RM) $(OUTPUT){*.o,*/*.o,*/*/*.o,*/*/*/*.o,$(LIB_FILE),perf-archive}
-	$(RM) $(ALL_PROGRAMS) $(BUILT_INS) perf$X
+	$(RM) $(ALL_PROGRAMS) $(BUILT_INS) perf
 	$(RM) $(TEST_PROGRAMS)
 	$(RM) *.spec *.pyc *.pyo */*.pyc */*.pyo $(OUTPUT)common-cmds.h TAGS tags cscope*
 	$(RM) -r $(PERF_TARNAME) .doc-tmp-dir
diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h
index e833f26..fc78428 100644
--- a/tools/perf/util/util.h
+++ b/tools/perf/util/util.h
@@ -70,9 +70,7 @@
 #include <sys/poll.h>
 #include <sys/socket.h>
 #include <sys/ioctl.h>
-#ifndef NO_SYS_SELECT_H
 #include <sys/select.h>
-#endif
 #include <netinet/in.h>
 #include <netinet/tcp.h>
 #include <arpa/inet.h>
@@ -83,10 +81,6 @@
 #include "types.h"
 #include <sys/ttydefaults.h>
 
-#ifndef NO_ICONV
-#include <iconv.h>
-#endif
-
 extern const char *graph_line;
 extern const char *graph_dotted_line;
 extern char buildid_dir[];
@@ -236,26 +230,6 @@ static inline int sane_case(int x, int high)
 	return x;
 }
 
-#ifndef DIR_HAS_BSD_GROUP_SEMANTICS
-# define FORCE_DIR_SET_GID S_ISGID
-#else
-# define FORCE_DIR_SET_GID 0
-#endif
-
-#ifdef NO_NSEC
-#undef USE_NSEC
-#define ST_CTIME_NSEC(st) 0
-#define ST_MTIME_NSEC(st) 0
-#else
-#ifdef USE_ST_TIMESPEC
-#define ST_CTIME_NSEC(st) ((unsigned int)((st).st_ctimespec.tv_nsec))
-#define ST_MTIME_NSEC(st) ((unsigned int)((st).st_mtimespec.tv_nsec))
-#else
-#define ST_CTIME_NSEC(st) ((unsigned int)((st).st_ctim.tv_nsec))
-#define ST_MTIME_NSEC(st) ((unsigned int)((st).st_mtim.tv_nsec))
-#endif
-#endif
-
 int mkdir_p(char *path, mode_t mode);
 int copyfile(const char *from, const char *to);
 
-- 
cgit v0.10.2


From 0a54fb63600b745e060d24879ed5194382a466c5 Mon Sep 17 00:00:00 2001
From: Michael Witten <mfwitten@gmail.com>
Date: Wed, 2 Feb 2011 12:04:27 -0600
Subject: perf tools: Makefile: Remove tool-specific cruft

This commit squashes several commits that remove:

 NO_C99_FORMAT
 CURLDIR and EXPATDIR
 NO_DEFLATE_BOUND
 CC_LD_DYNPATH and NO_R_TO_GCC_LINKER
 NO_PERL_MAKEMAKER
 INTERNAL_QSORT
 NO_EXTERNAL_GREP
 NO_PERL
 SCRIPT_PERL
 PERL_PATH_SQ

Signed-off-by: Michael Witten <mfwitten@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index 53c1e93..fde196f 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -14,40 +14,10 @@ endif
 # Define V=1 to have a more verbose compile.
 # Define V=2 to have an even more verbose compile.
 #
-# Define CURLDIR=/foo/bar if your curl header and library files are in
-# /foo/bar/include and /foo/bar/lib directories.
-#
-# Define EXPATDIR=/foo/bar if your expat header and library files are in
-# /foo/bar/include and /foo/bar/lib directories.
-#
-# Define NO_C99_FORMAT if your formatted IO functions (printf/scanf et.al.)
-# do not support the 'size specifiers' introduced by C99, namely ll, hh,
-# j, z, t. (representing long long int, char, intmax_t, size_t, ptrdiff_t).
-# some C compilers supported these specifiers prior to C99 as an extension.
-#
-# Define NO_DEFLATE_BOUND if your zlib does not have deflateBound.
-#
-# Define NO_R_TO_GCC_LINKER if your gcc does not like "-R/path/lib"
-# that tells runtime paths to dynamic libraries;
-# "-Wl,-rpath=/path/lib" is used instead.
-#
 # Define ASCIIDOC8 if you want to format documentation with AsciiDoc 8
 #
 # Define DOCBOOK_XSL_172 if you want to format man pages with DocBook XSL v1.72.
 #
-# Define NO_PERL_MAKEMAKER if you cannot use Makefiles generated by perl's
-# MakeMaker (e.g. using ActiveState under Cygwin).
-#
-# Define NO_PERL if you do not want Perl scripts or libraries at all.
-#
-# Define INTERNAL_QSORT to use Git's implementation of qsort(), which
-# is a simplified version of the merge sort used in glibc. This is
-# recommended if Git triggers O(n^2) behavior in your platform's qsort().
-#
-# Define NO_EXTERNAL_GREP if you don't want "perf grep" to ever call
-# your external grep (e.g., if your system lacks grep, if its grep is
-# broken, or spawning external process is slower than built-in grep perf has).
-#
 # Define LDFLAGS=-static to build a static binary.
 #
 # Define EXTRA_CFLAGS=-m64 or EXTRA_CFLAGS=-m32 as appropriate for cross-builds.
@@ -205,7 +175,6 @@ BUILT_INS =
 LIB_H =
 LIB_OBJS =
 PYRF_OBJS =
-SCRIPT_PERL =
 SCRIPT_SH =
 TEST_PROGRAMS =
 
@@ -221,10 +190,7 @@ $(OUTPUT)python/perf.so: $(PYRF_OBJS)
 # No Perl scripts right now:
 #
 
-# SCRIPT_PERL += perf-add--interactive.perl
-
-SCRIPTS = $(patsubst %.sh,%,$(SCRIPT_SH)) \
-	  $(patsubst %.perl,%,$(SCRIPT_PERL))
+SCRIPTS = $(patsubst %.sh,%,$(SCRIPT_SH))
 
 # Empty...
 EXTRA_PROGRAMS =
@@ -599,43 +565,10 @@ else
 	endif
 endif
 
-ifndef CC_LD_DYNPATH
-	ifdef NO_R_TO_GCC_LINKER
-		# Some gcc does not accept and pass -R to the linker to specify
-		# the runtime dynamic library path.
-		CC_LD_DYNPATH = -Wl,-rpath,
-	else
-		CC_LD_DYNPATH = -R
-	endif
-endif
-
-ifdef NO_C99_FORMAT
-	BASIC_CFLAGS += -DNO_C99_FORMAT
-endif
-
-ifdef NO_DEFLATE_BOUND
-	BASIC_CFLAGS += -DNO_DEFLATE_BOUND
-endif
-
-ifdef NO_PERL_MAKEMAKER
-	export NO_PERL_MAKEMAKER
-endif
-ifdef INTERNAL_QSORT
-	COMPAT_CFLAGS += -DINTERNAL_QSORT
-	COMPAT_OBJS += $(OUTPUT)compat/qsort.o
-endif
 ifdef RUNTIME_PREFIX
 	COMPAT_CFLAGS += -DRUNTIME_PREFIX
 endif
 
-ifdef NO_EXTERNAL_GREP
-	BASIC_CFLAGS += -DNO_EXTERNAL_GREP
-endif
-
-ifeq ($(PERL_PATH),)
-NO_PERL=NoThanks
-endif
-
 QUIET_SUBDIR0  = +$(MAKE) -C # space to separate -C and subdir
 QUIET_SUBDIR1  =
 
@@ -681,7 +614,6 @@ htmldir_SQ = $(subst ','\'',$(htmldir))
 prefix_SQ = $(subst ','\'',$(prefix))
 
 SHELL_PATH_SQ = $(subst ','\'',$(SHELL_PATH))
-PERL_PATH_SQ = $(subst ','\'',$(PERL_PATH))
 
 LIBS = -Wl,--whole-archive $(PERFLIBS) -Wl,--no-whole-archive $(EXTLIBS)
 
@@ -744,7 +676,6 @@ $(patsubst %.sh,%,$(SCRIPT_SH)) : % : %.sh
 	$(QUIET_GEN)$(RM) $(OUTPUT)$@ $(OUTPUT)$@+ && \
 	sed -e '1s|#!.*/sh|#!$(SHELL_PATH_SQ)|' \
 	    -e 's|@SHELL_PATH@|$(SHELL_PATH_SQ)|' \
-	    -e 's|@@PERL@@|$(PERL_PATH_SQ)|g' \
 	    -e 's/@@PERF_VERSION@@/$(PERF_VERSION)/g' \
 	    -e 's/@@NO_CURL@@/$(NO_CURL)/g' \
 	    $@.sh > $(OUTPUT)$@+ && \
@@ -761,7 +692,6 @@ configure: configure.ac
 # These can record PERF_VERSION
 $(OUTPUT)perf.o perf.spec \
 	$(patsubst %.sh,%,$(SCRIPT_SH)) \
-	$(patsubst %.perl,%,$(SCRIPT_PERL)) \
 	: $(OUTPUT)PERF-VERSION-FILE
 
 $(OUTPUT)%.o: %.c $(OUTPUT)PERF-CFLAGS
@@ -903,7 +833,6 @@ $(OUTPUT)PERF-BUILD-OPTIONS: .FORCE-PERF-BUILD-OPTIONS
 	@echo SHELL_PATH=\''$(subst ','\'',$(SHELL_PATH_SQ))'\' >$@
 	@echo TAR=\''$(subst ','\'',$(subst ','\'',$(TAR)))'\' >>$@
 	@echo NO_CURL=\''$(subst ','\'',$(subst ','\'',$(NO_CURL)))'\' >>$@
-	@echo NO_PERL=\''$(subst ','\'',$(subst ','\'',$(NO_PERL)))'\' >>$@
 
 ### Testing rules
 
-- 
cgit v0.10.2


From a3d1ee10d1bf4520af3d44c1aa6cd46956ec4fd7 Mon Sep 17 00:00:00 2001
From: Michael Witten <mfwitten@gmail.com>
Date: Wed, 2 Feb 2011 14:22:08 -0600
Subject: perf tools: Makefile: Remove various and sundry cruft

This commit squashes several commits that remove:

 unnecessary uname calls
 `sh -c'
 BUILT_INS and QUIET_BUILT_IN

    They have no effect, and the `fixup-builtins' and `check-builtins.sh'
    scripts don't even exist.

 RUNTIME_PREFIX

    It's currently never anything but unset, and it's apparently
    only meaningful when Microsoft Windows is the operating system
    (according to the source for git).

 TEST_PROGRAMS
 EXTRA_PROGRAMS
 unused SHELL_PATH_SQ portions
 unused test for V=2
 useless exports

    Only when `V' is undefined (that is, only when the value of `V'
    is empty) is `export V' performed, which just has the effect of
    placing the empty-valued variable `V' in the environment.

    The only other script to make use of `V' is `Documentation/Makefile',
    which only checks whether `V' is undefined (that is, whether the value
    of `V' is empty); hence, the `export V' has no effect whatsoever.

    Similarly, `export QUIET_GEN' is useless because it will only have
    a non-empty value when `V' has an empty-value, and when `V' has
    an empty-value, `QUIET_GEN' is always explicitly set in every
    script in which it is used.

    `DESTDIR' is only ever defined by the user via the environment
    or the command line, both of which are automatically exported
    to sub-make processes. Furthermore, no non-make sub-scripts
    make use of `DESTDIR' as an environment variable.

    No other scripts use `perfexec_instdir'.

 unused QUIET_SUBDIR{0,1}
 TAR and RPMBUILD
 PTHREAD_LIBS
 Maintainer's dist rules and commands
 distclean target
 Test suite coverage testing
 PRINT_DIR and NO_SUBDIR
 `configure' target
 NO_CURL
 @@PERF_VERSION@@ substitution

    Without the sed command, all of the rule's commands can be reduced
    to a single line that copies a file and sets the permissions properly
    in the process.

 `make test' echo line
 template_instdir
 PERF-BUILD-OPTIONS
 double-colon rules

    The use of double-colon rules seems misguided or vestigial git.

 Essentially hard-coded $(SCRIPTS) expansion

Signed-off-by: Michael Witten <mfwitten@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/.gitignore b/tools/perf/.gitignore
index cb43289..416684b 100644
--- a/tools/perf/.gitignore
+++ b/tools/perf/.gitignore
@@ -1,4 +1,3 @@
-PERF-BUILD-OPTIONS
 PERF-CFLAGS
 PERF-GUI-VARS
 PERF-VERSION-FILE
diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index fde196f..9b84218 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -3,7 +3,7 @@ ifeq ("$(origin O)", "command line")
 endif
 
 # The default target of this Makefile is...
-all::
+all:
 
 ifneq ($(OUTPUT),)
 # check that the output directory actually exists
@@ -11,8 +11,7 @@ OUTDIR := $(shell cd $(OUTPUT) && /bin/pwd)
 $(if $(OUTDIR),, $(error output directory "$(OUTPUT)" does not exist))
 endif
 
-# Define V=1 to have a more verbose compile.
-# Define V=2 to have an even more verbose compile.
+# Define V to have a more verbose compile.
 #
 # Define ASCIIDOC8 if you want to format documentation with AsciiDoc 8
 #
@@ -28,12 +27,7 @@ $(OUTPUT)PERF-VERSION-FILE: .FORCE-PERF-VERSION-FILE
 	@$(SHELL_PATH) util/PERF-VERSION-GEN $(OUTPUT)
 -include $(OUTPUT)PERF-VERSION-FILE
 
-uname_S := $(shell sh -c 'uname -s 2>/dev/null || echo not')
-uname_M := $(shell sh -c 'uname -m 2>/dev/null || echo not')
-uname_O := $(shell sh -c 'uname -o 2>/dev/null || echo not')
-uname_R := $(shell sh -c 'uname -r 2>/dev/null || echo not')
-uname_P := $(shell sh -c 'uname -p 2>/dev/null || echo not')
-uname_V := $(shell sh -c 'uname -v 2>/dev/null || echo not')
+uname_M := $(shell uname -m 2>/dev/null || echo not)
 
 ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/i386/ -e s/sun4u/sparc64/ \
 				  -e s/arm.*/arm/ -e s/sa110/arm/ \
@@ -52,8 +46,6 @@ ifeq ($(ARCH),x86_64)
 	ARCH_INCLUDE = ../../arch/x86/lib/memcpy_64.S
 endif
 
-# CFLAGS and LDFLAGS are for the users to override from the command line.
-
 #
 # Include saner warnings here, which can catch bugs:
 #
@@ -131,22 +123,13 @@ CC = $(CROSS_COMPILE)gcc
 AR = $(CROSS_COMPILE)ar
 RM = rm -f
 MKDIR = mkdir
-TAR = tar
 FIND = find
 INSTALL = install
-RPMBUILD = rpmbuild
-PTHREAD_LIBS = -lpthread
 
 # sparse is architecture-neutral, which means that we need to tell it
 # explicitly what architecture to check for. Fix this up for yours..
 SPARSE_FLAGS = -D__BIG_ENDIAN__ -D__powerpc__
 
-ifeq ($(V), 2)
-	QUIET_STDERR = ">/dev/null"
-else
-	QUIET_STDERR = ">/dev/null 2>&1"
-endif
-
 -include feature-tests.mak
 
 ifeq ($(call try-cc,$(SOURCE_HELLO),-Werror -fstack-protector-all),y)
@@ -171,12 +154,10 @@ BASIC_LDFLAGS =
 
 # Guard against environment variables
 BUILTIN_OBJS =
-BUILT_INS =
 LIB_H =
 LIB_OBJS =
 PYRF_OBJS =
 SCRIPT_SH =
-TEST_PROGRAMS =
 
 SCRIPT_SH += perf-archive.sh
 
@@ -192,12 +173,6 @@ $(OUTPUT)python/perf.so: $(PYRF_OBJS)
 
 SCRIPTS = $(patsubst %.sh,%,$(SCRIPT_SH))
 
-# Empty...
-EXTRA_PROGRAMS =
-
-# ... and all the rest that could be moved out of bindir to perfexecdir
-PROGRAMS += $(EXTRA_PROGRAMS)
-
 #
 # Single 'perf' binary right now:
 #
@@ -205,10 +180,6 @@ PROGRAMS += $(OUTPUT)perf
 
 LANG_BINDINGS =
 
-# List built-in command $C whose implementation cmd_$C() is not in
-# builtin-$C.o but is linked in as part of some other command.
-#
-
 # what 'all' will build and 'install' will install, in perfexecdir
 ALL_PROGRAMS = $(PROGRAMS) $(SCRIPTS)
 
@@ -565,33 +536,13 @@ else
 	endif
 endif
 
-ifdef RUNTIME_PREFIX
-	COMPAT_CFLAGS += -DRUNTIME_PREFIX
-endif
-
-QUIET_SUBDIR0  = +$(MAKE) -C # space to separate -C and subdir
-QUIET_SUBDIR1  =
-
-ifneq ($(findstring $(MAKEFLAGS),w),w)
-PRINT_DIR = --no-print-directory
-else # "make -w"
-NO_SUBDIR = :
-endif
-
 ifneq ($(findstring $(MAKEFLAGS),s),s)
 ifndef V
 	QUIET_CC       = @echo '   ' CC $@;
 	QUIET_AR       = @echo '   ' AR $@;
 	QUIET_LINK     = @echo '   ' LINK $@;
 	QUIET_MKDIR    = @echo '   ' MKDIR $@;
-	QUIET_BUILT_IN = @echo '   ' BUILTIN $@;
 	QUIET_GEN      = @echo '   ' GEN $@;
-	QUIET_SUBDIR0  = +@subdir=
-	QUIET_SUBDIR1  = ;$(NO_SUBDIR) echo '   ' SUBDIR $$subdir; \
-			 $(MAKE) $(PRINT_DIR) -C $$subdir
-	export V
-	export QUIET_GEN
-	export QUIET_BUILT_IN
 endif
 endif
 
@@ -621,16 +572,14 @@ ALL_CFLAGS += $(BASIC_CFLAGS)
 ALL_CFLAGS += $(ARCH_CFLAGS)
 ALL_LDFLAGS += $(BASIC_LDFLAGS)
 
-export TAR INSTALL DESTDIR SHELL_PATH
+export INSTALL SHELL_PATH
 
 
 ### Build rules
 
 SHELL = $(SHELL_PATH)
 
-all:: shell_compatibility_test $(ALL_PROGRAMS) $(LANG_BINDINGS) $(BUILT_INS) $(OTHER_PROGRAMS) $(OUTPUT)PERF-BUILD-OPTIONS
-
-all::
+all: shell_compatibility_test $(ALL_PROGRAMS) $(LANG_BINDINGS) $(OTHER_PROGRAMS)
 
 please_set_SHELL_PATH_to_a_more_modern_shell:
 	@$$(:)
@@ -661,37 +610,17 @@ $(OUTPUT)builtin-timechart.o: builtin-timechart.c $(OUTPUT)common-cmds.h $(OUTPU
 		'-DPERF_MAN_PATH="$(mandir_SQ)"' \
 		'-DPERF_INFO_PATH="$(infodir_SQ)"' $<
 
-$(BUILT_INS): $(OUTPUT)perf$X
-	$(QUIET_BUILT_IN)$(RM) $@ && \
-	ln perf$X $@ 2>/dev/null || \
-	ln -s perf$X $@ 2>/dev/null || \
-	cp perf$X $@
-
 $(OUTPUT)common-cmds.h: util/generate-cmdlist.sh command-list.txt
 
 $(OUTPUT)common-cmds.h: $(wildcard Documentation/perf-*.txt)
 	$(QUIET_GEN). util/generate-cmdlist.sh > $@+ && mv $@+ $@
 
-$(patsubst %.sh,%,$(SCRIPT_SH)) : % : %.sh
-	$(QUIET_GEN)$(RM) $(OUTPUT)$@ $(OUTPUT)$@+ && \
-	sed -e '1s|#!.*/sh|#!$(SHELL_PATH_SQ)|' \
-	    -e 's|@SHELL_PATH@|$(SHELL_PATH_SQ)|' \
-	    -e 's/@@PERF_VERSION@@/$(PERF_VERSION)/g' \
-	    -e 's/@@NO_CURL@@/$(NO_CURL)/g' \
-	    $@.sh > $(OUTPUT)$@+ && \
-	chmod +x $(OUTPUT)$@+ && \
-	mv $(OUTPUT)$@+ $(OUTPUT)$@
-
-configure: configure.ac
-	$(QUIET_GEN)$(RM) $@ $<+ && \
-	sed -e 's/@@PERF_VERSION@@/$(PERF_VERSION)/g' \
-	    $< > $<+ && \
-	autoconf -o $@ $<+ && \
-	$(RM) $<+
+$(SCRIPTS) : % : %.sh
+	$(QUIET_GEN)$(INSTALL) '$@.sh' '$(OUTPUT)$@'
 
 # These can record PERF_VERSION
 $(OUTPUT)perf.o perf.spec \
-	$(patsubst %.sh,%,$(SCRIPT_SH)) \
+	$(SCRIPTS) \
 	: $(OUTPUT)PERF-VERSION-FILE
 
 $(OUTPUT)%.o: %.c $(OUTPUT)PERF-CFLAGS
@@ -826,23 +755,8 @@ $(OUTPUT)PERF-CFLAGS: .FORCE-PERF-CFLAGS
 		echo "$$FLAGS" >$(OUTPUT)PERF-CFLAGS; \
             fi
 
-# We need to apply sq twice, once to protect from the shell
-# that runs $(OUTPUT)PERF-BUILD-OPTIONS, and then again to protect it
-# and the first level quoting from the shell that runs "echo".
-$(OUTPUT)PERF-BUILD-OPTIONS: .FORCE-PERF-BUILD-OPTIONS
-	@echo SHELL_PATH=\''$(subst ','\'',$(SHELL_PATH_SQ))'\' >$@
-	@echo TAR=\''$(subst ','\'',$(subst ','\'',$(TAR)))'\' >>$@
-	@echo NO_CURL=\''$(subst ','\'',$(subst ','\'',$(NO_CURL)))'\' >>$@
-
 ### Testing rules
 
-#
-# None right now:
-#
-# TEST_PROGRAMS += test-something$X
-
-all:: $(TEST_PROGRAMS)
-
 # GNU make supports exporting all variables by "export" without parameters.
 # However, the environment gets quite big, and some programs have problems
 # with that.
@@ -855,29 +769,17 @@ check: $(OUTPUT)common-cmds.h
 			sparse $(ALL_CFLAGS) $(SPARSE_FLAGS) $$i || exit; \
 		done; \
 	else \
-		echo 2>&1 "Did you mean 'make test'?"; \
 		exit 1; \
 	fi
 
-remove-dashes:
-	./fixup-builtins $(BUILT_INS) $(PROGRAMS) $(SCRIPTS)
-
 ### Installation rules
 
-ifneq ($(filter /%,$(firstword $(template_dir))),)
-template_instdir = $(template_dir)
-else
-template_instdir = $(prefix)/$(template_dir)
-endif
-export template_instdir
-
 ifneq ($(filter /%,$(firstword $(perfexecdir))),)
 perfexec_instdir = $(perfexecdir)
 else
 perfexec_instdir = $(prefix)/$(perfexecdir)
 endif
 perfexec_instdir_SQ = $(subst ','\'',$(perfexec_instdir))
-export perfexec_instdir
 
 install: all
 	$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(bindir_SQ)'
@@ -894,14 +796,6 @@ install: all
 	$(INSTALL) scripts/python/*.py -t '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/python'
 	$(INSTALL) scripts/python/bin/* -t '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/python/bin'
 
-ifdef BUILT_INS
-	$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)'
-	$(INSTALL) $(BUILT_INS) '$(DESTDIR_SQ)$(perfexec_instdir_SQ)'
-ifneq (,$X)
-	$(foreach p,$(patsubst %$X,%,$(filter %$X,$(ALL_PROGRAMS) $(BUILT_INS) $(OUTPUT)perf$X)), $(RM) '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/$p';)
-endif
-endif
-
 install-doc:
 	$(MAKE) -C Documentation install
 
@@ -926,104 +820,17 @@ quick-install-man:
 quick-install-html:
 	$(MAKE) -C Documentation quick-install-html
 
-
-### Maintainer's dist rules
-#
-# None right now
-#
-#
-# perf.spec: perf.spec.in
-#	sed -e 's/@@VERSION@@/$(PERF_VERSION)/g' < $< > $@+
-#	mv $@+ $@
-#
-# PERF_TARNAME=perf-$(PERF_VERSION)
-# dist: perf.spec perf-archive$(X) configure
-#	./perf-archive --format=tar \
-#		--prefix=$(PERF_TARNAME)/ HEAD^{tree} > $(PERF_TARNAME).tar
-#	@mkdir -p $(PERF_TARNAME)
-#	@cp perf.spec configure $(PERF_TARNAME)
-#	@echo $(PERF_VERSION) > $(PERF_TARNAME)/version
-#	$(TAR) rf $(PERF_TARNAME).tar \
-#		$(PERF_TARNAME)/perf.spec \
-#		$(PERF_TARNAME)/configure \
-#		$(PERF_TARNAME)/version
-#	@$(RM) -r $(PERF_TARNAME)
-#	gzip -f -9 $(PERF_TARNAME).tar
-#
-# htmldocs = perf-htmldocs-$(PERF_VERSION)
-# manpages = perf-manpages-$(PERF_VERSION)
-# dist-doc:
-#	$(RM) -r .doc-tmp-dir
-#	mkdir .doc-tmp-dir
-#	$(MAKE) -C Documentation WEBDOC_DEST=../.doc-tmp-dir install-webdoc
-#	cd .doc-tmp-dir && $(TAR) cf ../$(htmldocs).tar .
-#	gzip -n -9 -f $(htmldocs).tar
-#	:
-#	$(RM) -r .doc-tmp-dir
-#	mkdir -p .doc-tmp-dir/man1 .doc-tmp-dir/man5 .doc-tmp-dir/man7
-#	$(MAKE) -C Documentation DESTDIR=./ \
-#		man1dir=../.doc-tmp-dir/man1 \
-#		man5dir=../.doc-tmp-dir/man5 \
-#		man7dir=../.doc-tmp-dir/man7 \
-#		install
-#	cd .doc-tmp-dir && $(TAR) cf ../$(manpages).tar .
-#	gzip -n -9 -f $(manpages).tar
-#	$(RM) -r .doc-tmp-dir
-#
-# rpm: dist
-#	$(RPMBUILD) -ta $(PERF_TARNAME).tar.gz
-
 ### Cleaning rules
 
-distclean: clean
-#	$(RM) configure
-
 clean:
 	$(RM) $(OUTPUT){*.o,*/*.o,*/*/*.o,*/*/*/*.o,$(LIB_FILE),perf-archive}
-	$(RM) $(ALL_PROGRAMS) $(BUILT_INS) perf
-	$(RM) $(TEST_PROGRAMS)
+	$(RM) $(ALL_PROGRAMS) perf
 	$(RM) *.spec *.pyc *.pyo */*.pyc */*.pyo $(OUTPUT)common-cmds.h TAGS tags cscope*
-	$(RM) -r $(PERF_TARNAME) .doc-tmp-dir
-	$(RM) $(PERF_TARNAME).tar.gz perf-core_$(PERF_VERSION)-*.tar.gz
-	$(RM) $(htmldocs).tar.gz $(manpages).tar.gz
 	$(MAKE) -C Documentation/ clean
-	$(RM) $(OUTPUT)PERF-VERSION-FILE $(OUTPUT)PERF-CFLAGS $(OUTPUT)PERF-BUILD-OPTIONS
+	$(RM) $(OUTPUT)PERF-VERSION-FILE $(OUTPUT)PERF-CFLAGS
 	@python util/setup.py clean --build-lib='$(OUTPUT)python' \
 				   --build-temp='$(OUTPUT)python/temp'
 
 .PHONY: all install clean strip
 .PHONY: shell_compatibility_test please_set_SHELL_PATH_to_a_more_modern_shell
 .PHONY: .FORCE-PERF-VERSION-FILE TAGS tags cscope .FORCE-PERF-CFLAGS
-.PHONY: .FORCE-PERF-BUILD-OPTIONS
-
-### Make sure built-ins do not have dups and listed in perf.c
-#
-check-builtins::
-	./check-builtins.sh
-
-### Test suite coverage testing
-#
-# None right now
-#
-# .PHONY: coverage coverage-clean coverage-build coverage-report
-#
-# coverage:
-#	$(MAKE) coverage-build
-#	$(MAKE) coverage-report
-#
-# coverage-clean:
-#	rm -f *.gcda *.gcno
-#
-# COVERAGE_CFLAGS = $(CFLAGS) -O0 -ftest-coverage -fprofile-arcs
-# COVERAGE_LDFLAGS = $(CFLAGS)  -O0 -lgcov
-#
-# coverage-build: coverage-clean
-#	$(MAKE) CFLAGS="$(COVERAGE_CFLAGS)" LDFLAGS="$(COVERAGE_LDFLAGS)" all
-#	$(MAKE) CFLAGS="$(COVERAGE_CFLAGS)" LDFLAGS="$(COVERAGE_LDFLAGS)" \
-#		-j1 test
-#
-# coverage-report:
-#	gcov -b *.c */*.c
-#	grep '^function.*called 0 ' *.c.gcov */*.c.gcov \
-#		| sed -e 's/\([^:]*\)\.gcov: *function \([^ ]*\) called.*/\1: \2/' \
-#		| tee coverage-untested-functions
diff --git a/tools/perf/util/exec_cmd.c b/tools/perf/util/exec_cmd.c
index 67eeff5..7adf4ad 100644
--- a/tools/perf/util/exec_cmd.c
+++ b/tools/perf/util/exec_cmd.c
@@ -11,31 +11,12 @@ static const char *argv0_path;
 
 const char *system_path(const char *path)
 {
-#ifdef RUNTIME_PREFIX
-	static const char *prefix;
-#else
 	static const char *prefix = PREFIX;
-#endif
 	struct strbuf d = STRBUF_INIT;
 
 	if (is_absolute_path(path))
 		return path;
 
-#ifdef RUNTIME_PREFIX
-	assert(argv0_path);
-	assert(is_absolute_path(argv0_path));
-
-	if (!prefix &&
-	    !(prefix = strip_path_suffix(argv0_path, PERF_EXEC_PATH)) &&
-	    !(prefix = strip_path_suffix(argv0_path, BINDIR)) &&
-	    !(prefix = strip_path_suffix(argv0_path, "perf"))) {
-		prefix = PREFIX;
-		fprintf(stderr, "RUNTIME_PREFIX requested, "
-				"but prefix computation failed.  "
-				"Using static fallback '%s'.\n", prefix);
-	}
-#endif
-
 	strbuf_addf(&d, "%s/%s", prefix, path);
 	path = strbuf_detach(&d, NULL);
 	return path;
-- 
cgit v0.10.2


From 1d4610527bc71d3f9eea520fc51a02d54f79dcd0 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Wed, 16 Feb 2011 13:43:22 -0500
Subject: xen-pcifront: Sanity check the MSI/MSI-X values

Check the returned vector values for any values that are
odd or plain incorrect (say vector value zero), and if so
print a warning. Also fixup the return values.

This patch was precipiated by the Xen PCIBack returning the
incorrect values due to how it was retrieving PIRQ values.
This has been fixed in the xen-pciback by
"xen/pciback: Utilize 'xen_pirq_from_irq' to get PIRQ value"
patch.

Reviewed-by: Ian Campbell <Ian.Campbell@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/drivers/pci/xen-pcifront.c b/drivers/pci/xen-pcifront.c
index 030ce37..5c7b6ad 100644
--- a/drivers/pci/xen-pcifront.c
+++ b/drivers/pci/xen-pcifront.c
@@ -277,18 +277,24 @@ static int pci_frontend_enable_msix(struct pci_dev *dev,
 	if (likely(!err)) {
 		if (likely(!op.value)) {
 			/* we get the result */
-			for (i = 0; i < nvec; i++)
+			for (i = 0; i < nvec; i++) {
+				if (op.msix_entries[i].vector <= 0) {
+					dev_warn(&dev->dev, "MSI-X entry %d is invalid: %d!\n",
+						i, op.msix_entries[i].vector);
+					err = -EINVAL;
+					*(*vector+i) = -1;
+					continue;
+				}
 				*(*vector+i) = op.msix_entries[i].vector;
-			return 0;
+			}
 		} else {
 			printk(KERN_DEBUG "enable msix get value %x\n",
 				op.value);
-			return op.value;
 		}
 	} else {
 		dev_err(&dev->dev, "enable msix get err %x\n", err);
-		return err;
 	}
+	return err;
 }
 
 static void pci_frontend_disable_msix(struct pci_dev *dev)
@@ -325,6 +331,12 @@ static int pci_frontend_enable_msi(struct pci_dev *dev, int **vector)
 	err = do_pci_op(pdev, &op);
 	if (likely(!err)) {
 		*(*vector) = op.value;
+		if (op.value <= 0) {
+			dev_warn(&dev->dev, "MSI entry is invalid: %d!\n",
+				op.value);
+			err = -EINVAL;
+			*(*vector) = -1;	
+		}
 	} else {
 		dev_err(&dev->dev, "pci frontend enable msi failed for dev "
 				    "%x:%x\n", op.bus, op.devfn);
-- 
cgit v0.10.2


From 55cb8cd45e0600df1473489518d7f12ce1bbe973 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Wed, 16 Feb 2011 13:43:04 -0500
Subject: pci/xen: Use xen_allocate_pirq_msi instead of xen_allocate_pirq

xen_allocate_pirq -> xen_map_pirq_gsi -> PHYSDEVOP_alloc_irq_vector IFF
xen_initial_domain() in addition to the kernel side book-keeping side of
things (set chip and handler, update irq_info etc) whereas
xen_allocate_pirq_msi just does the kernel book keeping.

Also xen_allocate_pirq allocates an IRQ in the 1-1 GSI space whereas
xen_allocate_pirq_msi allocates a dynamic one in the >GSI IRQ space.

All of this is uneccessary as this code path is only executed
when we run as a domU PV guest with an MSI/MSI-X PCI card passed in.
Hence we can jump straight to allocating an dynamic IRQ (and
binding it to the proper PIRQ) and skip the rest.

In short: this change is a cosmetic one.

Reviewed-by: Ian Campbell <Ian.Campbell@citrix.com>
Reviewed-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index 25cd4a0..6432f75 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -157,14 +157,14 @@ static int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 		goto error;
 	i = 0;
 	list_for_each_entry(msidesc, &dev->msi_list, list) {
-		irq = xen_allocate_pirq(v[i], 0, /* not sharable */
+		xen_allocate_pirq_msi(
 			(type == PCI_CAP_ID_MSIX) ?
-			"pcifront-msi-x" : "pcifront-msi");
+			"pcifront-msi-x" : "pcifront-msi",
+			&irq, &v[i], XEN_ALLOC_IRQ);
 		if (irq < 0) {
 			ret = -1;
 			goto free;
 		}
-
 		ret = set_irq_msi(irq, msidesc);
 		if (ret)
 			goto error_while;
-- 
cgit v0.10.2


From 13884c6680973f0ce3483dc59b636b4962d6dafe Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 17 Dec 2010 16:33:53 +0100
Subject: x86/pci: Remove unused variable

|arch/x86/pci/ce4100.c: In function `ce4100_conf_read':
|arch/x86/pci/ce4100.c:257:9: warning: unused variable `retval'

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: dirk.brandewie@gmail.com
LKML-Reference: <1292600033-12271-16-git-send-email-bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/pci/ce4100.c b/arch/x86/pci/ce4100.c
index 85b68ef..c63c6d3 100644
--- a/arch/x86/pci/ce4100.c
+++ b/arch/x86/pci/ce4100.c
@@ -254,7 +254,7 @@ int bridge_read(unsigned int devfn, int reg, int len, u32 *value)
 static int ce4100_conf_read(unsigned int seg, unsigned int bus,
 			    unsigned int devfn, int reg, int len, u32 *value)
 {
-	int i, retval = 1;
+	int i;
 
 	if (bus == 1) {
 		for (i = 0; i < ARRAY_SIZE(bus1_fixups); i++) {
-- 
cgit v0.10.2


From db1c1cce4a653dcbe6949c72ae7b9f42cab1b929 Mon Sep 17 00:00:00 2001
From: Richard Cochran <richardcochran@gmail.com>
Date: Fri, 18 Feb 2011 10:07:25 +0100
Subject: ntp: Remove redundant and incorrect parameter check

The ADJ_SETOFFSET code redundantly checks the range of the nanoseconds
field of the time value. This field is checked again in the subsequent
call to timekeeping_inject_offset(). Also, as is, the check will not
detect whether the number of microseconds is out of range.

Let timekeeping_inject_offset() do the error checking.

Signed-off-by: Richard Cochran <richard.cochran@omicron.at>
Cc: johnstul@us.ibm.com
LKML-Reference: <20110218090724.GA2924@riccoc20.at.omicron.at>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 5ac5932..5f1bb8e 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -650,13 +650,13 @@ int do_adjtimex(struct timex *txc)
 
 	if (txc->modes & ADJ_SETOFFSET) {
 		struct timespec delta;
-		if ((unsigned long)txc->time.tv_usec >= NSEC_PER_SEC)
-			return -EINVAL;
 		delta.tv_sec  = txc->time.tv_sec;
 		delta.tv_nsec = txc->time.tv_usec;
 		if (!(txc->modes & ADJ_NANO))
 			delta.tv_nsec *= 1000;
-		timekeeping_inject_offset(&delta);
+		result = timekeeping_inject_offset(&delta);
+		if (result)
+			return result;
 	}
 
 	getnstimeofday(&ts);
-- 
cgit v0.10.2


From cc0f89c4a426fcd6400a89e9e34e4a8851abef76 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Thu, 17 Feb 2011 12:02:23 -0500
Subject: pci/xen: Cleanup: convert int** to int[]

Cleanup code. Cosmetic change to make the code look easier
to read.

Reviewed-by: Ian Campbell <Ian.Campbell@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/arch/x86/include/asm/xen/pci.h b/arch/x86/include/asm/xen/pci.h
index 2329b3e..aa86209 100644
--- a/arch/x86/include/asm/xen/pci.h
+++ b/arch/x86/include/asm/xen/pci.h
@@ -27,16 +27,16 @@ static inline void __init xen_setup_pirqs(void)
  * its own functions.
  */
 struct xen_pci_frontend_ops {
-	int (*enable_msi)(struct pci_dev *dev, int **vectors);
+	int (*enable_msi)(struct pci_dev *dev, int vectors[]);
 	void (*disable_msi)(struct pci_dev *dev);
-	int (*enable_msix)(struct pci_dev *dev, int **vectors, int nvec);
+	int (*enable_msix)(struct pci_dev *dev, int vectors[], int nvec);
 	void (*disable_msix)(struct pci_dev *dev);
 };
 
 extern struct xen_pci_frontend_ops *xen_pci_frontend;
 
 static inline int xen_pci_frontend_enable_msi(struct pci_dev *dev,
-					      int **vectors)
+					      int vectors[])
 {
 	if (xen_pci_frontend && xen_pci_frontend->enable_msi)
 		return xen_pci_frontend->enable_msi(dev, vectors);
@@ -48,7 +48,7 @@ static inline void xen_pci_frontend_disable_msi(struct pci_dev *dev)
 			xen_pci_frontend->disable_msi(dev);
 }
 static inline int xen_pci_frontend_enable_msix(struct pci_dev *dev,
-					       int **vectors, int nvec)
+					       int vectors[], int nvec)
 {
 	if (xen_pci_frontend && xen_pci_frontend->enable_msix)
 		return xen_pci_frontend->enable_msix(dev, vectors, nvec);
diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index 6432f75..30fdd09 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -150,9 +150,9 @@ static int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 		return -ENOMEM;
 
 	if (type == PCI_CAP_ID_MSIX)
-		ret = xen_pci_frontend_enable_msix(dev, &v, nvec);
+		ret = xen_pci_frontend_enable_msix(dev, v, nvec);
 	else
-		ret = xen_pci_frontend_enable_msi(dev, &v);
+		ret = xen_pci_frontend_enable_msi(dev, v);
 	if (ret)
 		goto error;
 	i = 0;
diff --git a/drivers/pci/xen-pcifront.c b/drivers/pci/xen-pcifront.c
index 5c7b6ad..492b7d8 100644
--- a/drivers/pci/xen-pcifront.c
+++ b/drivers/pci/xen-pcifront.c
@@ -243,7 +243,7 @@ struct pci_ops pcifront_bus_ops = {
 
 #ifdef CONFIG_PCI_MSI
 static int pci_frontend_enable_msix(struct pci_dev *dev,
-				    int **vector, int nvec)
+				    int vector[], int nvec)
 {
 	int err;
 	int i;
@@ -282,10 +282,10 @@ static int pci_frontend_enable_msix(struct pci_dev *dev,
 					dev_warn(&dev->dev, "MSI-X entry %d is invalid: %d!\n",
 						i, op.msix_entries[i].vector);
 					err = -EINVAL;
-					*(*vector+i) = -1;
+					vector[i] = -1;
 					continue;
 				}
-				*(*vector+i) = op.msix_entries[i].vector;
+				vector[i] = op.msix_entries[i].vector;
 			}
 		} else {
 			printk(KERN_DEBUG "enable msix get value %x\n",
@@ -316,7 +316,7 @@ static void pci_frontend_disable_msix(struct pci_dev *dev)
 		dev_err(&dev->dev, "pci_disable_msix get err %x\n", err);
 }
 
-static int pci_frontend_enable_msi(struct pci_dev *dev, int **vector)
+static int pci_frontend_enable_msi(struct pci_dev *dev, int vector[])
 {
 	int err;
 	struct xen_pci_op op = {
@@ -330,12 +330,12 @@ static int pci_frontend_enable_msi(struct pci_dev *dev, int **vector)
 
 	err = do_pci_op(pdev, &op);
 	if (likely(!err)) {
-		*(*vector) = op.value;
+		vector[0] = op.value;
 		if (op.value <= 0) {
 			dev_warn(&dev->dev, "MSI entry is invalid: %d!\n",
 				op.value);
 			err = -EINVAL;
-			*(*vector) = -1;	
+			vector[0] = -1;
 		}
 	} else {
 		dev_err(&dev->dev, "pci frontend enable msi failed for dev "
-- 
cgit v0.10.2


From 3d74a539ae07a8f3c061332e426fc07b2310cf05 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Thu, 17 Feb 2011 16:12:51 -0500
Subject: pci/xen: When free-ing MSI-X/MSI irq->desc also use generic code.

This code path is only run when an MSI/MSI-X PCI device is passed
in to PV DomU.

In 2.6.37 time-frame we over-wrote the default cleanup handler for
MSI/MSI-X irq->desc to be "xen_teardown_msi_irqs". That function
calls the the xen-pcifront driver which can tell the backend to
cleanup/take back the MSI/MSI-X device.

However, we forgot to continue the process of free-ing the MSI/MSI-X
device resources (irq->desc) in the PV domU side. Which is what
the default cleanup handler: default_teardown_msi_irqs did.

Hence we would leak IRQ descriptors.

Without this patch, doing "rmmod igbvf;modprobe igbvf" multiple
times ends with abandoned IRQ descriptors:

 28:          5  xen-pirq-pcifront-msi-x
 29:          8  xen-pirq-pcifront-msi-x
...
130:         10  xen-pirq-pcifront-msi-x

with the end result of running out of IRQ descriptors.

Reviewed-by: Ian Campbell <Ian.Campbell@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index 30fdd09..57afd1d 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -193,6 +193,9 @@ static void xen_teardown_msi_irqs(struct pci_dev *dev)
 		xen_pci_frontend_disable_msix(dev);
 	else
 		xen_pci_frontend_disable_msi(dev);
+
+	/* Free the IRQ's and the msidesc using the generic code. */
+	default_teardown_msi_irqs(dev);
 }
 
 static void xen_teardown_msi_irq(unsigned int irq)
-- 
cgit v0.10.2


From 5df91509d324d44cfb11e55d9cb02fe18b53b045 Mon Sep 17 00:00:00 2001
From: "jacob.jun.pan@linux.intel.com" <jacob.jun.pan@linux.intel.com>
Date: Fri, 18 Feb 2011 13:42:54 -0800
Subject: x86: mrst: Remove apb timer read workaround

APB timer current count was unreliable in the earlier silicon, which
could result in time going backwards. This problem has been fixed in
the current silicon stepping. This patch removes the workaround which
was used to check and prevent timer rolling back when APB timer is
used as clocksource device.

The workaround code was also flawed by potential race condition
around the cached read value last_read. Though a fix can be done
by assigning last_read to a local variable at the beginning of
apbt_read_clocksource(), but this is not necessary anymore.

[ tglx: A sane timer on an Intel chip - I can't believe it ]

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Alan Cox <alan@linux.intel.com>
LKML-Reference: <1298065374-25532-1-git-send-email-jacob.jun.pan@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
index 7c9ab59..afc4064 100644
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -506,64 +506,12 @@ static int apbt_next_event(unsigned long delta,
 	return 0;
 }
 
-/*
- * APB timer clock is not in sync with pclk on Langwell, which translates to
- * unreliable read value caused by sampling error. the error does not add up
- * overtime and only happens when sampling a 0 as a 1 by mistake. so the time
- * would go backwards. the following code is trying to prevent time traveling
- * backwards. little bit paranoid.
- */
 static cycle_t apbt_read_clocksource(struct clocksource *cs)
 {
-	unsigned long t0, t1, t2;
-	static unsigned long last_read;
-
-bad_count:
-	t1 = apbt_readl(phy_cs_timer_id,
-			APBTMR_N_CURRENT_VALUE);
-	t2 = apbt_readl(phy_cs_timer_id,
-			APBTMR_N_CURRENT_VALUE);
-	if (unlikely(t1 < t2)) {
-		pr_debug("APBT: read current count error %lx:%lx:%lx\n",
-			 t1, t2, t2 - t1);
-		goto bad_count;
-	}
-	/*
-	 * check against cached last read, makes sure time does not go back.
-	 * it could be a normal rollover but we will do tripple check anyway
-	 */
-	if (unlikely(t2 > last_read)) {
-		/* check if we have a normal rollover */
-		unsigned long raw_intr_status =
-			apbt_readl_reg(APBTMRS_RAW_INT_STATUS);
-		/*
-		 * cs timer interrupt is masked but raw intr bit is set if
-		 * rollover occurs. then we read EOI reg to clear it.
-		 */
-		if (raw_intr_status & (1 << phy_cs_timer_id)) {
-			apbt_readl(phy_cs_timer_id, APBTMR_N_EOI);
-			goto out;
-		}
-		pr_debug("APB CS going back %lx:%lx:%lx ",
-			 t2, last_read, t2 - last_read);
-bad_count_x3:
-		pr_debug("triple check enforced\n");
-		t0 = apbt_readl(phy_cs_timer_id,
-				APBTMR_N_CURRENT_VALUE);
-		udelay(1);
-		t1 = apbt_readl(phy_cs_timer_id,
-				APBTMR_N_CURRENT_VALUE);
-		udelay(1);
-		t2 = apbt_readl(phy_cs_timer_id,
-				APBTMR_N_CURRENT_VALUE);
-		if ((t2 > t1) || (t1 > t0)) {
-			printk(KERN_ERR "Error: APB CS tripple check failed\n");
-			goto bad_count_x3;
-		}
-	}
-out:
-	last_read = t2;
-	return (cycle_t)~t2;
+	unsigned long current_count;
+
+	current_count = apbt_readl(phy_cs_timer_id, APBTMR_N_CURRENT_VALUE);
+	return (cycle_t)~current_count;
 }
 
 static int apbt_clocksource_register(void)
-- 
cgit v0.10.2


From e7bcecb7b1d29b9ad5af939149a945658620ca8f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 16 Feb 2011 17:12:57 +0100
Subject: genirq: Make nr_irqs runtime expandable

We face more and more the requirement to expand nr_irqs at
runtime. The reason are irq expanders which can not be detected in the
early boot stage. So we speculate nr_irqs to have enough room. Further
Xen needs extra irq numbers and we really want to avoid adding more
"detection" code into the early boot. There is no real good reason why
we need to limit nr_irqs at early boot.

Allow the allocation code to expand nr_irqs. We have already 8k extra
number space in the allocation bitmap, so lets use it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index a250d3a..6f6644f 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -206,6 +206,14 @@ struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
 	return NULL;
 }
 
+static int irq_expand_nr_irqs(unsigned int cnt)
+{
+	if (nr_irqs + cnt > IRQ_BITMAP_BITS)
+		return -ENOMEM;
+	nr_irqs += cnt;
+	return 0;
+}
+
 int __init early_irq_init(void)
 {
 	int i, initcnt, node = first_online_node;
@@ -287,6 +295,12 @@ static inline int alloc_descs(unsigned int start, unsigned int cnt, int node)
 {
 	return start;
 }
+
+static int irq_expand_nr_irqs(unsigned int cnt)
+{
+	return -ENOMEM;
+}
+
 #endif /* !CONFIG_SPARSE_IRQ */
 
 /* Dynamic interrupt handling */
@@ -335,9 +349,11 @@ irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node)
 	if (irq >=0 && start != irq)
 		goto err;
 
-	ret = -ENOMEM;
-	if (start >= nr_irqs)
-		goto err;
+	if (start >= nr_irqs) {
+		ret = irq_expand_nr_irqs(cnt);
+		if (ret)
+			goto err;
+	}
 
 	bitmap_set(allocated_irqs, start, cnt);
 	mutex_unlock(&sparse_irq_lock);
-- 
cgit v0.10.2


From 43abe43ce0619d744c7a5bb15cce075e532b53b7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 12 Feb 2011 12:10:49 +0100
Subject: genirq: Add missing buslock to set_irq_type(), set_irq_wake()

chips behind a slow bus cannot update the chip under desc->lock, but
we miss the chip_buslock/chip_bus_sync_unlock() calls around the set
type and set wake functions.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index baa5c4a..9639ab8 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -65,9 +65,11 @@ int set_irq_type(unsigned int irq, unsigned int type)
 	if (type == IRQ_TYPE_NONE)
 		return 0;
 
+	chip_bus_lock(desc);
 	raw_spin_lock_irqsave(&desc->lock, flags);
 	ret = __irq_set_trigger(desc, irq, type);
 	raw_spin_unlock_irqrestore(&desc->lock, flags);
+	chip_bus_sync_unlock(desc);
 	return ret;
 }
 EXPORT_SYMBOL(set_irq_type);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index ba84307..a400db2 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -454,6 +454,7 @@ int set_irq_wake(unsigned int irq, unsigned int on)
 	/* wakeup-capable irqs can be shared between drivers that
 	 * don't need to have the same sleep mode behaviors.
 	 */
+	chip_bus_lock(desc);
 	raw_spin_lock_irqsave(&desc->lock, flags);
 	if (on) {
 		if (desc->wake_depth++ == 0) {
@@ -476,6 +477,7 @@ int set_irq_wake(unsigned int irq, unsigned int on)
 	}
 
 	raw_spin_unlock_irqrestore(&desc->lock, flags);
+	chip_bus_sync_unlock(desc);
 	return ret;
 }
 EXPORT_SYMBOL(set_irq_wake);
-- 
cgit v0.10.2


From a0cd9ca2b907d7ee26575e7b63ac92dad768a75e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 10 Feb 2011 11:36:33 +0100
Subject: genirq: Namespace cleanup

The irq namespace has become quite convoluted. My bad.  Clean it up
and deprecate the old functions. All new functions follow the scheme:

irq number based:
    irq_set/get/xxx/_xxx(unsigned int irq, ...)

irq_data based:
	 irq_data_set/get/xxx/_xxx(struct irq_data *d, ....)

irq_desc based:
	 irq_desc_get_xxx(struct irq_desc *desc)

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 56b7c97..7834726 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -346,16 +346,24 @@ static inline void enable_irq_lockdep_irqrestore(unsigned int irq, unsigned long
 }
 
 /* IRQ wakeup (PM) control: */
-extern int set_irq_wake(unsigned int irq, unsigned int on);
+extern int irq_set_irq_wake(unsigned int irq, unsigned int on);
+
+#ifndef CONFIG_GENERIC_HARDIRQS_NO_COMPAT
+/* Please do not use: Use the replacement functions instead */
+static inline int set_irq_wake(unsigned int irq, unsigned int on)
+{
+	return irq_set_irq_wake(irq, on);
+}
+#endif
 
 static inline int enable_irq_wake(unsigned int irq)
 {
-	return set_irq_wake(irq, 1);
+	return irq_set_irq_wake(irq, 1);
 }
 
 static inline int disable_irq_wake(unsigned int irq)
 {
-	return set_irq_wake(irq, 0);
+	return irq_set_irq_wake(irq, 0);
 }
 
 #else /* !CONFIG_GENERIC_HARDIRQS */
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 80fcb53..e9f847d 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -292,8 +292,7 @@ set_irq_handler(unsigned int irq, irq_flow_handler_t handle)
  *  IRQ_NOREQUEST and IRQ_NOPROBE)
  */
 static inline void
-set_irq_chained_handler(unsigned int irq,
-			irq_flow_handler_t handle)
+set_irq_chained_handler(unsigned int irq, irq_flow_handler_t handle)
 {
 	__set_irq_handler(irq, handle, 1, NULL);
 }
@@ -312,12 +311,12 @@ static inline void irq_clear_status_flags(unsigned int irq, unsigned long clr)
 	irq_modify_status(irq, clr, 0);
 }
 
-static inline void set_irq_noprobe(unsigned int irq)
+static inline void irq_set_noprobe(unsigned int irq)
 {
 	irq_modify_status(irq, 0, IRQ_NOPROBE);
 }
 
-static inline void set_irq_probe(unsigned int irq)
+static inline void irq_set_probe(unsigned int irq)
 {
 	irq_modify_status(irq, IRQ_NOPROBE, 0);
 }
@@ -338,14 +337,14 @@ static inline void dynamic_irq_init(unsigned int irq)
 }
 
 /* Set/get chip/data for an IRQ: */
-extern int set_irq_chip(unsigned int irq, struct irq_chip *chip);
-extern int set_irq_data(unsigned int irq, void *data);
-extern int set_irq_chip_data(unsigned int irq, void *data);
-extern int set_irq_type(unsigned int irq, unsigned int type);
-extern int set_irq_msi(unsigned int irq, struct msi_desc *entry);
+extern int irq_set_chip(unsigned int irq, struct irq_chip *chip);
+extern int irq_set_handler_data(unsigned int irq, void *data);
+extern int irq_set_chip_data(unsigned int irq, void *data);
+extern int irq_set_irq_type(unsigned int irq, unsigned int type);
+extern int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry);
 extern struct irq_data *irq_get_irq_data(unsigned int irq);
 
-static inline struct irq_chip *get_irq_chip(unsigned int irq)
+static inline struct irq_chip *irq_get_chip(unsigned int irq)
 {
 	struct irq_data *d = irq_get_irq_data(irq);
 	return d ? d->chip : NULL;
@@ -356,7 +355,7 @@ static inline struct irq_chip *irq_data_get_irq_chip(struct irq_data *d)
 	return d->chip;
 }
 
-static inline void *get_irq_chip_data(unsigned int irq)
+static inline void *irq_get_chip_data(unsigned int irq)
 {
 	struct irq_data *d = irq_get_irq_data(irq);
 	return d ? d->chip_data : NULL;
@@ -367,18 +366,18 @@ static inline void *irq_data_get_irq_chip_data(struct irq_data *d)
 	return d->chip_data;
 }
 
-static inline void *get_irq_data(unsigned int irq)
+static inline void *irq_get_handler_data(unsigned int irq)
 {
 	struct irq_data *d = irq_get_irq_data(irq);
 	return d ? d->handler_data : NULL;
 }
 
-static inline void *irq_data_get_irq_data(struct irq_data *d)
+static inline void *irq_data_get_irq_handler_data(struct irq_data *d)
 {
 	return d->handler_data;
 }
 
-static inline struct msi_desc *get_irq_msi(unsigned int irq)
+static inline struct msi_desc *irq_get_msi_desc(unsigned int irq)
 {
 	struct irq_data *d = irq_get_irq_data(irq);
 	return d ? d->msi_desc : NULL;
@@ -389,6 +388,58 @@ static inline struct msi_desc *irq_data_get_msi(struct irq_data *d)
 	return d->msi_desc;
 }
 
+#ifndef CONFIG_GENERIC_HARDIRQS_NO_COMPAT
+/* Please do not use: Use the replacement functions instead */
+static inline int set_irq_chip(unsigned int irq, struct irq_chip *chip)
+{
+	return irq_set_chip(irq, chip);
+}
+static inline int set_irq_data(unsigned int irq, void *data)
+{
+	return irq_set_handler_data(irq, data);
+}
+static inline int set_irq_chip_data(unsigned int irq, void *data)
+{
+	return irq_set_chip_data(irq, data);
+}
+static inline int set_irq_type(unsigned int irq, unsigned int type)
+{
+	return irq_set_irq_type(irq, type);
+}
+static inline int set_irq_msi(unsigned int irq, struct msi_desc *entry)
+{
+	return irq_set_msi_desc(irq, entry);
+}
+static inline struct irq_chip *get_irq_chip(unsigned int irq)
+{
+	return irq_get_chip(irq);
+}
+static inline void *get_irq_chip_data(unsigned int irq)
+{
+	return irq_get_chip_data(irq);
+}
+static inline void *get_irq_data(unsigned int irq)
+{
+	return irq_get_handler_data(irq);
+}
+static inline void *irq_data_get_irq_data(struct irq_data *d)
+{
+	return irq_data_get_irq_handler_data(d);
+}
+static inline struct msi_desc *get_irq_msi(unsigned int irq)
+{
+	return irq_get_msi_desc(irq);
+}
+static inline void set_irq_noprobe(unsigned int irq)
+{
+	irq_set_noprobe(irq);
+}
+static inline void set_irq_probe(unsigned int irq)
+{
+	irq_set_probe(irq);
+}
+#endif
+
 int irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node);
 void irq_free_descs(unsigned int irq, unsigned int cnt);
 int irq_reserve_irqs(unsigned int from, unsigned int cnt);
diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
index bfef56d..64794de 100644
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -98,10 +98,46 @@ static inline struct irq_desc *move_irq_desc(struct irq_desc *desc, int node)
 
 #ifdef CONFIG_GENERIC_HARDIRQS
 
-#define get_irq_desc_chip(desc)		((desc)->irq_data.chip)
-#define get_irq_desc_chip_data(desc)	((desc)->irq_data.chip_data)
-#define get_irq_desc_data(desc)		((desc)->irq_data.handler_data)
-#define get_irq_desc_msi(desc)		((desc)->irq_data.msi_desc)
+static inline struct irq_chip *irq_desc_get_chip(struct irq_desc *desc)
+{
+	return desc->irq_data.chip;
+}
+
+static inline void *irq_desc_get_chip_data(struct irq_desc *desc)
+{
+	return desc->irq_data.chip_data;
+}
+
+static inline void *irq_desc_get_handler_data(struct irq_desc *desc)
+{
+	return desc->irq_data.handler_data;
+}
+
+static inline struct msi_desc *irq_desc_get_msi_desc(struct irq_desc *desc)
+{
+	return desc->irq_data.msi_desc;
+}
+
+#ifndef CONFIG_GENERIC_HARDIRQS_NO_COMPAT
+static inline struct irq_chip *get_irq_desc_chip(struct irq_desc *desc)
+{
+	return irq_desc_get_chip(desc);
+}
+static inline void *get_irq_desc_data(struct irq_desc *desc)
+{
+	return irq_desc_get_handler_data(desc);
+}
+
+static inline void *get_irq_desc_chip_data(struct irq_desc *desc)
+{
+	return irq_desc_get_chip_data(desc);
+}
+
+static inline struct msi_desc *get_irq_desc_msi(struct irq_desc *desc)
+{
+	return irq_desc_get_msi_desc(desc);
+}
+#endif
 
 /*
  * Architectures call this to let the generic IRQ layer
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 9639ab8..622b55a 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -19,11 +19,11 @@
 #include "internals.h"
 
 /**
- *	set_irq_chip - set the irq chip for an irq
+ *	irq_set_chip - set the irq chip for an irq
  *	@irq:	irq number
  *	@chip:	pointer to irq chip description structure
  */
-int set_irq_chip(unsigned int irq, struct irq_chip *chip)
+int irq_set_chip(unsigned int irq, struct irq_chip *chip)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 	unsigned long flags;
@@ -43,14 +43,14 @@ int set_irq_chip(unsigned int irq, struct irq_chip *chip)
 
 	return 0;
 }
-EXPORT_SYMBOL(set_irq_chip);
+EXPORT_SYMBOL(irq_set_chip);
 
 /**
- *	set_irq_type - set the irq trigger type for an irq
+ *	irq_set_type - set the irq trigger type for an irq
  *	@irq:	irq number
  *	@type:	IRQ_TYPE_{LEVEL,EDGE}_* value - see include/linux/irq.h
  */
-int set_irq_type(unsigned int irq, unsigned int type)
+int irq_set_irq_type(unsigned int irq, unsigned int type)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 	unsigned long flags;
@@ -72,16 +72,16 @@ int set_irq_type(unsigned int irq, unsigned int type)
 	chip_bus_sync_unlock(desc);
 	return ret;
 }
-EXPORT_SYMBOL(set_irq_type);
+EXPORT_SYMBOL(irq_set_irq_type);
 
 /**
- *	set_irq_data - set irq type data for an irq
+ *	irq_set_handler_data - set irq handler data for an irq
  *	@irq:	Interrupt number
  *	@data:	Pointer to interrupt specific data
  *
  *	Set the hardware irq controller data for an irq
  */
-int set_irq_data(unsigned int irq, void *data)
+int irq_set_handler_data(unsigned int irq, void *data)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 	unsigned long flags;
@@ -97,16 +97,16 @@ int set_irq_data(unsigned int irq, void *data)
 	raw_spin_unlock_irqrestore(&desc->lock, flags);
 	return 0;
 }
-EXPORT_SYMBOL(set_irq_data);
+EXPORT_SYMBOL(irq_set_handler_data);
 
 /**
- *	set_irq_msi - set MSI descriptor data for an irq
+ *	irq_set_msi_desc - set MSI descriptor data for an irq
  *	@irq:	Interrupt number
  *	@entry:	Pointer to MSI descriptor data
  *
  *	Set the MSI descriptor entry for an irq
  */
-int set_irq_msi(unsigned int irq, struct msi_desc *entry)
+int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 	unsigned long flags;
@@ -126,13 +126,13 @@ int set_irq_msi(unsigned int irq, struct msi_desc *entry)
 }
 
 /**
- *	set_irq_chip_data - set irq chip data for an irq
+ *	irq_set_chip_data - set irq chip data for an irq
  *	@irq:	Interrupt number
  *	@data:	Pointer to chip specific data
  *
  *	Set the hardware irq chip data for an irq
  */
-int set_irq_chip_data(unsigned int irq, void *data)
+int irq_set_chip_data(unsigned int irq, void *data)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 	unsigned long flags;
@@ -154,7 +154,7 @@ int set_irq_chip_data(unsigned int irq, void *data)
 
 	return 0;
 }
-EXPORT_SYMBOL(set_irq_chip_data);
+EXPORT_SYMBOL(irq_set_chip_data);
 
 struct irq_data *irq_get_irq_data(unsigned int irq)
 {
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index a400db2..b1b4da9 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -434,7 +434,7 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on)
 }
 
 /**
- *	set_irq_wake - control irq power management wakeup
+ *	irq_set_irq_wake - control irq power management wakeup
  *	@irq:	interrupt to control
  *	@on:	enable/disable power management wakeup
  *
@@ -445,7 +445,7 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on)
  *	Wakeup mode lets this IRQ wake the system from sleep
  *	states like "suspend to RAM".
  */
-int set_irq_wake(unsigned int irq, unsigned int on)
+int irq_set_irq_wake(unsigned int irq, unsigned int on)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 	unsigned long flags;
@@ -480,7 +480,7 @@ int set_irq_wake(unsigned int irq, unsigned int on)
 	chip_bus_sync_unlock(desc);
 	return ret;
 }
-EXPORT_SYMBOL(set_irq_wake);
+EXPORT_SYMBOL(irq_set_irq_wake);
 
 /*
  * Internal function that tells the architecture code whether a
-- 
cgit v0.10.2


From 1fa46f1f070961783661ae640cd2f6b2557f3885 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 7 Feb 2011 16:46:58 +0100
Subject: genirq: Simplify affinity related code

There is lot of #ifdef CONFIG_GENERIC_PENDING_IRQ along with
duplicated code in the irq core. Move the #ifdeffery into one place
and cleanup the code so it's readable. No functional change.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index b1b4da9..99f3e9a 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -100,47 +100,70 @@ void irq_set_thread_affinity(struct irq_desc *desc)
 	}
 }
 
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+static inline bool irq_can_move_pcntxt(struct irq_desc *desc)
+{
+	return desc->status & IRQ_MOVE_PCNTXT;
+}
+static inline bool irq_move_pending(struct irq_desc *desc)
+{
+	return desc->status & IRQ_MOVE_PENDING;
+}
+static inline void
+irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask)
+{
+	cpumask_copy(desc->pending_mask, mask);
+}
+static inline void
+irq_get_pending(struct cpumask *mask, struct irq_desc *desc)
+{
+	cpumask_copy(mask, desc->pending_mask);
+}
+#else
+static inline bool irq_can_move_pcntxt(struct irq_desc *desc) { return true; }
+static inline bool irq_move_pending(struct irq_desc *desc) { return false; }
+static inline void
+irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask) { }
+static inline void
+irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { }
+#endif
+
 /**
  *	irq_set_affinity - Set the irq affinity of a given irq
  *	@irq:		Interrupt to set affinity
  *	@cpumask:	cpumask
  *
  */
-int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
+int irq_set_affinity(unsigned int irq, const struct cpumask *mask)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 	struct irq_chip *chip = desc->irq_data.chip;
 	unsigned long flags;
+	int ret = 0;
 
 	if (!chip->irq_set_affinity)
 		return -EINVAL;
 
 	raw_spin_lock_irqsave(&desc->lock, flags);
 
-#ifdef CONFIG_GENERIC_PENDING_IRQ
-	if (desc->status & IRQ_MOVE_PCNTXT) {
-		if (!chip->irq_set_affinity(&desc->irq_data, cpumask, false)) {
-			cpumask_copy(desc->irq_data.affinity, cpumask);
+	if (irq_can_move_pcntxt(desc)) {
+		ret = chip->irq_set_affinity(&desc->irq_data, mask, false);
+		if (!ret) {
+			cpumask_copy(desc->irq_data.affinity, mask);
 			irq_set_thread_affinity(desc);
 		}
-	}
-	else {
+	} else {
 		desc->status |= IRQ_MOVE_PENDING;
-		cpumask_copy(desc->pending_mask, cpumask);
+		irq_copy_pending(desc, mask);
 	}
-#else
-	if (!chip->irq_set_affinity(&desc->irq_data, cpumask, false)) {
-		cpumask_copy(desc->irq_data.affinity, cpumask);
-		irq_set_thread_affinity(desc);
-	}
-#endif
+
 	if (desc->affinity_notify) {
 		kref_get(&desc->affinity_notify->kref);
 		schedule_work(&desc->affinity_notify->work);
 	}
 	desc->status |= IRQ_AFFINITY_SET;
 	raw_spin_unlock_irqrestore(&desc->lock, flags);
-	return 0;
+	return ret;
 }
 
 int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
@@ -167,18 +190,13 @@ static void irq_affinity_notify(struct work_struct *work)
 	cpumask_var_t cpumask;
 	unsigned long flags;
 
-	if (!desc)
-		goto out;
-
-	if (!alloc_cpumask_var(&cpumask, GFP_KERNEL))
+	if (!desc || !alloc_cpumask_var(&cpumask, GFP_KERNEL))
 		goto out;
 
 	raw_spin_lock_irqsave(&desc->lock, flags);
-#ifdef CONFIG_GENERIC_PENDING_IRQ
-	if (desc->status & IRQ_MOVE_PENDING)
-		cpumask_copy(cpumask, desc->pending_mask);
+	if (irq_move_pending(desc))
+		irq_get_pending(cpumask, desc);
 	else
-#endif
 		cpumask_copy(cpumask, desc->irq_data.affinity);
 	raw_spin_unlock_irqrestore(&desc->lock, flags);
 
-- 
cgit v0.10.2


From b008207cbd0d5ce606a1a2ac52826e0ab37d0b99 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 7 Feb 2011 17:30:50 +0100
Subject: genirq: Rremove redundant check

IRQ_NO_BALANCING is already checked in irq_can_set_affinity() above,
no need to check it again.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 99f3e9a..591c927 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -256,6 +256,7 @@ EXPORT_SYMBOL_GPL(irq_set_affinity_notifier);
  */
 static int setup_affinity(unsigned int irq, struct irq_desc *desc)
 {
+	/* Excludes PER_CPU and NO_BALANCE interrupts */
 	if (!irq_can_set_affinity(irq))
 		return 0;
 
@@ -263,7 +264,7 @@ static int setup_affinity(unsigned int irq, struct irq_desc *desc)
 	 * Preserve an userspace affinity setup, but make sure that
 	 * one of the targets is online.
 	 */
-	if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) {
+	if (desc->status & (IRQ_AFFINITY_SET)) {
 		if (cpumask_any_and(desc->irq_data.affinity, cpu_online_mask)
 		    < nr_cpu_ids)
 			goto set_affinity;
-- 
cgit v0.10.2


From 569bda8df11effa03e618729293c7961696abb10 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 7 Feb 2011 17:05:08 +0100
Subject: genirq: Always apply cpu online mask

If the affinity had been set by the user, then a later request_irq()
will honour that setting. But online cpus can have changed. So apply
the online mask and for this case as well.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 591c927..ade65bf 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -256,6 +256,8 @@ EXPORT_SYMBOL_GPL(irq_set_affinity_notifier);
  */
 static int setup_affinity(unsigned int irq, struct irq_desc *desc)
 {
+	struct cpumask *set = irq_default_affinity;
+
 	/* Excludes PER_CPU and NO_BALANCE interrupts */
 	if (!irq_can_set_affinity(irq))
 		return 0;
@@ -265,15 +267,13 @@ static int setup_affinity(unsigned int irq, struct irq_desc *desc)
 	 * one of the targets is online.
 	 */
 	if (desc->status & (IRQ_AFFINITY_SET)) {
-		if (cpumask_any_and(desc->irq_data.affinity, cpu_online_mask)
-		    < nr_cpu_ids)
-			goto set_affinity;
+		if (cpumask_intersects(desc->irq_data.affinity,
+				       cpu_online_mask))
+			set = desc->irq_data.affinity;
 		else
 			desc->status &= ~IRQ_AFFINITY_SET;
 	}
-
-	cpumask_and(desc->irq_data.affinity, cpu_online_mask, irq_default_affinity);
-set_affinity:
+	cpumask_and(desc->irq_data.affinity, cpu_online_mask, set);
 	desc->irq_data.chip->irq_set_affinity(&desc->irq_data, desc->irq_data.affinity, false);
 
 	return 0;
-- 
cgit v0.10.2


From 3b8249e759c701c4a82f99d957be651a7657bf6f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 7 Feb 2011 16:02:20 +0100
Subject: genirq: Do not copy affinity before set

While rumaging through arch code I found that there are a few
workarounds which deal with the fact that the initial affinity setting
from request_irq() copies the mask into irq_data->affinity before the
chip code is called. In the normal path we unconditionally copy the
mask when the chip code returns 0.

Copy after the code is called and add a return code
IRQ_SET_MASK_OK_NOCOPY for the chip functions, which prevents the
copy. That way we see the real mask when the chip function decided to
truncate it further as some arches do. IRQ_SET_MASK_OK is 0, which is
the current behaviour.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/irq.h b/include/linux/irq.h
index e9f847d..f5e9003 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -85,6 +85,17 @@ typedef	void (*irq_flow_handler_t)(unsigned int irq,
 # define IRQ_NO_BALANCING_MASK	IRQ_NO_BALANCING
 #endif
 
+/*
+ * Return value for chip->irq_set_affinity()
+ *
+ * IRQ_SET_MASK_OK	- OK, core updates irq_data.affinity
+ * IRQ_SET_MASK_NOCPY	- OK, chip did update irq_data.affinity
+ */
+enum {
+	IRQ_SET_MASK_OK = 0,
+	IRQ_SET_MASK_OK_NOCOPY,
+};
+
 struct msi_desc;
 
 /**
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 99c3bc8..b5bfa24 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -43,7 +43,7 @@ static inline void unregister_handler_proc(unsigned int irq,
 					   struct irqaction *action) { }
 #endif
 
-extern int irq_select_affinity_usr(unsigned int irq);
+extern int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask);
 
 extern void irq_set_thread_affinity(struct irq_desc *desc);
 
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index ade65bf..dc95d53 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -148,9 +148,12 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *mask)
 
 	if (irq_can_move_pcntxt(desc)) {
 		ret = chip->irq_set_affinity(&desc->irq_data, mask, false);
-		if (!ret) {
+		switch (ret) {
+		case IRQ_SET_MASK_OK:
 			cpumask_copy(desc->irq_data.affinity, mask);
+		case IRQ_SET_MASK_OK_NOCOPY:
 			irq_set_thread_affinity(desc);
+			ret = 0;
 		}
 	} else {
 		desc->status |= IRQ_MOVE_PENDING;
@@ -254,9 +257,12 @@ EXPORT_SYMBOL_GPL(irq_set_affinity_notifier);
 /*
  * Generic version of the affinity autoselector.
  */
-static int setup_affinity(unsigned int irq, struct irq_desc *desc)
+static int
+setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
 {
+	struct irq_chip *chip = get_irq_desc_chip(desc);
 	struct cpumask *set = irq_default_affinity;
+	int ret;
 
 	/* Excludes PER_CPU and NO_BALANCE interrupts */
 	if (!irq_can_set_affinity(irq))
@@ -273,13 +279,20 @@ static int setup_affinity(unsigned int irq, struct irq_desc *desc)
 		else
 			desc->status &= ~IRQ_AFFINITY_SET;
 	}
-	cpumask_and(desc->irq_data.affinity, cpu_online_mask, set);
-	desc->irq_data.chip->irq_set_affinity(&desc->irq_data, desc->irq_data.affinity, false);
 
+	cpumask_and(mask, cpu_online_mask, set);
+	ret = chip->irq_set_affinity(&desc->irq_data, mask, false);
+	switch (ret) {
+	case IRQ_SET_MASK_OK:
+		cpumask_copy(desc->irq_data.affinity, mask);
+	case IRQ_SET_MASK_OK_NOCOPY:
+		irq_set_thread_affinity(desc);
+	}
 	return 0;
 }
 #else
-static inline int setup_affinity(unsigned int irq, struct irq_desc *d)
+static inline int
+setup_affinity(unsigned int irq, struct irq_desc *d, struct cpumask *mask)
 {
 	return irq_select_affinity(irq);
 }
@@ -288,23 +301,23 @@ static inline int setup_affinity(unsigned int irq, struct irq_desc *d)
 /*
  * Called when affinity is set via /proc/irq
  */
-int irq_select_affinity_usr(unsigned int irq)
+int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 	unsigned long flags;
 	int ret;
 
 	raw_spin_lock_irqsave(&desc->lock, flags);
-	ret = setup_affinity(irq, desc);
+	ret = setup_affinity(irq, desc, mask);
 	if (!ret)
 		irq_set_thread_affinity(desc);
 	raw_spin_unlock_irqrestore(&desc->lock, flags);
-
 	return ret;
 }
 
 #else
-static inline int setup_affinity(unsigned int irq, struct irq_desc *desc)
+static inline int
+setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
 {
 	return 0;
 }
@@ -765,8 +778,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 	struct irqaction *old, **old_ptr;
 	const char *old_name = NULL;
 	unsigned long flags;
-	int nested, shared = 0;
-	int ret;
+	int ret, nested, shared = 0;
+	cpumask_var_t mask;
 
 	if (!desc)
 		return -EINVAL;
@@ -831,6 +844,11 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 		new->thread = t;
 	}
 
+	if (!alloc_cpumask_var(&mask, GFP_KERNEL)) {
+		ret = -ENOMEM;
+		goto out_thread;
+	}
+
 	/*
 	 * The following block of code has to be executed atomically
 	 */
@@ -876,7 +894,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 					new->flags & IRQF_TRIGGER_MASK);
 
 			if (ret)
-				goto out_thread;
+				goto out_mask;
 		} else
 			compat_irq_chip_set_default_handler(desc);
 #if defined(CONFIG_IRQ_PER_CPU)
@@ -903,7 +921,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 			desc->status |= IRQ_NO_BALANCING;
 
 		/* Set default affinity mask once everything is setup */
-		setup_affinity(irq, desc);
+		setup_affinity(irq, desc, mask);
 
 	} else if ((new->flags & IRQF_TRIGGER_MASK)
 			&& (new->flags & IRQF_TRIGGER_MASK)
@@ -956,6 +974,9 @@ mismatch:
 #endif
 	ret = -EBUSY;
 
+out_mask:
+	free_cpumask_var(mask);
+
 out_thread:
 	raw_spin_unlock_irqrestore(&desc->lock, flags);
 	if (new->thread) {
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 6c8a2a9..a46bd76 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -89,7 +89,7 @@ static ssize_t irq_affinity_proc_write(struct file *file,
 	if (!cpumask_intersects(new_value, cpu_online_mask)) {
 		/* Special case for empty set - allow the architecture
 		   code to set default SMP affinity. */
-		err = irq_select_affinity_usr(irq) ? -EINVAL : count;
+		err = irq_select_affinity_usr(irq, new_value) ? -EINVAL : count;
 	} else {
 		irq_set_affinity(irq, new_value);
 		err = count;
-- 
cgit v0.10.2


From 2b879eaf095878430c38cbd95e5c0fc4ce65ad8e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 14 Feb 2011 11:25:02 +0100
Subject: genirq: Remove redundant thread affinity setting

Thread affinity is already set by setup_affinity().

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index dc95d53..33a6ee0 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -309,8 +309,6 @@ int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask)
 
 	raw_spin_lock_irqsave(&desc->lock, flags);
 	ret = setup_affinity(irq, desc, mask);
-	if (!ret)
-		irq_set_thread_affinity(desc);
 	raw_spin_unlock_irqrestore(&desc->lock, flags);
 	return ret;
 }
-- 
cgit v0.10.2


From 1082687e8d6292a61759eb83358e7db39fed1bf4 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 7 Feb 2011 09:05:05 +0100
Subject: genirq: Plug race in report_bad_irq()

We cannot walk the action chain unlocked. Even if IRQ_INPROGRESS is
set an action can be removed and we follow a null pointer. It's safe
to take the lock there, because the code which removes the action will
call synchronize_irq() which waits unlocked for IRQ_INPROGRESS going
away.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 3089d3b..2fbfda2 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -139,15 +139,13 @@ static void poll_spurious_irqs(unsigned long dummy)
  *
  * (The other 100-of-100,000 interrupts may have been a correctly
  *  functioning device sharing an IRQ with the failing one)
- *
- * Called under desc->lock
  */
-
 static void
 __report_bad_irq(unsigned int irq, struct irq_desc *desc,
 		 irqreturn_t action_ret)
 {
 	struct irqaction *action;
+	unsigned long flags;
 
 	if (action_ret != IRQ_HANDLED && action_ret != IRQ_NONE) {
 		printk(KERN_ERR "irq event %d: bogus return value %x\n",
@@ -159,6 +157,13 @@ __report_bad_irq(unsigned int irq, struct irq_desc *desc,
 	dump_stack();
 	printk(KERN_ERR "handlers:\n");
 
+	/*
+	 * We need to take desc->lock here. note_interrupt() is called
+	 * w/o desc->lock held, but IRQ_PROGRESS set. We might race
+	 * with something else removing an action. It's ok to take
+	 * desc->lock here. See synchronize_irq().
+	 */
+	raw_spin_lock_irqsave(&desc->lock, flags);
 	action = desc->action;
 	while (action) {
 		printk(KERN_ERR "[<%p>]", action->handler);
@@ -167,6 +172,7 @@ __report_bad_irq(unsigned int irq, struct irq_desc *desc,
 		printk("\n");
 		action = action->next;
 	}
+	raw_spin_unlock_irqrestore(&desc->lock, flags);
 }
 
 static void
-- 
cgit v0.10.2


From b738a50a202639614c98b5763b01bf9201779e50 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 2 Feb 2011 23:58:19 +0100
Subject: genirq: Warn when handler enables interrupts

We run all handlers with interrupts disabled and expect them not to
enable them. Warn when we catch one who does.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 3540a71..cdd6fbb 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -68,6 +68,9 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
 		ret = action->handler(irq, action->dev_id);
 		trace_irq_handler_exit(irq, action, ret);
 
+		if (WARN_ON_ONCE(!irqs_disabled()))
+			local_irq_disable();
+
 		switch (ret) {
 		case IRQ_WAKE_THREAD:
 			/*
@@ -114,7 +117,6 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
 
 	if (status & IRQF_SAMPLE_RANDOM)
 		add_interrupt_randomness(irq);
-	local_irq_disable();
 
 	return retval;
 }
-- 
cgit v0.10.2


From fa27271bc8d230355c1f24ddea103824fdc12de6 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 7 Feb 2011 09:10:39 +0100
Subject: genirq: Fixup poll handling

try_one_irq() contains redundant code and lots of useless checks for
shared interrupts. Check for shared before setting IRQ_INPROGRESS and
then call handle_IRQ_event() while pending. Shorter version with the
same functionality.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 2fbfda2..0af9e59 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -42,48 +42,36 @@ static int try_one_irq(int irq, struct irq_desc *desc)
 		raw_spin_unlock(&desc->lock);
 		return ok;
 	}
-	/* Honour the normal IRQ locking */
-	desc->status |= IRQ_INPROGRESS;
-	action = desc->action;
-	raw_spin_unlock(&desc->lock);
-
-	while (action) {
-		/* Only shared IRQ handlers are safe to call */
-		if (action->flags & IRQF_SHARED) {
-			if (action->handler(irq, action->dev_id) ==
-				IRQ_HANDLED)
-				ok = 1;
-		}
-		action = action->next;
-	}
-	local_irq_disable();
-	/* Now clean up the flags */
-	raw_spin_lock(&desc->lock);
-	action = desc->action;
-
 	/*
-	 * While we were looking for a fixup someone queued a real
-	 * IRQ clashing with our walk:
+	 * All handlers must agree on IRQF_SHARED, so we test just the
+	 * first. Check for action->next as well.
 	 */
-	while ((desc->status & IRQ_PENDING) && action) {
-		/*
-		 * Perform real IRQ processing for the IRQ we deferred
-		 */
-		work = 1;
+	action = desc->action;
+	if (!action || !(action->flags & IRQF_SHARED) || !action->next)
+		goto out;
+
+	/* Honour the normal IRQ locking */
+	desc->status |= IRQ_INPROGRESS;
+	do {
+		work++;
+		desc->status &= ~IRQ_PENDING;
 		raw_spin_unlock(&desc->lock);
-		handle_IRQ_event(irq, action);
+		if (handle_IRQ_event(irq, action) != IRQ_NONE)
+			ok = 1;
 		raw_spin_lock(&desc->lock);
-		desc->status &= ~IRQ_PENDING;
-	}
+		action = desc->action;
+	}  while ((desc->status & IRQ_PENDING) && action);
+
 	desc->status &= ~IRQ_INPROGRESS;
 	/*
 	 * If we did actual work for the real IRQ line we must let the
 	 * IRQ controller clean up too
 	 */
-	if (work)
+	if (work > 1)
 		irq_end(irq, desc);
-	raw_spin_unlock(&desc->lock);
 
+out:
+	raw_spin_unlock(&desc->lock);
 	return ok;
 }
 
-- 
cgit v0.10.2


From c7259cd7af757ddcd65701c37099dcddae2054f0 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 7 Feb 2011 09:52:27 +0100
Subject: genirq: Do not poll disabled, percpu and timer interrupts

There is no point in polling disabled lines.

percpu does not make sense at all because we only poll on the cpu
we're currently running on. Also polling per_cpu interrupts is racy as
hell. The handler runs without locking so we might get a huge
surprise.

If the timer interrupt needs polling, then we wont get there anyway.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 0af9e59..bd0e42d 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -25,30 +25,42 @@ static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs, 0, 0);
 /*
  * Recovery handler for misrouted interrupts.
  */
-static int try_one_irq(int irq, struct irq_desc *desc)
+static int try_one_irq(int irq, struct irq_desc *desc, bool force)
 {
 	struct irqaction *action;
 	int ok = 0, work = 0;
 
 	raw_spin_lock(&desc->lock);
+
+	/* PER_CPU and nested thread interrupts are never polled */
+	if (desc->status & (IRQ_PER_CPU | IRQ_NESTED_THREAD))
+		goto out;
+
+	/*
+	 * Do not poll disabled interrupts unless the spurious
+	 * disabled poller asks explicitely.
+	 */
+	if ((desc->status & IRQ_DISABLED) && !force)
+		goto out;
+
+	/*
+	 * All handlers must agree on IRQF_SHARED, so we test just the
+	 * first. Check for action->next as well.
+	 */
+	action = desc->action;
+	if (!action || !(action->flags & IRQF_SHARED) ||
+	    (action->flags & __IRQF_TIMER) || !action->next)
+		goto out;
+
 	/* Already running on another processor */
 	if (desc->status & IRQ_INPROGRESS) {
 		/*
 		 * Already running: If it is shared get the other
 		 * CPU to go looking for our mystery interrupt too
 		 */
-		if (desc->action && (desc->action->flags & IRQF_SHARED))
-			desc->status |= IRQ_PENDING;
-		raw_spin_unlock(&desc->lock);
-		return ok;
-	}
-	/*
-	 * All handlers must agree on IRQF_SHARED, so we test just the
-	 * first. Check for action->next as well.
-	 */
-	action = desc->action;
-	if (!action || !(action->flags & IRQF_SHARED) || !action->next)
+		desc->status |= IRQ_PENDING;
 		goto out;
+	}
 
 	/* Honour the normal IRQ locking */
 	desc->status |= IRQ_INPROGRESS;
@@ -87,7 +99,7 @@ static int misrouted_irq(int irq)
 		if (i == irq)	/* Already tried */
 			continue;
 
-		if (try_one_irq(i, desc))
+		if (try_one_irq(i, desc, false))
 			ok = 1;
 	}
 	/* So the caller can adjust the irq error counts */
@@ -112,7 +124,7 @@ static void poll_spurious_irqs(unsigned long dummy)
 			continue;
 
 		local_irq_disable();
-		try_one_irq(i, desc);
+		try_one_irq(i, desc, true);
 		local_irq_enable();
 	}
 
-- 
cgit v0.10.2


From d05c65fff0ef672be75429266751f0e015b54d94 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 7 Feb 2011 14:31:37 +0100
Subject: genirq: spurious: Run only one poller at a time

No point in running concurrent pollers which confuse each other by
setting PENDING.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index bd0e42d..56ff8ff 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -21,6 +21,8 @@ static int irqfixup __read_mostly;
 #define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10)
 static void poll_spurious_irqs(unsigned long dummy);
 static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs, 0, 0);
+static int irq_poll_cpu;
+static atomic_t irq_poll_active;
 
 /*
  * Recovery handler for misrouted interrupts.
@@ -92,6 +94,11 @@ static int misrouted_irq(int irq)
 	struct irq_desc *desc;
 	int i, ok = 0;
 
+	if (atomic_inc_return(&irq_poll_active) == 1)
+		goto out;
+
+	irq_poll_cpu = smp_processor_id();
+
 	for_each_irq_desc(i, desc) {
 		if (!i)
 			 continue;
@@ -102,6 +109,8 @@ static int misrouted_irq(int irq)
 		if (try_one_irq(i, desc, false))
 			ok = 1;
 	}
+out:
+	atomic_dec(&irq_poll_active);
 	/* So the caller can adjust the irq error counts */
 	return ok;
 }
@@ -111,6 +120,10 @@ static void poll_spurious_irqs(unsigned long dummy)
 	struct irq_desc *desc;
 	int i;
 
+	if (atomic_inc_return(&irq_poll_active) != 1)
+		goto out;
+	irq_poll_cpu = smp_processor_id();
+
 	for_each_irq_desc(i, desc) {
 		unsigned int status;
 
@@ -127,7 +140,8 @@ static void poll_spurious_irqs(unsigned long dummy)
 		try_one_irq(i, desc, true);
 		local_irq_enable();
 	}
-
+out:
+	atomic_dec(&irq_poll_active);
 	mod_timer(&poll_spurious_irq_timer,
 		  jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
 }
-- 
cgit v0.10.2


From fe200ae48ef5c79bf7941fe8046ff9505c570ff6 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 7 Feb 2011 10:34:30 +0100
Subject: genirq: Mark polled irqs and defer the real handler

With the chip.end() function gone we might run into a situation where
a poll call runs and the real interrupt comes in, sees IRQ_INPROGRESS
and disables the line. That might be a perfect working one, which will
then be masked forever.

So mark them polled while the poll runs. When the real handler sees
IRQ_INPROGRESS it checks the poll flag and waits for the polling to
complete. Add the necessary amount of sanity checks to it to avoid
deadlocks.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/irq.h b/include/linux/irq.h
index f5e9003..e32b64c 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -71,6 +71,7 @@ typedef	void (*irq_flow_handler_t)(unsigned int irq,
 #define IRQ_SUSPENDED		0x04000000	/* IRQ has gone through suspend sequence */
 #define IRQ_ONESHOT		0x08000000	/* IRQ is not unmasked after hardirq */
 #define IRQ_NESTED_THREAD	0x10000000	/* IRQ is nested into another, no own handler thread */
+#define IRQ_POLL_INPROGRESS	0x20000000	/* IRQ poll is in progress */
 
 #define IRQF_MODIFY_MASK	\
 	(IRQ_TYPE_SENSE_MASK | IRQ_NOPROBE | IRQ_NOREQUEST | \
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 622b55a..3125878 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -448,6 +448,13 @@ out_unlock:
 }
 EXPORT_SYMBOL_GPL(handle_nested_irq);
 
+static bool irq_check_poll(struct irq_desc *desc)
+{
+	if (!(desc->status & IRQ_POLL_INPROGRESS))
+		return false;
+	return irq_wait_for_poll(desc);
+}
+
 /**
  *	handle_simple_irq - Simple and software-decoded IRQs.
  *	@irq:	the interrupt number
@@ -469,7 +476,9 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc)
 	raw_spin_lock(&desc->lock);
 
 	if (unlikely(desc->status & IRQ_INPROGRESS))
-		goto out_unlock;
+		if (!irq_check_poll(desc))
+			goto out_unlock;
+
 	desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
 	kstat_incr_irqs_this_cpu(irq, desc);
 
@@ -510,7 +519,9 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
 	mask_ack_irq(desc);
 
 	if (unlikely(desc->status & IRQ_INPROGRESS))
-		goto out_unlock;
+		if (!irq_check_poll(desc))
+			goto out_unlock;
+
 	desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
 	kstat_incr_irqs_this_cpu(irq, desc);
 
@@ -558,7 +569,8 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
 	raw_spin_lock(&desc->lock);
 
 	if (unlikely(desc->status & IRQ_INPROGRESS))
-		goto out;
+		if (!irq_check_poll(desc))
+			goto out;
 
 	desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
 	kstat_incr_irqs_this_cpu(irq, desc);
@@ -620,9 +632,11 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
 	 */
 	if (unlikely((desc->status & (IRQ_INPROGRESS | IRQ_DISABLED)) ||
 		    !desc->action)) {
-		desc->status |= (IRQ_PENDING | IRQ_MASKED);
-		mask_ack_irq(desc);
-		goto out_unlock;
+		if (!irq_check_poll(desc)) {
+			desc->status |= (IRQ_PENDING | IRQ_MASKED);
+			mask_ack_irq(desc);
+			goto out_unlock;
+		}
 	}
 	kstat_incr_irqs_this_cpu(irq, desc);
 
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index b5bfa24..0eff7e9 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -28,6 +28,7 @@ extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
 
 /* Resending of interrupts :*/
 void check_irq_resend(struct irq_desc *desc, unsigned int irq);
+bool irq_wait_for_poll(struct irq_desc *desc);
 
 #ifdef CONFIG_PROC_FS
 extern void register_irq_proc(unsigned int irq, struct irq_desc *desc);
@@ -47,16 +48,6 @@ extern int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask);
 
 extern void irq_set_thread_affinity(struct irq_desc *desc);
 
-#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
-static inline void irq_end(unsigned int irq, struct irq_desc *desc)
-{
-	if (desc->irq_data.chip && desc->irq_data.chip->end)
-		desc->irq_data.chip->end(irq);
-}
-#else
-static inline void irq_end(unsigned int irq, struct irq_desc *desc) { }
-#endif
-
 /* Inline functions for support of irq chips on slow busses */
 static inline void chip_bus_lock(struct irq_desc *desc)
 {
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 56ff8ff..f749d29 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -25,12 +25,44 @@ static int irq_poll_cpu;
 static atomic_t irq_poll_active;
 
 /*
+ * We wait here for a poller to finish.
+ *
+ * If the poll runs on this CPU, then we yell loudly and return
+ * false. That will leave the interrupt line disabled in the worst
+ * case, but it should never happen.
+ *
+ * We wait until the poller is done and then recheck disabled and
+ * action (about to be disabled). Only if it's still active, we return
+ * true and let the handler run.
+ */
+bool irq_wait_for_poll(struct irq_desc *desc)
+{
+	if (WARN_ONCE(irq_poll_cpu == smp_processor_id(),
+		      "irq poll in progress on cpu %d for irq %d\n",
+		      smp_processor_id(), desc->irq_data.irq))
+		return false;
+
+#ifdef CONFIG_SMP
+	do {
+		raw_spin_unlock(&desc->lock);
+		while (desc->status & IRQ_INPROGRESS)
+			cpu_relax();
+		raw_spin_lock(&desc->lock);
+	} while (desc->status & IRQ_INPROGRESS);
+	/* Might have been disabled in meantime */
+	return !(desc->status & IRQ_DISABLED) && desc->action;
+#else
+	return false;
+#endif
+}
+
+/*
  * Recovery handler for misrouted interrupts.
  */
 static int try_one_irq(int irq, struct irq_desc *desc, bool force)
 {
 	struct irqaction *action;
-	int ok = 0, work = 0;
+	int ok = 0;
 
 	raw_spin_lock(&desc->lock);
 
@@ -64,10 +96,9 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force)
 		goto out;
 	}
 
-	/* Honour the normal IRQ locking */
-	desc->status |= IRQ_INPROGRESS;
+	/* Honour the normal IRQ locking and mark it poll in progress */
+	desc->status |= IRQ_INPROGRESS | IRQ_POLL_INPROGRESS;
 	do {
-		work++;
 		desc->status &= ~IRQ_PENDING;
 		raw_spin_unlock(&desc->lock);
 		if (handle_IRQ_event(irq, action) != IRQ_NONE)
@@ -76,14 +107,7 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force)
 		action = desc->action;
 	}  while ((desc->status & IRQ_PENDING) && action);
 
-	desc->status &= ~IRQ_INPROGRESS;
-	/*
-	 * If we did actual work for the real IRQ line we must let the
-	 * IRQ controller clean up too
-	 */
-	if (work > 1)
-		irq_end(irq, desc);
-
+	desc->status &= ~(IRQ_INPROGRESS | IRQ_POLL_INPROGRESS);
 out:
 	raw_spin_unlock(&desc->lock);
 	return ok;
@@ -238,6 +262,9 @@ try_misrouted_irq(unsigned int irq, struct irq_desc *desc,
 void note_interrupt(unsigned int irq, struct irq_desc *desc,
 		    irqreturn_t action_ret)
 {
+	if (desc->status & IRQ_POLL_INPROGRESS)
+		return;
+
 	if (unlikely(action_ret != IRQ_HANDLED)) {
 		/*
 		 * If we are seeing only the odd spurious IRQ caused by
-- 
cgit v0.10.2


From 1535dfacbf21c4da1b73fcf07c39913da5bd5581 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 7 Feb 2011 01:55:43 +0100
Subject: genirq: Move irq thread flags to core

Soleley used in core code.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 7834726..de97b95 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -74,20 +74,6 @@
 #define IRQF_TIMER		(__IRQF_TIMER | IRQF_NO_SUSPEND)
 
 /*
- * Bits used by threaded handlers:
- * IRQTF_RUNTHREAD - signals that the interrupt handler thread should run
- * IRQTF_DIED      - handler thread died
- * IRQTF_WARNED    - warning "IRQ_WAKE_THREAD w/o thread_fn" has been printed
- * IRQTF_AFFINITY  - irq thread is requested to adjust affinity
- */
-enum {
-	IRQTF_RUNTHREAD,
-	IRQTF_DIED,
-	IRQTF_WARNED,
-	IRQTF_AFFINITY,
-};
-
-/*
  * These values can be returned by request_any_context_irq() and
  * describe the context the interrupt will be run in.
  *
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 0eff7e9..b17c984 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -11,6 +11,20 @@
 
 extern int noirqdebug;
 
+/*
+ * Bits used by threaded handlers:
+ * IRQTF_RUNTHREAD - signals that the interrupt handler thread should run
+ * IRQTF_DIED      - handler thread died
+ * IRQTF_WARNED    - warning "IRQ_WAKE_THREAD w/o thread_fn" has been printed
+ * IRQTF_AFFINITY  - irq thread is requested to adjust affinity
+ */
+enum {
+	IRQTF_RUNTHREAD,
+	IRQTF_DIED,
+	IRQTF_WARNED,
+	IRQTF_AFFINITY,
+};
+
 #define irq_data_to_desc(data)	container_of(data, struct irq_desc, irq_data)
 
 /* Set default functions for irq_chip structures: */
-- 
cgit v0.10.2


From 3b56f0585fd4c02d047dc406668cb40159b2d340 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 2 Feb 2011 21:41:12 +0000
Subject: genirq: Remove bogus conditional

The if (chip->irq_shutdown) check will always evaluate to true, as we
fill in chip->irq_shutdown with default_shutdown in
irq_chip_set_defaults() if the chip does not provide its own function.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <20110202212551.667607458@linutronix.de>

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 33a6ee0..30bc8de 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1057,10 +1057,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
 	/* If this was the last handler, shut down the IRQ line: */
 	if (!desc->action) {
 		desc->status |= IRQ_DISABLED;
-		if (desc->irq_data.chip->irq_shutdown)
-			desc->irq_data.chip->irq_shutdown(&desc->irq_data);
-		else
-			desc->irq_data.chip->irq_disable(&desc->irq_data);
+		desc->irq_data.chip->irq_shutdown(&desc->irq_data);
 	}
 
 #ifdef CONFIG_SMP
-- 
cgit v0.10.2


From 4699923861513671d3f6ade8efb4e56a9a7ecadf Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 2 Feb 2011 21:41:14 +0000
Subject: genirq: Consolidate startup/shutdown of interrupts

Aside of duplicated code some of the startup/shutdown sites do not
handle the MASKED/DISABLED flags and the depth field at all. Move that
to a helper function and take care of it there.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <20110202212551.787481468@linutronix.de>

diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index 505798f..08947cb 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -60,7 +60,7 @@ unsigned long probe_irq_on(void)
 			if (desc->irq_data.chip->irq_set_type)
 				desc->irq_data.chip->irq_set_type(&desc->irq_data,
 							 IRQ_TYPE_PROBE);
-			desc->irq_data.chip->irq_startup(&desc->irq_data);
+			irq_startup(desc);
 		}
 		raw_spin_unlock_irq(&desc->lock);
 	}
@@ -77,7 +77,7 @@ unsigned long probe_irq_on(void)
 		raw_spin_lock_irq(&desc->lock);
 		if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
 			desc->status |= IRQ_AUTODETECT | IRQ_WAITING;
-			if (desc->irq_data.chip->irq_startup(&desc->irq_data))
+			if (irq_startup(desc))
 				desc->status |= IRQ_PENDING;
 		}
 		raw_spin_unlock_irq(&desc->lock);
@@ -99,7 +99,7 @@ unsigned long probe_irq_on(void)
 			/* It triggered already - consider it spurious. */
 			if (!(status & IRQ_WAITING)) {
 				desc->status = status & ~IRQ_AUTODETECT;
-				desc->irq_data.chip->irq_shutdown(&desc->irq_data);
+				irq_shutdown(desc);
 			} else
 				if (i < 32)
 					mask |= 1 << i;
@@ -138,7 +138,7 @@ unsigned int probe_irq_mask(unsigned long val)
 				mask |= 1 << i;
 
 			desc->status = status & ~IRQ_AUTODETECT;
-			desc->irq_data.chip->irq_shutdown(&desc->irq_data);
+			irq_shutdown(desc);
 		}
 		raw_spin_unlock_irq(&desc->lock);
 	}
@@ -182,7 +182,7 @@ int probe_irq_off(unsigned long val)
 				nr_of_irqs++;
 			}
 			desc->status = status & ~IRQ_AUTODETECT;
-			desc->irq_data.chip->irq_shutdown(&desc->irq_data);
+			irq_shutdown(desc);
 		}
 		raw_spin_unlock_irq(&desc->lock);
 	}
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 3125878..988fe7a 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -192,6 +192,25 @@ void set_irq_nested_thread(unsigned int irq, int nest)
 }
 EXPORT_SYMBOL_GPL(set_irq_nested_thread);
 
+int irq_startup(struct irq_desc *desc)
+{
+	desc->status &= ~(IRQ_MASKED | IRQ_DISABLED);
+	desc->depth = 0;
+
+	if (desc->irq_data.chip->irq_startup)
+		return desc->irq_data.chip->irq_startup(&desc->irq_data);
+
+	desc->irq_data.chip->irq_enable(&desc->irq_data);
+	return 0;
+}
+
+void irq_shutdown(struct irq_desc *desc)
+{
+	desc->status |= IRQ_MASKED | IRQ_DISABLED;
+	desc->depth = 1;
+	desc->irq_data.chip->irq_shutdown(&desc->irq_data);
+}
+
 /*
  * default enable function
  */
@@ -211,17 +230,6 @@ static void default_disable(struct irq_data *data)
 }
 
 /*
- * default startup function
- */
-static unsigned int default_startup(struct irq_data *data)
-{
-	struct irq_desc *desc = irq_data_to_desc(data);
-
-	desc->irq_data.chip->irq_enable(data);
-	return 0;
-}
-
-/*
  * default shutdown function
  */
 static void default_shutdown(struct irq_data *data)
@@ -229,7 +237,6 @@ static void default_shutdown(struct irq_data *data)
 	struct irq_desc *desc = irq_data_to_desc(data);
 
 	desc->irq_data.chip->irq_mask(&desc->irq_data);
-	desc->status |= IRQ_MASKED;
 }
 
 #ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
@@ -337,8 +344,6 @@ void irq_chip_set_defaults(struct irq_chip *chip)
 		chip->irq_enable = default_enable;
 	if (!chip->irq_disable)
 		chip->irq_disable = default_disable;
-	if (!chip->irq_startup)
-		chip->irq_startup = default_startup;
 	/*
 	 * We use chip->irq_disable, when the user provided its own. When
 	 * we have default_disable set for chip->irq_disable, then we need
@@ -747,10 +752,8 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
 	desc->name = name;
 
 	if (handle != handle_bad_irq && is_chained) {
-		desc->status &= ~IRQ_DISABLED;
 		desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE;
-		desc->depth = 0;
-		desc->irq_data.chip->irq_startup(&desc->irq_data);
+		irq_startup(desc);
 	}
 	raw_spin_unlock_irqrestore(&desc->lock, flags);
 	chip_bus_sync_unlock(desc);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index b17c984..5cbfc93 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -38,6 +38,9 @@ extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
 extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp);
 extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume);
 
+extern int irq_startup(struct irq_desc *desc);
+extern void irq_shutdown(struct irq_desc *desc);
+
 extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
 
 /* Resending of interrupts :*/
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 30bc8de..9c56247 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -906,11 +906,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 		if (new->flags & IRQF_ONESHOT)
 			desc->status |= IRQ_ONESHOT;
 
-		if (!(desc->status & IRQ_NOAUTOEN)) {
-			desc->depth = 0;
-			desc->status &= ~IRQ_DISABLED;
-			desc->irq_data.chip->irq_startup(&desc->irq_data);
-		} else
+		if (!(desc->status & IRQ_NOAUTOEN))
+			irq_startup(desc);
+		else
 			/* Undo nested disables: */
 			desc->depth = 1;
 
@@ -1055,10 +1053,8 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
 #endif
 
 	/* If this was the last handler, shut down the IRQ line: */
-	if (!desc->action) {
-		desc->status |= IRQ_DISABLED;
-		desc->irq_data.chip->irq_shutdown(&desc->irq_data);
-	}
+	if (!desc->action)
+		irq_shutdown(desc);
 
 #ifdef CONFIG_SMP
 	/* make sure affinity_hint is cleaned up */
-- 
cgit v0.10.2


From 87923470c712dff00b101ffb6b6fbc27bd7a6df5 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 3 Feb 2011 12:27:44 +0100
Subject: genirq: Consolidate disable/enable

Create irq_disable/enable and use them to keep the flags consistent.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 988fe7a..86c8e42 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -200,7 +200,7 @@ int irq_startup(struct irq_desc *desc)
 	if (desc->irq_data.chip->irq_startup)
 		return desc->irq_data.chip->irq_startup(&desc->irq_data);
 
-	desc->irq_data.chip->irq_enable(&desc->irq_data);
+	irq_enable(desc);
 	return 0;
 }
 
@@ -211,6 +211,16 @@ void irq_shutdown(struct irq_desc *desc)
 	desc->irq_data.chip->irq_shutdown(&desc->irq_data);
 }
 
+void irq_enable(struct irq_desc *desc)
+{
+	desc->irq_data.chip->irq_enable(&desc->irq_data);
+}
+
+void irq_disable(struct irq_desc *desc)
+{
+	desc->irq_data.chip->irq_disable(&desc->irq_data);
+}
+
 /*
  * default enable function
  */
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 5cbfc93..c71fc4de0 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -40,6 +40,8 @@ extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume);
 
 extern int irq_startup(struct irq_desc *desc);
 extern void irq_shutdown(struct irq_desc *desc);
+extern void irq_enable(struct irq_desc *desc);
+extern void irq_disable(struct irq_desc *desc);
 
 extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
 
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 9c56247..0078023 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -331,7 +331,7 @@ void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend)
 
 	if (!desc->depth++) {
 		desc->status |= IRQ_DISABLED;
-		desc->irq_data.chip->irq_disable(&desc->irq_data);
+		irq_disable(desc);
 	}
 }
 
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index dc49358..4bfe268 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -55,20 +55,20 @@ static DECLARE_TASKLET(resend_tasklet, resend_irqs, 0);
  */
 void check_irq_resend(struct irq_desc *desc, unsigned int irq)
 {
-	unsigned int status = desc->status;
-
 	/*
 	 * Make sure the interrupt is enabled, before resending it:
 	 */
-	desc->irq_data.chip->irq_enable(&desc->irq_data);
+	irq_enable(desc);
 
 	/*
 	 * We do not resend level type interrupts. Level type
 	 * interrupts are resent by hardware when they are still
 	 * active.
 	 */
-	if ((status & (IRQ_LEVEL | IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) {
-		desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY;
+	if (desc->status & IRQ_LEVEL)
+		return;
+	if ((desc->status & (IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) {
+		desc->status = (desc->status & ~IRQ_PENDING) | IRQ_REPLAY;
 
 		if (!desc->irq_data.chip->irq_retrigger ||
 		    !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) {
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index f749d29..c300b8f 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -303,7 +303,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
 		printk(KERN_EMERG "Disabling IRQ #%d\n", irq);
 		desc->status |= IRQ_DISABLED | IRQ_SPURIOUS_DISABLED;
 		desc->depth++;
-		desc->irq_data.chip->irq_disable(&desc->irq_data);
+		irq_disable(desc);
 
 		mod_timer(&poll_spurious_irq_timer,
 			  jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
-- 
cgit v0.10.2


From 50f7c0327513d5acefbe26fd33498af18d1ffac5 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 3 Feb 2011 13:23:54 +0100
Subject: genirq: Remove default magic

Now that everything uses the wrappers, we can remove the default
functions. None of those functions is performance critical.

That makes the IRQ_MASKED flag tracking fully consistent.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 86c8e42..1a239a8 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -208,45 +208,29 @@ void irq_shutdown(struct irq_desc *desc)
 {
 	desc->status |= IRQ_MASKED | IRQ_DISABLED;
 	desc->depth = 1;
-	desc->irq_data.chip->irq_shutdown(&desc->irq_data);
+	if (desc->irq_data.chip->irq_shutdown)
+		desc->irq_data.chip->irq_shutdown(&desc->irq_data);
+	if (desc->irq_data.chip->irq_disable)
+		desc->irq_data.chip->irq_disable(&desc->irq_data);
+	else
+		desc->irq_data.chip->irq_mask(&desc->irq_data);
 }
 
 void irq_enable(struct irq_desc *desc)
 {
-	desc->irq_data.chip->irq_enable(&desc->irq_data);
-}
-
-void irq_disable(struct irq_desc *desc)
-{
-	desc->irq_data.chip->irq_disable(&desc->irq_data);
-}
-
-/*
- * default enable function
- */
-static void default_enable(struct irq_data *data)
-{
-	struct irq_desc *desc = irq_data_to_desc(data);
-
-	desc->irq_data.chip->irq_unmask(&desc->irq_data);
+	if (desc->irq_data.chip->irq_enable)
+		desc->irq_data.chip->irq_enable(&desc->irq_data);
+	else
+		desc->irq_data.chip->irq_unmask(&desc->irq_data);
 	desc->status &= ~IRQ_MASKED;
 }
 
-/*
- * default disable function
- */
-static void default_disable(struct irq_data *data)
-{
-}
-
-/*
- * default shutdown function
- */
-static void default_shutdown(struct irq_data *data)
+void irq_disable(struct irq_desc *desc)
 {
-	struct irq_desc *desc = irq_data_to_desc(data);
-
-	desc->irq_data.chip->irq_mask(&desc->irq_data);
+	if (desc->irq_data.chip->irq_disable) {
+		desc->irq_data.chip->irq_disable(&desc->irq_data);
+		desc->status |= IRQ_MASKED;
+	}
 }
 
 #ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
@@ -334,10 +318,6 @@ static void compat_bus_sync_unlock(struct irq_data *data)
 void irq_chip_set_defaults(struct irq_chip *chip)
 {
 #ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
-	/*
-	 * Compat fixup functions need to be before we set the
-	 * defaults for enable/disable/startup/shutdown
-	 */
 	if (chip->enable)
 		chip->irq_enable = compat_irq_enable;
 	if (chip->disable)
@@ -346,31 +326,8 @@ void irq_chip_set_defaults(struct irq_chip *chip)
 		chip->irq_shutdown = compat_irq_shutdown;
 	if (chip->startup)
 		chip->irq_startup = compat_irq_startup;
-#endif
-	/*
-	 * The real defaults
-	 */
-	if (!chip->irq_enable)
-		chip->irq_enable = default_enable;
-	if (!chip->irq_disable)
-		chip->irq_disable = default_disable;
-	/*
-	 * We use chip->irq_disable, when the user provided its own. When
-	 * we have default_disable set for chip->irq_disable, then we need
-	 * to use default_shutdown, otherwise the irq line is not
-	 * disabled on free_irq():
-	 */
-	if (!chip->irq_shutdown)
-		chip->irq_shutdown = chip->irq_disable != default_disable ?
-			chip->irq_disable : default_shutdown;
-
-#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
 	if (!chip->end)
 		chip->end = dummy_irq_chip.end;
-
-	/*
-	 * Now fix up the remaining compat handlers
-	 */
 	if (chip->bus_lock)
 		chip->irq_bus_lock = compat_bus_lock;
 	if (chip->bus_sync_unlock)
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 0078023..2a6c6ee 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -440,8 +440,8 @@ void enable_irq(unsigned int irq)
 	if (!desc)
 		return;
 
-	if (WARN(!desc->irq_data.chip || !desc->irq_data.chip->irq_enable,
-	    KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq))
+	if (WARN(!desc->irq_data.chip,
+		 KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq))
 		return;
 
 	chip_bus_lock(desc);
-- 
cgit v0.10.2


From 3aae994fb0f43f6d94a31c33536a83869504abdf Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 4 Feb 2011 10:17:52 +0100
Subject: genirq: Consolidate IRQ_DISABLED

Handle IRQ_DISABLED consistent.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 1a239a8..43c62ca 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -194,11 +194,14 @@ EXPORT_SYMBOL_GPL(set_irq_nested_thread);
 
 int irq_startup(struct irq_desc *desc)
 {
-	desc->status &= ~(IRQ_MASKED | IRQ_DISABLED);
+	desc->status &= ~IRQ_DISABLED;
 	desc->depth = 0;
 
-	if (desc->irq_data.chip->irq_startup)
-		return desc->irq_data.chip->irq_startup(&desc->irq_data);
+	if (desc->irq_data.chip->irq_startup) {
+		int ret = desc->irq_data.chip->irq_startup(&desc->irq_data);
+		desc->status &= ~IRQ_MASKED;
+		return ret;
+	}
 
 	irq_enable(desc);
 	return 0;
@@ -206,7 +209,7 @@ int irq_startup(struct irq_desc *desc)
 
 void irq_shutdown(struct irq_desc *desc)
 {
-	desc->status |= IRQ_MASKED | IRQ_DISABLED;
+	desc->status |= IRQ_DISABLED;
 	desc->depth = 1;
 	if (desc->irq_data.chip->irq_shutdown)
 		desc->irq_data.chip->irq_shutdown(&desc->irq_data);
@@ -214,10 +217,12 @@ void irq_shutdown(struct irq_desc *desc)
 		desc->irq_data.chip->irq_disable(&desc->irq_data);
 	else
 		desc->irq_data.chip->irq_mask(&desc->irq_data);
+	desc->status |= IRQ_MASKED;
 }
 
 void irq_enable(struct irq_desc *desc)
 {
+	desc->status &= ~IRQ_DISABLED;
 	if (desc->irq_data.chip->irq_enable)
 		desc->irq_data.chip->irq_enable(&desc->irq_data);
 	else
@@ -227,6 +232,7 @@ void irq_enable(struct irq_desc *desc)
 
 void irq_disable(struct irq_desc *desc)
 {
+	desc->status |= IRQ_DISABLED;
 	if (desc->irq_data.chip->irq_disable) {
 		desc->irq_data.chip->irq_disable(&desc->irq_data);
 		desc->status |= IRQ_MASKED;
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 2a6c6ee..78a566a 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -329,10 +329,8 @@ void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend)
 		desc->status |= IRQ_SUSPENDED;
 	}
 
-	if (!desc->depth++) {
-		desc->status |= IRQ_DISABLED;
+	if (!desc->depth++)
 		irq_disable(desc);
-	}
 }
 
 /**
@@ -407,12 +405,11 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
 		WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq);
 		break;
 	case 1: {
-		unsigned int status = desc->status & ~IRQ_DISABLED;
-
 		if (desc->status & IRQ_SUSPENDED)
 			goto err_out;
 		/* Prevent probing on this irq: */
-		desc->status = status | IRQ_NOPROBE;
+		desc->status |= IRQ_NOPROBE;
+		irq_enable(desc);
 		check_irq_resend(desc, irq);
 		/* fall-through */
 	}
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 4bfe268..60b2026 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -56,11 +56,6 @@ static DECLARE_TASKLET(resend_tasklet, resend_irqs, 0);
 void check_irq_resend(struct irq_desc *desc, unsigned int irq)
 {
 	/*
-	 * Make sure the interrupt is enabled, before resending it:
-	 */
-	irq_enable(desc);
-
-	/*
 	 * We do not resend level type interrupts. Level type
 	 * interrupts are resent by hardware when they are still
 	 * active.
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index c300b8f..89e5e16 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -301,7 +301,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
 		 * Now kill the IRQ
 		 */
 		printk(KERN_EMERG "Disabling IRQ #%d\n", irq);
-		desc->status |= IRQ_DISABLED | IRQ_SPURIOUS_DISABLED;
+		desc->status |= IRQ_SPURIOUS_DISABLED;
 		desc->depth++;
 		irq_disable(desc);
 
-- 
cgit v0.10.2


From d78f8dd36b90626106ce19cb2e6828b0dc39447e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 2 Feb 2011 21:41:17 +0000
Subject: genirq: Do not fiddle with IRQ_MASKED in handle_edge_irq()

IRQ_MASKED is set in mask_ack_irq() anyway. Remove it from
handle_edge_irq() to allow simpler ab^HHreuse of that function.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <20110202212551.918484270@linutronix.de>

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 43c62ca..2c30b78 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -611,7 +611,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
 	if (unlikely((desc->status & (IRQ_INPROGRESS | IRQ_DISABLED)) ||
 		    !desc->action)) {
 		if (!irq_check_poll(desc)) {
-			desc->status |= (IRQ_PENDING | IRQ_MASKED);
+			desc->status |= IRQ_PENDING;
 			mask_ack_irq(desc);
 			goto out_unlock;
 		}
-- 
cgit v0.10.2


From 4912609f228da4a3d2bfbdf0f31de3d9eab2b7f8 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 7 Feb 2011 01:08:49 +0100
Subject: genirq: Implement handle_irq_event()

Core code replacement for the ugly camel case. It contains all the
code which is shared in all handlers.

     clear status flags
     set INPROGRESS flag
     unlock
     call action chain
     note_interrupt
     lock
     clr INPROGRESS flag

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index cdd6fbb..4ef0594 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -51,14 +51,7 @@ static void warn_no_thread(unsigned int irq, struct irqaction *action)
 	       "but no thread function available.", irq, action->name);
 }
 
-/**
- * handle_IRQ_event - irq action chain handler
- * @irq:	the interrupt number
- * @action:	the interrupt action chain for this irq
- *
- * Handles the action chain of an irq event
- */
-irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
+static irqreturn_t __handle_irq_event(unsigned int irq, struct irqaction *action)
 {
 	irqreturn_t ret, retval = IRQ_NONE;
 	unsigned int status = 0;
@@ -120,3 +113,41 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
 
 	return retval;
 }
+
+irqreturn_t
+handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
+{
+	irqreturn_t ret = __handle_irq_event(desc->irq_data.irq, action);
+
+	if (!noirqdebug)
+		note_interrupt(desc->irq_data.irq, desc, ret);
+	return ret;
+}
+
+irqreturn_t handle_irq_event(struct irq_desc *desc)
+{
+	struct irqaction *action = desc->action;
+	irqreturn_t ret;
+
+	desc->status &= ~IRQ_PENDING;
+	desc->status |= IRQ_INPROGRESS;
+	raw_spin_unlock(&desc->lock);
+
+	ret = handle_irq_event_percpu(desc, action);
+
+	raw_spin_lock(&desc->lock);
+	desc->status &= ~IRQ_INPROGRESS;
+	return ret;
+}
+
+/**
+ * handle_IRQ_event - irq action chain handler
+ * @irq:	the interrupt number
+ * @action:	the interrupt action chain for this irq
+ *
+ * Handles the action chain of an irq event
+ */
+irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
+{
+	return __handle_irq_event(irq, action);
+}
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index c71fc4de0..b61824c 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -45,6 +45,9 @@ extern void irq_disable(struct irq_desc *desc);
 
 extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
 
+irqreturn_t handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action);
+irqreturn_t handle_irq_event(struct irq_desc *desc);
+
 /* Resending of interrupts :*/
 void check_irq_resend(struct irq_desc *desc, unsigned int irq);
 bool irq_wait_for_poll(struct irq_desc *desc);
-- 
cgit v0.10.2


From 107781e72192067b95a7d373bfa460434a13c6ae Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 7 Feb 2011 01:21:02 +0100
Subject: genirq: Use handle_irq_event() in handle_simple_irq()

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 2c30b78..809a03f 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -448,9 +448,6 @@ static bool irq_check_poll(struct irq_desc *desc)
 void
 handle_simple_irq(unsigned int irq, struct irq_desc *desc)
 {
-	struct irqaction *action;
-	irqreturn_t action_ret;
-
 	raw_spin_lock(&desc->lock);
 
 	if (unlikely(desc->status & IRQ_INPROGRESS))
@@ -460,19 +457,11 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc)
 	desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
 	kstat_incr_irqs_this_cpu(irq, desc);
 
-	action = desc->action;
-	if (unlikely(!action || (desc->status & IRQ_DISABLED)))
+	if (unlikely(!desc->action || (desc->status & IRQ_DISABLED)))
 		goto out_unlock;
 
-	desc->status |= IRQ_INPROGRESS;
-	raw_spin_unlock(&desc->lock);
+	handle_irq_event(desc);
 
-	action_ret = handle_IRQ_event(irq, action);
-	if (!noirqdebug)
-		note_interrupt(irq, desc, action_ret);
-
-	raw_spin_lock(&desc->lock);
-	desc->status &= ~IRQ_INPROGRESS;
 out_unlock:
 	raw_spin_unlock(&desc->lock);
 }
-- 
cgit v0.10.2


From 1529866c63d789925de9b4250646d82d033e4b95 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 7 Feb 2011 01:22:17 +0100
Subject: genirq: Use handle_irq_event() in handle_level_irq()

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 809a03f..2d2ba4a 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -479,9 +479,6 @@ out_unlock:
 void
 handle_level_irq(unsigned int irq, struct irq_desc *desc)
 {
-	struct irqaction *action;
-	irqreturn_t action_ret;
-
 	raw_spin_lock(&desc->lock);
 	mask_ack_irq(desc);
 
@@ -496,19 +493,10 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
 	 * If its disabled or no action available
 	 * keep it masked and get out of here
 	 */
-	action = desc->action;
-	if (unlikely(!action || (desc->status & IRQ_DISABLED)))
+	if (unlikely(!desc->action || (desc->status & IRQ_DISABLED)))
 		goto out_unlock;
 
-	desc->status |= IRQ_INPROGRESS;
-	raw_spin_unlock(&desc->lock);
-
-	action_ret = handle_IRQ_event(irq, action);
-	if (!noirqdebug)
-		note_interrupt(irq, desc, action_ret);
-
-	raw_spin_lock(&desc->lock);
-	desc->status &= ~IRQ_INPROGRESS;
+	handle_irq_event(desc);
 
 	if (!(desc->status & (IRQ_DISABLED | IRQ_ONESHOT)))
 		unmask_irq(desc);
-- 
cgit v0.10.2


From a7ae4de5c8ae8110556f0f9c7241093ef984605c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 7 Feb 2011 01:23:07 +0100
Subject: genirq: Use handle_irq_event() in handle_fasteoi_irq()

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 2d2ba4a..a499ca5 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -518,9 +518,6 @@ EXPORT_SYMBOL_GPL(handle_level_irq);
 void
 handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
 {
-	struct irqaction *action;
-	irqreturn_t action_ret;
-
 	raw_spin_lock(&desc->lock);
 
 	if (unlikely(desc->status & IRQ_INPROGRESS))
@@ -534,26 +531,14 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
 	 * If its disabled or no action available
 	 * then mask it and get out of here:
 	 */
-	action = desc->action;
-	if (unlikely(!action || (desc->status & IRQ_DISABLED))) {
+	if (unlikely(!desc->action || (desc->status & IRQ_DISABLED))) {
 		desc->status |= IRQ_PENDING;
 		mask_irq(desc);
 		goto out;
 	}
-
-	desc->status |= IRQ_INPROGRESS;
-	desc->status &= ~IRQ_PENDING;
-	raw_spin_unlock(&desc->lock);
-
-	action_ret = handle_IRQ_event(irq, action);
-	if (!noirqdebug)
-		note_interrupt(irq, desc, action_ret);
-
-	raw_spin_lock(&desc->lock);
-	desc->status &= ~IRQ_INPROGRESS;
+	handle_irq_event(desc);
 out:
 	desc->irq_data.chip->irq_eoi(&desc->irq_data);
-
 	raw_spin_unlock(&desc->lock);
 }
 
-- 
cgit v0.10.2


From a60a5dc2db3b08b3c2900614c43b1262410c2d8c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 7 Feb 2011 01:24:07 +0100
Subject: genirq: Use handle_irq_event() in handle_edge_irq()

It's safe to drop the IRQ_INPROGRESS flag between action chain walks
as we are protected by desc->lock.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index a499ca5..3ccff4d 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -583,14 +583,8 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
 	/* Start handling the irq */
 	desc->irq_data.chip->irq_ack(&desc->irq_data);
 
-	/* Mark the IRQ currently in progress.*/
-	desc->status |= IRQ_INPROGRESS;
-
 	do {
-		struct irqaction *action = desc->action;
-		irqreturn_t action_ret;
-
-		if (unlikely(!action)) {
+		if (unlikely(!desc->action)) {
 			mask_irq(desc);
 			goto out_unlock;
 		}
@@ -606,16 +600,10 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
 			unmask_irq(desc);
 		}
 
-		desc->status &= ~IRQ_PENDING;
-		raw_spin_unlock(&desc->lock);
-		action_ret = handle_IRQ_event(irq, action);
-		if (!noirqdebug)
-			note_interrupt(irq, desc, action_ret);
-		raw_spin_lock(&desc->lock);
+		handle_irq_event(desc);
 
 	} while ((desc->status & (IRQ_PENDING | IRQ_DISABLED)) == IRQ_PENDING);
 
-	desc->status &= ~IRQ_INPROGRESS;
 out_unlock:
 	raw_spin_unlock(&desc->lock);
 }
-- 
cgit v0.10.2


From 849f061c25f8951d11c7dd88f44950ccde296392 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 7 Feb 2011 01:25:41 +0100
Subject: genirq: Use handle_perpcu_event() in handle_percpu_irq()

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 3ccff4d..52b10ad 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -618,19 +618,17 @@ out_unlock:
 void
 handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
 {
-	irqreturn_t action_ret;
+	struct irq_chip *chip = get_irq_desc_chip(desc);
 
 	kstat_incr_irqs_this_cpu(irq, desc);
 
-	if (desc->irq_data.chip->irq_ack)
-		desc->irq_data.chip->irq_ack(&desc->irq_data);
+	if (chip->irq_ack)
+		chip->irq_ack(&desc->irq_data);
 
-	action_ret = handle_IRQ_event(irq, desc->action);
-	if (!noirqdebug)
-		note_interrupt(irq, desc, action_ret);
+	handle_irq_event_percpu(desc, desc->action);
 
-	if (desc->irq_data.chip->irq_eoi)
-		desc->irq_data.chip->irq_eoi(&desc->irq_data);
+	if (chip->irq_eoi)
+		chip->irq_eoi(&desc->irq_data);
 }
 
 void
-- 
cgit v0.10.2


From 0877d66257082ce86fca8f9826b91870575b272c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 7 Feb 2011 01:29:15 +0100
Subject: genirq: Use handle_irq_event() in the spurious poll code

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 89e5e16..bc06207 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -56,13 +56,14 @@ bool irq_wait_for_poll(struct irq_desc *desc)
 #endif
 }
 
+
 /*
  * Recovery handler for misrouted interrupts.
  */
 static int try_one_irq(int irq, struct irq_desc *desc, bool force)
 {
+	irqreturn_t ret = IRQ_NONE;
 	struct irqaction *action;
-	int ok = 0;
 
 	raw_spin_lock(&desc->lock);
 
@@ -96,21 +97,17 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force)
 		goto out;
 	}
 
-	/* Honour the normal IRQ locking and mark it poll in progress */
-	desc->status |= IRQ_INPROGRESS | IRQ_POLL_INPROGRESS;
+	/* Mark it poll in progress */
+	desc->status |= IRQ_POLL_INPROGRESS;
 	do {
-		desc->status &= ~IRQ_PENDING;
-		raw_spin_unlock(&desc->lock);
-		if (handle_IRQ_event(irq, action) != IRQ_NONE)
-			ok = 1;
-		raw_spin_lock(&desc->lock);
+		if (handle_irq_event(desc) == IRQ_HANDLED)
+			ret = IRQ_HANDLED;
 		action = desc->action;
-	}  while ((desc->status & IRQ_PENDING) && action);
-
-	desc->status &= ~(IRQ_INPROGRESS | IRQ_POLL_INPROGRESS);
+	} while ((desc->status & IRQ_PENDING) && action);
+	desc->status &= ~IRQ_POLL_INPROGRESS;
 out:
 	raw_spin_unlock(&desc->lock);
-	return ok;
+	return ret == IRQ_HANDLED;
 }
 
 static int misrouted_irq(int irq)
-- 
cgit v0.10.2


From 1277a5325adfc53caac7dd3dac5d3d2fd2a125b4 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 7 Feb 2011 01:40:27 +0100
Subject: genirq: Simplify handle_irq_event()

Now that all core users are converted one layer can go.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 4ef0594..ff40e0f 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -51,10 +51,11 @@ static void warn_no_thread(unsigned int irq, struct irqaction *action)
 	       "but no thread function available.", irq, action->name);
 }
 
-static irqreturn_t __handle_irq_event(unsigned int irq, struct irqaction *action)
+irqreturn_t
+handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
 {
 	irqreturn_t ret, retval = IRQ_NONE;
-	unsigned int status = 0;
+	unsigned int status = 0, irq = desc->irq_data.irq;
 
 	do {
 		trace_irq_handler_entry(irq, action);
@@ -111,17 +112,9 @@ static irqreturn_t __handle_irq_event(unsigned int irq, struct irqaction *action
 	if (status & IRQF_SAMPLE_RANDOM)
 		add_interrupt_randomness(irq);
 
-	return retval;
-}
-
-irqreturn_t
-handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
-{
-	irqreturn_t ret = __handle_irq_event(desc->irq_data.irq, action);
-
 	if (!noirqdebug)
-		note_interrupt(desc->irq_data.irq, desc, ret);
-	return ret;
+		note_interrupt(irq, desc, ret);
+	return retval;
 }
 
 irqreturn_t handle_irq_event(struct irq_desc *desc)
@@ -149,5 +142,5 @@ irqreturn_t handle_irq_event(struct irq_desc *desc)
  */
 irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
 {
-	return __handle_irq_event(irq, action);
+	return handle_irq_event_percpu(irq_to_desc(irq), action);
 }
-- 
cgit v0.10.2


From c78b9b65faa291def628dbd8539649f58299f0f3 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 16 Dec 2010 17:21:47 +0100
Subject: genirq: Implement generic irq_show_interrupts()

All archs implement show_interrupts() in more or less the same
way. That's tons of duplicated code with different bugs with no
value. Implement a generic version and deprecate show_interrupts()

Unfortunately we need some ifdeffery for !GENERIC_HARDIRQ archs.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index de97b95..8da6643 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -671,6 +671,7 @@ static inline void init_irq_proc(void)
 
 struct seq_file;
 int show_interrupts(struct seq_file *p, void *v);
+int arch_show_interrupts(struct seq_file *p, int prec);
 
 extern int early_irq_init(void);
 extern int arch_probe_nr_irqs(void);
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 8e42fec..4cd5d71 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -20,6 +20,9 @@ config HAVE_SPARSE_IRQ
 config GENERIC_IRQ_PROBE
 	def_bool n
 
+config GENERIC_IRQ_SHOW
+       def_bool n
+
 config GENERIC_PENDING_IRQ
 	def_bool n
 
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index a46bd76..2644923 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -11,6 +11,7 @@
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/interrupt.h>
+#include <linux/kernel_stat.h>
 
 #include "internals.h"
 
@@ -357,3 +358,65 @@ void init_irq_proc(void)
 	}
 }
 
+#ifdef CONFIG_GENERIC_IRQ_SHOW
+
+int __weak arch_show_interrupts(struct seq_file *p, int prec)
+{
+	return 0;
+}
+
+int show_interrupts(struct seq_file *p, void *v)
+{
+	static int prec;
+
+	unsigned long flags, any_count = 0;
+	int i = *(loff_t *) v, j;
+	struct irqaction *action;
+	struct irq_desc *desc;
+
+	if (i > nr_irqs)
+		return 0;
+
+	if (i == nr_irqs)
+		return arch_show_interrupts(p, prec);
+
+	/* print header and calculate the width of the first column */
+	if (i == 0) {
+		for (prec = 3, j = 1000; prec < 10 && j <= nr_irqs; ++prec)
+			j *= 10;
+
+		seq_printf(p, "%*s", prec + 8, "");
+		for_each_online_cpu(j)
+			seq_printf(p, "CPU%-8d", j);
+		seq_putc(p, '\n');
+	}
+
+	desc = irq_to_desc(i);
+	if (!desc)
+		return 0;
+
+	raw_spin_lock_irqsave(&desc->lock, flags);
+	for_each_online_cpu(j)
+		any_count |= kstat_irqs_cpu(i, j);
+	action = desc->action;
+	if (!action && !any_count)
+		goto out;
+
+	seq_printf(p, "%*d: ", prec, i);
+	for_each_online_cpu(j)
+		seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
+	seq_printf(p, " %8s", desc->irq_data.chip->name);
+	seq_printf(p, "-%-8s", desc->name);
+
+	if (action) {
+		seq_printf(p, "  %s", action->name);
+		while ((action = action->next) != NULL)
+			seq_printf(p, ", %s", action->name);
+	}
+
+	seq_putc(p, '\n');
+out:
+	raw_spin_unlock_irqrestore(&desc->lock, flags);
+	return 0;
+}
+#endif
-- 
cgit v0.10.2


From 35e857cbeb24e75c6f9a9312ac30454eee8c5950 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 10 Feb 2011 12:20:23 +0100
Subject: genirq: Fixup core code namespace fallout

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 52b10ad..143eb2a 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -618,7 +618,7 @@ out_unlock:
 void
 handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
 {
-	struct irq_chip *chip = get_irq_desc_chip(desc);
+	struct irq_chip *chip = irq_desc_get_chip(desc);
 
 	kstat_incr_irqs_this_cpu(irq, desc);
 
@@ -685,7 +685,7 @@ void
 set_irq_chip_and_handler(unsigned int irq, struct irq_chip *chip,
 			 irq_flow_handler_t handle)
 {
-	set_irq_chip(irq, chip);
+	irq_set_chip(irq, chip);
 	__set_irq_handler(irq, handle, 0, NULL);
 }
 
@@ -693,7 +693,7 @@ void
 set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
 			      irq_flow_handler_t handle, const char *name)
 {
-	set_irq_chip(irq, chip);
+	irq_set_chip(irq, chip);
 	__set_irq_handler(irq, handle, 0, name);
 }
 
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 78a566a..dd4e5c2 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -260,7 +260,7 @@ EXPORT_SYMBOL_GPL(irq_set_affinity_notifier);
 static int
 setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
 {
-	struct irq_chip *chip = get_irq_desc_chip(desc);
+	struct irq_chip *chip = irq_desc_get_chip(desc);
 	struct cpumask *set = irq_default_affinity;
 	int ret;
 
-- 
cgit v0.10.2


From dbec07bac614a61e3392c1e7c08cc6a49ad43f7a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 7 Feb 2011 20:19:55 +0100
Subject: genirq: Add internal state field to irq_desc

That field will contain internal state information which is not going
to be exposed to anything outside the core code - except via accessor
functions. I'm tired of everyone fiddling in irq_desc.status.

core_internal_state__do_not_mess_with_it is clear enough, annoying to
type and easy to grep for. Offenders will be tracked down and slapped
with stinking trouts.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
index 64794de..782bf98 100644
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -19,6 +19,7 @@ struct timer_rand_state;
  * @handle_irq:		highlevel irq-events handler [if NULL, __do_IRQ()]
  * @action:		the irq action chain
  * @status:		status information
+ * @core_internal_state__do_not_mess_with_it: core internal status information
  * @depth:		disable-depth, for nested irq_disable() calls
  * @wake_depth:		enable depth, for multiple set_irq_wake() callers
  * @irq_count:		stats field to detect stalled irqs
@@ -63,7 +64,7 @@ struct irq_desc {
 	irq_flow_handler_t	handle_irq;
 	struct irqaction	*action;	/* IRQ action list */
 	unsigned int		status;		/* IRQ status */
-
+	unsigned int		core_internal_state__do_not_mess_with_it;
 	unsigned int		depth;		/* nested irq disables */
 	unsigned int		wake_depth;	/* nested wake enables */
 	unsigned int		irq_count;	/* For detecting broken IRQs */
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index b61824c..ae96e68 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -1,5 +1,9 @@
 /*
  * IRQ subsystem internal functions and variables:
+ *
+ * Do not ever include this file from anything else than
+ * kernel/irq/. Do not even think about using any information outside
+ * of this file for your non core code.
  */
 #include <linux/irqdesc.h>
 
@@ -9,6 +13,8 @@
 # define IRQ_BITMAP_BITS	NR_IRQS
 #endif
 
+#define istate core_internal_state__do_not_mess_with_it
+
 extern int noirqdebug;
 
 /*
-- 
cgit v0.10.2


From e6bea9c404699223322d7411c6f2ceaec02fa83c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 9 Feb 2011 13:16:52 +0100
Subject: genirq: Protect tglx from tripping over his own feet

The irq_desc.status field will either go away or renamed to
settings. Anyway we need to maintain compatibility to avoid breaking
the world and some more. While moving bits into the core, I need to
avoid that I use any of the still existing IRQ_ bits in the core code
by typos. So that file will hold the inline wrappers and some nasty
CPP tricks to break the build when typoed.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index ae96e68..8f20031 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -13,6 +13,8 @@
 # define IRQ_BITMAP_BITS	NR_IRQS
 #endif
 
+#include "settings.h"
+
 #define istate core_internal_state__do_not_mess_with_it
 
 extern int noirqdebug;
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 6f6644f..8b87f2c 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -79,7 +79,7 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node)
 	desc->irq_data.chip_data = NULL;
 	desc->irq_data.handler_data = NULL;
 	desc->irq_data.msi_desc = NULL;
-	desc->status = IRQ_DEFAULT_INIT_FLAGS;
+	desc->status = _IRQ_DEFAULT_INIT_FLAGS;
 	desc->handle_irq = handle_bad_irq;
 	desc->depth = 1;
 	desc->irq_count = 0;
@@ -246,7 +246,7 @@ int __init early_irq_init(void)
 
 struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
 	[0 ... NR_IRQS-1] = {
-		.status		= IRQ_DEFAULT_INIT_FLAGS,
+		.status		= _IRQ_DEFAULT_INIT_FLAGS,
 		.handle_irq	= handle_bad_irq,
 		.depth		= 1,
 		.lock		= __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock),
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
new file mode 100644
index 0000000..610f555
--- /dev/null
+++ b/kernel/irq/settings.h
@@ -0,0 +1,7 @@
+/*
+ * Internal header to deal with irq_desc->status which will be renamed
+ * to irq_desc->settings.
+ */
+enum {
+	_IRQ_DEFAULT_INIT_FLAGS	= IRQ_DEFAULT_INIT_FLAGS,
+};
-- 
cgit v0.10.2


From bd062e7667ac173afef57fbfe9327f3b914a9d4c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 7 Feb 2011 20:25:25 +0100
Subject: genirq: Move IRQ_AUTODETECT to internal state

No users outside of core

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/irq.h b/include/linux/irq.h
index e32b64c..d1f9c35 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -54,7 +54,6 @@ typedef	void (*irq_flow_handler_t)(unsigned int irq,
 #define IRQ_DISABLED		0x00000200	/* IRQ disabled - do not enter! */
 #define IRQ_PENDING		0x00000400	/* IRQ pending - replay on enable */
 #define IRQ_REPLAY		0x00000800	/* IRQ has been replayed but not acked yet */
-#define IRQ_AUTODETECT		0x00001000	/* IRQ is being autodetected */
 #define IRQ_WAITING		0x00002000	/* IRQ not yet seen - for autodetection */
 #define IRQ_LEVEL		0x00004000	/* IRQ level triggered */
 #define IRQ_MASKED		0x00008000	/* IRQ masked - shouldn't be seen again */
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index 08947cb..916e56e 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -32,7 +32,6 @@ unsigned long probe_irq_on(void)
 {
 	struct irq_desc *desc;
 	unsigned long mask = 0;
-	unsigned int status;
 	int i;
 
 	/*
@@ -76,7 +75,8 @@ unsigned long probe_irq_on(void)
 	for_each_irq_desc_reverse(i, desc) {
 		raw_spin_lock_irq(&desc->lock);
 		if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
-			desc->status |= IRQ_AUTODETECT | IRQ_WAITING;
+			desc->istate |= IRQS_AUTODETECT;
+			desc->status |= IRQ_WAITING;
 			if (irq_startup(desc))
 				desc->status |= IRQ_PENDING;
 		}
@@ -93,12 +93,11 @@ unsigned long probe_irq_on(void)
 	 */
 	for_each_irq_desc(i, desc) {
 		raw_spin_lock_irq(&desc->lock);
-		status = desc->status;
 
-		if (status & IRQ_AUTODETECT) {
+		if (desc->istate & IRQS_AUTODETECT) {
 			/* It triggered already - consider it spurious. */
-			if (!(status & IRQ_WAITING)) {
-				desc->status = status & ~IRQ_AUTODETECT;
+			if (!(desc->status & IRQ_WAITING)) {
+				desc->istate &= ~IRQS_AUTODETECT;
 				irq_shutdown(desc);
 			} else
 				if (i < 32)
@@ -125,19 +124,17 @@ EXPORT_SYMBOL(probe_irq_on);
  */
 unsigned int probe_irq_mask(unsigned long val)
 {
-	unsigned int status, mask = 0;
+	unsigned int mask = 0;
 	struct irq_desc *desc;
 	int i;
 
 	for_each_irq_desc(i, desc) {
 		raw_spin_lock_irq(&desc->lock);
-		status = desc->status;
-
-		if (status & IRQ_AUTODETECT) {
-			if (i < 16 && !(status & IRQ_WAITING))
+		if (desc->istate & IRQS_AUTODETECT) {
+			if (i < 16 && !(desc->status & IRQ_WAITING))
 				mask |= 1 << i;
 
-			desc->status = status & ~IRQ_AUTODETECT;
+			desc->istate &= ~IRQS_AUTODETECT;
 			irq_shutdown(desc);
 		}
 		raw_spin_unlock_irq(&desc->lock);
@@ -169,19 +166,17 @@ int probe_irq_off(unsigned long val)
 {
 	int i, irq_found = 0, nr_of_irqs = 0;
 	struct irq_desc *desc;
-	unsigned int status;
 
 	for_each_irq_desc(i, desc) {
 		raw_spin_lock_irq(&desc->lock);
-		status = desc->status;
 
-		if (status & IRQ_AUTODETECT) {
-			if (!(status & IRQ_WAITING)) {
+		if (desc->istate & IRQS_AUTODETECT) {
+			if (!(desc->status & IRQ_WAITING)) {
 				if (!nr_of_irqs)
 					irq_found = i;
 				nr_of_irqs++;
 			}
-			desc->status = status & ~IRQ_AUTODETECT;
+			desc->istate &= ~IRQS_AUTODETECT;
 			irq_shutdown(desc);
 		}
 		raw_spin_unlock_irq(&desc->lock);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 8f20031..7ffd4f4 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -33,6 +33,15 @@ enum {
 	IRQTF_AFFINITY,
 };
 
+/*
+ * Bit masks for desc->state
+ *
+ * IRQS_AUTODETECT		- autodetection in progress
+ */
+enum {
+	IRQS_AUTODETECT		= 0x00000001,
+};
+
 #define irq_data_to_desc(data)	container_of(data, struct irq_desc, irq_data)
 
 /* Set default functions for irq_chip structures: */
@@ -98,6 +107,7 @@ static inline void chip_bus_sync_unlock(struct irq_desc *desc)
 #include <linux/kallsyms.h>
 
 #define P(f) if (desc->status & f) printk("%14s set\n", #f)
+#define PS(f) if (desc->istate & f) printk("%14s set\n", #f)
 
 static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
 {
@@ -117,7 +127,6 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
 	P(IRQ_DISABLED);
 	P(IRQ_PENDING);
 	P(IRQ_REPLAY);
-	P(IRQ_AUTODETECT);
 	P(IRQ_WAITING);
 	P(IRQ_LEVEL);
 	P(IRQ_MASKED);
@@ -127,7 +136,9 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
 	P(IRQ_NOPROBE);
 	P(IRQ_NOREQUEST);
 	P(IRQ_NOAUTOEN);
+
+	PS(IRQS_AUTODETECT);
 }
 
 #undef P
-
+#undef PS
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index dd4e5c2..abe852c 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -897,8 +897,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 			desc->status |= IRQ_PER_CPU;
 #endif
 
-		desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING | IRQ_ONESHOT |
+		desc->status &= ~(IRQ_WAITING | IRQ_ONESHOT |
 				  IRQ_INPROGRESS | IRQ_SPURIOUS_DISABLED);
+		desc->istate &= ~IRQS_AUTODETECT;
 
 		if (new->flags & IRQF_ONESHOT)
 			desc->status |= IRQ_ONESHOT;
-- 
cgit v0.10.2


From 7acdd53e5b2c55b6f7e3427e85e2f91fa814a4f9 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 7 Feb 2011 20:40:54 +0100
Subject: genirq: Move IRQ_SPURIOUS_DISABLED to core state

No users outside.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/irq.h b/include/linux/irq.h
index d1f9c35..a900741 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -64,7 +64,6 @@ typedef	void (*irq_flow_handler_t)(unsigned int irq,
 #define IRQ_WAKEUP		0x00100000	/* IRQ triggers system wakeup */
 #define IRQ_MOVE_PENDING	0x00200000	/* need to re-target IRQ destination */
 #define IRQ_NO_BALANCING	0x00400000	/* IRQ is excluded from balancing */
-#define IRQ_SPURIOUS_DISABLED	0x00800000	/* IRQ was disabled by the spurious trap */
 #define IRQ_MOVE_PCNTXT		0x01000000	/* IRQ migration from process context */
 #define IRQ_AFFINITY_SET	0x02000000	/* IRQ affinity was set from userspace*/
 #define IRQ_SUSPENDED		0x04000000	/* IRQ has gone through suspend sequence */
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 7ffd4f4..dc5e21b 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -37,9 +37,12 @@ enum {
  * Bit masks for desc->state
  *
  * IRQS_AUTODETECT		- autodetection in progress
+ * IRQS_SPURIOUS_DISABLED	- was disabled due to spurious interrupt
+ *				  detection
  */
 enum {
 	IRQS_AUTODETECT		= 0x00000001,
+	IRQS_SPURIOUS_DISABLED	= 0x00000002,
 };
 
 #define irq_data_to_desc(data)	container_of(data, struct irq_desc, irq_data)
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index abe852c..5b918ff 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -897,9 +897,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 			desc->status |= IRQ_PER_CPU;
 #endif
 
-		desc->status &= ~(IRQ_WAITING | IRQ_ONESHOT |
-				  IRQ_INPROGRESS | IRQ_SPURIOUS_DISABLED);
-		desc->istate &= ~IRQS_AUTODETECT;
+		desc->status &= ~(IRQ_WAITING | IRQ_ONESHOT | IRQ_INPROGRESS);
+		desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED);
 
 		if (new->flags & IRQF_ONESHOT)
 			desc->status |= IRQ_ONESHOT;
@@ -937,8 +936,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 	 * Check whether we disabled the irq via the spurious handler
 	 * before. Reenable it and give it another chance.
 	 */
-	if (shared && (desc->status & IRQ_SPURIOUS_DISABLED)) {
-		desc->status &= ~IRQ_SPURIOUS_DISABLED;
+	if (shared && (desc->istate & IRQS_SPURIOUS_DISABLED)) {
+		desc->istate &= ~IRQS_SPURIOUS_DISABLED;
 		__enable_irq(desc, irq, false);
 	}
 
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index bc06207..2941d8a 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -146,15 +146,15 @@ static void poll_spurious_irqs(unsigned long dummy)
 	irq_poll_cpu = smp_processor_id();
 
 	for_each_irq_desc(i, desc) {
-		unsigned int status;
+		unsigned int state;
 
 		if (!i)
 			 continue;
 
 		/* Racy but it doesn't matter */
-		status = desc->status;
+		state = desc->istate;
 		barrier();
-		if (!(status & IRQ_SPURIOUS_DISABLED))
+		if (!(state & IRQS_SPURIOUS_DISABLED))
 			continue;
 
 		local_irq_disable();
@@ -298,7 +298,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
 		 * Now kill the IRQ
 		 */
 		printk(KERN_EMERG "Disabling IRQ #%d\n", irq);
-		desc->status |= IRQ_SPURIOUS_DISABLED;
+		desc->istate |= IRQS_SPURIOUS_DISABLED;
 		desc->depth++;
 		irq_disable(desc);
 
-- 
cgit v0.10.2


From 6f91a52d9bb28396177662f1da0f2e2cef9cf5d0 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 14 Feb 2011 13:33:16 +0100
Subject: genirq: Use modify_status for set_irq_nested_thread

No need for a separate function in the core code.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/irq.h b/include/linux/irq.h
index a900741..67b77cf 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -74,7 +74,7 @@ typedef	void (*irq_flow_handler_t)(unsigned int irq,
 #define IRQF_MODIFY_MASK	\
 	(IRQ_TYPE_SENSE_MASK | IRQ_NOPROBE | IRQ_NOREQUEST | \
 	 IRQ_NOAUTOEN | IRQ_MOVE_PCNTXT | IRQ_LEVEL | IRQ_NO_BALANCING | \
-	 IRQ_PER_CPU)
+	 IRQ_PER_CPU | IRQ_NESTED_THREAD)
 
 #ifdef CONFIG_IRQ_PER_CPU
 # define CHECK_IRQ_PER_CPU(var) ((var) & IRQ_PER_CPU)
@@ -307,8 +307,6 @@ set_irq_chained_handler(unsigned int irq, irq_flow_handler_t handle)
 	__set_irq_handler(irq, handle, 1, NULL);
 }
 
-extern void set_irq_nested_thread(unsigned int irq, int nest);
-
 void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set);
 
 static inline void irq_set_status_flags(unsigned int irq, unsigned long set)
@@ -331,6 +329,14 @@ static inline void irq_set_probe(unsigned int irq)
 	irq_modify_status(irq, IRQ_NOPROBE, 0);
 }
 
+static inline void irq_set_nested_thread(unsigned int irq, bool nest)
+{
+	if (nest)
+		irq_set_status_flags(irq, IRQ_NESTED_THREAD);
+	else
+		irq_clear_status_flags(irq, IRQ_NESTED_THREAD);
+}
+
 /* Handle dynamic irq creation and destruction */
 extern unsigned int create_irq_nr(unsigned int irq_want, int node);
 extern int create_irq(void);
@@ -448,6 +454,10 @@ static inline void set_irq_probe(unsigned int irq)
 {
 	irq_set_probe(irq);
 }
+static inline void set_irq_nested_thread(unsigned int irq, int nest)
+{
+	irq_set_nested_thread(irq, nest);
+}
 #endif
 
 int irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node);
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 143eb2a..bff21f2 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -164,34 +164,6 @@ struct irq_data *irq_get_irq_data(unsigned int irq)
 }
 EXPORT_SYMBOL_GPL(irq_get_irq_data);
 
-/**
- *	set_irq_nested_thread - Set/Reset the IRQ_NESTED_THREAD flag of an irq
- *
- *	@irq:	Interrupt number
- *	@nest:	0 to clear / 1 to set the IRQ_NESTED_THREAD flag
- *
- *	The IRQ_NESTED_THREAD flag indicates that on
- *	request_threaded_irq() no separate interrupt thread should be
- *	created for the irq as the handler are called nested in the
- *	context of a demultiplexing interrupt handler thread.
- */
-void set_irq_nested_thread(unsigned int irq, int nest)
-{
-	struct irq_desc *desc = irq_to_desc(irq);
-	unsigned long flags;
-
-	if (!desc)
-		return;
-
-	raw_spin_lock_irqsave(&desc->lock, flags);
-	if (nest)
-		desc->status |= IRQ_NESTED_THREAD;
-	else
-		desc->status &= ~IRQ_NESTED_THREAD;
-	raw_spin_unlock_irqrestore(&desc->lock, flags);
-}
-EXPORT_SYMBOL_GPL(set_irq_nested_thread);
-
 int irq_startup(struct irq_desc *desc)
 {
 	desc->status &= ~IRQ_DISABLED;
-- 
cgit v0.10.2


From 6954b75b488dd740950573f244ddd66fd28620aa Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 7 Feb 2011 20:55:35 +0100
Subject: genirq: Move IRQ_POLL_INPROGRESS to core

No users outside of core.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 67b77cf..047a695 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -69,7 +69,6 @@ typedef	void (*irq_flow_handler_t)(unsigned int irq,
 #define IRQ_SUSPENDED		0x04000000	/* IRQ has gone through suspend sequence */
 #define IRQ_ONESHOT		0x08000000	/* IRQ is not unmasked after hardirq */
 #define IRQ_NESTED_THREAD	0x10000000	/* IRQ is nested into another, no own handler thread */
-#define IRQ_POLL_INPROGRESS	0x20000000	/* IRQ poll is in progress */
 
 #define IRQF_MODIFY_MASK	\
 	(IRQ_TYPE_SENSE_MASK | IRQ_NOPROBE | IRQ_NOREQUEST | \
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index bff21f2..34245e7 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -400,7 +400,7 @@ EXPORT_SYMBOL_GPL(handle_nested_irq);
 
 static bool irq_check_poll(struct irq_desc *desc)
 {
-	if (!(desc->status & IRQ_POLL_INPROGRESS))
+	if (!(desc->istate & IRQS_POLL_INPROGRESS))
 		return false;
 	return irq_wait_for_poll(desc);
 }
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index dc5e21b..f5d28e1 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -39,10 +39,12 @@ enum {
  * IRQS_AUTODETECT		- autodetection in progress
  * IRQS_SPURIOUS_DISABLED	- was disabled due to spurious interrupt
  *				  detection
+ * IRQS_POLL_INPROGRESS		- polling in progress
  */
 enum {
 	IRQS_AUTODETECT		= 0x00000001,
 	IRQS_SPURIOUS_DISABLED	= 0x00000002,
+	IRQS_POLL_INPROGRESS	= 0x00000008,
 };
 
 #define irq_data_to_desc(data)	container_of(data, struct irq_desc, irq_data)
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 2941d8a..21c4617 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -98,13 +98,13 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force)
 	}
 
 	/* Mark it poll in progress */
-	desc->status |= IRQ_POLL_INPROGRESS;
+	desc->istate |= IRQS_POLL_INPROGRESS;
 	do {
 		if (handle_irq_event(desc) == IRQ_HANDLED)
 			ret = IRQ_HANDLED;
 		action = desc->action;
 	} while ((desc->status & IRQ_PENDING) && action);
-	desc->status &= ~IRQ_POLL_INPROGRESS;
+	desc->istate &= ~IRQS_POLL_INPROGRESS;
 out:
 	raw_spin_unlock(&desc->lock);
 	return ret == IRQ_HANDLED;
@@ -259,7 +259,7 @@ try_misrouted_irq(unsigned int irq, struct irq_desc *desc,
 void note_interrupt(unsigned int irq, struct irq_desc *desc,
 		    irqreturn_t action_ret)
 {
-	if (desc->status & IRQ_POLL_INPROGRESS)
+	if (desc->istate & IRQS_POLL_INPROGRESS)
 		return;
 
 	if (unlikely(action_ret != IRQ_HANDLED)) {
-- 
cgit v0.10.2


From 009b4c3b8ad584b3462734127a5bec680d5d6af4 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 7 Feb 2011 21:48:49 +0100
Subject: genirq: Add IRQ_INPROGRESS to core

We need to maintain the flag for now in both fields status and istate.
Add a CONFIG_GENERIC_HARDIRQS_NO_COMPAT switch to allow testing w/o
the status one. Wrap the access to status IRQ_INPROGRESS in a inline
which can be turned of with CONFIG_GENERIC_HARDIRQS_NO_COMPAT along
with the define.

There is no reason that anything outside of core looks at this. That
needs some modifications, but we'll get there.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 047a695..274590f 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -50,7 +50,11 @@ typedef	void (*irq_flow_handler_t)(unsigned int irq,
 #define IRQ_TYPE_PROBE		0x00000010	/* Probing in progress */
 
 /* Internal flags */
-#define IRQ_INPROGRESS		0x00000100	/* IRQ handler active - do not enter! */
+
+#ifndef CONFIG_GENERIC_HARDIRQS_NO_COMPAT
+#define IRQ_INPROGRESS		0x00000100	/* DEPRECATED */
+#endif
+
 #define IRQ_DISABLED		0x00000200	/* IRQ disabled - do not enter! */
 #define IRQ_PENDING		0x00000400	/* IRQ pending - replay on enable */
 #define IRQ_REPLAY		0x00000800	/* IRQ has been replayed but not acked yet */
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 4cd5d71..9e2256d 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -13,6 +13,9 @@ config GENERIC_HARDIRQS
 config GENERIC_HARDIRQS_NO_DEPRECATED
        def_bool n
 
+config GENERIC_HARDIRQS_NO_COMPAT
+       def_bool n
+
 # Options selectable by the architecture code
 config HAVE_SPARSE_IRQ
        def_bool n
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 34245e7..0753855 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -383,7 +383,8 @@ void handle_nested_irq(unsigned int irq)
 	if (unlikely(!action || (desc->status & IRQ_DISABLED)))
 		goto out_unlock;
 
-	desc->status |= IRQ_INPROGRESS;
+	irq_compat_set_progress(desc);
+	desc->istate |= IRQS_INPROGRESS;
 	raw_spin_unlock_irq(&desc->lock);
 
 	action_ret = action->thread_fn(action->irq, action->dev_id);
@@ -391,7 +392,8 @@ void handle_nested_irq(unsigned int irq)
 		note_interrupt(irq, desc, action_ret);
 
 	raw_spin_lock_irq(&desc->lock);
-	desc->status &= ~IRQ_INPROGRESS;
+	desc->istate &= ~IRQS_INPROGRESS;
+	irq_compat_clr_progress(desc);
 
 out_unlock:
 	raw_spin_unlock_irq(&desc->lock);
@@ -422,7 +424,7 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc)
 {
 	raw_spin_lock(&desc->lock);
 
-	if (unlikely(desc->status & IRQ_INPROGRESS))
+	if (unlikely(desc->istate & IRQS_INPROGRESS))
 		if (!irq_check_poll(desc))
 			goto out_unlock;
 
@@ -454,7 +456,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
 	raw_spin_lock(&desc->lock);
 	mask_ack_irq(desc);
 
-	if (unlikely(desc->status & IRQ_INPROGRESS))
+	if (unlikely(desc->istate & IRQS_INPROGRESS))
 		if (!irq_check_poll(desc))
 			goto out_unlock;
 
@@ -492,7 +494,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
 {
 	raw_spin_lock(&desc->lock);
 
-	if (unlikely(desc->status & IRQ_INPROGRESS))
+	if (unlikely(desc->istate & IRQS_INPROGRESS))
 		if (!irq_check_poll(desc))
 			goto out;
 
@@ -542,8 +544,8 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
 	 * we shouldn't process the IRQ. Mark it pending, handle
 	 * the necessary masking and go out
 	 */
-	if (unlikely((desc->status & (IRQ_INPROGRESS | IRQ_DISABLED)) ||
-		    !desc->action)) {
+	if (unlikely((desc->istate & (IRQS_INPROGRESS) ||
+		      (desc->status & IRQ_DISABLED) || !desc->action))) {
 		if (!irq_check_poll(desc)) {
 			desc->status |= IRQ_PENDING;
 			mask_ack_irq(desc);
diff --git a/kernel/irq/compat.h b/kernel/irq/compat.h
new file mode 100644
index 0000000..aac6e40
--- /dev/null
+++ b/kernel/irq/compat.h
@@ -0,0 +1,17 @@
+/*
+ * Compat layer for transition period
+ */
+#ifndef CONFIG_GENERIC_HARDIRQS_NO_COMPAT
+static inline void irq_compat_set_progress(struct irq_desc *desc)
+{
+	desc->status |= IRQ_INPROGRESS;
+}
+
+static inline void irq_compat_clr_progress(struct irq_desc *desc)
+{
+	desc->status &= ~IRQ_INPROGRESS;
+}
+#else
+static inline void irq_compat_set_progress(struct irq_desc *desc) { }
+static inline void irq_compat_clr_progress(struct irq_desc *desc) { }
+#endif
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index ff40e0f..d4ae0b1 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -123,13 +123,15 @@ irqreturn_t handle_irq_event(struct irq_desc *desc)
 	irqreturn_t ret;
 
 	desc->status &= ~IRQ_PENDING;
-	desc->status |= IRQ_INPROGRESS;
+	irq_compat_set_progress(desc);
+	desc->istate |= IRQS_INPROGRESS;
 	raw_spin_unlock(&desc->lock);
 
 	ret = handle_irq_event_percpu(desc, action);
 
 	raw_spin_lock(&desc->lock);
-	desc->status &= ~IRQ_INPROGRESS;
+	desc->istate &= ~IRQS_INPROGRESS;
+	irq_compat_clr_progress(desc);
 	return ret;
 }
 
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index f5d28e1..d1cb1f8 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -13,6 +13,7 @@
 # define IRQ_BITMAP_BITS	NR_IRQS
 #endif
 
+#include "compat.h"
 #include "settings.h"
 
 #define istate core_internal_state__do_not_mess_with_it
@@ -40,11 +41,13 @@ enum {
  * IRQS_SPURIOUS_DISABLED	- was disabled due to spurious interrupt
  *				  detection
  * IRQS_POLL_INPROGRESS		- polling in progress
+ * IRQS_INPROGRESS		- Interrupt in progress
  */
 enum {
 	IRQS_AUTODETECT		= 0x00000001,
 	IRQS_SPURIOUS_DISABLED	= 0x00000002,
 	IRQS_POLL_INPROGRESS	= 0x00000008,
+	IRQS_INPROGRESS		= 0x00000010,
 };
 
 #define irq_data_to_desc(data)	container_of(data, struct irq_desc, irq_data)
@@ -128,7 +131,6 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
 		print_symbol("%s\n", (unsigned long)desc->action->handler);
 	}
 
-	P(IRQ_INPROGRESS);
 	P(IRQ_DISABLED);
 	P(IRQ_PENDING);
 	P(IRQ_REPLAY);
@@ -143,6 +145,7 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
 	P(IRQ_NOAUTOEN);
 
 	PS(IRQS_AUTODETECT);
+	PS(IRQS_INPROGRESS);
 }
 
 #undef P
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 5b918ff..7e5a508 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -30,7 +30,7 @@
 void synchronize_irq(unsigned int irq)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
-	unsigned int status;
+	unsigned int state;
 
 	if (!desc)
 		return;
@@ -42,16 +42,16 @@ void synchronize_irq(unsigned int irq)
 		 * Wait until we're out of the critical section.  This might
 		 * give the wrong answer due to the lack of memory barriers.
 		 */
-		while (desc->status & IRQ_INPROGRESS)
+		while (desc->istate & IRQS_INPROGRESS)
 			cpu_relax();
 
 		/* Ok, that indicated we're done: double-check carefully. */
 		raw_spin_lock_irqsave(&desc->lock, flags);
-		status = desc->status;
+		state = desc->istate;
 		raw_spin_unlock_irqrestore(&desc->lock, flags);
 
 		/* Oops, that failed? */
-	} while (status & IRQ_INPROGRESS);
+	} while (state & IRQS_INPROGRESS);
 
 	/*
 	 * We made sure that no hardirq handler is running. Now verify
@@ -637,9 +637,9 @@ again:
 	 * The thread is faster done than the hard interrupt handler
 	 * on the other CPU. If we unmask the irq line then the
 	 * interrupt can come in again and masks the line, leaves due
-	 * to IRQ_INPROGRESS and the irq line is masked forever.
+	 * to IRQS_INPROGRESS and the irq line is masked forever.
 	 */
-	if (unlikely(desc->status & IRQ_INPROGRESS)) {
+	if (unlikely(desc->istate & IRQS_INPROGRESS)) {
 		raw_spin_unlock_irq(&desc->lock);
 		chip_bus_sync_unlock(desc);
 		cpu_relax();
@@ -897,8 +897,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 			desc->status |= IRQ_PER_CPU;
 #endif
 
-		desc->status &= ~(IRQ_WAITING | IRQ_ONESHOT | IRQ_INPROGRESS);
-		desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED);
+		desc->status &= ~(IRQ_WAITING | IRQ_ONESHOT);
+		desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \
+				  IRQS_INPROGRESS);
 
 		if (new->flags & IRQF_ONESHOT)
 			desc->status |= IRQ_ONESHOT;
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
index 610f555..a96140e 100644
--- a/kernel/irq/settings.h
+++ b/kernel/irq/settings.h
@@ -5,3 +5,6 @@
 enum {
 	_IRQ_DEFAULT_INIT_FLAGS	= IRQ_DEFAULT_INIT_FLAGS,
 };
+
+#undef IRQ_INPROGRESS
+#define IRQ_INPROGRESS		GOT_YOU_MORON
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 21c4617..5150483 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -45,10 +45,10 @@ bool irq_wait_for_poll(struct irq_desc *desc)
 #ifdef CONFIG_SMP
 	do {
 		raw_spin_unlock(&desc->lock);
-		while (desc->status & IRQ_INPROGRESS)
+		while (desc->istate & IRQS_INPROGRESS)
 			cpu_relax();
 		raw_spin_lock(&desc->lock);
-	} while (desc->status & IRQ_INPROGRESS);
+	} while (desc->istate & IRQS_INPROGRESS);
 	/* Might have been disabled in meantime */
 	return !(desc->status & IRQ_DISABLED) && desc->action;
 #else
@@ -88,7 +88,7 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force)
 		goto out;
 
 	/* Already running on another processor */
-	if (desc->status & IRQ_INPROGRESS) {
+	if (desc->istate & IRQS_INPROGRESS) {
 		/*
 		 * Already running: If it is shared get the other
 		 * CPU to go looking for our mystery interrupt too
-- 
cgit v0.10.2


From 3d67baec7f1b01fc289ac1a2f1a7e6d5e43391c6 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 7 Feb 2011 21:02:10 +0100
Subject: genirq: Move IRQ_ONESHOT to core

No users outside of core.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 274590f..1a4c723 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -71,7 +71,6 @@ typedef	void (*irq_flow_handler_t)(unsigned int irq,
 #define IRQ_MOVE_PCNTXT		0x01000000	/* IRQ migration from process context */
 #define IRQ_AFFINITY_SET	0x02000000	/* IRQ affinity was set from userspace*/
 #define IRQ_SUSPENDED		0x04000000	/* IRQ has gone through suspend sequence */
-#define IRQ_ONESHOT		0x08000000	/* IRQ is not unmasked after hardirq */
 #define IRQ_NESTED_THREAD	0x10000000	/* IRQ is nested into another, no own handler thread */
 
 #define IRQF_MODIFY_MASK	\
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 0753855..420fa6b 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -472,7 +472,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
 
 	handle_irq_event(desc);
 
-	if (!(desc->status & (IRQ_DISABLED | IRQ_ONESHOT)))
+	if (!(desc->status & IRQ_DISABLED) && !(desc->istate & IRQS_ONESHOT))
 		unmask_irq(desc);
 out_unlock:
 	raw_spin_unlock(&desc->lock);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index d1cb1f8..36563f73 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -42,12 +42,14 @@ enum {
  *				  detection
  * IRQS_POLL_INPROGRESS		- polling in progress
  * IRQS_INPROGRESS		- Interrupt in progress
+ * IRQS_ONESHOT			- irq is not unmasked in primary handler
  */
 enum {
 	IRQS_AUTODETECT		= 0x00000001,
 	IRQS_SPURIOUS_DISABLED	= 0x00000002,
 	IRQS_POLL_INPROGRESS	= 0x00000008,
 	IRQS_INPROGRESS		= 0x00000010,
+	IRQS_ONESHOT		= 0x00000020,
 };
 
 #define irq_data_to_desc(data)	container_of(data, struct irq_desc, irq_data)
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 7e5a508..aca4208 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -697,7 +697,7 @@ static int irq_thread(void *data)
 	};
 	struct irqaction *action = data;
 	struct irq_desc *desc = irq_to_desc(action->irq);
-	int wake, oneshot = desc->status & IRQ_ONESHOT;
+	int wake, oneshot = desc->istate & IRQS_ONESHOT;
 
 	sched_setscheduler(current, SCHED_FIFO, &param);
 	current->irqaction = action;
@@ -897,12 +897,12 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 			desc->status |= IRQ_PER_CPU;
 #endif
 
-		desc->status &= ~(IRQ_WAITING | IRQ_ONESHOT);
+		desc->status &= ~IRQ_WAITING;
 		desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \
-				  IRQS_INPROGRESS);
+				  IRQS_INPROGRESS | IRQS_ONESHOT);
 
 		if (new->flags & IRQF_ONESHOT)
-			desc->status |= IRQ_ONESHOT;
+			desc->istate |= IRQS_ONESHOT;
 
 		if (!(desc->status & IRQ_NOAUTOEN))
 			irq_startup(desc);
-- 
cgit v0.10.2


From 163ef3091195f514a06f064b12914597d2644c55 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 8 Feb 2011 11:39:15 +0100
Subject: genirq: Move IRQ_REPLAY and IRQ_WAITING to core

No users outside of core.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 1a4c723..c38dbd5 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -53,12 +53,13 @@ typedef	void (*irq_flow_handler_t)(unsigned int irq,
 
 #ifndef CONFIG_GENERIC_HARDIRQS_NO_COMPAT
 #define IRQ_INPROGRESS		0x00000100	/* DEPRECATED */
+#define IRQ_REPLAY		0x00000200	/* DEPRECATED */
+#define IRQ_WAITING		0x00000400	/* DEPRECATED */
 #endif
 
-#define IRQ_DISABLED		0x00000200	/* IRQ disabled - do not enter! */
-#define IRQ_PENDING		0x00000400	/* IRQ pending - replay on enable */
-#define IRQ_REPLAY		0x00000800	/* IRQ has been replayed but not acked yet */
-#define IRQ_WAITING		0x00002000	/* IRQ not yet seen - for autodetection */
+#define IRQ_DISABLED		0x00000800	/* IRQ disabled - do not enter! */
+#define IRQ_PENDING		0x00001000	/* IRQ pending - replay on enable */
+
 #define IRQ_LEVEL		0x00004000	/* IRQ level triggered */
 #define IRQ_MASKED		0x00008000	/* IRQ masked - shouldn't be seen again */
 #define IRQ_PER_CPU		0x00010000	/* IRQ is per CPU */
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index 916e56e..9ea8bb9 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -17,7 +17,7 @@
 /*
  * Autodetection depends on the fact that any interrupt that
  * comes in on to an unassigned handler will get stuck with
- * "IRQ_WAITING" cleared and the interrupt disabled.
+ * "IRQS_WAITING" cleared and the interrupt disabled.
  */
 static DEFINE_MUTEX(probing_active);
 
@@ -75,8 +75,7 @@ unsigned long probe_irq_on(void)
 	for_each_irq_desc_reverse(i, desc) {
 		raw_spin_lock_irq(&desc->lock);
 		if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
-			desc->istate |= IRQS_AUTODETECT;
-			desc->status |= IRQ_WAITING;
+			desc->istate |= IRQS_AUTODETECT | IRQS_WAITING;
 			if (irq_startup(desc))
 				desc->status |= IRQ_PENDING;
 		}
@@ -96,7 +95,7 @@ unsigned long probe_irq_on(void)
 
 		if (desc->istate & IRQS_AUTODETECT) {
 			/* It triggered already - consider it spurious. */
-			if (!(desc->status & IRQ_WAITING)) {
+			if (!(desc->istate & IRQS_WAITING)) {
 				desc->istate &= ~IRQS_AUTODETECT;
 				irq_shutdown(desc);
 			} else
@@ -131,7 +130,7 @@ unsigned int probe_irq_mask(unsigned long val)
 	for_each_irq_desc(i, desc) {
 		raw_spin_lock_irq(&desc->lock);
 		if (desc->istate & IRQS_AUTODETECT) {
-			if (i < 16 && !(desc->status & IRQ_WAITING))
+			if (i < 16 && !(desc->istate & IRQS_WAITING))
 				mask |= 1 << i;
 
 			desc->istate &= ~IRQS_AUTODETECT;
@@ -171,7 +170,7 @@ int probe_irq_off(unsigned long val)
 		raw_spin_lock_irq(&desc->lock);
 
 		if (desc->istate & IRQS_AUTODETECT) {
-			if (!(desc->status & IRQ_WAITING)) {
+			if (!(desc->istate & IRQS_WAITING)) {
 				if (!nr_of_irqs)
 					irq_found = i;
 				nr_of_irqs++;
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 420fa6b..59ae145 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -428,7 +428,7 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc)
 		if (!irq_check_poll(desc))
 			goto out_unlock;
 
-	desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
+	desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
 	kstat_incr_irqs_this_cpu(irq, desc);
 
 	if (unlikely(!desc->action || (desc->status & IRQ_DISABLED)))
@@ -460,7 +460,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
 		if (!irq_check_poll(desc))
 			goto out_unlock;
 
-	desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
+	desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
 	kstat_incr_irqs_this_cpu(irq, desc);
 
 	/*
@@ -498,7 +498,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
 		if (!irq_check_poll(desc))
 			goto out;
 
-	desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
+	desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
 	kstat_incr_irqs_this_cpu(irq, desc);
 
 	/*
@@ -537,8 +537,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
 {
 	raw_spin_lock(&desc->lock);
 
-	desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
-
+	desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
 	/*
 	 * If we're currently running this IRQ, or its disabled,
 	 * we shouldn't process the IRQ. Mark it pending, handle
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 36563f73..5403753 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -43,6 +43,8 @@ enum {
  * IRQS_POLL_INPROGRESS		- polling in progress
  * IRQS_INPROGRESS		- Interrupt in progress
  * IRQS_ONESHOT			- irq is not unmasked in primary handler
+ * IRQS_REPLAY			- irq is replayed
+ * IRQS_WAITING			- irq is waiting
  */
 enum {
 	IRQS_AUTODETECT		= 0x00000001,
@@ -50,6 +52,8 @@ enum {
 	IRQS_POLL_INPROGRESS	= 0x00000008,
 	IRQS_INPROGRESS		= 0x00000010,
 	IRQS_ONESHOT		= 0x00000020,
+	IRQS_REPLAY		= 0x00000040,
+	IRQS_WAITING		= 0x00000080,
 };
 
 #define irq_data_to_desc(data)	container_of(data, struct irq_desc, irq_data)
@@ -135,8 +139,6 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
 
 	P(IRQ_DISABLED);
 	P(IRQ_PENDING);
-	P(IRQ_REPLAY);
-	P(IRQ_WAITING);
 	P(IRQ_LEVEL);
 	P(IRQ_MASKED);
 #ifdef CONFIG_IRQ_PER_CPU
@@ -148,6 +150,8 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
 
 	PS(IRQS_AUTODETECT);
 	PS(IRQS_INPROGRESS);
+	PS(IRQS_REPLAY);
+	PS(IRQS_WAITING);
 }
 
 #undef P
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index aca4208..7971df5 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -897,9 +897,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 			desc->status |= IRQ_PER_CPU;
 #endif
 
-		desc->status &= ~IRQ_WAITING;
 		desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \
-				  IRQS_INPROGRESS | IRQS_ONESHOT);
+				  IRQS_INPROGRESS | IRQS_ONESHOT | \
+				  IRQS_WAITING);
 
 		if (new->flags & IRQF_ONESHOT)
 			desc->istate |= IRQS_ONESHOT;
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 60b2026..f83387c 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -62,8 +62,11 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq)
 	 */
 	if (desc->status & IRQ_LEVEL)
 		return;
-	if ((desc->status & (IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) {
-		desc->status = (desc->status & ~IRQ_PENDING) | IRQ_REPLAY;
+	if (desc->istate & IRQS_REPLAY)
+		return;
+	if (desc->status & IRQ_PENDING) {
+		desc->status &= ~IRQ_PENDING;
+		desc->istate |= IRQS_REPLAY;
 
 		if (!desc->irq_data.chip->irq_retrigger ||
 		    !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) {
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
index a96140e..2e7d08f 100644
--- a/kernel/irq/settings.h
+++ b/kernel/irq/settings.h
@@ -8,3 +8,7 @@ enum {
 
 #undef IRQ_INPROGRESS
 #define IRQ_INPROGRESS		GOT_YOU_MORON
+#undef IRQ_REPLAY
+#define IRQ_REPLAY		GOT_YOU_MORON
+#undef IRQ_WAITING
+#define IRQ_WAITING		GOT_YOU_MORON
-- 
cgit v0.10.2


From c1594b77e46124bb462f961e536120e471c67446 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 7 Feb 2011 22:11:30 +0100
Subject: genirq: Move IRQ_DISABLED to core

Keep status in sync until all abusers are fixed.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/irq.h b/include/linux/irq.h
index c38dbd5..32efca7 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -55,9 +55,9 @@ typedef	void (*irq_flow_handler_t)(unsigned int irq,
 #define IRQ_INPROGRESS		0x00000100	/* DEPRECATED */
 #define IRQ_REPLAY		0x00000200	/* DEPRECATED */
 #define IRQ_WAITING		0x00000400	/* DEPRECATED */
+#define IRQ_DISABLED		0x00000800	/* DEPRECATED */
 #endif
 
-#define IRQ_DISABLED		0x00000800	/* IRQ disabled - do not enter! */
 #define IRQ_PENDING		0x00001000	/* IRQ pending - replay on enable */
 
 #define IRQ_LEVEL		0x00004000	/* IRQ level triggered */
@@ -231,7 +231,7 @@ struct irq_chip {
 # define ARCH_IRQ_INIT_FLAGS	0
 #endif
 
-#define IRQ_DEFAULT_INIT_FLAGS	(IRQ_DISABLED | ARCH_IRQ_INIT_FLAGS)
+#define IRQ_DEFAULT_INIT_FLAGS	ARCH_IRQ_INIT_FLAGS
 
 struct irqaction;
 extern int setup_irq(unsigned int irq, struct irqaction *new);
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 59ae145..527df7a 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -164,9 +164,21 @@ struct irq_data *irq_get_irq_data(unsigned int irq)
 }
 EXPORT_SYMBOL_GPL(irq_get_irq_data);
 
+static void irq_state_clr_disabled(struct irq_desc *desc)
+{
+	desc->istate &= ~IRQS_DISABLED;
+	irq_compat_clr_disabled(desc);
+}
+
+static void irq_state_set_disabled(struct irq_desc *desc)
+{
+	desc->istate |= IRQS_DISABLED;
+	irq_compat_set_disabled(desc);
+}
+
 int irq_startup(struct irq_desc *desc)
 {
-	desc->status &= ~IRQ_DISABLED;
+	irq_state_clr_disabled(desc);
 	desc->depth = 0;
 
 	if (desc->irq_data.chip->irq_startup) {
@@ -181,7 +193,7 @@ int irq_startup(struct irq_desc *desc)
 
 void irq_shutdown(struct irq_desc *desc)
 {
-	desc->status |= IRQ_DISABLED;
+	irq_state_set_disabled(desc);
 	desc->depth = 1;
 	if (desc->irq_data.chip->irq_shutdown)
 		desc->irq_data.chip->irq_shutdown(&desc->irq_data);
@@ -194,7 +206,7 @@ void irq_shutdown(struct irq_desc *desc)
 
 void irq_enable(struct irq_desc *desc)
 {
-	desc->status &= ~IRQ_DISABLED;
+	irq_state_clr_disabled(desc);
 	if (desc->irq_data.chip->irq_enable)
 		desc->irq_data.chip->irq_enable(&desc->irq_data);
 	else
@@ -204,7 +216,7 @@ void irq_enable(struct irq_desc *desc)
 
 void irq_disable(struct irq_desc *desc)
 {
-	desc->status |= IRQ_DISABLED;
+	irq_state_set_disabled(desc);
 	if (desc->irq_data.chip->irq_disable) {
 		desc->irq_data.chip->irq_disable(&desc->irq_data);
 		desc->status |= IRQ_MASKED;
@@ -380,7 +392,7 @@ void handle_nested_irq(unsigned int irq)
 	kstat_incr_irqs_this_cpu(irq, desc);
 
 	action = desc->action;
-	if (unlikely(!action || (desc->status & IRQ_DISABLED)))
+	if (unlikely(!action || (desc->istate & IRQS_DISABLED)))
 		goto out_unlock;
 
 	irq_compat_set_progress(desc);
@@ -431,7 +443,7 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc)
 	desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
 	kstat_incr_irqs_this_cpu(irq, desc);
 
-	if (unlikely(!desc->action || (desc->status & IRQ_DISABLED)))
+	if (unlikely(!desc->action || (desc->istate & IRQS_DISABLED)))
 		goto out_unlock;
 
 	handle_irq_event(desc);
@@ -467,12 +479,12 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
 	 * If its disabled or no action available
 	 * keep it masked and get out of here
 	 */
-	if (unlikely(!desc->action || (desc->status & IRQ_DISABLED)))
+	if (unlikely(!desc->action || (desc->istate & IRQS_DISABLED)))
 		goto out_unlock;
 
 	handle_irq_event(desc);
 
-	if (!(desc->status & IRQ_DISABLED) && !(desc->istate & IRQS_ONESHOT))
+	if (!(desc->istate & (IRQS_DISABLED | IRQS_ONESHOT)))
 		unmask_irq(desc);
 out_unlock:
 	raw_spin_unlock(&desc->lock);
@@ -505,7 +517,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
 	 * If its disabled or no action available
 	 * then mask it and get out of here:
 	 */
-	if (unlikely(!desc->action || (desc->status & IRQ_DISABLED))) {
+	if (unlikely(!desc->action || (desc->istate & IRQS_DISABLED))) {
 		desc->status |= IRQ_PENDING;
 		mask_irq(desc);
 		goto out;
@@ -543,8 +555,8 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
 	 * we shouldn't process the IRQ. Mark it pending, handle
 	 * the necessary masking and go out
 	 */
-	if (unlikely((desc->istate & (IRQS_INPROGRESS) ||
-		      (desc->status & IRQ_DISABLED) || !desc->action))) {
+	if (unlikely((desc->istate & (IRQS_DISABLED | IRQS_INPROGRESS) ||
+		      !desc->action))) {
 		if (!irq_check_poll(desc)) {
 			desc->status |= IRQ_PENDING;
 			mask_ack_irq(desc);
@@ -567,15 +579,16 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
 		 * one, we could have masked the irq.
 		 * Renable it, if it was not disabled in meantime.
 		 */
-		if (unlikely((desc->status &
-			       (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) ==
-			      (IRQ_PENDING | IRQ_MASKED))) {
-			unmask_irq(desc);
+		if (unlikely(desc->status & IRQ_PENDING)) {
+			if (!(desc->istate & IRQS_DISABLED) &&
+			    (desc->status & IRQ_MASKED))
+				unmask_irq(desc);
 		}
 
 		handle_irq_event(desc);
 
-	} while ((desc->status & (IRQ_PENDING | IRQ_DISABLED)) == IRQ_PENDING);
+	} while ((desc->status & IRQ_PENDING) &&
+		 !(desc->istate & IRQS_DISABLED));
 
 out_unlock:
 	raw_spin_unlock(&desc->lock);
@@ -639,7 +652,8 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
 	if (handle == handle_bad_irq) {
 		if (desc->irq_data.chip != &no_irq_chip)
 			mask_ack_irq(desc);
-		desc->status |= IRQ_DISABLED;
+		irq_compat_set_disabled(desc);
+		desc->istate |= IRQS_DISABLED;
 		desc->depth = 1;
 	}
 	desc->handle_irq = handle;
diff --git a/kernel/irq/compat.h b/kernel/irq/compat.h
index aac6e40..bc0c2a5 100644
--- a/kernel/irq/compat.h
+++ b/kernel/irq/compat.h
@@ -11,7 +11,19 @@ static inline void irq_compat_clr_progress(struct irq_desc *desc)
 {
 	desc->status &= ~IRQ_INPROGRESS;
 }
+static inline void irq_compat_set_disabled(struct irq_desc *desc)
+{
+	desc->status |= IRQ_DISABLED;
+}
+
+static inline void irq_compat_clr_disabled(struct irq_desc *desc)
+{
+	desc->status &= ~IRQ_DISABLED;
+}
 #else
 static inline void irq_compat_set_progress(struct irq_desc *desc) { }
 static inline void irq_compat_clr_progress(struct irq_desc *desc) { }
+static inline void irq_compat_set_disabled(struct irq_desc *desc) { }
+static inline void irq_compat_clr_disabled(struct irq_desc *desc) { }
 #endif
+
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 5403753..919d2dd 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -45,6 +45,7 @@ enum {
  * IRQS_ONESHOT			- irq is not unmasked in primary handler
  * IRQS_REPLAY			- irq is replayed
  * IRQS_WAITING			- irq is waiting
+ * IRQS_DISABLED		- irq is disabled
  */
 enum {
 	IRQS_AUTODETECT		= 0x00000001,
@@ -54,6 +55,7 @@ enum {
 	IRQS_ONESHOT		= 0x00000020,
 	IRQS_REPLAY		= 0x00000040,
 	IRQS_WAITING		= 0x00000080,
+	IRQS_DISABLED		= 0x00000100,
 };
 
 #define irq_data_to_desc(data)	container_of(data, struct irq_desc, irq_data)
@@ -137,7 +139,6 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
 		print_symbol("%s\n", (unsigned long)desc->action->handler);
 	}
 
-	P(IRQ_DISABLED);
 	P(IRQ_PENDING);
 	P(IRQ_LEVEL);
 	P(IRQ_MASKED);
@@ -152,6 +153,7 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
 	PS(IRQS_INPROGRESS);
 	PS(IRQS_REPLAY);
 	PS(IRQS_WAITING);
+	PS(IRQS_DISABLED);
 }
 
 #undef P
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 8b87f2c..78866d0 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -80,6 +80,7 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node)
 	desc->irq_data.handler_data = NULL;
 	desc->irq_data.msi_desc = NULL;
 	desc->status = _IRQ_DEFAULT_INIT_FLAGS;
+	desc->istate = IRQS_DISABLED;
 	desc->handle_irq = handle_bad_irq;
 	desc->depth = 1;
 	desc->irq_count = 0;
@@ -247,6 +248,7 @@ int __init early_irq_init(void)
 struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
 	[0 ... NR_IRQS-1] = {
 		.status		= _IRQ_DEFAULT_INIT_FLAGS,
+		.istate		= IRQS_DISABLED,
 		.handle_irq	= handle_bad_irq,
 		.depth		= 1,
 		.lock		= __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock),
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 7971df5..77ff275 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -646,7 +646,7 @@ again:
 		goto again;
 	}
 
-	if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) {
+	if (!(desc->istate & IRQS_DISABLED) && (desc->status & IRQ_MASKED)) {
 		desc->status &= ~IRQ_MASKED;
 		desc->irq_data.chip->irq_unmask(&desc->irq_data);
 	}
@@ -709,7 +709,7 @@ static int irq_thread(void *data)
 		atomic_inc(&desc->threads_active);
 
 		raw_spin_lock_irq(&desc->lock);
-		if (unlikely(desc->status & IRQ_DISABLED)) {
+		if (unlikely(desc->istate & IRQS_DISABLED)) {
 			/*
 			 * CHECKME: We might need a dedicated
 			 * IRQ_THREAD_PENDING flag here, which
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 441fd62..8c68cb8 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -61,7 +61,7 @@ void move_native_irq(int irq)
 	if (likely(!(desc->status & IRQ_MOVE_PENDING)))
 		return;
 
-	if (unlikely(desc->status & IRQ_DISABLED))
+	if (unlikely(desc->istate & IRQS_DISABLED))
 		return;
 
 	/*
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
index 2e7d08f..5e3411c 100644
--- a/kernel/irq/settings.h
+++ b/kernel/irq/settings.h
@@ -12,3 +12,5 @@ enum {
 #define IRQ_REPLAY		GOT_YOU_MORON
 #undef IRQ_WAITING
 #define IRQ_WAITING		GOT_YOU_MORON
+#undef IRQ_DISABLED
+#define IRQ_DISABLED		GOT_YOU_MORON
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 5150483..367614f 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -50,7 +50,7 @@ bool irq_wait_for_poll(struct irq_desc *desc)
 		raw_spin_lock(&desc->lock);
 	} while (desc->istate & IRQS_INPROGRESS);
 	/* Might have been disabled in meantime */
-	return !(desc->status & IRQ_DISABLED) && desc->action;
+	return !(desc->istate & IRQS_DISABLED) && desc->action;
 #else
 	return false;
 #endif
@@ -75,7 +75,7 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force)
 	 * Do not poll disabled interrupts unless the spurious
 	 * disabled poller asks explicitely.
 	 */
-	if ((desc->status & IRQ_DISABLED) && !force)
+	if ((desc->istate & IRQS_DISABLED) && !force)
 		goto out;
 
 	/*
-- 
cgit v0.10.2


From 2a0d6fb335d4428285dab2d254911748e6040807 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 8 Feb 2011 12:17:57 +0100
Subject: genirq: Move IRQ_PENDING flag to core

Keep status in sync until all users are fixed.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 32efca7..7ca55c9 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -56,9 +56,9 @@ typedef	void (*irq_flow_handler_t)(unsigned int irq,
 #define IRQ_REPLAY		0x00000200	/* DEPRECATED */
 #define IRQ_WAITING		0x00000400	/* DEPRECATED */
 #define IRQ_DISABLED		0x00000800	/* DEPRECATED */
+#define IRQ_PENDING		0x00001000	/* DEPRECATED */
 #endif
 
-#define IRQ_PENDING		0x00001000	/* IRQ pending - replay on enable */
 
 #define IRQ_LEVEL		0x00004000	/* IRQ level triggered */
 #define IRQ_MASKED		0x00008000	/* IRQ masked - shouldn't be seen again */
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index 9ea8bb9..aab64c2 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -76,8 +76,10 @@ unsigned long probe_irq_on(void)
 		raw_spin_lock_irq(&desc->lock);
 		if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
 			desc->istate |= IRQS_AUTODETECT | IRQS_WAITING;
-			if (irq_startup(desc))
-				desc->status |= IRQ_PENDING;
+			if (irq_startup(desc)) {
+				irq_compat_set_pending(desc);
+				desc->istate |= IRQS_PENDING;
+			}
 		}
 		raw_spin_unlock_irq(&desc->lock);
 	}
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 527df7a..17c8786 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -518,7 +518,8 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
 	 * then mask it and get out of here:
 	 */
 	if (unlikely(!desc->action || (desc->istate & IRQS_DISABLED))) {
-		desc->status |= IRQ_PENDING;
+		irq_compat_set_pending(desc);
+		desc->istate |= IRQS_PENDING;
 		mask_irq(desc);
 		goto out;
 	}
@@ -558,7 +559,8 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
 	if (unlikely((desc->istate & (IRQS_DISABLED | IRQS_INPROGRESS) ||
 		      !desc->action))) {
 		if (!irq_check_poll(desc)) {
-			desc->status |= IRQ_PENDING;
+			irq_compat_set_pending(desc);
+			desc->istate |= IRQS_PENDING;
 			mask_ack_irq(desc);
 			goto out_unlock;
 		}
@@ -579,7 +581,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
 		 * one, we could have masked the irq.
 		 * Renable it, if it was not disabled in meantime.
 		 */
-		if (unlikely(desc->status & IRQ_PENDING)) {
+		if (unlikely(desc->istate & IRQS_PENDING)) {
 			if (!(desc->istate & IRQS_DISABLED) &&
 			    (desc->status & IRQ_MASKED))
 				unmask_irq(desc);
@@ -587,7 +589,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
 
 		handle_irq_event(desc);
 
-	} while ((desc->status & IRQ_PENDING) &&
+	} while ((desc->istate & IRQS_PENDING) &&
 		 !(desc->istate & IRQS_DISABLED));
 
 out_unlock:
diff --git a/kernel/irq/compat.h b/kernel/irq/compat.h
index bc0c2a5..0067a69 100644
--- a/kernel/irq/compat.h
+++ b/kernel/irq/compat.h
@@ -15,15 +15,25 @@ static inline void irq_compat_set_disabled(struct irq_desc *desc)
 {
 	desc->status |= IRQ_DISABLED;
 }
-
 static inline void irq_compat_clr_disabled(struct irq_desc *desc)
 {
 	desc->status &= ~IRQ_DISABLED;
 }
+static inline void irq_compat_set_pending(struct irq_desc *desc)
+{
+	desc->status |= IRQ_PENDING;
+}
+
+static inline void irq_compat_clr_pending(struct irq_desc *desc)
+{
+	desc->status &= ~IRQ_PENDING;
+}
 #else
 static inline void irq_compat_set_progress(struct irq_desc *desc) { }
 static inline void irq_compat_clr_progress(struct irq_desc *desc) { }
 static inline void irq_compat_set_disabled(struct irq_desc *desc) { }
 static inline void irq_compat_clr_disabled(struct irq_desc *desc) { }
+static inline void irq_compat_set_pending(struct irq_desc *desc) { }
+static inline void irq_compat_clr_pending(struct irq_desc *desc) { }
 #endif
 
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index d4ae0b1..6e34bdb 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -122,7 +122,8 @@ irqreturn_t handle_irq_event(struct irq_desc *desc)
 	struct irqaction *action = desc->action;
 	irqreturn_t ret;
 
-	desc->status &= ~IRQ_PENDING;
+	irq_compat_clr_pending(desc);
+	desc->istate &= ~IRQS_PENDING;
 	irq_compat_set_progress(desc);
 	desc->istate |= IRQS_INPROGRESS;
 	raw_spin_unlock(&desc->lock);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 919d2dd..fdf2524 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -46,6 +46,7 @@ enum {
  * IRQS_REPLAY			- irq is replayed
  * IRQS_WAITING			- irq is waiting
  * IRQS_DISABLED		- irq is disabled
+ * IRQS_PENDING			- irq is pending and replayed later
  */
 enum {
 	IRQS_AUTODETECT		= 0x00000001,
@@ -56,6 +57,7 @@ enum {
 	IRQS_REPLAY		= 0x00000040,
 	IRQS_WAITING		= 0x00000080,
 	IRQS_DISABLED		= 0x00000100,
+	IRQS_PENDING		= 0x00000200,
 };
 
 #define irq_data_to_desc(data)	container_of(data, struct irq_desc, irq_data)
@@ -139,7 +141,6 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
 		print_symbol("%s\n", (unsigned long)desc->action->handler);
 	}
 
-	P(IRQ_PENDING);
 	P(IRQ_LEVEL);
 	P(IRQ_MASKED);
 #ifdef CONFIG_IRQ_PER_CPU
@@ -154,6 +155,7 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
 	PS(IRQS_REPLAY);
 	PS(IRQS_WAITING);
 	PS(IRQS_DISABLED);
+	PS(IRQS_PENDING);
 }
 
 #undef P
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 77ff275..ac06081 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -714,10 +714,11 @@ static int irq_thread(void *data)
 			 * CHECKME: We might need a dedicated
 			 * IRQ_THREAD_PENDING flag here, which
 			 * retriggers the thread in check_irq_resend()
-			 * but AFAICT IRQ_PENDING should be fine as it
+			 * but AFAICT IRQS_PENDING should be fine as it
 			 * retriggers the interrupt itself --- tglx
 			 */
-			desc->status |= IRQ_PENDING;
+			irq_compat_set_pending(desc);
+			desc->istate |= IRQS_PENDING;
 			raw_spin_unlock_irq(&desc->lock);
 		} else {
 			raw_spin_unlock_irq(&desc->lock);
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index d6bfb89..d738941 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -69,7 +69,8 @@ int check_wakeup_irqs(void)
 	int irq;
 
 	for_each_irq_desc(irq, desc)
-		if ((desc->status & IRQ_WAKEUP) && (desc->status & IRQ_PENDING))
+		if ((desc->status & IRQ_WAKEUP) &&
+		    (desc->istate & IRQS_PENDING))
 			return -EBUSY;
 
 	return 0;
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index f83387c..ff1fea0 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -64,8 +64,9 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq)
 		return;
 	if (desc->istate & IRQS_REPLAY)
 		return;
-	if (desc->status & IRQ_PENDING) {
-		desc->status &= ~IRQ_PENDING;
+	if (desc->istate & IRQS_PENDING) {
+		irq_compat_clr_pending(desc);
+		desc->istate &= ~IRQS_PENDING;
 		desc->istate |= IRQS_REPLAY;
 
 		if (!desc->irq_data.chip->irq_retrigger ||
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
index 5e3411c..623fcf8 100644
--- a/kernel/irq/settings.h
+++ b/kernel/irq/settings.h
@@ -14,3 +14,5 @@ enum {
 #define IRQ_WAITING		GOT_YOU_MORON
 #undef IRQ_DISABLED
 #define IRQ_DISABLED		GOT_YOU_MORON
+#undef IRQ_PENDING
+#define IRQ_PENDING		GOT_YOU_MORON
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 367614f..692ce2b 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -93,7 +93,8 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force)
 		 * Already running: If it is shared get the other
 		 * CPU to go looking for our mystery interrupt too
 		 */
-		desc->status |= IRQ_PENDING;
+		irq_compat_set_pending(desc);
+		desc->istate |= IRQS_PENDING;
 		goto out;
 	}
 
@@ -103,7 +104,7 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force)
 		if (handle_irq_event(desc) == IRQ_HANDLED)
 			ret = IRQ_HANDLED;
 		action = desc->action;
-	} while ((desc->status & IRQ_PENDING) && action);
+	} while ((desc->istate & IRQS_PENDING) && action);
 	desc->istate &= ~IRQS_POLL_INPROGRESS;
 out:
 	raw_spin_unlock(&desc->lock);
-- 
cgit v0.10.2


From 6e40262ea43c4b0e3f435b3a083e4461ef921c17 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 8 Feb 2011 12:36:06 +0100
Subject: genirq: Move IRQ_MASKED to core

Keep status in sync until all users are fixed.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 7ca55c9..9800bac 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -57,11 +57,11 @@ typedef	void (*irq_flow_handler_t)(unsigned int irq,
 #define IRQ_WAITING		0x00000400	/* DEPRECATED */
 #define IRQ_DISABLED		0x00000800	/* DEPRECATED */
 #define IRQ_PENDING		0x00001000	/* DEPRECATED */
+#define IRQ_MASKED		0x00002000	/* DEPRECATED */
 #endif
 
 
 #define IRQ_LEVEL		0x00004000	/* IRQ level triggered */
-#define IRQ_MASKED		0x00008000	/* IRQ masked - shouldn't be seen again */
 #define IRQ_PER_CPU		0x00010000	/* IRQ is per CPU */
 #define IRQ_NOPROBE		0x00020000	/* IRQ is not valid for probing */
 #define IRQ_NOREQUEST		0x00040000	/* IRQ cannot be requested */
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 17c8786..73b2e7e 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -176,6 +176,18 @@ static void irq_state_set_disabled(struct irq_desc *desc)
 	irq_compat_set_disabled(desc);
 }
 
+static void irq_state_clr_masked(struct irq_desc *desc)
+{
+	desc->istate &= ~IRQS_MASKED;
+	irq_compat_clr_masked(desc);
+}
+
+static void irq_state_set_masked(struct irq_desc *desc)
+{
+	desc->istate |= IRQS_MASKED;
+	irq_compat_set_masked(desc);
+}
+
 int irq_startup(struct irq_desc *desc)
 {
 	irq_state_clr_disabled(desc);
@@ -183,7 +195,7 @@ int irq_startup(struct irq_desc *desc)
 
 	if (desc->irq_data.chip->irq_startup) {
 		int ret = desc->irq_data.chip->irq_startup(&desc->irq_data);
-		desc->status &= ~IRQ_MASKED;
+		irq_state_clr_masked(desc);
 		return ret;
 	}
 
@@ -201,7 +213,7 @@ void irq_shutdown(struct irq_desc *desc)
 		desc->irq_data.chip->irq_disable(&desc->irq_data);
 	else
 		desc->irq_data.chip->irq_mask(&desc->irq_data);
-	desc->status |= IRQ_MASKED;
+	irq_state_set_masked(desc);
 }
 
 void irq_enable(struct irq_desc *desc)
@@ -211,7 +223,7 @@ void irq_enable(struct irq_desc *desc)
 		desc->irq_data.chip->irq_enable(&desc->irq_data);
 	else
 		desc->irq_data.chip->irq_unmask(&desc->irq_data);
-	desc->status &= ~IRQ_MASKED;
+	irq_state_clr_masked(desc);
 }
 
 void irq_disable(struct irq_desc *desc)
@@ -219,8 +231,8 @@ void irq_disable(struct irq_desc *desc)
 	irq_state_set_disabled(desc);
 	if (desc->irq_data.chip->irq_disable) {
 		desc->irq_data.chip->irq_disable(&desc->irq_data);
-		desc->status |= IRQ_MASKED;
 	}
+	irq_state_set_masked(desc);
 }
 
 #ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
@@ -352,14 +364,14 @@ static inline void mask_ack_irq(struct irq_desc *desc)
 		if (desc->irq_data.chip->irq_ack)
 			desc->irq_data.chip->irq_ack(&desc->irq_data);
 	}
-	desc->status |= IRQ_MASKED;
+	irq_state_set_masked(desc);
 }
 
 static inline void mask_irq(struct irq_desc *desc)
 {
 	if (desc->irq_data.chip->irq_mask) {
 		desc->irq_data.chip->irq_mask(&desc->irq_data);
-		desc->status |= IRQ_MASKED;
+		irq_state_set_masked(desc);
 	}
 }
 
@@ -367,7 +379,7 @@ static inline void unmask_irq(struct irq_desc *desc)
 {
 	if (desc->irq_data.chip->irq_unmask) {
 		desc->irq_data.chip->irq_unmask(&desc->irq_data);
-		desc->status &= ~IRQ_MASKED;
+		irq_state_clr_masked(desc);
 	}
 }
 
@@ -583,7 +595,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
 		 */
 		if (unlikely(desc->istate & IRQS_PENDING)) {
 			if (!(desc->istate & IRQS_DISABLED) &&
-			    (desc->status & IRQ_MASKED))
+			    (desc->istate & IRQS_MASKED))
 				unmask_irq(desc);
 		}
 
diff --git a/kernel/irq/compat.h b/kernel/irq/compat.h
index 0067a69..593abec 100644
--- a/kernel/irq/compat.h
+++ b/kernel/irq/compat.h
@@ -28,6 +28,15 @@ static inline void irq_compat_clr_pending(struct irq_desc *desc)
 {
 	desc->status &= ~IRQ_PENDING;
 }
+static inline void irq_compat_set_masked(struct irq_desc *desc)
+{
+	desc->status |= IRQ_MASKED;
+}
+
+static inline void irq_compat_clr_masked(struct irq_desc *desc)
+{
+	desc->status &= ~IRQ_MASKED;
+}
 #else
 static inline void irq_compat_set_progress(struct irq_desc *desc) { }
 static inline void irq_compat_clr_progress(struct irq_desc *desc) { }
@@ -35,5 +44,7 @@ static inline void irq_compat_set_disabled(struct irq_desc *desc) { }
 static inline void irq_compat_clr_disabled(struct irq_desc *desc) { }
 static inline void irq_compat_set_pending(struct irq_desc *desc) { }
 static inline void irq_compat_clr_pending(struct irq_desc *desc) { }
+static inline void irq_compat_set_masked(struct irq_desc *desc) { }
+static inline void irq_compat_clr_masked(struct irq_desc *desc) { }
 #endif
 
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index fdf2524..3f2fcc1 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -47,6 +47,7 @@ enum {
  * IRQS_WAITING			- irq is waiting
  * IRQS_DISABLED		- irq is disabled
  * IRQS_PENDING			- irq is pending and replayed later
+ * IRQS_MASKED			- irq is masked
  */
 enum {
 	IRQS_AUTODETECT		= 0x00000001,
@@ -58,6 +59,7 @@ enum {
 	IRQS_WAITING		= 0x00000080,
 	IRQS_DISABLED		= 0x00000100,
 	IRQS_PENDING		= 0x00000200,
+	IRQS_MASKED		= 0x00000400,
 };
 
 #define irq_data_to_desc(data)	container_of(data, struct irq_desc, irq_data)
@@ -142,7 +144,6 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
 	}
 
 	P(IRQ_LEVEL);
-	P(IRQ_MASKED);
 #ifdef CONFIG_IRQ_PER_CPU
 	P(IRQ_PER_CPU);
 #endif
@@ -156,6 +157,7 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
 	PS(IRQS_WAITING);
 	PS(IRQS_DISABLED);
 	PS(IRQS_PENDING);
+	PS(IRQS_MASKED);
 }
 
 #undef P
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index ac06081..83fd201 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -646,8 +646,9 @@ again:
 		goto again;
 	}
 
-	if (!(desc->istate & IRQS_DISABLED) && (desc->status & IRQ_MASKED)) {
-		desc->status &= ~IRQ_MASKED;
+	if (!(desc->istate & IRQS_DISABLED) && (desc->istate & IRQS_MASKED)) {
+		irq_compat_clr_masked(desc);
+		desc->istate &= ~IRQS_MASKED;
 		desc->irq_data.chip->irq_unmask(&desc->irq_data);
 	}
 	raw_spin_unlock_irq(&desc->lock);
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 8c68cb8..6f2f984 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -69,7 +69,7 @@ void move_native_irq(int irq)
 	 * threaded interrupt with ONESHOT set, we can end up with an
 	 * interrupt storm.
 	 */
-	masked = desc->status & IRQ_MASKED;
+	masked = desc->istate & IRQS_MASKED;
 	if (!masked)
 		desc->irq_data.chip->irq_mask(&desc->irq_data);
 	move_masked_irq(irq);
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
index 623fcf8..2cd45fd 100644
--- a/kernel/irq/settings.h
+++ b/kernel/irq/settings.h
@@ -16,3 +16,5 @@ enum {
 #define IRQ_DISABLED		GOT_YOU_MORON
 #undef IRQ_PENDING
 #define IRQ_PENDING		GOT_YOU_MORON
+#undef IRQ_MASKED
+#define IRQ_MASKED		GOT_YOU_MORON
-- 
cgit v0.10.2


From c531e8361f1968d664e6e97fbd3bfa4cf0e62e42 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 8 Feb 2011 12:44:58 +0100
Subject: genirq: Move IRQ_SUSPENDED to core

No users outside of core.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 9800bac..3ce45c2 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -60,7 +60,6 @@ typedef	void (*irq_flow_handler_t)(unsigned int irq,
 #define IRQ_MASKED		0x00002000	/* DEPRECATED */
 #endif
 
-
 #define IRQ_LEVEL		0x00004000	/* IRQ level triggered */
 #define IRQ_PER_CPU		0x00010000	/* IRQ is per CPU */
 #define IRQ_NOPROBE		0x00020000	/* IRQ is not valid for probing */
@@ -71,7 +70,6 @@ typedef	void (*irq_flow_handler_t)(unsigned int irq,
 #define IRQ_NO_BALANCING	0x00400000	/* IRQ is excluded from balancing */
 #define IRQ_MOVE_PCNTXT		0x01000000	/* IRQ migration from process context */
 #define IRQ_AFFINITY_SET	0x02000000	/* IRQ affinity was set from userspace*/
-#define IRQ_SUSPENDED		0x04000000	/* IRQ has gone through suspend sequence */
 #define IRQ_NESTED_THREAD	0x10000000	/* IRQ is nested into another, no own handler thread */
 
 #define IRQF_MODIFY_MASK	\
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 3f2fcc1..4688911 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -48,6 +48,7 @@ enum {
  * IRQS_DISABLED		- irq is disabled
  * IRQS_PENDING			- irq is pending and replayed later
  * IRQS_MASKED			- irq is masked
+ * IRQS_SUSPENDED		- irq is suspended
  */
 enum {
 	IRQS_AUTODETECT		= 0x00000001,
@@ -60,6 +61,7 @@ enum {
 	IRQS_DISABLED		= 0x00000100,
 	IRQS_PENDING		= 0x00000200,
 	IRQS_MASKED		= 0x00000400,
+	IRQS_SUSPENDED		= 0x00000800,
 };
 
 #define irq_data_to_desc(data)	container_of(data, struct irq_desc, irq_data)
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 83fd201..b912de4 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -326,7 +326,7 @@ void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend)
 	if (suspend) {
 		if (!desc->action || (desc->action->flags & IRQF_NO_SUSPEND))
 			return;
-		desc->status |= IRQ_SUSPENDED;
+		desc->istate |= IRQS_SUSPENDED;
 	}
 
 	if (!desc->depth++)
@@ -388,7 +388,7 @@ EXPORT_SYMBOL(disable_irq);
 void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
 {
 	if (resume) {
-		if (!(desc->status & IRQ_SUSPENDED)) {
+		if (!(desc->istate & IRQS_SUSPENDED)) {
 			if (!desc->action)
 				return;
 			if (!(desc->action->flags & IRQF_FORCE_RESUME))
@@ -396,7 +396,7 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
 			/* Pretend that it got disabled ! */
 			desc->depth++;
 		}
-		desc->status &= ~IRQ_SUSPENDED;
+		desc->istate &= ~IRQS_SUSPENDED;
 	}
 
 	switch (desc->depth) {
@@ -405,7 +405,7 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
 		WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq);
 		break;
 	case 1: {
-		if (desc->status & IRQ_SUSPENDED)
+		if (desc->istate & IRQS_SUSPENDED)
 			goto err_out;
 		/* Prevent probing on this irq: */
 		desc->status |= IRQ_NOPROBE;
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index d738941..d81337f 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -18,7 +18,7 @@
  * During system-wide suspend or hibernation device drivers need to be prevented
  * from receiving interrupts and this function is provided for this purpose.
  * It marks all interrupt lines in use, except for the timer ones, as disabled
- * and sets the IRQ_SUSPENDED flag for each of them.
+ * and sets the IRQS_SUSPENDED flag for each of them.
  */
 void suspend_device_irqs(void)
 {
@@ -34,7 +34,7 @@ void suspend_device_irqs(void)
 	}
 
 	for_each_irq_desc(irq, desc)
-		if (desc->status & IRQ_SUSPENDED)
+		if (desc->istate & IRQS_SUSPENDED)
 			synchronize_irq(irq);
 }
 EXPORT_SYMBOL_GPL(suspend_device_irqs);
@@ -43,7 +43,7 @@ EXPORT_SYMBOL_GPL(suspend_device_irqs);
  * resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs()
  *
  * Enable all interrupt lines previously disabled by suspend_device_irqs() that
- * have the IRQ_SUSPENDED flag set.
+ * have the IRQS_SUSPENDED flag set.
  */
 void resume_device_irqs(void)
 {
-- 
cgit v0.10.2


From 6d2cd17fde1fc3e93302815f049f255bb2b3123e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 8 Feb 2011 14:34:18 +0100
Subject: genirq: Move IRQ_WAKEUP to core

No users outside of core.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 4688911..cef0849 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -49,6 +49,7 @@ enum {
  * IRQS_PENDING			- irq is pending and replayed later
  * IRQS_MASKED			- irq is masked
  * IRQS_SUSPENDED		- irq is suspended
+ * IRQS_WAKEUP			- irq triggers system wakeup from suspend
  */
 enum {
 	IRQS_AUTODETECT		= 0x00000001,
@@ -62,6 +63,7 @@ enum {
 	IRQS_PENDING		= 0x00000200,
 	IRQS_MASKED		= 0x00000400,
 	IRQS_SUSPENDED		= 0x00000800,
+	IRQS_WAKEUP		= 0x00001000,
 };
 
 #define irq_data_to_desc(data)	container_of(data, struct irq_desc, irq_data)
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index b912de4..ccc9389 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -489,7 +489,7 @@ int irq_set_irq_wake(unsigned int irq, unsigned int on)
 			if (ret)
 				desc->wake_depth = 0;
 			else
-				desc->status |= IRQ_WAKEUP;
+				desc->istate |= IRQS_WAKEUP;
 		}
 	} else {
 		if (desc->wake_depth == 0) {
@@ -499,7 +499,7 @@ int irq_set_irq_wake(unsigned int irq, unsigned int on)
 			if (ret)
 				desc->wake_depth = 1;
 			else
-				desc->status &= ~IRQ_WAKEUP;
+				desc->istate &= ~IRQS_WAKEUP;
 		}
 	}
 
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index d81337f..f39383d 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -69,7 +69,7 @@ int check_wakeup_irqs(void)
 	int irq;
 
 	for_each_irq_desc(irq, desc)
-		if ((desc->status & IRQ_WAKEUP) &&
+		if ((desc->istate & IRQS_WAKEUP) &&
 		    (desc->istate & IRQS_PENDING))
 			return -EBUSY;
 
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
index 2cd45fd..ef09824 100644
--- a/kernel/irq/settings.h
+++ b/kernel/irq/settings.h
@@ -18,3 +18,5 @@ enum {
 #define IRQ_PENDING		GOT_YOU_MORON
 #undef IRQ_MASKED
 #define IRQ_MASKED		GOT_YOU_MORON
+#undef IRQ_WAKEUP
+#define IRQ_WAKEUP		GOT_YOU_MORON
-- 
cgit v0.10.2


From 91c499178139d6597e68db19638e4135510a34b8 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 3 Feb 2011 20:48:29 +0100
Subject: genirq: Add state field to irq_data

Some chip implementations need to access certain status flags. With
sparse irqs that requires a lookup of the irq descriptor. Add a state
field which contains such flags.

Name it in a way which will make coders happy to access it with the
proper accessor functions. And it's easy to grep for.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 3ce45c2..62bb08e 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -102,6 +102,8 @@ struct msi_desc;
  * struct irq_data - per irq and irq chip data passed down to chip functions
  * @irq:		interrupt number
  * @node:		node index useful for balancing
+ * @state_use_accessor: status information for irq chip functions.
+ *			Use accessor functions to deal with it
  * @chip:		low level interrupt hardware access
  * @handler_data:	per-IRQ data for the irq_chip methods
  * @chip_data:		platform-specific per-chip private data for the chip
@@ -116,6 +118,7 @@ struct msi_desc;
 struct irq_data {
 	unsigned int		irq;
 	unsigned int		node;
+	unsigned int		state_use_accessors;
 	struct irq_chip		*chip;
 	void			*handler_data;
 	void			*chip_data;
diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
index 782bf98..581d9665 100644
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -48,6 +48,7 @@ struct irq_desc {
 		struct {
 			unsigned int		irq;
 			unsigned int		node;
+			unsigned int		pad_do_not_even_think_about_it;
 			struct irq_chip		*chip;
 			void			*handler_data;
 			void			*chip_data;
-- 
cgit v0.10.2


From f230b6d5c48f8d12f4dfa1f8b5ab0b0320076d21 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 5 Feb 2011 15:20:04 +0100
Subject: genirq: Add IRQ_MOVE_PENDING to irq_data.state

chip implementations need to know about it. Keep status in sync until
all users are fixed.

Accessor function: irqd_is_setaffinity_pending(irqdata)

Coders who access them directly will be tracked down and slapped with
stinking trouts.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 62bb08e..2899905 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -58,15 +58,16 @@ typedef	void (*irq_flow_handler_t)(unsigned int irq,
 #define IRQ_DISABLED		0x00000800	/* DEPRECATED */
 #define IRQ_PENDING		0x00001000	/* DEPRECATED */
 #define IRQ_MASKED		0x00002000	/* DEPRECATED */
+/* DEPRECATED use irq_setaffinity_pending() instead*/
+#define IRQ_MOVE_PENDING	0x00004000
 #endif
 
-#define IRQ_LEVEL		0x00004000	/* IRQ level triggered */
+#define IRQ_LEVEL		0x00008000	/* IRQ level triggered */
 #define IRQ_PER_CPU		0x00010000	/* IRQ is per CPU */
 #define IRQ_NOPROBE		0x00020000	/* IRQ is not valid for probing */
 #define IRQ_NOREQUEST		0x00040000	/* IRQ cannot be requested */
 #define IRQ_NOAUTOEN		0x00080000	/* IRQ will not be enabled on request irq */
 #define IRQ_WAKEUP		0x00100000	/* IRQ triggers system wakeup */
-#define IRQ_MOVE_PENDING	0x00200000	/* need to re-target IRQ destination */
 #define IRQ_NO_BALANCING	0x00400000	/* IRQ is excluded from balancing */
 #define IRQ_MOVE_PCNTXT		0x01000000	/* IRQ migration from process context */
 #define IRQ_AFFINITY_SET	0x02000000	/* IRQ affinity was set from userspace*/
@@ -128,6 +129,21 @@ struct irq_data {
 #endif
 };
 
+/*
+ * Bit masks for irq_data.state
+ *
+ * IRQD_SETAFFINITY_PENDING	- Affinity setting is pending
+ */
+enum {
+	/* Bit 0 - 7 reserved for TYPE will use later */
+	IRQD_SETAFFINITY_PENDING = (1 << 8),
+};
+
+static inline bool irqd_is_setaffinity_pending(struct irq_data *d)
+{
+	return d->state_use_accessors & IRQD_SETAFFINITY_PENDING;
+}
+
 /**
  * struct irq_chip - hardware interrupt chip descriptor
  *
diff --git a/kernel/irq/compat.h b/kernel/irq/compat.h
index 593abec..5e33aad 100644
--- a/kernel/irq/compat.h
+++ b/kernel/irq/compat.h
@@ -37,6 +37,15 @@ static inline void irq_compat_clr_masked(struct irq_desc *desc)
 {
 	desc->status &= ~IRQ_MASKED;
 }
+static inline void irq_compat_set_move_pending(struct irq_desc *desc)
+{
+	desc->status |= IRQ_MOVE_PENDING;
+}
+
+static inline void irq_compat_clr_move_pending(struct irq_desc *desc)
+{
+	desc->status &= ~IRQ_MOVE_PENDING;
+}
 #else
 static inline void irq_compat_set_progress(struct irq_desc *desc) { }
 static inline void irq_compat_clr_progress(struct irq_desc *desc) { }
@@ -46,5 +55,7 @@ static inline void irq_compat_set_pending(struct irq_desc *desc) { }
 static inline void irq_compat_clr_pending(struct irq_desc *desc) { }
 static inline void irq_compat_set_masked(struct irq_desc *desc) { }
 static inline void irq_compat_clr_masked(struct irq_desc *desc) { }
+static inline void irq_compat_set_move_pending(struct irq_desc *desc) { }
+static inline void irq_compat_clr_move_pending(struct irq_desc *desc) { }
 #endif
 
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index cef0849..e93e609 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -125,6 +125,21 @@ static inline void chip_bus_sync_unlock(struct irq_desc *desc)
 }
 
 /*
+ * Manipulation functions for irq_data.state
+ */
+static inline void irqd_set_move_pending(struct irq_data *d)
+{
+	d->state_use_accessors |= IRQD_SETAFFINITY_PENDING;
+	irq_compat_set_move_pending(irq_data_to_desc(d));
+}
+
+static inline void irqd_clr_move_pending(struct irq_data *d)
+{
+	d->state_use_accessors &= ~IRQD_SETAFFINITY_PENDING;
+	irq_compat_clr_move_pending(irq_data_to_desc(d));
+}
+
+/*
  * Debugging printout:
  */
 
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index ccc9389..9a99c47 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -107,7 +107,7 @@ static inline bool irq_can_move_pcntxt(struct irq_desc *desc)
 }
 static inline bool irq_move_pending(struct irq_desc *desc)
 {
-	return desc->status & IRQ_MOVE_PENDING;
+	return irqd_is_setaffinity_pending(&desc->irq_data);
 }
 static inline void
 irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask)
@@ -156,7 +156,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *mask)
 			ret = 0;
 		}
 	} else {
-		desc->status |= IRQ_MOVE_PENDING;
+		irqd_set_move_pending(&desc->irq_data);
 		irq_copy_pending(desc, mask);
 	}
 
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 6f2f984..9485ae0 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -9,7 +9,7 @@ void move_masked_irq(int irq)
 	struct irq_desc *desc = irq_to_desc(irq);
 	struct irq_chip *chip = desc->irq_data.chip;
 
-	if (likely(!(desc->status & IRQ_MOVE_PENDING)))
+	if (likely(!irqd_is_setaffinity_pending(&desc->irq_data)))
 		return;
 
 	/*
@@ -20,7 +20,7 @@ void move_masked_irq(int irq)
 		return;
 	}
 
-	desc->status &= ~IRQ_MOVE_PENDING;
+	irqd_clr_move_pending(&desc->irq_data);
 
 	if (unlikely(cpumask_empty(desc->pending_mask)))
 		return;
@@ -58,7 +58,7 @@ void move_native_irq(int irq)
 	struct irq_desc *desc = irq_to_desc(irq);
 	bool masked;
 
-	if (likely(!(desc->status & IRQ_MOVE_PENDING)))
+	if (likely(!irqd_is_setaffinity_pending(&desc->irq_data)))
 		return;
 
 	if (unlikely(desc->istate & IRQS_DISABLED))
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 2644923..afe4e68 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -25,7 +25,7 @@ static int irq_affinity_proc_show(struct seq_file *m, void *v)
 	const struct cpumask *mask = desc->irq_data.affinity;
 
 #ifdef CONFIG_GENERIC_PENDING_IRQ
-	if (desc->status & IRQ_MOVE_PENDING)
+	if (irqd_is_setaffinity_pending(&desc->irq_data))
 		mask = desc->pending_mask;
 #endif
 	seq_cpumask(m, mask);
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
index ef09824..bb104a2 100644
--- a/kernel/irq/settings.h
+++ b/kernel/irq/settings.h
@@ -20,3 +20,5 @@ enum {
 #define IRQ_MASKED		GOT_YOU_MORON
 #undef IRQ_WAKEUP
 #define IRQ_WAKEUP		GOT_YOU_MORON
+#undef IRQ_MOVE_PENDING
+#define IRQ_MOVE_PENDING	GOT_YOU_MORON
-- 
cgit v0.10.2


From 6a58fb3bad099076f36f0f30f44507bc3275cdb6 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 8 Feb 2011 15:40:05 +0100
Subject: genirq: Remove CONFIG_IRQ_PER_CPU

The saving of this switch is minimal versus the ifdef mess it
creates. Simple enable PER_CPU unconditionally and remove the config
switch.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 2899905..ab708f2 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -78,13 +78,8 @@ typedef	void (*irq_flow_handler_t)(unsigned int irq,
 	 IRQ_NOAUTOEN | IRQ_MOVE_PCNTXT | IRQ_LEVEL | IRQ_NO_BALANCING | \
 	 IRQ_PER_CPU | IRQ_NESTED_THREAD)
 
-#ifdef CONFIG_IRQ_PER_CPU
 # define CHECK_IRQ_PER_CPU(var) ((var) & IRQ_PER_CPU)
 # define IRQ_NO_BALANCING_MASK	(IRQ_PER_CPU | IRQ_NO_BALANCING)
-#else
-# define CHECK_IRQ_PER_CPU(var) 0
-# define IRQ_NO_BALANCING_MASK	IRQ_NO_BALANCING
-#endif
 
 /*
  * Return value for chip->irq_set_affinity()
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 9e2256d..48ad25f 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -32,9 +32,6 @@ config GENERIC_PENDING_IRQ
 config AUTO_IRQ_AFFINITY
        def_bool n
 
-config IRQ_PER_CPU
-       def_bool n
-
 config HARDIRQS_SW_RESEND
        def_bool n
 
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index e93e609..9e32b3d 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -163,9 +163,7 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
 	}
 
 	P(IRQ_LEVEL);
-#ifdef CONFIG_IRQ_PER_CPU
 	P(IRQ_PER_CPU);
-#endif
 	P(IRQ_NOPROBE);
 	P(IRQ_NOREQUEST);
 	P(IRQ_NOAUTOEN);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 9a99c47..056aa49 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -865,12 +865,10 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 			goto mismatch;
 		}
 
-#if defined(CONFIG_IRQ_PER_CPU)
 		/* All handlers must agree on per-cpuness */
 		if ((old->flags & IRQF_PERCPU) !=
 		    (new->flags & IRQF_PERCPU))
 			goto mismatch;
-#endif
 
 		/* add new interrupt at end of irq queue */
 		do {
@@ -894,15 +892,14 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 				goto out_mask;
 		} else
 			compat_irq_chip_set_default_handler(desc);
-#if defined(CONFIG_IRQ_PER_CPU)
-		if (new->flags & IRQF_PERCPU)
-			desc->status |= IRQ_PER_CPU;
-#endif
 
 		desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \
 				  IRQS_INPROGRESS | IRQS_ONESHOT | \
 				  IRQS_WAITING);
 
+		if (new->flags & IRQF_PERCPU)
+			desc->status |= IRQ_PER_CPU;
+
 		if (new->flags & IRQF_ONESHOT)
 			desc->istate |= IRQS_ONESHOT;
 
-- 
cgit v0.10.2


From 8f53f92404bead2ab2154d45c8f508880bb5d95d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 8 Feb 2011 16:50:00 +0100
Subject: genirq: Make CHECK_IRQ_PER_CPU an inline and deprecate it

Its' too ugly and needs to go. The only users are core code and
parisc. Core code does not need it and parisc gets a new check once
IRQ_PER_CPU is reflected in irq_data.state.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/irq.h b/include/linux/irq.h
index ab708f2..3f607ad 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -78,8 +78,12 @@ typedef	void (*irq_flow_handler_t)(unsigned int irq,
 	 IRQ_NOAUTOEN | IRQ_MOVE_PCNTXT | IRQ_LEVEL | IRQ_NO_BALANCING | \
 	 IRQ_PER_CPU | IRQ_NESTED_THREAD)
 
-# define CHECK_IRQ_PER_CPU(var) ((var) & IRQ_PER_CPU)
-# define IRQ_NO_BALANCING_MASK	(IRQ_PER_CPU | IRQ_NO_BALANCING)
+#define IRQ_NO_BALANCING_MASK	(IRQ_PER_CPU | IRQ_NO_BALANCING)
+
+static inline __deprecated bool CHECK_IRQ_PER_CPU(unsigned int status)
+{
+	return status & IRQ_PER_CPU;
+}
 
 /*
  * Return value for chip->irq_set_affinity()
-- 
cgit v0.10.2


From fae581e588e64a0690f3fc995e404fcacaebe772 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 8 Feb 2011 16:53:24 +0100
Subject: genirq: Remove CHECK_IRQ_PER_CPU from core code

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 056aa49..f1cfa27 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -73,8 +73,8 @@ int irq_can_set_affinity(unsigned int irq)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 
-	if (CHECK_IRQ_PER_CPU(desc->status) || !desc->irq_data.chip ||
-	    !desc->irq_data.chip->irq_set_affinity)
+	if ((desc->status & (IRQ_PER_CPU | IRQ_NO_BALANCING)) ||
+	    !desc->irq_data.chip || !desc->irq_data.chip->irq_set_affinity)
 		return 0;
 
 	return 1;
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 9485ae0..24f53ca 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -15,7 +15,7 @@ void move_masked_irq(int irq)
 	/*
 	 * Paranoia: cpu-local interrupts shouldn't be calling in here anyway.
 	 */
-	if (CHECK_IRQ_PER_CPU(desc->status)) {
+	if (desc->status & (IRQ_PER_CPU | IRQ_NO_BALANCING)) {
 		WARN_ON(1);
 		return;
 	}
-- 
cgit v0.10.2


From 1ce6068dac1924f7095be5850481e790cbf1b3c1 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 9 Feb 2011 20:44:21 +0100
Subject: genirq: Move debug code to separate header

It'll break when I'm going to undefine the constants.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/irq/debug.h b/kernel/irq/debug.h
new file mode 100644
index 0000000..d1a33b7
--- /dev/null
+++ b/kernel/irq/debug.h
@@ -0,0 +1,40 @@
+/*
+ * Debugging printout:
+ */
+
+#include <linux/kallsyms.h>
+
+#define P(f) if (desc->status & f) printk("%14s set\n", #f)
+#define PS(f) if (desc->istate & f) printk("%14s set\n", #f)
+
+static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
+{
+	printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n",
+		irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled);
+	printk("->handle_irq():  %p, ", desc->handle_irq);
+	print_symbol("%s\n", (unsigned long)desc->handle_irq);
+	printk("->irq_data.chip(): %p, ", desc->irq_data.chip);
+	print_symbol("%s\n", (unsigned long)desc->irq_data.chip);
+	printk("->action(): %p\n", desc->action);
+	if (desc->action) {
+		printk("->action->handler(): %p, ", desc->action->handler);
+		print_symbol("%s\n", (unsigned long)desc->action->handler);
+	}
+
+	P(IRQ_LEVEL);
+	P(IRQ_PER_CPU);
+	P(IRQ_NOPROBE);
+	P(IRQ_NOREQUEST);
+	P(IRQ_NOAUTOEN);
+
+	PS(IRQS_AUTODETECT);
+	PS(IRQS_INPROGRESS);
+	PS(IRQS_REPLAY);
+	PS(IRQS_WAITING);
+	PS(IRQS_DISABLED);
+	PS(IRQS_PENDING);
+	PS(IRQS_MASKED);
+}
+
+#undef P
+#undef PS
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 9e32b3d..b2ba59e 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -13,9 +13,6 @@
 # define IRQ_BITMAP_BITS	NR_IRQS
 #endif
 
-#include "compat.h"
-#include "settings.h"
-
 #define istate core_internal_state__do_not_mess_with_it
 
 extern int noirqdebug;
@@ -66,6 +63,10 @@ enum {
 	IRQS_WAKEUP		= 0x00001000,
 };
 
+#include "compat.h"
+#include "debug.h"
+#include "settings.h"
+
 #define irq_data_to_desc(data)	container_of(data, struct irq_desc, irq_data)
 
 /* Set default functions for irq_chip structures: */
@@ -138,44 +139,3 @@ static inline void irqd_clr_move_pending(struct irq_data *d)
 	d->state_use_accessors &= ~IRQD_SETAFFINITY_PENDING;
 	irq_compat_clr_move_pending(irq_data_to_desc(d));
 }
-
-/*
- * Debugging printout:
- */
-
-#include <linux/kallsyms.h>
-
-#define P(f) if (desc->status & f) printk("%14s set\n", #f)
-#define PS(f) if (desc->istate & f) printk("%14s set\n", #f)
-
-static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
-{
-	printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n",
-		irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled);
-	printk("->handle_irq():  %p, ", desc->handle_irq);
-	print_symbol("%s\n", (unsigned long)desc->handle_irq);
-	printk("->irq_data.chip(): %p, ", desc->irq_data.chip);
-	print_symbol("%s\n", (unsigned long)desc->irq_data.chip);
-	printk("->action(): %p\n", desc->action);
-	if (desc->action) {
-		printk("->action->handler(): %p, ", desc->action->handler);
-		print_symbol("%s\n", (unsigned long)desc->action->handler);
-	}
-
-	P(IRQ_LEVEL);
-	P(IRQ_PER_CPU);
-	P(IRQ_NOPROBE);
-	P(IRQ_NOREQUEST);
-	P(IRQ_NOAUTOEN);
-
-	PS(IRQS_AUTODETECT);
-	PS(IRQS_INPROGRESS);
-	PS(IRQS_REPLAY);
-	PS(IRQS_WAITING);
-	PS(IRQS_DISABLED);
-	PS(IRQS_PENDING);
-	PS(IRQS_MASKED);
-}
-
-#undef P
-#undef PS
-- 
cgit v0.10.2


From a005677b3dd05decdd8880cf3044ae709856f58f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 8 Feb 2011 17:11:03 +0100
Subject: genirq: Mirror IRQ_PER_CPU and IRQ_NO_BALANCING in irq_data.state

That's the right data structure to look at for arch code.

Accessor functions are provided.

	 irqd_is_per_cpu(irqdata);
	 irqd_can_balance(irqdata);

Coders who access them directly will be tracked down and slapped with
stinking trouts.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 3f607ad..d5312e6 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -132,10 +132,14 @@ struct irq_data {
  * Bit masks for irq_data.state
  *
  * IRQD_SETAFFINITY_PENDING	- Affinity setting is pending
+ * IRQD_NO_BALANCING		- Balancing disabled for this IRQ
+ * IRQD_PER_CPU			- Interrupt is per cpu
  */
 enum {
 	/* Bit 0 - 7 reserved for TYPE will use later */
-	IRQD_SETAFFINITY_PENDING = (1 << 8),
+	IRQD_SETAFFINITY_PENDING	= (1 <<  8),
+	IRQD_NO_BALANCING		= (1 << 10),
+	IRQD_PER_CPU			= (1 << 11),
 };
 
 static inline bool irqd_is_setaffinity_pending(struct irq_data *d)
@@ -143,6 +147,16 @@ static inline bool irqd_is_setaffinity_pending(struct irq_data *d)
 	return d->state_use_accessors & IRQD_SETAFFINITY_PENDING;
 }
 
+static inline bool irqd_is_per_cpu(struct irq_data *d)
+{
+	return d->state_use_accessors & IRQD_PER_CPU;
+}
+
+static inline bool irqd_can_balance(struct irq_data *d)
+{
+	return !(d->state_use_accessors & (IRQD_PER_CPU | IRQD_NO_BALANCING));
+}
+
 /**
  * struct irq_chip - hardware interrupt chip descriptor
  *
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 73b2e7e..b8aa3df 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -706,12 +706,15 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
 	if (!desc)
 		return;
 
-	/* Sanitize flags */
-	set &= IRQF_MODIFY_MASK;
-	clr &= IRQF_MODIFY_MASK;
-
 	raw_spin_lock_irqsave(&desc->lock, flags);
-	desc->status &= ~clr;
-	desc->status |= set;
+
+	irq_settings_clr_and_set(desc, clr, set);
+
+	irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU);
+	if (irq_settings_has_no_balance_set(desc))
+		irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
+	if (irq_settings_is_per_cpu(desc))
+		irqd_set(&desc->irq_data, IRQD_PER_CPU);
+
 	raw_spin_unlock_irqrestore(&desc->lock, flags);
 }
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index b2ba59e..a80b44d 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -139,3 +139,14 @@ static inline void irqd_clr_move_pending(struct irq_data *d)
 	d->state_use_accessors &= ~IRQD_SETAFFINITY_PENDING;
 	irq_compat_clr_move_pending(irq_data_to_desc(d));
 }
+
+static inline void irqd_clear(struct irq_data *d, unsigned int mask)
+{
+	d->state_use_accessors &= ~mask;
+}
+
+static inline void irqd_set(struct irq_data *d, unsigned int mask)
+{
+	d->state_use_accessors |= mask;
+}
+
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index f1cfa27..84a0a9c 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -73,8 +73,8 @@ int irq_can_set_affinity(unsigned int irq)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 
-	if ((desc->status & (IRQ_PER_CPU | IRQ_NO_BALANCING)) ||
-	    !desc->irq_data.chip || !desc->irq_data.chip->irq_set_affinity)
+	if (!irqd_can_balance(&desc->irq_data) || !desc->irq_data.chip ||
+	    !desc->irq_data.chip->irq_set_affinity)
 		return 0;
 
 	return 1;
@@ -897,8 +897,10 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 				  IRQS_INPROGRESS | IRQS_ONESHOT | \
 				  IRQS_WAITING);
 
-		if (new->flags & IRQF_PERCPU)
-			desc->status |= IRQ_PER_CPU;
+		if (new->flags & IRQF_PERCPU) {
+			irqd_set(&desc->irq_data, IRQD_PER_CPU);
+			irq_settings_set_per_cpu(desc);
+		}
 
 		if (new->flags & IRQF_ONESHOT)
 			desc->istate |= IRQS_ONESHOT;
@@ -910,8 +912,10 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 			desc->depth = 1;
 
 		/* Exclude IRQ from balancing if requested */
-		if (new->flags & IRQF_NOBALANCING)
-			desc->status |= IRQ_NO_BALANCING;
+		if (new->flags & IRQF_NOBALANCING) {
+			irq_settings_set_no_balancing(desc);
+			irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
+		}
 
 		/* Set default affinity mask once everything is setup */
 		setup_affinity(irq, desc, mask);
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 24f53ca..7a93c6b 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -15,7 +15,7 @@ void move_masked_irq(int irq)
 	/*
 	 * Paranoia: cpu-local interrupts shouldn't be calling in here anyway.
 	 */
-	if (desc->status & (IRQ_PER_CPU | IRQ_NO_BALANCING)) {
+	if (!irqd_can_balance(&desc->irq_data)) {
 		WARN_ON(1);
 		return;
 	}
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
index bb104a2..ba0fffe 100644
--- a/kernel/irq/settings.h
+++ b/kernel/irq/settings.h
@@ -4,6 +4,9 @@
  */
 enum {
 	_IRQ_DEFAULT_INIT_FLAGS	= IRQ_DEFAULT_INIT_FLAGS,
+	_IRQ_PER_CPU		= IRQ_PER_CPU,
+	_IRQ_NO_BALANCING	= IRQ_NO_BALANCING,
+	_IRQF_MODIFY_MASK	= IRQF_MODIFY_MASK,
 };
 
 #undef IRQ_INPROGRESS
@@ -22,3 +25,36 @@ enum {
 #define IRQ_WAKEUP		GOT_YOU_MORON
 #undef IRQ_MOVE_PENDING
 #define IRQ_MOVE_PENDING	GOT_YOU_MORON
+#undef IRQ_PER_CPU
+#define IRQ_PER_CPU		GOT_YOU_MORON
+#undef IRQ_NO_BALANCING
+#define IRQ_NO_BALANCING	GOT_YOU_MORON
+#undef IRQF_MODIFY_MASK
+#define IRQF_MODIFY_MASK	GOT_YOU_MORON
+
+static inline void
+irq_settings_clr_and_set(struct irq_desc *desc, u32 clr, u32 set)
+{
+	desc->status &= ~(clr & _IRQF_MODIFY_MASK);
+	desc->status |= (set & _IRQF_MODIFY_MASK);
+}
+
+static inline bool irq_settings_is_per_cpu(struct irq_desc *desc)
+{
+	return desc->status & _IRQ_PER_CPU;
+}
+
+static inline void irq_settings_set_per_cpu(struct irq_desc *desc)
+{
+	desc->status |= _IRQ_PER_CPU;
+}
+
+static inline void irq_settings_set_no_balancing(struct irq_desc *desc)
+{
+	desc->status |= _IRQ_NO_BALANCING;
+}
+
+static inline bool irq_settings_has_no_balance_set(struct irq_desc *desc)
+{
+	return desc->status & _IRQ_NO_BALANCING;
+}
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 692ce2b..226ed7d 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -68,7 +68,8 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force)
 	raw_spin_lock(&desc->lock);
 
 	/* PER_CPU and nested thread interrupts are never polled */
-	if (desc->status & (IRQ_PER_CPU | IRQ_NESTED_THREAD))
+	if (irq_settings_is_per_cpu(desc) ||
+	    (desc->status & IRQ_NESTED_THREAD))
 		goto out;
 
 	/*
-- 
cgit v0.10.2


From bce43032ad79fae0ce5b6174ce1321e643ceb54b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 10 Feb 2011 22:37:41 +0100
Subject: genirq: Reuse existing can set affinty check

Add a !desc check while at it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 84a0a9c..550ae97 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -73,8 +73,8 @@ int irq_can_set_affinity(unsigned int irq)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 
-	if (!irqd_can_balance(&desc->irq_data) || !desc->irq_data.chip ||
-	    !desc->irq_data.chip->irq_set_affinity)
+	if (!desc || !irqd_can_balance(&desc->irq_data) ||
+	    !desc->irq_data.chip || !desc->irq_data.chip->irq_set_affinity)
 		return 0;
 
 	return 1;
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index afe4e68..4cc2e5e 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -66,8 +66,7 @@ static ssize_t irq_affinity_proc_write(struct file *file,
 	cpumask_var_t new_value;
 	int err;
 
-	if (!irq_to_desc(irq)->irq_data.chip->irq_set_affinity || no_irq_affinity ||
-	    irq_balancing_disabled(irq))
+	if (!irq_can_set_affinity(irq) || no_irq_affinity)
 		return -EIO;
 
 	if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
-- 
cgit v0.10.2


From 2bdd10558c8d93009cb6c32ce9e30800fbb08add Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 8 Feb 2011 17:22:00 +0100
Subject: genirq: Move IRQ_AFFINITY_SET to core

Keep status in sync until last abuser is gone.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/irq.h b/include/linux/irq.h
index d5312e6..8da1782 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -60,6 +60,7 @@ typedef	void (*irq_flow_handler_t)(unsigned int irq,
 #define IRQ_MASKED		0x00002000	/* DEPRECATED */
 /* DEPRECATED use irq_setaffinity_pending() instead*/
 #define IRQ_MOVE_PENDING	0x00004000
+#define IRQ_AFFINITY_SET	0x02000000	/* DEPRECATED */
 #endif
 
 #define IRQ_LEVEL		0x00008000	/* IRQ level triggered */
@@ -70,7 +71,6 @@ typedef	void (*irq_flow_handler_t)(unsigned int irq,
 #define IRQ_WAKEUP		0x00100000	/* IRQ triggers system wakeup */
 #define IRQ_NO_BALANCING	0x00400000	/* IRQ is excluded from balancing */
 #define IRQ_MOVE_PCNTXT		0x01000000	/* IRQ migration from process context */
-#define IRQ_AFFINITY_SET	0x02000000	/* IRQ affinity was set from userspace*/
 #define IRQ_NESTED_THREAD	0x10000000	/* IRQ is nested into another, no own handler thread */
 
 #define IRQF_MODIFY_MASK	\
@@ -134,12 +134,14 @@ struct irq_data {
  * IRQD_SETAFFINITY_PENDING	- Affinity setting is pending
  * IRQD_NO_BALANCING		- Balancing disabled for this IRQ
  * IRQD_PER_CPU			- Interrupt is per cpu
+ * IRQD_AFFINITY_SET		- Interrupt affinity was set
  */
 enum {
 	/* Bit 0 - 7 reserved for TYPE will use later */
 	IRQD_SETAFFINITY_PENDING	= (1 <<  8),
 	IRQD_NO_BALANCING		= (1 << 10),
 	IRQD_PER_CPU			= (1 << 11),
+	IRQD_AFFINITY_SET		= (1 << 12),
 };
 
 static inline bool irqd_is_setaffinity_pending(struct irq_data *d)
@@ -157,6 +159,11 @@ static inline bool irqd_can_balance(struct irq_data *d)
 	return !(d->state_use_accessors & (IRQD_PER_CPU | IRQD_NO_BALANCING));
 }
 
+static inline bool irqd_affinity_was_set(struct irq_data *d)
+{
+	return d->state_use_accessors & IRQD_AFFINITY_SET;
+}
+
 /**
  * struct irq_chip - hardware interrupt chip descriptor
  *
diff --git a/kernel/irq/compat.h b/kernel/irq/compat.h
index 5e33aad..6bbaf66 100644
--- a/kernel/irq/compat.h
+++ b/kernel/irq/compat.h
@@ -46,6 +46,15 @@ static inline void irq_compat_clr_move_pending(struct irq_desc *desc)
 {
 	desc->status &= ~IRQ_MOVE_PENDING;
 }
+static inline void irq_compat_set_affinity(struct irq_desc *desc)
+{
+	desc->status |= IRQ_AFFINITY_SET;
+}
+
+static inline void irq_compat_clr_affinity(struct irq_desc *desc)
+{
+	desc->status &= ~IRQ_AFFINITY_SET;
+}
 #else
 static inline void irq_compat_set_progress(struct irq_desc *desc) { }
 static inline void irq_compat_clr_progress(struct irq_desc *desc) { }
@@ -57,5 +66,7 @@ static inline void irq_compat_set_masked(struct irq_desc *desc) { }
 static inline void irq_compat_clr_masked(struct irq_desc *desc) { }
 static inline void irq_compat_set_move_pending(struct irq_desc *desc) { }
 static inline void irq_compat_clr_move_pending(struct irq_desc *desc) { }
+static inline void irq_compat_set_affinity(struct irq_desc *desc) { }
+static inline void irq_compat_clr_affinity(struct irq_desc *desc) { }
 #endif
 
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index a80b44d..6776453 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -150,3 +150,7 @@ static inline void irqd_set(struct irq_data *d, unsigned int mask)
 	d->state_use_accessors |= mask;
 }
 
+static inline bool irqd_has_set(struct irq_data *d, unsigned int mask)
+{
+	return d->state_use_accessors & mask;
+}
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 550ae97..8246afc 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -164,7 +164,8 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *mask)
 		kref_get(&desc->affinity_notify->kref);
 		schedule_work(&desc->affinity_notify->work);
 	}
-	desc->status |= IRQ_AFFINITY_SET;
+	irq_compat_set_affinity(desc);
+	irqd_set(&desc->irq_data, IRQD_AFFINITY_SET);
 	raw_spin_unlock_irqrestore(&desc->lock, flags);
 	return ret;
 }
@@ -272,12 +273,14 @@ setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
 	 * Preserve an userspace affinity setup, but make sure that
 	 * one of the targets is online.
 	 */
-	if (desc->status & (IRQ_AFFINITY_SET)) {
+	if (irqd_has_set(&desc->irq_data, IRQD_AFFINITY_SET)) {
 		if (cpumask_intersects(desc->irq_data.affinity,
 				       cpu_online_mask))
 			set = desc->irq_data.affinity;
-		else
-			desc->status &= ~IRQ_AFFINITY_SET;
+		else {
+			irq_compat_clr_affinity(desc);
+			irqd_clear(&desc->irq_data, IRQD_AFFINITY_SET);
+		}
 	}
 
 	cpumask_and(mask, cpu_online_mask, set);
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
index ba0fffe..da5acb4 100644
--- a/kernel/irq/settings.h
+++ b/kernel/irq/settings.h
@@ -29,6 +29,8 @@ enum {
 #define IRQ_PER_CPU		GOT_YOU_MORON
 #undef IRQ_NO_BALANCING
 #define IRQ_NO_BALANCING	GOT_YOU_MORON
+#undef IRQ_AFFINITY_SET
+#define IRQ_AFFINITY_SET	GOT_YOU_MORON
 #undef IRQF_MODIFY_MASK
 #define IRQF_MODIFY_MASK	GOT_YOU_MORON
 
-- 
cgit v0.10.2


From 876dbd4cc1b35c1a4cb96a2be1d43ea0eabce3b4 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 8 Feb 2011 17:28:12 +0100
Subject: genirq: Mirror irq trigger type bits in irq_data.state

That's the data structure chip functions get provided. Also allow them
to signal the core code that they updated the flags in irq_data.state
by returning IRQ_SET_MASK_OK_NOCOPY. The default is unchanged.

The type bits should be accessed via:

val = irqd_get_trigger_type(irqdata);
and
irqd_set_trigger_type(irqdata, val);

Coders who access them directly will be tracked down and slapped with
stinking trouts.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 8da1782..be73c0a 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -46,7 +46,9 @@ typedef	void (*irq_flow_handler_t)(unsigned int irq,
 #define IRQ_TYPE_EDGE_BOTH (IRQ_TYPE_EDGE_FALLING | IRQ_TYPE_EDGE_RISING)
 #define IRQ_TYPE_LEVEL_HIGH	0x00000004	/* Level high type */
 #define IRQ_TYPE_LEVEL_LOW	0x00000008	/* Level low type */
+#define IRQ_TYPE_LEVEL_MASK	(IRQ_TYPE_LEVEL_LOW | IRQ_TYPE_LEVEL_HIGH)
 #define IRQ_TYPE_SENSE_MASK	0x0000000f	/* Mask of the above */
+
 #define IRQ_TYPE_PROBE		0x00000010	/* Probing in progress */
 
 /* Internal flags */
@@ -131,17 +133,20 @@ struct irq_data {
 /*
  * Bit masks for irq_data.state
  *
+ * IRQD_TRIGGER_MASK		- Mask for the trigger type bits
  * IRQD_SETAFFINITY_PENDING	- Affinity setting is pending
  * IRQD_NO_BALANCING		- Balancing disabled for this IRQ
  * IRQD_PER_CPU			- Interrupt is per cpu
  * IRQD_AFFINITY_SET		- Interrupt affinity was set
+ * IRQD_LEVEL			- Interrupt is level triggered
  */
 enum {
-	/* Bit 0 - 7 reserved for TYPE will use later */
+	IRQD_TRIGGER_MASK		= 0xf,
 	IRQD_SETAFFINITY_PENDING	= (1 <<  8),
 	IRQD_NO_BALANCING		= (1 << 10),
 	IRQD_PER_CPU			= (1 << 11),
 	IRQD_AFFINITY_SET		= (1 << 12),
+	IRQD_LEVEL			= (1 << 13),
 };
 
 static inline bool irqd_is_setaffinity_pending(struct irq_data *d)
@@ -164,6 +169,25 @@ static inline bool irqd_affinity_was_set(struct irq_data *d)
 	return d->state_use_accessors & IRQD_AFFINITY_SET;
 }
 
+static inline u32 irqd_get_trigger_type(struct irq_data *d)
+{
+	return d->state_use_accessors & IRQD_TRIGGER_MASK;
+}
+
+/*
+ * Must only be called inside irq_chip.irq_set_type() functions.
+ */
+static inline void irqd_set_trigger_type(struct irq_data *d, u32 type)
+{
+	d->state_use_accessors &= ~IRQD_TRIGGER_MASK;
+	d->state_use_accessors |= type & IRQD_TRIGGER_MASK;
+}
+
+static inline bool irqd_is_level_type(struct irq_data *d)
+{
+	return d->state_use_accessors & IRQD_LEVEL;
+}
+
 /**
  * struct irq_chip - hardware interrupt chip descriptor
  *
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index b8aa3df..9c9b573 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -710,11 +710,14 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
 
 	irq_settings_clr_and_set(desc, clr, set);
 
-	irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU);
+	irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU |
+		   IRQD_TRIGGER_MASK | IRQD_LEVEL);
 	if (irq_settings_has_no_balance_set(desc))
 		irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
 	if (irq_settings_is_per_cpu(desc))
 		irqd_set(&desc->irq_data, IRQD_PER_CPU);
 
+	irqd_set(&desc->irq_data, irq_settings_get_trigger_mask(desc));
+
 	raw_spin_unlock_irqrestore(&desc->lock, flags);
 }
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 8246afc..9ae758e 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -567,23 +567,32 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
 		return 0;
 	}
 
+	flags &= IRQ_TYPE_SENSE_MASK;
 	/* caller masked out all except trigger mode flags */
 	ret = chip->irq_set_type(&desc->irq_data, flags);
 
-	if (ret)
-		pr_err("setting trigger mode %lu for irq %u failed (%pF)\n",
-		       flags, irq, chip->irq_set_type);
-	else {
-		if (flags & (IRQ_TYPE_LEVEL_LOW | IRQ_TYPE_LEVEL_HIGH))
-			flags |= IRQ_LEVEL;
-		/* note that IRQF_TRIGGER_MASK == IRQ_TYPE_SENSE_MASK */
-		desc->status &= ~(IRQ_LEVEL | IRQ_TYPE_SENSE_MASK);
-		desc->status |= flags;
+	switch (ret) {
+	case IRQ_SET_MASK_OK:
+		irqd_clear(&desc->irq_data, IRQD_TRIGGER_MASK);
+		irqd_set(&desc->irq_data, flags);
+
+	case IRQ_SET_MASK_OK_NOCOPY:
+		flags = irqd_get_trigger_type(&desc->irq_data);
+		irq_settings_set_trigger_mask(desc, flags);
+		irqd_clear(&desc->irq_data, IRQD_LEVEL);
+		irq_settings_clr_level(desc);
+		if (flags & IRQ_TYPE_LEVEL_MASK) {
+			irq_settings_set_level(desc);
+			irqd_set(&desc->irq_data, IRQD_LEVEL);
+		}
 
 		if (chip != desc->irq_data.chip)
 			irq_chip_set_defaults(desc->irq_data.chip);
+		return 0;
+	default:
+		pr_err("setting trigger mode %lu for irq %u failed (%pF)\n",
+		       flags, irq, chip->irq_set_type);
 	}
-
 	return ret;
 }
 
@@ -923,13 +932,14 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 		/* Set default affinity mask once everything is setup */
 		setup_affinity(irq, desc, mask);
 
-	} else if ((new->flags & IRQF_TRIGGER_MASK)
-			&& (new->flags & IRQF_TRIGGER_MASK)
-				!= (desc->status & IRQ_TYPE_SENSE_MASK)) {
-		/* hope the handler works with the actual trigger mode... */
-		pr_warning("IRQ %d uses trigger mode %d; requested %d\n",
-				irq, (int)(desc->status & IRQ_TYPE_SENSE_MASK),
-				(int)(new->flags & IRQF_TRIGGER_MASK));
+	} else if (new->flags & IRQF_TRIGGER_MASK) {
+		unsigned int nmsk = new->flags & IRQF_TRIGGER_MASK;
+		unsigned int omsk = irq_settings_get_trigger_mask(desc);
+
+		if (nmsk != omsk)
+			/* hope the handler works with current  trigger mode */
+			pr_warning("IRQ %d uses trigger mode %u; requested %u\n",
+				   irq, nmsk, omsk);
 	}
 
 	new->irq = irq;
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index ff1fea0..ad683a9 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -60,7 +60,7 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq)
 	 * interrupts are resent by hardware when they are still
 	 * active.
 	 */
-	if (desc->status & IRQ_LEVEL)
+	if (irq_settings_is_level(desc))
 		return;
 	if (desc->istate & IRQS_REPLAY)
 		return;
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
index da5acb4..2201f2a 100644
--- a/kernel/irq/settings.h
+++ b/kernel/irq/settings.h
@@ -5,6 +5,7 @@
 enum {
 	_IRQ_DEFAULT_INIT_FLAGS	= IRQ_DEFAULT_INIT_FLAGS,
 	_IRQ_PER_CPU		= IRQ_PER_CPU,
+	_IRQ_LEVEL		= IRQ_LEVEL,
 	_IRQ_NO_BALANCING	= IRQ_NO_BALANCING,
 	_IRQF_MODIFY_MASK	= IRQF_MODIFY_MASK,
 };
@@ -31,6 +32,8 @@ enum {
 #define IRQ_NO_BALANCING	GOT_YOU_MORON
 #undef IRQ_AFFINITY_SET
 #define IRQ_AFFINITY_SET	GOT_YOU_MORON
+#undef IRQ_LEVEL
+#define IRQ_LEVEL		GOT_YOU_MORON
 #undef IRQF_MODIFY_MASK
 #define IRQF_MODIFY_MASK	GOT_YOU_MORON
 
@@ -60,3 +63,30 @@ static inline bool irq_settings_has_no_balance_set(struct irq_desc *desc)
 {
 	return desc->status & _IRQ_NO_BALANCING;
 }
+
+static inline u32 irq_settings_get_trigger_mask(struct irq_desc *desc)
+{
+	return desc->status & IRQ_TYPE_SENSE_MASK;
+}
+
+static inline void
+irq_settings_set_trigger_mask(struct irq_desc *desc, u32 mask)
+{
+	desc->status &= ~IRQ_TYPE_SENSE_MASK;
+	desc->status |= mask & IRQ_TYPE_SENSE_MASK;
+}
+
+static inline bool irq_settings_is_level(struct irq_desc *desc)
+{
+	return desc->status & _IRQ_LEVEL;
+}
+
+static inline void irq_settings_clr_level(struct irq_desc *desc)
+{
+	desc->status &= ~_IRQ_LEVEL;
+}
+
+static inline void irq_settings_set_level(struct irq_desc *desc)
+{
+	desc->status |= _IRQ_LEVEL;
+}
-- 
cgit v0.10.2


From 1ccb4e612f68ceefb888c2c6c1def6294ea8666d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 9 Feb 2011 14:44:17 +0100
Subject: genirq: Wrap the remaning IRQ_* flags

Use wrappers to keep them away from the core code.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index aab64c2..c8bbc4f 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -45,7 +45,7 @@ unsigned long probe_irq_on(void)
 	 */
 	for_each_irq_desc_reverse(i, desc) {
 		raw_spin_lock_irq(&desc->lock);
-		if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
+		if (!desc->action && irq_settings_can_probe(desc)) {
 			/*
 			 * An old-style architecture might still have
 			 * the handle_bad_irq handler there:
@@ -74,7 +74,7 @@ unsigned long probe_irq_on(void)
 	 */
 	for_each_irq_desc_reverse(i, desc) {
 		raw_spin_lock_irq(&desc->lock);
-		if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
+		if (!desc->action && irq_settings_can_probe(desc)) {
 			desc->istate |= IRQS_AUTODETECT | IRQS_WAITING;
 			if (irq_startup(desc)) {
 				irq_compat_set_pending(desc);
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 9c9b573..9e9220d 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -674,7 +674,8 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
 	desc->name = name;
 
 	if (handle != handle_bad_irq && is_chained) {
-		desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE;
+		irq_settings_set_noprobe(desc);
+		irq_settings_set_norequest(desc);
 		irq_startup(desc);
 	}
 	raw_spin_unlock_irqrestore(&desc->lock, flags);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 9ae758e..b5de828 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -103,7 +103,7 @@ void irq_set_thread_affinity(struct irq_desc *desc)
 #ifdef CONFIG_GENERIC_PENDING_IRQ
 static inline bool irq_can_move_pcntxt(struct irq_desc *desc)
 {
-	return desc->status & IRQ_MOVE_PCNTXT;
+	return irq_settings_can_move_pcntxt(desc);
 }
 static inline bool irq_move_pending(struct irq_desc *desc)
 {
@@ -411,7 +411,7 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
 		if (desc->istate & IRQS_SUSPENDED)
 			goto err_out;
 		/* Prevent probing on this irq: */
-		desc->status |= IRQ_NOPROBE;
+		irq_settings_set_noprobe(desc);
 		irq_enable(desc);
 		check_irq_resend(desc, irq);
 		/* fall-through */
@@ -526,7 +526,7 @@ int can_request_irq(unsigned int irq, unsigned long irqflags)
 	if (!desc)
 		return 0;
 
-	if (desc->status & IRQ_NOREQUEST)
+	if (!irq_settings_can_request(desc))
 		return 0;
 
 	raw_spin_lock_irqsave(&desc->lock, flags);
@@ -820,7 +820,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 	 * Check whether the interrupt nests into another interrupt
 	 * thread.
 	 */
-	nested = desc->status & IRQ_NESTED_THREAD;
+	nested = irq_settings_is_nested_thread(desc);
 	if (nested) {
 		if (!new->thread_fn)
 			return -EINVAL;
@@ -917,7 +917,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 		if (new->flags & IRQF_ONESHOT)
 			desc->istate |= IRQS_ONESHOT;
 
-		if (!(desc->status & IRQ_NOAUTOEN))
+		if (irq_settings_can_autoenable(desc))
 			irq_startup(desc);
 		else
 			/* Undo nested disables: */
@@ -1217,7 +1217,7 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
 	if (!desc)
 		return -EINVAL;
 
-	if (desc->status & IRQ_NOREQUEST)
+	if (!irq_settings_can_request(desc))
 		return -EINVAL;
 
 	if (!handler) {
@@ -1292,7 +1292,7 @@ int request_any_context_irq(unsigned int irq, irq_handler_t handler,
 	if (!desc)
 		return -EINVAL;
 
-	if (desc->status & IRQ_NESTED_THREAD) {
+	if (irq_settings_is_nested_thread(desc)) {
 		ret = request_threaded_irq(irq, NULL, handler,
 					   flags, name, dev_id);
 		return !ret ? IRQC_IS_NESTED : ret;
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
index 2201f2a..216b6f2 100644
--- a/kernel/irq/settings.h
+++ b/kernel/irq/settings.h
@@ -6,7 +6,12 @@ enum {
 	_IRQ_DEFAULT_INIT_FLAGS	= IRQ_DEFAULT_INIT_FLAGS,
 	_IRQ_PER_CPU		= IRQ_PER_CPU,
 	_IRQ_LEVEL		= IRQ_LEVEL,
+	_IRQ_NOPROBE		= IRQ_NOPROBE,
+	_IRQ_NOREQUEST		= IRQ_NOREQUEST,
+	_IRQ_NOAUTOEN		= IRQ_NOAUTOEN,
+	_IRQ_MOVE_PCNTXT	= IRQ_MOVE_PCNTXT,
 	_IRQ_NO_BALANCING	= IRQ_NO_BALANCING,
+	_IRQ_NESTED_THREAD	= IRQ_NESTED_THREAD,
 	_IRQF_MODIFY_MASK	= IRQF_MODIFY_MASK,
 };
 
@@ -34,6 +39,14 @@ enum {
 #define IRQ_AFFINITY_SET	GOT_YOU_MORON
 #undef IRQ_LEVEL
 #define IRQ_LEVEL		GOT_YOU_MORON
+#undef IRQ_NOPROBE
+#define IRQ_NOPROBE		GOT_YOU_MORON
+#undef IRQ_NOREQUEST
+#define IRQ_NOREQUEST		GOT_YOU_MORON
+#undef IRQ_NOAUTOEN
+#define IRQ_NOAUTOEN		GOT_YOU_MORON
+#undef IRQ_NESTED_THREAD
+#define IRQ_NESTED_THREAD	GOT_YOU_MORON
 #undef IRQF_MODIFY_MASK
 #define IRQF_MODIFY_MASK	GOT_YOU_MORON
 
@@ -90,3 +103,48 @@ static inline void irq_settings_set_level(struct irq_desc *desc)
 {
 	desc->status |= _IRQ_LEVEL;
 }
+
+static inline bool irq_settings_can_request(struct irq_desc *desc)
+{
+	return !(desc->status & _IRQ_NOREQUEST);
+}
+
+static inline void irq_settings_clr_norequest(struct irq_desc *desc)
+{
+	desc->status &= ~_IRQ_NOREQUEST;
+}
+
+static inline void irq_settings_set_norequest(struct irq_desc *desc)
+{
+	desc->status |= _IRQ_NOREQUEST;
+}
+
+static inline bool irq_settings_can_probe(struct irq_desc *desc)
+{
+	return !(desc->status & _IRQ_NOPROBE);
+}
+
+static inline void irq_settings_clr_noprobe(struct irq_desc *desc)
+{
+	desc->status &= ~_IRQ_NOPROBE;
+}
+
+static inline void irq_settings_set_noprobe(struct irq_desc *desc)
+{
+	desc->status |= _IRQ_NOPROBE;
+}
+
+static inline bool irq_settings_can_move_pcntxt(struct irq_desc *desc)
+{
+	return desc->status & _IRQ_MOVE_PCNTXT;
+}
+
+static inline bool irq_settings_can_autoenable(struct irq_desc *desc)
+{
+	return !(desc->status & _IRQ_NOAUTOEN);
+}
+
+static inline bool irq_settings_is_nested_thread(struct irq_desc *desc)
+{
+	return desc->status & _IRQ_NESTED_THREAD;
+}
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 226ed7d..dd586eb 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -68,8 +68,7 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force)
 	raw_spin_lock(&desc->lock);
 
 	/* PER_CPU and nested thread interrupts are never polled */
-	if (irq_settings_is_per_cpu(desc) ||
-	    (desc->status & IRQ_NESTED_THREAD))
+	if (irq_settings_is_per_cpu(desc) || irq_settings_is_nested_thread(desc))
 		goto out;
 
 	/*
-- 
cgit v0.10.2


From f9e4989eb8183a1f33581fa1b99274287b0639d2 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 9 Feb 2011 14:54:49 +0100
Subject: genirq: Force wrapped access to desc->status in core code

Force the usage of wrappers by another nasty CPP substitution.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 6e34bdb..cb62e2d 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -55,7 +55,7 @@ irqreturn_t
 handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
 {
 	irqreturn_t ret, retval = IRQ_NONE;
-	unsigned int status = 0, irq = desc->irq_data.irq;
+	unsigned int random = 0, irq = desc->irq_data.irq;
 
 	do {
 		trace_irq_handler_entry(irq, action);
@@ -98,7 +98,7 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
 
 			/* Fall through to add to randomness */
 		case IRQ_HANDLED:
-			status |= action->flags;
+			random |= action->flags;
 			break;
 
 		default:
@@ -109,7 +109,7 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
 		action = action->next;
 	} while (action);
 
-	if (status & IRQF_SAMPLE_RANDOM)
+	if (random & IRQF_SAMPLE_RANDOM)
 		add_interrupt_randomness(irq);
 
 	if (!noirqdebug)
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 78866d0..3387fbd 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -79,7 +79,7 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node)
 	desc->irq_data.chip_data = NULL;
 	desc->irq_data.handler_data = NULL;
 	desc->irq_data.msi_desc = NULL;
-	desc->status = _IRQ_DEFAULT_INIT_FLAGS;
+	irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS);
 	desc->istate = IRQS_DISABLED;
 	desc->handle_irq = handle_bad_irq;
 	desc->depth = 1;
@@ -247,7 +247,6 @@ int __init early_irq_init(void)
 
 struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
 	[0 ... NR_IRQS-1] = {
-		.status		= _IRQ_DEFAULT_INIT_FLAGS,
 		.istate		= IRQS_DISABLED,
 		.handle_irq	= handle_bad_irq,
 		.depth		= 1,
@@ -271,6 +270,7 @@ int __init early_irq_init(void)
 		desc[i].irq_data.irq = i;
 		desc[i].irq_data.chip = &no_irq_chip;
 		desc[i].kstat_irqs = alloc_percpu(unsigned int);
+		irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS);
 		alloc_masks(desc + i, GFP_KERNEL, node);
 		desc_smp_init(desc + i, node);
 		lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
index 216b6f2..47bcd3b 100644
--- a/kernel/irq/settings.h
+++ b/kernel/irq/settings.h
@@ -148,3 +148,6 @@ static inline bool irq_settings_is_nested_thread(struct irq_desc *desc)
 {
 	return desc->status & _IRQ_NESTED_THREAD;
 }
+
+/* Nothing should touch desc->status from now on */
+#define status		USE_THE_PROPER_WRAPPERS_YOU_MORON
-- 
cgit v0.10.2


From 5d4d8fc9ac3e9a90bbdf90bae6864cb2c01f2208 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 8 Feb 2011 17:27:18 +0100
Subject: genirq: Cleanup irq.h

Put the constants into an enum and document them.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/irq.h b/include/linux/irq.h
index be73c0a..2e3d1e5 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -36,44 +36,73 @@ typedef	void (*irq_flow_handler_t)(unsigned int irq,
 /*
  * IRQ line status.
  *
- * Bits 0-7 are reserved for the IRQF_* bits in linux/interrupt.h
+ * Bits 0-7 are the same as the IRQF_* bits in linux/interrupt.h
+ *
+ * IRQ_TYPE_NONE		- default, unspecified type
+ * IRQ_TYPE_EDGE_RISING		- rising edge triggered
+ * IRQ_TYPE_EDGE_FALLING	- falling edge triggered
+ * IRQ_TYPE_EDGE_BOTH		- rising and falling edge triggered
+ * IRQ_TYPE_LEVEL_HIGH		- high level triggered
+ * IRQ_TYPE_LEVEL_LOW		- low level triggered
+ * IRQ_TYPE_LEVEL_MASK		- Mask to filter out the level bits
+ * IRQ_TYPE_SENSE_MASK		- Mask for all the above bits
+ * IRQ_TYPE_PROBE		- Special flag for probing in progress
+ *
+ * Bits which can be modified via irq_set/clear/modify_status_flags()
+ * IRQ_LEVEL			- Interrupt is level type. Will be also
+ *				  updated in the code when the above trigger
+ *				  bits are modified via set_irq_type()
+ * IRQ_PER_CPU			- Mark an interrupt PER_CPU. Will protect
+ *				  it from affinity setting
+ * IRQ_NOPROBE			- Interrupt cannot be probed by autoprobing
+ * IRQ_NOREQUEST		- Interrupt cannot be requested via
+ *				  request_irq()
+ * IRQ_NOAUTOEN			- Interrupt is not automatically enabled in
+ *				  request/setup_irq()
+ * IRQ_NO_BALANCING		- Interrupt cannot be balanced (affinity set)
+ * IRQ_MOVE_PCNTXT		- Interrupt can be migrated from process context
+ * IRQ_NESTED_TRHEAD		- Interrupt nests into another thread
+ *
+ * Deprecated bits. They are kept updated as long as
+ * CONFIG_GENERIC_HARDIRQS_NO_COMPAT is not set. Will go away soon. These bits
+ * are internal state of the core code and if you really need to acces
+ * them then talk to the genirq maintainer instead of hacking
+ * something weird.
  *
- * IRQ types
  */
-#define IRQ_TYPE_NONE		0x00000000	/* Default, unspecified type */
-#define IRQ_TYPE_EDGE_RISING	0x00000001	/* Edge rising type */
-#define IRQ_TYPE_EDGE_FALLING	0x00000002	/* Edge falling type */
-#define IRQ_TYPE_EDGE_BOTH (IRQ_TYPE_EDGE_FALLING | IRQ_TYPE_EDGE_RISING)
-#define IRQ_TYPE_LEVEL_HIGH	0x00000004	/* Level high type */
-#define IRQ_TYPE_LEVEL_LOW	0x00000008	/* Level low type */
-#define IRQ_TYPE_LEVEL_MASK	(IRQ_TYPE_LEVEL_LOW | IRQ_TYPE_LEVEL_HIGH)
-#define IRQ_TYPE_SENSE_MASK	0x0000000f	/* Mask of the above */
-
-#define IRQ_TYPE_PROBE		0x00000010	/* Probing in progress */
-
-/* Internal flags */
+enum {
+	IRQ_TYPE_NONE		= 0x00000000,
+	IRQ_TYPE_EDGE_RISING	= 0x00000001,
+	IRQ_TYPE_EDGE_FALLING	= 0x00000002,
+	IRQ_TYPE_EDGE_BOTH	= (IRQ_TYPE_EDGE_FALLING | IRQ_TYPE_EDGE_RISING),
+	IRQ_TYPE_LEVEL_HIGH	= 0x00000004,
+	IRQ_TYPE_LEVEL_LOW	= 0x00000008,
+	IRQ_TYPE_LEVEL_MASK	= (IRQ_TYPE_LEVEL_LOW | IRQ_TYPE_LEVEL_HIGH),
+	IRQ_TYPE_SENSE_MASK	= 0x0000000f,
+
+	IRQ_TYPE_PROBE		= 0x00000010,
+
+	IRQ_LEVEL		= (1 <<  8),
+	IRQ_PER_CPU		= (1 <<  9),
+	IRQ_NOPROBE		= (1 << 10),
+	IRQ_NOREQUEST		= (1 << 11),
+	IRQ_NOAUTOEN		= (1 << 12),
+	IRQ_NO_BALANCING	= (1 << 13),
+	IRQ_MOVE_PCNTXT		= (1 << 14),
+	IRQ_NESTED_THREAD	= (1 << 15),
 
 #ifndef CONFIG_GENERIC_HARDIRQS_NO_COMPAT
-#define IRQ_INPROGRESS		0x00000100	/* DEPRECATED */
-#define IRQ_REPLAY		0x00000200	/* DEPRECATED */
-#define IRQ_WAITING		0x00000400	/* DEPRECATED */
-#define IRQ_DISABLED		0x00000800	/* DEPRECATED */
-#define IRQ_PENDING		0x00001000	/* DEPRECATED */
-#define IRQ_MASKED		0x00002000	/* DEPRECATED */
-/* DEPRECATED use irq_setaffinity_pending() instead*/
-#define IRQ_MOVE_PENDING	0x00004000
-#define IRQ_AFFINITY_SET	0x02000000	/* DEPRECATED */
+	IRQ_INPROGRESS		= (1 << 16),
+	IRQ_REPLAY		= (1 << 17),
+	IRQ_WAITING		= (1 << 18),
+	IRQ_DISABLED		= (1 << 19),
+	IRQ_PENDING		= (1 << 20),
+	IRQ_MASKED		= (1 << 21),
+	IRQ_MOVE_PENDING	= (1 << 22),
+	IRQ_AFFINITY_SET	= (1 << 23),
+	IRQ_WAKEUP		= (1 << 24),
 #endif
-
-#define IRQ_LEVEL		0x00008000	/* IRQ level triggered */
-#define IRQ_PER_CPU		0x00010000	/* IRQ is per CPU */
-#define IRQ_NOPROBE		0x00020000	/* IRQ is not valid for probing */
-#define IRQ_NOREQUEST		0x00040000	/* IRQ cannot be requested */
-#define IRQ_NOAUTOEN		0x00080000	/* IRQ will not be enabled on request irq */
-#define IRQ_WAKEUP		0x00100000	/* IRQ triggers system wakeup */
-#define IRQ_NO_BALANCING	0x00400000	/* IRQ is excluded from balancing */
-#define IRQ_MOVE_PCNTXT		0x01000000	/* IRQ migration from process context */
-#define IRQ_NESTED_THREAD	0x10000000	/* IRQ is nested into another, no own handler thread */
+};
 
 #define IRQF_MODIFY_MASK	\
 	(IRQ_TYPE_SENSE_MASK | IRQ_NOPROBE | IRQ_NOREQUEST | \
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
index 47bcd3b..55ebe1e 100644
--- a/kernel/irq/settings.h
+++ b/kernel/irq/settings.h
@@ -15,37 +15,21 @@ enum {
 	_IRQF_MODIFY_MASK	= IRQF_MODIFY_MASK,
 };
 
-#undef IRQ_INPROGRESS
 #define IRQ_INPROGRESS		GOT_YOU_MORON
-#undef IRQ_REPLAY
 #define IRQ_REPLAY		GOT_YOU_MORON
-#undef IRQ_WAITING
 #define IRQ_WAITING		GOT_YOU_MORON
-#undef IRQ_DISABLED
 #define IRQ_DISABLED		GOT_YOU_MORON
-#undef IRQ_PENDING
 #define IRQ_PENDING		GOT_YOU_MORON
-#undef IRQ_MASKED
 #define IRQ_MASKED		GOT_YOU_MORON
-#undef IRQ_WAKEUP
 #define IRQ_WAKEUP		GOT_YOU_MORON
-#undef IRQ_MOVE_PENDING
 #define IRQ_MOVE_PENDING	GOT_YOU_MORON
-#undef IRQ_PER_CPU
 #define IRQ_PER_CPU		GOT_YOU_MORON
-#undef IRQ_NO_BALANCING
 #define IRQ_NO_BALANCING	GOT_YOU_MORON
-#undef IRQ_AFFINITY_SET
 #define IRQ_AFFINITY_SET	GOT_YOU_MORON
-#undef IRQ_LEVEL
 #define IRQ_LEVEL		GOT_YOU_MORON
-#undef IRQ_NOPROBE
 #define IRQ_NOPROBE		GOT_YOU_MORON
-#undef IRQ_NOREQUEST
 #define IRQ_NOREQUEST		GOT_YOU_MORON
-#undef IRQ_NOAUTOEN
 #define IRQ_NOAUTOEN		GOT_YOU_MORON
-#undef IRQ_NESTED_THREAD
 #define IRQ_NESTED_THREAD	GOT_YOU_MORON
 #undef IRQF_MODIFY_MASK
 #define IRQF_MODIFY_MASK	GOT_YOU_MORON
-- 
cgit v0.10.2


From 2bff17ad2107c66fc8ca96501a7128dd7fa7a390 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 10 Feb 2011 13:08:38 +0100
Subject: genirq: Add flags to irq_chip

Looking through irq_chip implementations I noticed that some of them
have special requirements, like setting the type masked and therefor
fiddle in irq_desc->status. Add a flag field, so the core code can
handle it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 2e3d1e5..aefb30b 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -253,6 +253,7 @@ static inline bool irqd_is_level_type(struct irq_data *d)
  * @irq_set_wake:	enable/disable power-management wake-on of an IRQ
  * @irq_bus_lock:	function to lock access to slow bus (i2c) chips
  * @irq_bus_sync_unlock:function to sync and unlock slow bus (i2c) chips
+ * @flags:		chip specific flags
  *
  * @release:		release function solely used by UML
  */
@@ -299,6 +300,8 @@ struct irq_chip {
 	void		(*irq_bus_lock)(struct irq_data *data);
 	void		(*irq_bus_sync_unlock)(struct irq_data *data);
 
+	unsigned long	flags;
+
 	/* Currently used only by UML, might disappear one day.*/
 #ifdef CONFIG_IRQ_RELEASE_METHOD
 	void		(*release)(unsigned int irq, void *dev_id);
-- 
cgit v0.10.2


From d4d5e08960844a062da8387ee5f16ca7a33200d0 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 10 Feb 2011 13:16:14 +0100
Subject: genirq: Add IRQCHIP_SET_TYPE_MASKED flag

irq_chips, which require to mask the chip before changing the trigger
type should set this flag. So the core takes care of it and the
requirement for looking into desc->status in the chip goes away.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Linus Walleij <linus.walleij@stericsson.com>
Cc: Lars-Peter Clausen <lars@metafoo.de>

diff --git a/include/linux/irq.h b/include/linux/irq.h
index aefb30b..ef6b66d 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -308,6 +308,15 @@ struct irq_chip {
 #endif
 };
 
+/*
+ * irq_chip specific flags
+ *
+ * IRQCHIP_SET_TYPE_MASKED:		Mask before calling chip.irq_set_type()
+ */
+enum {
+	IRQCHIP_SET_TYPE_MASKED		= (1 <<  0),
+};
+
 /* This include will go away once we isolated irq_desc usage to core code */
 #include <linux/irqdesc.h>
 
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 9e9220d..4687457 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -367,7 +367,7 @@ static inline void mask_ack_irq(struct irq_desc *desc)
 	irq_state_set_masked(desc);
 }
 
-static inline void mask_irq(struct irq_desc *desc)
+void mask_irq(struct irq_desc *desc)
 {
 	if (desc->irq_data.chip->irq_mask) {
 		desc->irq_data.chip->irq_mask(&desc->irq_data);
@@ -375,7 +375,7 @@ static inline void mask_irq(struct irq_desc *desc)
 	}
 }
 
-static inline void unmask_irq(struct irq_desc *desc)
+void unmask_irq(struct irq_desc *desc)
 {
 	if (desc->irq_data.chip->irq_unmask) {
 		desc->irq_data.chip->irq_unmask(&desc->irq_data);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 6776453..1d500fb 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -84,6 +84,8 @@ extern int irq_startup(struct irq_desc *desc);
 extern void irq_shutdown(struct irq_desc *desc);
 extern void irq_enable(struct irq_desc *desc);
 extern void irq_disable(struct irq_desc *desc);
+extern void mask_irq(struct irq_desc *desc);
+extern void unmask_irq(struct irq_desc *desc);
 
 extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
 
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index b5de828..50809c7 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -554,8 +554,8 @@ void compat_irq_chip_set_default_handler(struct irq_desc *desc)
 int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
 		      unsigned long flags)
 {
-	int ret;
 	struct irq_chip *chip = desc->irq_data.chip;
+	int ret, unmask = 0;
 
 	if (!chip || !chip->irq_set_type) {
 		/*
@@ -568,6 +568,14 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
 	}
 
 	flags &= IRQ_TYPE_SENSE_MASK;
+
+	if (chip->flags & IRQCHIP_SET_TYPE_MASKED) {
+		if (!(desc->istate & IRQS_MASKED))
+			mask_irq(desc);
+		if (!(desc->istate & IRQS_DISABLED))
+			unmask = 1;
+	}
+
 	/* caller masked out all except trigger mode flags */
 	ret = chip->irq_set_type(&desc->irq_data, flags);
 
@@ -588,11 +596,13 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
 
 		if (chip != desc->irq_data.chip)
 			irq_chip_set_defaults(desc->irq_data.chip);
-		return 0;
+		ret = 0;
 	default:
 		pr_err("setting trigger mode %lu for irq %u failed (%pF)\n",
 		       flags, irq, chip->irq_set_type);
 	}
+	if (unmask)
+		unmask_irq(desc);
 	return ret;
 }
 
@@ -669,7 +679,7 @@ again:
 
 #ifdef CONFIG_SMP
 /*
- * Check whether we need to change the affinity of the interrupt thread.
+ * Check whether we need to chasnge the affinity of the interrupt thread.
  */
 static void
 irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
-- 
cgit v0.10.2


From 7f94226f03299f1ca32f118f02f2a0295e0e5e93 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 10 Feb 2011 19:46:26 +0100
Subject: genirq: Move wakeup state to irq_data

Some irq_chips need to know the state of wakeup mode for
setting the trigger type etc. Reflect it in irq_data state.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/irq.h b/include/linux/irq.h
index ef6b66d..94c8f5b 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -168,6 +168,8 @@ struct irq_data {
  * IRQD_PER_CPU			- Interrupt is per cpu
  * IRQD_AFFINITY_SET		- Interrupt affinity was set
  * IRQD_LEVEL			- Interrupt is level triggered
+ * IRQD_WAKEUP_STATE		- Interrupt is configured for wakeup
+ *				  from suspend
  */
 enum {
 	IRQD_TRIGGER_MASK		= 0xf,
@@ -176,6 +178,7 @@ enum {
 	IRQD_PER_CPU			= (1 << 11),
 	IRQD_AFFINITY_SET		= (1 << 12),
 	IRQD_LEVEL			= (1 << 13),
+	IRQD_WAKEUP_STATE		= (1 << 14),
 };
 
 static inline bool irqd_is_setaffinity_pending(struct irq_data *d)
@@ -217,6 +220,11 @@ static inline bool irqd_is_level_type(struct irq_data *d)
 	return d->state_use_accessors & IRQD_LEVEL;
 }
 
+static inline bool irqd_is_wakeup_set(struct irq_data *d)
+{
+	return d->state_use_accessors & IRQD_WAKEUP_STATE;
+}
+
 /**
  * struct irq_chip - hardware interrupt chip descriptor
  *
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 1d500fb..5e2366da 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -46,7 +46,6 @@ enum {
  * IRQS_PENDING			- irq is pending and replayed later
  * IRQS_MASKED			- irq is masked
  * IRQS_SUSPENDED		- irq is suspended
- * IRQS_WAKEUP			- irq triggers system wakeup from suspend
  */
 enum {
 	IRQS_AUTODETECT		= 0x00000001,
@@ -60,7 +59,6 @@ enum {
 	IRQS_PENDING		= 0x00000200,
 	IRQS_MASKED		= 0x00000400,
 	IRQS_SUSPENDED		= 0x00000800,
-	IRQS_WAKEUP		= 0x00001000,
 };
 
 #include "compat.h"
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 50809c7..ea6add6 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -492,7 +492,7 @@ int irq_set_irq_wake(unsigned int irq, unsigned int on)
 			if (ret)
 				desc->wake_depth = 0;
 			else
-				desc->istate |= IRQS_WAKEUP;
+				irqd_set(&desc->irq_data, IRQD_WAKEUP_STATE);
 		}
 	} else {
 		if (desc->wake_depth == 0) {
@@ -502,7 +502,7 @@ int irq_set_irq_wake(unsigned int irq, unsigned int on)
 			if (ret)
 				desc->wake_depth = 1;
 			else
-				desc->istate &= ~IRQS_WAKEUP;
+				irqd_clear(&desc->irq_data, IRQD_WAKEUP_STATE);
 		}
 	}
 
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index f39383d..1329f0e 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -69,7 +69,7 @@ int check_wakeup_irqs(void)
 	int irq;
 
 	for_each_irq_desc(irq, desc)
-		if ((desc->istate & IRQS_WAKEUP) &&
+		if (irqd_is_wakeup_set(&desc->irq_data) &&
 		    (desc->istate & IRQS_PENDING))
 			return -EBUSY;
 
-- 
cgit v0.10.2


From e1ef824146131709d7466e37f889f2dab24ca98e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 10 Feb 2011 22:25:31 +0100
Subject: genirq: Reflect IRQ_MOVE_PCNTXT in irq_data state

Required by x86.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 94c8f5b..c101ad4 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -170,6 +170,8 @@ struct irq_data {
  * IRQD_LEVEL			- Interrupt is level triggered
  * IRQD_WAKEUP_STATE		- Interrupt is configured for wakeup
  *				  from suspend
+ * IRDQ_MOVE_PCNTXT		- Interrupt can be moved in process
+ *				  context
  */
 enum {
 	IRQD_TRIGGER_MASK		= 0xf,
@@ -179,6 +181,7 @@ enum {
 	IRQD_AFFINITY_SET		= (1 << 12),
 	IRQD_LEVEL			= (1 << 13),
 	IRQD_WAKEUP_STATE		= (1 << 14),
+	IRQD_MOVE_PCNTXT		= (1 << 15),
 };
 
 static inline bool irqd_is_setaffinity_pending(struct irq_data *d)
@@ -225,6 +228,11 @@ static inline bool irqd_is_wakeup_set(struct irq_data *d)
 	return d->state_use_accessors & IRQD_WAKEUP_STATE;
 }
 
+static inline bool irqd_can_move_in_process_context(struct irq_data *d)
+{
+	return d->state_use_accessors & IRQD_MOVE_PCNTXT;
+}
+
 /**
  * struct irq_chip - hardware interrupt chip descriptor
  *
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 4687457..2b0f919 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -712,11 +712,13 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
 	irq_settings_clr_and_set(desc, clr, set);
 
 	irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU |
-		   IRQD_TRIGGER_MASK | IRQD_LEVEL);
+		   IRQD_TRIGGER_MASK | IRQD_LEVEL | IRQD_MOVE_PCNTXT);
 	if (irq_settings_has_no_balance_set(desc))
 		irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
 	if (irq_settings_is_per_cpu(desc))
 		irqd_set(&desc->irq_data, IRQD_PER_CPU);
+	if (irq_settings_can_move_pcntxt(desc))
+		irqd_set(&desc->irq_data, IRQD_MOVE_PCNTXT);
 
 	irqd_set(&desc->irq_data, irq_settings_get_trigger_mask(desc));
 
-- 
cgit v0.10.2


From a6967caf00ebbb2d4acdebcb72a25f2e9ba43fd2 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 10 Feb 2011 22:01:25 +0100
Subject: genirq: Remove desc->status when GENERIC_HARDIRQS_NO_COMPAT=y

If everything uses the right accessors, then enabling
GENERIC_HARDIRQS_NO_COMPAT should just work. If not it will tell you.

Don't be lazy and use the trick which I use in the core code!

git grep status_use_accessors

will unearth it in a split second. Offenders are tracked down and not
slapped with stinking trouts. This time we use frozen shark for a
better educational value.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
index 581d9665..36c95f0 100644
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -64,7 +64,11 @@ struct irq_desc {
 	unsigned int __percpu	*kstat_irqs;
 	irq_flow_handler_t	handle_irq;
 	struct irqaction	*action;	/* IRQ action list */
+#ifdef CONFIG_GENERIC_HARDIRQS_NO_COMPAT
+	unsigned int		status_use_accessors;
+#else
 	unsigned int		status;		/* IRQ status */
+#endif
 	unsigned int		core_internal_state__do_not_mess_with_it;
 	unsigned int		depth;		/* nested irq disables */
 	unsigned int		wake_depth;	/* nested wake enables */
@@ -164,6 +168,7 @@ static inline int irq_has_action(unsigned int irq)
 	return desc->action != NULL;
 }
 
+#ifndef CONFIG_GENERIC_HARDIRQS_NO_COMPAT
 static inline int irq_balancing_disabled(unsigned int irq)
 {
 	struct irq_desc *desc;
@@ -171,6 +176,7 @@ static inline int irq_balancing_disabled(unsigned int irq)
 	desc = irq_to_desc(irq);
 	return desc->status & IRQ_NO_BALANCING_MASK;
 }
+#endif
 
 /* caller has locked the irq_desc and both params are valid */
 static inline void __set_irq_handler_unlocked(int irq,
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 5e2366da..fd5777a 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -15,6 +15,10 @@
 
 #define istate core_internal_state__do_not_mess_with_it
 
+#ifdef CONFIG_GENERIC_HARDIRQS_NO_COMPAT
+# define status status_use_accessors
+#endif
+
 extern int noirqdebug;
 
 /*
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
index 55ebe1e..0227ad3 100644
--- a/kernel/irq/settings.h
+++ b/kernel/irq/settings.h
@@ -134,4 +134,5 @@ static inline bool irq_settings_is_nested_thread(struct irq_desc *desc)
 }
 
 /* Nothing should touch desc->status from now on */
+#undef status
 #define status		USE_THE_PROPER_WRAPPERS_YOU_MORON
-- 
cgit v0.10.2


From 091738a266fc74329ae186f22ff2b3f01319112d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 14 Feb 2011 20:16:43 +0100
Subject: genirq: Remove real old transition functions

These transition helpers are stale for years now. Remove them.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index c8bbc4f..394784c 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -47,12 +47,6 @@ unsigned long probe_irq_on(void)
 		raw_spin_lock_irq(&desc->lock);
 		if (!desc->action && irq_settings_can_probe(desc)) {
 			/*
-			 * An old-style architecture might still have
-			 * the handle_bad_irq handler there:
-			 */
-			compat_irq_chip_set_default_handler(desc);
-
-			/*
 			 * Some chips need to know about probing in
 			 * progress:
 			 */
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 2b0f919..c19c0b5 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -644,19 +644,11 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
 		return;
 	}
 
-	if (!handle)
+	if (!handle) {
 		handle = handle_bad_irq;
-	else if (desc->irq_data.chip == &no_irq_chip) {
-		printk(KERN_WARNING "Trying to install %sinterrupt handler "
-		       "for IRQ%d\n", is_chained ? "chained " : "", irq);
-		/*
-		 * Some ARM implementations install a handler for really dumb
-		 * interrupt hardware without setting an irq_chip. This worked
-		 * with the ARM no_irq_chip but the check in setup_irq would
-		 * prevent us to setup the interrupt at all. Switch it to
-		 * dummy_irq_chip for easy transition.
-		 */
-		desc->irq_data.chip = &dummy_irq_chip;
+	} else {
+		if (WARN_ON(desc->irq_data.chip == &no_irq_chip))
+			return;
 	}
 
 	chip_bus_lock(desc);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index fd5777a..f80a774 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -74,9 +74,6 @@ enum {
 /* Set default functions for irq_chip structures: */
 extern void irq_chip_set_defaults(struct irq_chip *chip);
 
-/* Set default handler: */
-extern void compat_irq_chip_set_default_handler(struct irq_desc *desc);
-
 extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
 		unsigned long flags);
 extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index ea6add6..99395a2 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -540,17 +540,6 @@ int can_request_irq(unsigned int irq, unsigned long irqflags)
 	return !action;
 }
 
-void compat_irq_chip_set_default_handler(struct irq_desc *desc)
-{
-	/*
-	 * If the architecture still has not overriden
-	 * the flow handler then zap the default. This
-	 * should catch incorrect flow-type setting.
-	 */
-	if (desc->handle_irq == &handle_bad_irq)
-		desc->handle_irq = NULL;
-}
-
 int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
 		      unsigned long flags)
 {
@@ -912,8 +901,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 
 			if (ret)
 				goto out_mask;
-		} else
-			compat_irq_chip_set_default_handler(desc);
+		}
 
 		desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \
 				  IRQS_INPROGRESS | IRQS_ONESHOT | \
-- 
cgit v0.10.2


From d5eb4ad2dfb2dfae43fd51bc8630b4fc3ef00e92 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 12 Feb 2011 12:16:16 +0100
Subject: genirq: Implement irq_get/put_desc_[bus]locked/unlock()

Most of the managing functions get the irq descriptor and lock it -
either with or without buslock. Instead of open coding this over and
over provide a common function to do that.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index f80a774..935bec4 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -126,6 +126,34 @@ static inline void chip_bus_sync_unlock(struct irq_desc *desc)
 		desc->irq_data.chip->irq_bus_sync_unlock(&desc->irq_data);
 }
 
+struct irq_desc *
+__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus);
+void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus);
+
+static inline struct irq_desc *
+irq_get_desc_buslock(unsigned int irq, unsigned long *flags)
+{
+	return __irq_get_desc_lock(irq, flags, true);
+}
+
+static inline void
+irq_put_desc_busunlock(struct irq_desc *desc, unsigned long flags)
+{
+	__irq_put_desc_unlock(desc, flags, true);
+}
+
+static inline struct irq_desc *
+irq_get_desc_lock(unsigned int irq, unsigned long *flags)
+{
+	return __irq_get_desc_lock(irq, flags, false);
+}
+
+static inline void
+irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags)
+{
+	__irq_put_desc_unlock(desc, flags, false);
+}
+
 /*
  * Manipulation functions for irq_data.state
  */
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 3387fbd..394ab6a 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -402,6 +402,26 @@ unsigned int irq_get_next_irq(unsigned int offset)
 	return find_next_bit(allocated_irqs, nr_irqs, offset);
 }
 
+struct irq_desc *
+__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	if (desc) {
+		if (bus)
+			chip_bus_lock(desc);
+		raw_spin_lock_irqsave(&desc->lock, *flags);
+	}
+	return desc;
+}
+
+void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus)
+{
+	raw_spin_unlock_irqrestore(&desc->lock, flags);
+	if (bus)
+		chip_bus_sync_unlock(desc);
+}
+
 /**
  * dynamic_irq_cleanup - cleanup a dynamically allocated irq
  * @irq:	irq number to initialize
-- 
cgit v0.10.2


From 02725e7471b8dd58fa96f6604bdb5dde45405a2e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 12 Feb 2011 10:37:36 +0100
Subject: genirq: Use irq_get/put functions

Convert the management functions to use the common irq_get/put
function.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index c19c0b5..3ea6aec 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -25,22 +25,18 @@
  */
 int irq_set_chip(unsigned int irq, struct irq_chip *chip)
 {
-	struct irq_desc *desc = irq_to_desc(irq);
 	unsigned long flags;
+	struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
 
-	if (!desc) {
-		WARN(1, KERN_ERR "Trying to install chip for IRQ%d\n", irq);
+	if (!desc)
 		return -EINVAL;
-	}
 
 	if (!chip)
 		chip = &no_irq_chip;
 
-	raw_spin_lock_irqsave(&desc->lock, flags);
 	irq_chip_set_defaults(chip);
 	desc->irq_data.chip = chip;
-	raw_spin_unlock_irqrestore(&desc->lock, flags);
-
+	irq_put_desc_unlock(desc, flags);
 	return 0;
 }
 EXPORT_SYMBOL(irq_set_chip);
@@ -52,24 +48,17 @@ EXPORT_SYMBOL(irq_set_chip);
  */
 int irq_set_irq_type(unsigned int irq, unsigned int type)
 {
-	struct irq_desc *desc = irq_to_desc(irq);
 	unsigned long flags;
-	int ret = -ENXIO;
+	struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
+	int ret = 0;
 
-	if (!desc) {
-		printk(KERN_ERR "Trying to set irq type for IRQ%d\n", irq);
-		return -ENODEV;
-	}
+	if (!desc)
+		return -EINVAL;
 
 	type &= IRQ_TYPE_SENSE_MASK;
-	if (type == IRQ_TYPE_NONE)
-		return 0;
-
-	chip_bus_lock(desc);
-	raw_spin_lock_irqsave(&desc->lock, flags);
-	ret = __irq_set_trigger(desc, irq, type);
-	raw_spin_unlock_irqrestore(&desc->lock, flags);
-	chip_bus_sync_unlock(desc);
+	if (type != IRQ_TYPE_NONE)
+		ret = __irq_set_trigger(desc, irq, type);
+	irq_put_desc_busunlock(desc, flags);
 	return ret;
 }
 EXPORT_SYMBOL(irq_set_irq_type);
@@ -83,18 +72,13 @@ EXPORT_SYMBOL(irq_set_irq_type);
  */
 int irq_set_handler_data(unsigned int irq, void *data)
 {
-	struct irq_desc *desc = irq_to_desc(irq);
 	unsigned long flags;
+	struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
 
-	if (!desc) {
-		printk(KERN_ERR
-		       "Trying to install controller data for IRQ%d\n", irq);
+	if (!desc)
 		return -EINVAL;
-	}
-
-	raw_spin_lock_irqsave(&desc->lock, flags);
 	desc->irq_data.handler_data = data;
-	raw_spin_unlock_irqrestore(&desc->lock, flags);
+	irq_put_desc_unlock(desc, flags);
 	return 0;
 }
 EXPORT_SYMBOL(irq_set_handler_data);
@@ -108,20 +92,15 @@ EXPORT_SYMBOL(irq_set_handler_data);
  */
 int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry)
 {
-	struct irq_desc *desc = irq_to_desc(irq);
 	unsigned long flags;
+	struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
 
-	if (!desc) {
-		printk(KERN_ERR
-		       "Trying to install msi data for IRQ%d\n", irq);
+	if (!desc)
 		return -EINVAL;
-	}
-
-	raw_spin_lock_irqsave(&desc->lock, flags);
 	desc->irq_data.msi_desc = entry;
 	if (entry)
 		entry->irq = irq;
-	raw_spin_unlock_irqrestore(&desc->lock, flags);
+	irq_put_desc_unlock(desc, flags);
 	return 0;
 }
 
@@ -134,24 +113,13 @@ int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry)
  */
 int irq_set_chip_data(unsigned int irq, void *data)
 {
-	struct irq_desc *desc = irq_to_desc(irq);
 	unsigned long flags;
+	struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
 
-	if (!desc) {
-		printk(KERN_ERR
-		       "Trying to install chip data for IRQ%d\n", irq);
-		return -EINVAL;
-	}
-
-	if (!desc->irq_data.chip) {
-		printk(KERN_ERR "BUG: bad set_irq_chip_data(IRQ#%d)\n", irq);
+	if (!desc)
 		return -EINVAL;
-	}
-
-	raw_spin_lock_irqsave(&desc->lock, flags);
 	desc->irq_data.chip_data = data;
-	raw_spin_unlock_irqrestore(&desc->lock, flags);
-
+	irq_put_desc_unlock(desc, flags);
 	return 0;
 }
 EXPORT_SYMBOL(irq_set_chip_data);
@@ -635,25 +603,19 @@ void
 __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
 		  const char *name)
 {
-	struct irq_desc *desc = irq_to_desc(irq);
 	unsigned long flags;
+	struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
 
-	if (!desc) {
-		printk(KERN_ERR
-		       "Trying to install type control for IRQ%d\n", irq);
+	if (!desc)
 		return;
-	}
 
 	if (!handle) {
 		handle = handle_bad_irq;
 	} else {
 		if (WARN_ON(desc->irq_data.chip == &no_irq_chip))
-			return;
+			goto out;
 	}
 
-	chip_bus_lock(desc);
-	raw_spin_lock_irqsave(&desc->lock, flags);
-
 	/* Uninstall? */
 	if (handle == handle_bad_irq) {
 		if (desc->irq_data.chip != &no_irq_chip)
@@ -670,8 +632,8 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
 		irq_settings_set_norequest(desc);
 		irq_startup(desc);
 	}
-	raw_spin_unlock_irqrestore(&desc->lock, flags);
-	chip_bus_sync_unlock(desc);
+out:
+	irq_put_desc_busunlock(desc, flags);
 }
 EXPORT_SYMBOL_GPL(__set_irq_handler);
 
@@ -693,14 +655,11 @@ set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
 
 void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
 {
-	struct irq_desc *desc = irq_to_desc(irq);
 	unsigned long flags;
+	struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
 
 	if (!desc)
 		return;
-
-	raw_spin_lock_irqsave(&desc->lock, flags);
-
 	irq_settings_clr_and_set(desc, clr, set);
 
 	irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU |
@@ -714,5 +673,5 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
 
 	irqd_set(&desc->irq_data, irq_settings_get_trigger_mask(desc));
 
-	raw_spin_unlock_irqrestore(&desc->lock, flags);
+	irq_put_desc_unlock(desc, flags);
 }
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 99395a2..6cca195 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -172,16 +172,13 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *mask)
 
 int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
 {
-	struct irq_desc *desc = irq_to_desc(irq);
 	unsigned long flags;
+	struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
 
 	if (!desc)
 		return -EINVAL;
-
-	raw_spin_lock_irqsave(&desc->lock, flags);
 	desc->affinity_hint = m;
-	raw_spin_unlock_irqrestore(&desc->lock, flags);
-
+	irq_put_desc_unlock(desc, flags);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
@@ -336,6 +333,18 @@ void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend)
 		irq_disable(desc);
 }
 
+static int __disable_irq_nosync(unsigned int irq)
+{
+	unsigned long flags;
+	struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
+
+	if (!desc)
+		return -EINVAL;
+	__disable_irq(desc, irq, false);
+	irq_put_desc_busunlock(desc, flags);
+	return 0;
+}
+
 /**
  *	disable_irq_nosync - disable an irq without waiting
  *	@irq: Interrupt to disable
@@ -349,17 +358,7 @@ void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend)
  */
 void disable_irq_nosync(unsigned int irq)
 {
-	struct irq_desc *desc = irq_to_desc(irq);
-	unsigned long flags;
-
-	if (!desc)
-		return;
-
-	chip_bus_lock(desc);
-	raw_spin_lock_irqsave(&desc->lock, flags);
-	__disable_irq(desc, irq, false);
-	raw_spin_unlock_irqrestore(&desc->lock, flags);
-	chip_bus_sync_unlock(desc);
+	__disable_irq_nosync(irq);
 }
 EXPORT_SYMBOL(disable_irq_nosync);
 
@@ -377,13 +376,7 @@ EXPORT_SYMBOL(disable_irq_nosync);
  */
 void disable_irq(unsigned int irq)
 {
-	struct irq_desc *desc = irq_to_desc(irq);
-
-	if (!desc)
-		return;
-
-	disable_irq_nosync(irq);
-	if (desc->action)
+	if (!__disable_irq_nosync(irq))
 		synchronize_irq(irq);
 }
 EXPORT_SYMBOL(disable_irq);
@@ -434,21 +427,18 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
  */
 void enable_irq(unsigned int irq)
 {
-	struct irq_desc *desc = irq_to_desc(irq);
 	unsigned long flags;
+	struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
 
 	if (!desc)
 		return;
-
 	if (WARN(!desc->irq_data.chip,
 		 KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq))
-		return;
+		goto out;
 
-	chip_bus_lock(desc);
-	raw_spin_lock_irqsave(&desc->lock, flags);
 	__enable_irq(desc, irq, false);
-	raw_spin_unlock_irqrestore(&desc->lock, flags);
-	chip_bus_sync_unlock(desc);
+out:
+	irq_put_desc_busunlock(desc, flags);
 }
 EXPORT_SYMBOL(enable_irq);
 
@@ -477,15 +467,13 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on)
  */
 int irq_set_irq_wake(unsigned int irq, unsigned int on)
 {
-	struct irq_desc *desc = irq_to_desc(irq);
 	unsigned long flags;
+	struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
 	int ret = 0;
 
 	/* wakeup-capable irqs can be shared between drivers that
 	 * don't need to have the same sleep mode behaviors.
 	 */
-	chip_bus_lock(desc);
-	raw_spin_lock_irqsave(&desc->lock, flags);
 	if (on) {
 		if (desc->wake_depth++ == 0) {
 			ret = set_irq_wake_real(irq, on);
@@ -505,9 +493,7 @@ int irq_set_irq_wake(unsigned int irq, unsigned int on)
 				irqd_clear(&desc->irq_data, IRQD_WAKEUP_STATE);
 		}
 	}
-
-	raw_spin_unlock_irqrestore(&desc->lock, flags);
-	chip_bus_sync_unlock(desc);
+	irq_put_desc_busunlock(desc, flags);
 	return ret;
 }
 EXPORT_SYMBOL(irq_set_irq_wake);
@@ -519,25 +505,20 @@ EXPORT_SYMBOL(irq_set_irq_wake);
  */
 int can_request_irq(unsigned int irq, unsigned long irqflags)
 {
-	struct irq_desc *desc = irq_to_desc(irq);
-	struct irqaction *action;
 	unsigned long flags;
+	struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
+	int canrequest = 0;
 
 	if (!desc)
 		return 0;
 
-	if (!irq_settings_can_request(desc))
-		return 0;
-
-	raw_spin_lock_irqsave(&desc->lock, flags);
-	action = desc->action;
-	if (action)
-		if (irqflags & action->flags & IRQF_SHARED)
-			action = NULL;
-
-	raw_spin_unlock_irqrestore(&desc->lock, flags);
-
-	return !action;
+	if (irq_settings_can_request(desc)) {
+		if (desc->action)
+			if (irqflags & desc->action->flags & IRQF_SHARED)
+				canrequest =1;
+	}
+	irq_put_desc_unlock(desc, flags);
+	return canrequest;
 }
 
 int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
-- 
cgit v0.10.2


From 3836ca08aad4575c120ccf328652f3873eea9063 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 14 Feb 2011 20:09:19 +0100
Subject: genirq: Consolidate set_chip_handler functions

No need to have separate functions if we have one plus inline wrappers.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/irq.h b/include/linux/irq.h
index c101ad4..3e29e2f 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -398,23 +398,23 @@ extern struct irq_chip no_irq_chip;
 extern struct irq_chip dummy_irq_chip;
 
 extern void
-set_irq_chip_and_handler(unsigned int irq, struct irq_chip *chip,
-			 irq_flow_handler_t handle);
-extern void
-set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
+irq_set_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
 			      irq_flow_handler_t handle, const char *name);
 
+static inline void irq_set_chip_and_handler(unsigned int irq, struct irq_chip *chip,
+					    irq_flow_handler_t handle)
+{
+	irq_set_chip_and_handler_name(irq, chip, handle, NULL);
+}
+
 extern void
-__set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
+__irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
 		  const char *name);
 
-/*
- * Set a highlevel flow handler for a given IRQ:
- */
 static inline void
-set_irq_handler(unsigned int irq, irq_flow_handler_t handle)
+irq_set_handler(unsigned int irq, irq_flow_handler_t handle)
 {
-	__set_irq_handler(irq, handle, 0, NULL);
+	__irq_set_handler(irq, handle, 0, NULL);
 }
 
 /*
@@ -423,9 +423,9 @@ set_irq_handler(unsigned int irq, irq_flow_handler_t handle)
  *  IRQ_NOREQUEST and IRQ_NOPROBE)
  */
 static inline void
-set_irq_chained_handler(unsigned int irq, irq_flow_handler_t handle)
+irq_set_chained_handler(unsigned int irq, irq_flow_handler_t handle)
 {
-	__set_irq_handler(irq, handle, 1, NULL);
+	__irq_set_handler(irq, handle, 1, NULL);
 }
 
 void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set);
@@ -579,6 +579,33 @@ static inline void set_irq_nested_thread(unsigned int irq, int nest)
 {
 	irq_set_nested_thread(irq, nest);
 }
+static inline void
+set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
+			      irq_flow_handler_t handle, const char *name)
+{
+	irq_set_chip_and_handler_name(irq, chip, handle, name);
+}
+static inline void
+set_irq_chip_and_handler(unsigned int irq, struct irq_chip *chip,
+			 irq_flow_handler_t handle)
+{
+	irq_set_chip_and_handler(irq, chip, handle);
+}
+static inline void
+__set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
+		  const char *name)
+{
+	__irq_set_handler(irq, handle, is_chained, name);
+}
+static inline void set_irq_handler(unsigned int irq, irq_flow_handler_t handle)
+{
+	irq_set_handler(irq, handle);
+}
+static inline void
+set_irq_chained_handler(unsigned int irq, irq_flow_handler_t handle)
+{
+	irq_set_chained_handler(irq, handle);
+}
 #endif
 
 int irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node);
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 3ea6aec..2a36038 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -600,7 +600,7 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
 }
 
 void
-__set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
+__irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
 		  const char *name)
 {
 	unsigned long flags;
@@ -635,22 +635,14 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
 out:
 	irq_put_desc_busunlock(desc, flags);
 }
-EXPORT_SYMBOL_GPL(__set_irq_handler);
+EXPORT_SYMBOL_GPL(__irq_set_handler);
 
 void
-set_irq_chip_and_handler(unsigned int irq, struct irq_chip *chip,
-			 irq_flow_handler_t handle)
-{
-	irq_set_chip(irq, chip);
-	__set_irq_handler(irq, handle, 0, NULL);
-}
-
-void
-set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
+irq_set_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
 			      irq_flow_handler_t handle, const char *name)
 {
 	irq_set_chip(irq, chip);
-	__set_irq_handler(irq, handle, 0, name);
+	__irq_set_handler(irq, handle, 0, name);
 }
 
 void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
-- 
cgit v0.10.2


From 781295762defc709a609efc01d8bb065276cd9a2 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 10 Feb 2011 15:14:20 +0100
Subject: genirq: Add preflow handler support

sparc64 needs to call a preflow handler on certain interrupts befor
calling the action chain. Integrate it into handle_fasteoi_irq. Must
be enabled via CONFIG_IRQ_FASTEOI_PREFLOW. No impact when disabled.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: David S. Miller <davem@davemloft.net>

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 3e29e2f..3639097 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -29,9 +29,10 @@
 #include <asm/irq_regs.h>
 
 struct irq_desc;
+struct irq_data;
 typedef	void (*irq_flow_handler_t)(unsigned int irq,
 					    struct irq_desc *desc);
-
+typedef	void (*irq_preflow_handler_t)(struct irq_data *data);
 
 /*
  * IRQ line status.
diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
index 36c95f0..2f87d64 100644
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -63,6 +63,9 @@ struct irq_desc {
 	struct timer_rand_state *timer_rand_state;
 	unsigned int __percpu	*kstat_irqs;
 	irq_flow_handler_t	handle_irq;
+#ifdef CONFIG_IRQ_PREFLOW_FASTEOI
+	irq_preflow_handler_t	preflow_handler;
+#endif
 	struct irqaction	*action;	/* IRQ action list */
 #ifdef CONFIG_GENERIC_HARDIRQS_NO_COMPAT
 	unsigned int		status_use_accessors;
@@ -187,6 +190,17 @@ static inline void __set_irq_handler_unlocked(int irq,
 	desc = irq_to_desc(irq);
 	desc->handle_irq = handler;
 }
+
+#ifdef CONFIG_IRQ_PREFLOW_FASTEOI
+static inline void
+__irq_set_preflow_handler(unsigned int irq, irq_preflow_handler_t handler)
+{
+	struct irq_desc *desc;
+
+	desc = irq_to_desc(irq);
+	desc->preflow_handler = handler;
+}
+#endif
 #endif
 
 #endif
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 48ad25f..9149be7 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -35,6 +35,9 @@ config AUTO_IRQ_AFFINITY
 config HARDIRQS_SW_RESEND
        def_bool n
 
+config IRQ_PREFLOW_FASTEOI
+       def_bool n
+
 config SPARSE_IRQ
 	bool "Support sparse irq numbering"
 	depends on HAVE_SPARSE_IRQ
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 2a36038..08be5d1 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -471,6 +471,16 @@ out_unlock:
 }
 EXPORT_SYMBOL_GPL(handle_level_irq);
 
+#ifdef CONFIG_IRQ_PREFLOW_FASTEOI
+static inline void preflow_handler(struct irq_desc *desc)
+{
+	if (desc->preflow_handler)
+		desc->preflow_handler(&desc->irq_data);
+}
+#else
+static inline void preflow_handler(struct irq_desc *desc) { }
+#endif
+
 /**
  *	handle_fasteoi_irq - irq handler for transparent controllers
  *	@irq:	the interrupt number
@@ -503,6 +513,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
 		mask_irq(desc);
 		goto out;
 	}
+	preflow_handler(desc);
 	handle_irq_event(desc);
 out:
 	desc->irq_data.chip->irq_eoi(&desc->irq_data);
-- 
cgit v0.10.2


From 77694b408abb8f92195ad5ed6ce5492f1d794c77 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 15 Feb 2011 10:33:57 +0100
Subject: genirq; Add fasteoi irq_chip quirk

Some chips want irq_eoi() only called when an interrupt is actually
handled. So they have checks for INPROGRESS and DISABLED in their
irq_eoi callbacks. Add a chip flag, which allows to handle that in the
generic code. No impact on the fastpath.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 3639097..ea2970c 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -328,10 +328,12 @@ struct irq_chip {
 /*
  * irq_chip specific flags
  *
- * IRQCHIP_SET_TYPE_MASKED:		Mask before calling chip.irq_set_type()
+ * IRQCHIP_SET_TYPE_MASKED:	Mask before calling chip.irq_set_type()
+ * IRQCHIP_EOI_IF_HANDLED:	Only issue irq_eoi() when irq was handled
  */
 enum {
 	IRQCHIP_SET_TYPE_MASKED		= (1 <<  0),
+	IRQCHIP_EOI_IF_HANDLED		= (1 <<  1),
 };
 
 /* This include will go away once we isolated irq_desc usage to core code */
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 08be5d1..1d3e25e 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -515,9 +515,16 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
 	}
 	preflow_handler(desc);
 	handle_irq_event(desc);
-out:
+
+out_eoi:
 	desc->irq_data.chip->irq_eoi(&desc->irq_data);
+out_unlock:
 	raw_spin_unlock(&desc->lock);
+	return;
+out:
+	if (!(desc->irq_data.chip->flags & IRQCHIP_EOI_IF_HANDLED))
+		goto out_eoi;
+	goto out_unlock;
 }
 
 /**
-- 
cgit v0.10.2


From a439520f8b18917b322f576be04c54aba84bb044 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 4 Feb 2011 18:46:16 +0100
Subject: genirq: Implement irq_data based move_*_irq() versions

No need to lookup the irq descriptor when calling from a chip callback
function which has irq_data already handy.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/irq.h b/include/linux/irq.h
index ea2970c..ff62d01 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -363,9 +363,13 @@ extern void remove_irq(unsigned int irq, struct irqaction *act);
 #if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_PENDING_IRQ)
 void move_native_irq(int irq);
 void move_masked_irq(int irq);
+void irq_move_irq(struct irq_data *data);
+void irq_move_masked_irq(struct irq_data *data);
 #else
 static inline void move_native_irq(int irq) { }
 static inline void move_masked_irq(int irq) { }
+static inline void irq_move_irq(struct irq_data *data) { }
+static inline void irq_move_masked_irq(struct irq_data *data) { }
 #endif
 
 extern int no_irq_affinity;
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 7a93c6b..ec4806d 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -4,10 +4,10 @@
 
 #include "internals.h"
 
-void move_masked_irq(int irq)
+void irq_move_masked_irq(struct irq_data *idata)
 {
-	struct irq_desc *desc = irq_to_desc(irq);
-	struct irq_chip *chip = desc->irq_data.chip;
+	struct irq_desc *desc = irq_data_to_desc(idata);
+	struct irq_chip *chip = idata->chip;
 
 	if (likely(!irqd_is_setaffinity_pending(&desc->irq_data)))
 		return;
@@ -53,12 +53,17 @@ void move_masked_irq(int irq)
 	cpumask_clear(desc->pending_mask);
 }
 
-void move_native_irq(int irq)
+void move_masked_irq(int irq)
+{
+	irq_move_masked_irq(irq_get_irq_data(irq));
+}
+
+void irq_move_irq(struct irq_data *idata)
 {
-	struct irq_desc *desc = irq_to_desc(irq);
+	struct irq_desc *desc = irq_data_to_desc(idata);
 	bool masked;
 
-	if (likely(!irqd_is_setaffinity_pending(&desc->irq_data)))
+	if (likely(!irqd_is_setaffinity_pending(idata)))
 		return;
 
 	if (unlikely(desc->istate & IRQS_DISABLED))
@@ -71,8 +76,13 @@ void move_native_irq(int irq)
 	 */
 	masked = desc->istate & IRQS_MASKED;
 	if (!masked)
-		desc->irq_data.chip->irq_mask(&desc->irq_data);
-	move_masked_irq(irq);
+		idata->chip->irq_mask(idata);
+	irq_move_masked_irq(idata);
 	if (!masked)
-		desc->irq_data.chip->irq_unmask(&desc->irq_data);
+		idata->chip->irq_unmask(idata);
+}
+
+void move_native_irq(int irq)
+{
+	irq_move_irq(irq_get_irq_data(irq));
 }
-- 
cgit v0.10.2


From 2b15cd96e5e93f9aa4f7041c91c1e7c344a62cbb Mon Sep 17 00:00:00 2001
From: Borislav Petkov <borislav.petkov@amd.com>
Date: Fri, 18 Feb 2011 16:47:36 +0100
Subject: x86, system.h: Drop unused __SAVE/__RESTORE macros

Those are unused since at least the beginning of git history.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
LKML-Reference: <1298044056-31104-1-git-send-email-bp@amd64.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h
index 33ecc3e..12569e6 100644
--- a/arch/x86/include/asm/system.h
+++ b/arch/x86/include/asm/system.h
@@ -98,8 +98,6 @@ do {									\
  */
 #define HAVE_DISABLE_HLT
 #else
-#define __SAVE(reg, offset) "movq %%" #reg ",(14-" #offset ")*8(%%rsp)\n\t"
-#define __RESTORE(reg, offset) "movq (14-" #offset ")*8(%%rsp),%%" #reg "\n\t"
 
 /* frame pointer must be last for get_wchan */
 #define SAVE_CONTEXT    "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t"
-- 
cgit v0.10.2


From 1396fa9cd2e34669253b7ca8c75f12103481f71c Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Fri, 18 Feb 2011 12:17:16 +0300
Subject: x86, microcode, AMD: Fix signedness bug in generic_load_microcode()

install_equiv_cpu_table() returns type int.  It uses negative
error codes so using an unsigned type breaks the error handling.

Signed-off-by: Dan Carpenter <error27@gmail.com>
Acked-by: Borislav Petkov <borislav.petkov@amd.com>
Cc: open list:AMD MICROCODE UPD... <amd64-microcode@amd64.org>
Cc: Andreas Herrmann <andreas.herrmann3@amd.com>
LKML-Reference: <20110218091716.GA4384@bicker>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index 9fb8405..c561038 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -246,7 +246,7 @@ generic_load_microcode(int cpu, const u8 *data, size_t size)
 	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
 	struct microcode_header_amd *mc_hdr = NULL;
 	unsigned int mc_size, leftover;
-	unsigned long offset;
+	int offset;
 	const u8 *ucode_ptr = data;
 	void *new_mc = NULL;
 	unsigned int new_rev = uci->cpu_sig.rev;
-- 
cgit v0.10.2


From 69efcc6d90d234a3a076afb2c635c1609536faa4 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Mon, 21 Feb 2011 10:58:13 +0100
Subject: x86-64, NUMA: Do not scan two times for setup_node_bootmem()

By the time setup_node_bootmem() is called, all the memblocks are
already registered.  As node_data is allocated from these memblocks,
calling it more than once doesn't make any difference.  Drop the loop.

tj: Dropped comment referencing to the old behavior as suggested by
    David and rephrased the description.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index f6d85e3..6e4ee96 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -480,7 +480,7 @@ static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi)
 
 static int __init numa_register_memblks(struct numa_meminfo *mi)
 {
-	int i, j, nid;
+	int i, nid;
 
 	/* Account for nodes with cpus and no memory */
 	node_possible_map = numa_nodes_parsed;
@@ -506,28 +506,20 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
 
 	init_memory_mapping_high();
 
-	/*
-	 * Finally register nodes.  Do it twice in case setup_node_bootmem
-	 * missed one due to missing bootmem.
-	 */
-	for (i = 0; i < 2; i++) {
-		for_each_node_mask(nid, node_possible_map) {
-			u64 start = (u64)max_pfn << PAGE_SHIFT;
-			u64 end = 0;
+	/* Finally register nodes. */
+	for_each_node_mask(nid, node_possible_map) {
+		u64 start = (u64)max_pfn << PAGE_SHIFT;
+		u64 end = 0;
 
-			if (node_online(nid))
+		for (i = 0; i < mi->nr_blks; i++) {
+			if (nid != mi->blk[i].nid)
 				continue;
-
-			for (j = 0; j < mi->nr_blks; j++) {
-				if (nid != mi->blk[j].nid)
-					continue;
-				start = min(mi->blk[j].start, start);
-				end = max(mi->blk[j].end, end);
-			}
-
-			if (start < end)
-				setup_node_bootmem(nid, start, end);
+			start = min(mi->blk[i].start, start);
+			end = max(mi->blk[i].end, end);
 		}
+
+		if (start < end)
+			setup_node_bootmem(nid, start, end);
 	}
 
 	return 0;
-- 
cgit v0.10.2


From a61d825808a0ce9935afebc225dcd602d5339e14 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 21 Feb 2011 12:54:34 +0100
Subject: genirq: Fix misplaced status update in irq_disable()

We lazy disable interrupt lines, so only mark the line masked, when
the chip provides an irq_disable callback.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 1d3e25e..b514565 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -199,8 +199,8 @@ void irq_disable(struct irq_desc *desc)
 	irq_state_set_disabled(desc);
 	if (desc->irq_data.chip->irq_disable) {
 		desc->irq_data.chip->irq_disable(&desc->irq_data);
+		irq_state_set_masked(desc);
 	}
-	irq_state_set_masked(desc);
 }
 
 #ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
-- 
cgit v0.10.2


From ed4dea6e0e33a3e58d8b77b775a8f0e433e7a005 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Sat, 19 Feb 2011 11:07:37 -0800
Subject: genirq: Use IRQ_BITMAP_BITS as search size in irq_alloc_descs()

The runtime expansion of nr_irqs does not take into account that
bitmap_find_next_zero_area() returns "start" + size in case the search
for an matching zero area fails. That results in a start value which
can be completely off and is not covered by the following
expand_nr_irqs() and possibly outside of the absolute limit. But we
use it without further checking.

Use IRQ_BITMAP_BITS as the limit for the bitmap search and expand
nr_irqs when the start bit is beyond nr_irqs. So start is always
pointing to the correct area in the bitmap. nr_irqs is just the limit
for irq enumerations, not the real limit for the irq space.

[ tglx: Let irq_expand_nr_irqs() take the new upper end so we do not
  	expand nr_irqs more than necessary. Made changelog readable ]

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <4D6014F9.8040605@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 394ab6a..dbccc79 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -207,11 +207,11 @@ struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
 	return NULL;
 }
 
-static int irq_expand_nr_irqs(unsigned int cnt)
+static int irq_expand_nr_irqs(unsigned int nr)
 {
-	if (nr_irqs + cnt > IRQ_BITMAP_BITS)
+	if (nr > IRQ_BITMAP_BITS)
 		return -ENOMEM;
-	nr_irqs += cnt;
+	nr_irqs = nr;
 	return 0;
 }
 
@@ -298,7 +298,7 @@ static inline int alloc_descs(unsigned int start, unsigned int cnt, int node)
 	return start;
 }
 
-static int irq_expand_nr_irqs(unsigned int cnt)
+static int irq_expand_nr_irqs(unsigned int nr)
 {
 	return -ENOMEM;
 }
@@ -346,13 +346,14 @@ irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node)
 
 	mutex_lock(&sparse_irq_lock);
 
-	start = bitmap_find_next_zero_area(allocated_irqs, nr_irqs, from, cnt, 0);
+	start = bitmap_find_next_zero_area(allocated_irqs, IRQ_BITMAP_BITS,
+					   from, cnt, 0);
 	ret = -EEXIST;
 	if (irq >=0 && start != irq)
 		goto err;
 
-	if (start >= nr_irqs) {
-		ret = irq_expand_nr_irqs(cnt);
+	if (start + cnt > nr_irqs) {
+		ret = irq_expand_nr_irqs(start + cnt);
 		if (ret)
 			goto err;
 	}
-- 
cgit v0.10.2


From 8fff39e06987492da3d4a0b9ec7cdbd245b6762b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 21 Feb 2011 14:19:42 +0100
Subject: genirq: Add missing break in __irq_set_trigger()

The switch case in __irq_set_trigger() lacks a break, which emits a
pr_err unconditionally on success.

Reported-by: Lars-Peter Clausen <lars@metafoo.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 6cca195..01f8a95 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -567,6 +567,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
 		if (chip != desc->irq_data.chip)
 			irq_chip_set_defaults(desc->irq_data.chip);
 		ret = 0;
+		break;
 	default:
 		pr_err("setting trigger mode %lu for irq %u failed (%pF)\n",
 		       flags, irq, chip->irq_set_type);
-- 
cgit v0.10.2


From e06383db9ec591696a06654257474b85bac1f8cb Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Tue, 14 Dec 2010 19:37:07 -0800
Subject: hrtimers: extend hrtimer base code to handle more then 2 clockids
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The hrtimer code is written mainly with CLOCK_REALTIME and CLOCK_MONOTONIC
in mind. These are clockids 0 and 1 resepctively. However, if we are
to introduce any new hrtimer bases, using new clockids, we have to skip
the cputimers (clockids 2,3) as well as other clockids that may not impelement
timers.

This patch adds a little bit of indirection between the clockid and
the base, so that we can extend the base by one when we add
a new clockid at number 7 or so.

CC: Jamie Lokier <jamie@shareable.org>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Alexander Shishkin <virtuoso@slind.org>
CC: Arve Hjønnevåg <arve@android.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index f376ddc..20b8e66 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -148,7 +148,11 @@ struct hrtimer_clock_base {
 #endif
 };
 
-#define HRTIMER_MAX_CLOCK_BASES 2
+enum  hrtimer_base_type {
+	HRTIMER_BASE_REALTIME,
+	HRTIMER_BASE_MONOTONIC,
+	HRTIMER_MAX_CLOCK_BASES,
+};
 
 /*
  * struct hrtimer_cpu_base - the per cpu clock bases
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 57c4d33..ca99e24 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -53,11 +53,10 @@
 /*
  * The timer bases:
  *
- * Note: If we want to add new timer bases, we have to skip the two
- * clock ids captured by the cpu-timers. We do this by holding empty
- * entries rather than doing math adjustment of the clock ids.
- * This ensures that we capture erroneous accesses to these clock ids
- * rather than moving them into the range of valid clock id's.
+ * There are more clockids then hrtimer bases. Thus, we index
+ * into the timer bases by the hrtimer_base_type enum. When trying
+ * to reach a base using a clockid, hrtimer_clockid_to_base()
+ * is used to convert from clockid to the proper hrtimer_base_type.
  */
 DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
 {
@@ -77,6 +76,14 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
 	}
 };
 
+static int hrtimer_clock_to_base_table[MAX_CLOCKS];
+
+static inline int hrtimer_clockid_to_base(clockid_t clock_id)
+{
+	return hrtimer_clock_to_base_table[clock_id];
+}
+
+
 /*
  * Get the coarse grained time at the softirq based on xtime and
  * wall_to_monotonic.
@@ -90,8 +97,8 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
 
 	xtim = timespec_to_ktime(xts);
 	tomono = timespec_to_ktime(tom);
-	base->clock_base[CLOCK_REALTIME].softirq_time = xtim;
-	base->clock_base[CLOCK_MONOTONIC].softirq_time =
+	base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim;
+	base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time =
 		ktime_add(xtim, tomono);
 }
 
@@ -179,10 +186,11 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
 	struct hrtimer_cpu_base *new_cpu_base;
 	int this_cpu = smp_processor_id();
 	int cpu = hrtimer_get_target(this_cpu, pinned);
+	int basenum = hrtimer_clockid_to_base(base->index);
 
 again:
 	new_cpu_base = &per_cpu(hrtimer_bases, cpu);
-	new_base = &new_cpu_base->clock_base[base->index];
+	new_base = &new_cpu_base->clock_base[basenum];
 
 	if (base != new_base) {
 		/*
@@ -618,7 +626,7 @@ static void retrigger_next_event(void *arg)
 
 	/* Adjust CLOCK_REALTIME offset */
 	raw_spin_lock(&base->lock);
-	base->clock_base[CLOCK_REALTIME].offset =
+	base->clock_base[HRTIMER_BASE_REALTIME].offset =
 		timespec_to_ktime(realtime_offset);
 
 	hrtimer_force_reprogram(base, 0);
@@ -716,8 +724,8 @@ static int hrtimer_switch_to_hres(void)
 		return 0;
 	}
 	base->hres_active = 1;
-	base->clock_base[CLOCK_REALTIME].resolution = KTIME_HIGH_RES;
-	base->clock_base[CLOCK_MONOTONIC].resolution = KTIME_HIGH_RES;
+	base->clock_base[HRTIMER_BASE_REALTIME].resolution = KTIME_HIGH_RES;
+	base->clock_base[HRTIMER_BASE_MONOTONIC].resolution = KTIME_HIGH_RES;
 
 	tick_setup_sched_timer();
 
@@ -1112,6 +1120,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
 			   enum hrtimer_mode mode)
 {
 	struct hrtimer_cpu_base *cpu_base;
+	int base;
 
 	memset(timer, 0, sizeof(struct hrtimer));
 
@@ -1120,7 +1129,8 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
 	if (clock_id == CLOCK_REALTIME && mode != HRTIMER_MODE_ABS)
 		clock_id = CLOCK_MONOTONIC;
 
-	timer->base = &cpu_base->clock_base[clock_id];
+	base = hrtimer_clockid_to_base(clock_id);
+	timer->base = &cpu_base->clock_base[base];
 	hrtimer_init_timer_hres(timer);
 	timerqueue_init(&timer->node);
 
@@ -1156,9 +1166,10 @@ EXPORT_SYMBOL_GPL(hrtimer_init);
 int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
 {
 	struct hrtimer_cpu_base *cpu_base;
+	int base = hrtimer_clockid_to_base(which_clock);
 
 	cpu_base = &__raw_get_cpu_var(hrtimer_bases);
-	*tp = ktime_to_timespec(cpu_base->clock_base[which_clock].resolution);
+	*tp = ktime_to_timespec(cpu_base->clock_base[base].resolution);
 
 	return 0;
 }
@@ -1705,6 +1716,9 @@ static struct notifier_block __cpuinitdata hrtimers_nb = {
 
 void __init hrtimers_init(void)
 {
+	hrtimer_clock_to_base_table[CLOCK_REALTIME] = HRTIMER_BASE_REALTIME;
+	hrtimer_clock_to_base_table[CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC;
+
 	hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE,
 			  (void *)(long)smp_processor_id());
 	register_cpu_notifier(&hrtimers_nb);
-- 
cgit v0.10.2


From abb3a4ea2e0ea7114a4475745da2f32bd9ad5b73 Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Mon, 14 Feb 2011 17:52:09 -0800
Subject: time: Introduce get_monotonic_boottime and ktime_get_boottime
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This adds new functions that return the monotonic time since boot
(in other words, CLOCK_MONOTONIC + suspend time).

CC: Jamie Lokier <jamie@shareable.org>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Alexander Shishkin <virtuoso@slind.org>
CC: Arve Hjønnevåg <arve@android.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 20b8e66..7a9e7ee 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -312,6 +312,7 @@ static inline int hrtimer_is_hres_active(struct hrtimer *timer)
 
 extern ktime_t ktime_get(void);
 extern ktime_t ktime_get_real(void);
+extern ktime_t ktime_get_boottime(void);
 
 
 DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
diff --git a/include/linux/time.h b/include/linux/time.h
index 379b903..fa39150 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -161,6 +161,7 @@ extern void getnstime_raw_and_real(struct timespec *ts_raw,
 		struct timespec *ts_real);
 extern void getboottime(struct timespec *ts);
 extern void monotonic_to_bootbased(struct timespec *ts);
+extern void get_monotonic_boottime(struct timespec *ts);
 
 extern struct timespec timespec_trunc(struct timespec t, unsigned gran);
 extern int timekeeping_valid_for_hres(void);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 6262c1d..5fbd9aa 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -907,7 +907,7 @@ static void update_wall_time(void)
  * getboottime - Return the real time of system boot.
  * @ts:		pointer to the timespec to be set
  *
- * Returns the time of day in a timespec.
+ * Returns the wall-time of boot in a timespec.
  *
  * This is based on the wall_to_monotonic offset and the total suspend
  * time. Calls to settimeofday will affect the value returned (which
@@ -925,6 +925,55 @@ void getboottime(struct timespec *ts)
 }
 EXPORT_SYMBOL_GPL(getboottime);
 
+
+/**
+ * get_monotonic_boottime - Returns monotonic time since boot
+ * @ts:		pointer to the timespec to be set
+ *
+ * Returns the monotonic time since boot in a timespec.
+ *
+ * This is similar to CLOCK_MONTONIC/ktime_get_ts, but also
+ * includes the time spent in suspend.
+ */
+void get_monotonic_boottime(struct timespec *ts)
+{
+	struct timespec tomono, sleep;
+	unsigned int seq;
+	s64 nsecs;
+
+	WARN_ON(timekeeping_suspended);
+
+	do {
+		seq = read_seqbegin(&xtime_lock);
+		*ts = xtime;
+		tomono = wall_to_monotonic;
+		sleep = total_sleep_time;
+		nsecs = timekeeping_get_ns();
+
+	} while (read_seqretry(&xtime_lock, seq));
+
+	set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec + sleep.tv_sec,
+			ts->tv_nsec + tomono.tv_nsec + sleep.tv_nsec + nsecs);
+}
+EXPORT_SYMBOL_GPL(get_monotonic_boottime);
+
+/**
+ * ktime_get_boottime - Returns monotonic time since boot in a ktime
+ *
+ * Returns the monotonic time since boot in a ktime
+ *
+ * This is similar to CLOCK_MONTONIC/ktime_get, but also
+ * includes the time spent in suspend.
+ */
+ktime_t ktime_get_boottime(void)
+{
+	struct timespec ts;
+
+	get_monotonic_boottime(&ts);
+	return timespec_to_ktime(ts);
+}
+EXPORT_SYMBOL_GPL(ktime_get_boottime);
+
 /**
  * monotonic_to_bootbased - Convert the monotonic time to boot based.
  * @ts:		pointer to the timespec to be converted
-- 
cgit v0.10.2


From 314ac37150011ebb398f522db528d2dbcc611189 Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Mon, 14 Feb 2011 18:43:08 -0800
Subject: time: Extend get_xtime_and_monotonic_offset() to also return sleep
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Extend get_xtime_and_monotonic_offset to
get_xtime_and_monotonic_and_sleep_offset().

CC: Jamie Lokier <jamie@shareable.org>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Alexander Shishkin <virtuoso@slind.org>
CC: Arve Hjønnevåg <arve@android.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>

diff --git a/include/linux/time.h b/include/linux/time.h
index fa39150..02d48fb 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -124,7 +124,8 @@ unsigned long get_seconds(void);
 struct timespec current_kernel_time(void);
 struct timespec __current_kernel_time(void); /* does not take xtime_lock */
 struct timespec get_monotonic_coarse(void);
-void get_xtime_and_monotonic_offset(struct timespec *xtim, struct timespec *wtom);
+void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
+				struct timespec *wtom, struct timespec *sleep);
 
 #define CURRENT_TIME		(current_kernel_time())
 #define CURRENT_TIME_SEC	((struct timespec) { get_seconds(), 0 })
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index ca99e24..e8bf3ad 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -91,9 +91,9 @@ static inline int hrtimer_clockid_to_base(clockid_t clock_id)
 static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
 {
 	ktime_t xtim, tomono;
-	struct timespec xts, tom;
+	struct timespec xts, tom, slp;
 
-	get_xtime_and_monotonic_offset(&xts, &tom);
+	get_xtime_and_monotonic_and_sleep_offset(&xts, &tom, &slp);
 
 	xtim = timespec_to_ktime(xts);
 	tomono = timespec_to_ktime(tom);
@@ -614,12 +614,13 @@ static int hrtimer_reprogram(struct hrtimer *timer,
 static void retrigger_next_event(void *arg)
 {
 	struct hrtimer_cpu_base *base;
-	struct timespec realtime_offset, wtm;
+	struct timespec realtime_offset, wtm, sleep;
 
 	if (!hrtimer_hres_active())
 		return;
 
-	get_xtime_and_monotonic_offset(&realtime_offset, &wtm);
+	get_xtime_and_monotonic_and_sleep_offset(&realtime_offset, &wtm,
+							&sleep);
 	set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec);
 
 	base = &__get_cpu_var(hrtimer_bases);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 5fbd9aa..3bd7e3d 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1040,11 +1040,14 @@ void do_timer(unsigned long ticks)
 }
 
 /**
- * get_xtime_and_monotonic_offset() - get xtime and wall_to_monotonic
+ * get_xtime_and_monotonic_and_sleep_offset() - get xtime, wall_to_monotonic,
+ *    and sleep offsets.
  * @xtim:	pointer to timespec to be set with xtime
  * @wtom:	pointer to timespec to be set with wall_to_monotonic
+ * @sleep:	pointer to timespec to be set with time in suspend
  */
-void get_xtime_and_monotonic_offset(struct timespec *xtim, struct timespec *wtom)
+void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
+				struct timespec *wtom, struct timespec *sleep)
 {
 	unsigned long seq;
 
@@ -1052,6 +1055,7 @@ void get_xtime_and_monotonic_offset(struct timespec *xtim, struct timespec *wtom
 		seq = read_seqbegin(&xtime_lock);
 		*xtim = xtime;
 		*wtom = wall_to_monotonic;
+		*sleep = total_sleep_time;
 	} while (read_seqretry(&xtime_lock, seq));
 }
 
-- 
cgit v0.10.2


From 70a08cca1227dc31c784ec930099a4417a06e7d0 Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Tue, 15 Feb 2011 10:45:16 -0800
Subject: timers: Add CLOCK_BOOTTIME hrtimer base
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

CLOCK_MONOTONIC stops while the system is in suspend. This is because
to applications system suspend is invisible. However, there is a
growing set of applications that are wanting to be suspend-aware,
but do not want to deal with the complications of CLOCK_REALTIME
(which might jump around if settimeofday is called).

For these applications, I propose a new clockid: CLOCK_BOOTTIME.
CLOCK_BOOTTIME is idential to CLOCK_MONOTONIC, except it also
includes any time spent in suspend.

This patch add hrtimer base for CLOCK_BOOTTIME, using
get_monotonic_boottime/ktime_get_boottime, to allow
in kernel users to set timers against.

CC: Jamie Lokier <jamie@shareable.org>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Alexander Shishkin <virtuoso@slind.org>
CC: Arve Hjønnevåg <arve@android.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 7a9e7ee..6bc1804 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -151,6 +151,7 @@ struct hrtimer_clock_base {
 enum  hrtimer_base_type {
 	HRTIMER_BASE_REALTIME,
 	HRTIMER_BASE_MONOTONIC,
+	HRTIMER_BASE_BOOTTIME,
 	HRTIMER_MAX_CLOCK_BASES,
 };
 
diff --git a/include/linux/time.h b/include/linux/time.h
index 02d48fb..454a262 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -293,6 +293,7 @@ struct itimerval {
 #define CLOCK_MONOTONIC_RAW		4
 #define CLOCK_REALTIME_COARSE		5
 #define CLOCK_MONOTONIC_COARSE		6
+#define CLOCK_BOOTTIME			7
 
 /*
  * The IDs of various hardware clocks:
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index e8bf3ad..4c53af1 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -73,6 +73,11 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
 			.get_time = &ktime_get,
 			.resolution = KTIME_LOW_RES,
 		},
+		{
+			.index = CLOCK_BOOTTIME,
+			.get_time = &ktime_get_boottime,
+			.resolution = KTIME_LOW_RES,
+		},
 	}
 };
 
@@ -90,16 +95,17 @@ static inline int hrtimer_clockid_to_base(clockid_t clock_id)
  */
 static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
 {
-	ktime_t xtim, tomono;
+	ktime_t xtim, mono, boot;
 	struct timespec xts, tom, slp;
 
 	get_xtime_and_monotonic_and_sleep_offset(&xts, &tom, &slp);
 
 	xtim = timespec_to_ktime(xts);
-	tomono = timespec_to_ktime(tom);
+	mono = ktime_add(xtim, timespec_to_ktime(tom));
+	boot = ktime_add(mono, timespec_to_ktime(slp));
 	base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim;
-	base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time =
-		ktime_add(xtim, tomono);
+	base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono;
+	base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot;
 }
 
 /*
@@ -727,6 +733,7 @@ static int hrtimer_switch_to_hres(void)
 	base->hres_active = 1;
 	base->clock_base[HRTIMER_BASE_REALTIME].resolution = KTIME_HIGH_RES;
 	base->clock_base[HRTIMER_BASE_MONOTONIC].resolution = KTIME_HIGH_RES;
+	base->clock_base[HRTIMER_BASE_BOOTTIME].resolution = KTIME_HIGH_RES;
 
 	tick_setup_sched_timer();
 
@@ -1719,6 +1726,7 @@ void __init hrtimers_init(void)
 {
 	hrtimer_clock_to_base_table[CLOCK_REALTIME] = HRTIMER_BASE_REALTIME;
 	hrtimer_clock_to_base_table[CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC;
+	hrtimer_clock_to_base_table[CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME;
 
 	hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE,
 			  (void *)(long)smp_processor_id());
-- 
cgit v0.10.2


From 7fdd7f89006dd5a4c702fa0ce0c272345fa44ae0 Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Tue, 15 Feb 2011 10:52:57 -0800
Subject: timers: Export CLOCK_BOOTTIME via the posix timers interface
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch exports CLOCK_BOOTTIME through the posix timers interface

CC: Jamie Lokier <jamie@shareable.org>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Alexander Shishkin <virtuoso@slind.org>
CC: Arve Hjønnevåg <arve@android.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>

diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 44fcff1..4c01249 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -187,7 +187,7 @@ static int posix_ktime_get_ts(clockid_t which_clock, struct timespec *tp)
 }
 
 /*
- * Get monotonic time for posix timers
+ * Get monotonic-raw time for posix timers
  */
 static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp)
 {
@@ -214,6 +214,14 @@ static int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp
 	*tp = ktime_to_timespec(KTIME_LOW_RES);
 	return 0;
 }
+
+static int posix_get_boottime(const clockid_t which_clock, struct timespec *tp)
+{
+	get_monotonic_boottime(tp);
+	return 0;
+}
+
+
 /*
  * Initialize everything, well, just everything in Posix clocks/timers ;)
  */
@@ -253,12 +261,23 @@ static __init int init_posix_timers(void)
 		.clock_getres	= posix_get_coarse_res,
 		.clock_get	= posix_get_monotonic_coarse,
 	};
+	struct k_clock clock_boottime = {
+		.clock_getres	= hrtimer_get_res,
+		.clock_get	= posix_get_boottime,
+		.nsleep		= common_nsleep,
+		.nsleep_restart	= hrtimer_nanosleep_restart,
+		.timer_create	= common_timer_create,
+		.timer_set	= common_timer_set,
+		.timer_get	= common_timer_get,
+		.timer_del	= common_timer_del,
+	};
 
 	posix_timers_register_clock(CLOCK_REALTIME, &clock_realtime);
 	posix_timers_register_clock(CLOCK_MONOTONIC, &clock_monotonic);
 	posix_timers_register_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw);
 	posix_timers_register_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse);
 	posix_timers_register_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse);
+	posix_timers_register_clock(CLOCK_BOOTTIME, &clock_boottime);
 
 	posix_timers_cache = kmem_cache_create("posix_timers_cache",
 					sizeof (struct k_itimer), 0, SLAB_PANIC,
-- 
cgit v0.10.2


From fbee632d0ca9f4073a3fefb9a843eac8af036b0f Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 21 Feb 2011 13:23:57 -0300
Subject: perf probe: Fix error propagation leading to segfault

There are two hunks in this patch that stops probe processing as soon as one
error is found, breaking out of loops, the other fix an error propagation that
should return a negative error number but instead was returning the result of
"ret < 0", which is 1 and thus made several error checks fail because they test
agains < 0.

The problem could be triggered by asking for a variable that was optimized out,
fact that should stop the whole probe processing but instead was segfaulting
while installing broken probes:

[root@emilia ~]# probe perf_mmap:55 user_lock_limit
Failed to find the location of user_lock_limit at this address.
 Perhaps, it has been optimized out.
Failed to find 'user_lock_limit' in this function.
Add new events:
  probe:perf_mmap      (on perf_mmap:55 with user_lock_limit)
  probe:perf_mmap_1    (on perf_mmap:55 with user_lock_limit)
Segmentation fault (core dumped)
[root@emilia ~]# perf probe -l
  probe:perf_mmap      (on perf_mmap:55@git/linux/kernel/perf_event.c with user_lock_limit)
  probe:perf_mmap_1    (on perf_mmap:55@git/linux/kernel/perf_event.c with user_lock_limit)
[root@emilia ~]#

After the fix:

[root@emilia ~]# probe perf_mmap:55 user_lock_limit
Failed to find the location of user_lock_limit at this address.
 Perhaps, it has been optimized out.
Failed to find 'user_lock_limit' in this function.
  Error: Failed to add events. (-2)
[root@emilia ~]#

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c
index 0e3ea13..369ddc6 100644
--- a/tools/perf/util/probe-event.c
+++ b/tools/perf/util/probe-event.c
@@ -1832,9 +1832,12 @@ int add_perf_probe_events(struct perf_probe_event *pevs, int npevs,
 	}
 
 	/* Loop 2: add all events */
-	for (i = 0; i < npevs && ret >= 0; i++)
+	for (i = 0; i < npevs && ret >= 0; i++) {
 		ret = __add_probe_trace_events(pkgs[i].pev, pkgs[i].tevs,
 						pkgs[i].ntevs, force_add);
+		if (ret < 0)
+			break;
+	}
 end:
 	/* Loop 3: cleanup and free trace events  */
 	for (i = 0; i < npevs; i++) {
diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c
index fe461f6..eecbdca 100644
--- a/tools/perf/util/probe-finder.c
+++ b/tools/perf/util/probe-finder.c
@@ -1262,7 +1262,7 @@ static int probe_point_line_walker(const char *fname, int lineno,
 	ret = call_probe_finder(NULL, pf);
 
 	/* Continue if no error, because the line will be in inline function */
-	return ret < 0 ?: 0;
+	return ret < 0 ? ret : 0;
 }
 
 /* Find probe point from its line number */
@@ -1484,6 +1484,8 @@ static int find_probes(int fd, struct probe_finder *pf)
 				pf->lno = pp->line;
 				ret = find_probe_point_by_line(pf);
 			}
+			if (ret != DWARF_CB_OK)
+				break;
 		}
 		off = noff;
 	}
-- 
cgit v0.10.2


From e603dc15072c7fec0ae263597e6dabc3bb4c5c5b Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 21 Feb 2011 16:05:50 -0300
Subject: perf evsel: Fix inverted test for fixing up attr.inherit flag

The kernel refuses mmapping an event with the inherit flag set for
something that is systemwide (cpu == -1), and the evsel layer got this
reversed at some point, fix it.

The symtom was that the --pid and --tid parameters for 'perf record' and
'perf top' returned with -EINVAL, like:

 # /tmp/build-perf/perf record -v -fo/tmp/perf.data -p 1042
   Warning:  ... trying to fall back to cpu-clock-ticks

   Fatal: failed to mmap with 22 (Invalid argument)

Reported-by: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 63cadaf..8083d51 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -179,8 +179,19 @@ static int __perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus,
 
 	for (cpu = 0; cpu < cpus->nr; cpu++) {
 		int group_fd = -1;
-
-		evsel->attr.inherit = (cpus->map[cpu] < 0) && inherit;
+		/*
+		 * Don't allow mmap() of inherited per-task counters. This
+		 * would create a performance issue due to all children writing
+		 * to the same buffer.
+		 *
+		 * FIXME:
+		 * Proper fix is not to pass 'inherit' to perf_evsel__open*,
+		 * but a 'flags' parameter, with 'group' folded there as well,
+		 * then introduce a PERF_O_{MMAP,GROUP,INHERIT} enum, and if
+		 * O_MMAP is set, emit a warning if cpu < 0 and O_INHERIT is
+		 * set. Lets go for the minimal fix first tho.
+		 */
+		evsel->attr.inherit = (cpus->map[cpu] >= 0) && inherit;
 
 		for (thread = 0; thread < threads->nr; thread++) {
 
-- 
cgit v0.10.2


From 8635bf6ea3402154eec64763e6ed14972013c1c1 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 22 Feb 2011 06:56:18 -0300
Subject: perf probe: Remove redundant checks

While fixing an error propagating problem in f809b25 I added two
redundant checks.

I did that because I didn't expect the checks to be on the while and for
loop condition expression, where they are tested before we run the loop,
where the 'ret' variable is set.

So remove it from there and leave it just after it is actually set,
eliminating unneded tests.

Reported-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c
index 369ddc6..5ddee66 100644
--- a/tools/perf/util/probe-event.c
+++ b/tools/perf/util/probe-event.c
@@ -1832,7 +1832,7 @@ int add_perf_probe_events(struct perf_probe_event *pevs, int npevs,
 	}
 
 	/* Loop 2: add all events */
-	for (i = 0; i < npevs && ret >= 0; i++) {
+	for (i = 0; i < npevs; i++) {
 		ret = __add_probe_trace_events(pkgs[i].pev, pkgs[i].tevs,
 						pkgs[i].ntevs, force_add);
 		if (ret < 0)
diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c
index eecbdca..17f9c4a 100644
--- a/tools/perf/util/probe-finder.c
+++ b/tools/perf/util/probe-finder.c
@@ -1462,8 +1462,7 @@ static int find_probes(int fd, struct probe_finder *pf)
 	off = 0;
 	line_list__init(&pf->lcache);
 	/* Loop on CUs (Compilation Unit) */
-	while (!dwarf_nextcu(dbg, off, &noff, &cuhl, NULL, NULL, NULL) &&
-	       ret >= 0) {
+	while (!dwarf_nextcu(dbg, off, &noff, &cuhl, NULL, NULL, NULL)) {
 		/* Get the DIE(Debugging Information Entry) of this CU */
 		diep = dwarf_offdie(dbg, off + cuhl, &pf->cu_die);
 		if (!diep)
@@ -1484,7 +1483,7 @@ static int find_probes(int fd, struct probe_finder *pf)
 				pf->lno = pp->line;
 				ret = find_probe_point_by_line(pf);
 			}
-			if (ret != DWARF_CB_OK)
+			if (ret < 0)
 				break;
 		}
 		off = noff;
-- 
cgit v0.10.2


From fbe99959d1db85222829a64d869dcab704ac7ec8 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 22 Feb 2011 11:10:08 +0100
Subject: x86-64, NUMA: Prepare numa_emulation() for moving NUMA emulation into
 a separate file

Update numa_emulation() such that, it

- takes @numa_meminfo and @numa_dist_cnt instead of directly
  referencing the global variables.

- copies the distance table by iterating each distance with
  node_distance() instead of memcpy'ing the distance table.

- tests emu_cmdline to determine whether emulation is requested and
  fills emu_nid_to_phys[] with identity mapping if emulation is not
  used.  This allows the caller to call numa_emulation()
  unconditionally and makes return value unncessary.

- defines dummy version if CONFIG_NUMA_EMU is disabled.

This patch doesn't introduce any behavior change.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>

diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 6e4ee96..980d514 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -789,17 +789,20 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
  * Sets up the system RAM area from start_pfn to last_pfn according to the
  * numa=fake command-line option.
  */
-static bool __init numa_emulation(void)
+static void __init numa_emulation(struct numa_meminfo *numa_meminfo,
+				  int numa_dist_cnt)
 {
 	static struct numa_meminfo ei __initdata;
 	static struct numa_meminfo pi __initdata;
 	const u64 max_addr = max_pfn << PAGE_SHIFT;
-	int phys_dist_cnt = numa_distance_cnt;
 	u8 *phys_dist = NULL;
 	int i, j, ret;
 
+	if (!emu_cmdline)
+		goto no_emu;
+
 	memset(&ei, 0, sizeof(ei));
-	pi = numa_meminfo;
+	pi = *numa_meminfo;
 
 	for (i = 0; i < MAX_NUMNODES; i++)
 		emu_nid_to_phys[i] = NUMA_NO_NODE;
@@ -822,19 +825,19 @@ static bool __init numa_emulation(void)
 	}
 
 	if (ret < 0)
-		return false;
+		goto no_emu;
 
 	if (numa_cleanup_meminfo(&ei) < 0) {
 		pr_warning("NUMA: Warning: constructed meminfo invalid, disabling emulation\n");
-		return false;
+		goto no_emu;
 	}
 
 	/*
 	 * Copy the original distance table.  It's temporary so no need to
 	 * reserve it.
 	 */
-	if (phys_dist_cnt) {
-		size_t size = phys_dist_cnt * sizeof(numa_distance[0]);
+	if (numa_dist_cnt) {
+		size_t size = numa_dist_cnt * sizeof(phys_dist[0]);
 		u64 phys;
 
 		phys = memblock_find_in_range(0,
@@ -842,14 +845,18 @@ static bool __init numa_emulation(void)
 					      size, PAGE_SIZE);
 		if (phys == MEMBLOCK_ERROR) {
 			pr_warning("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n");
-			return false;
+			goto no_emu;
 		}
 		phys_dist = __va(phys);
-		memcpy(phys_dist, numa_distance, size);
+
+		for (i = 0; i < numa_dist_cnt; i++)
+			for (j = 0; j < numa_dist_cnt; j++)
+				phys_dist[i * numa_dist_cnt + j] =
+					node_distance(i, j);
 	}
 
 	/* commit */
-	numa_meminfo = ei;
+	*numa_meminfo = ei;
 
 	/*
 	 * Transform __apicid_to_node table to use emulated nids by
@@ -878,18 +885,27 @@ static bool __init numa_emulation(void)
 			int physj = emu_nid_to_phys[j];
 			int dist;
 
-			if (physi >= phys_dist_cnt || physj >= phys_dist_cnt)
+			if (physi >= numa_dist_cnt || physj >= numa_dist_cnt)
 				dist = physi == physj ?
 					LOCAL_DISTANCE : REMOTE_DISTANCE;
 			else
-				dist = phys_dist[physi * phys_dist_cnt + physj];
+				dist = phys_dist[physi * numa_dist_cnt + physj];
 
 			numa_set_distance(i, j, dist);
 		}
 	}
-	return true;
+	return;
+
+no_emu:
+	/* No emulation.  Build identity emu_nid_to_phys[] for numa_add_cpu() */
+	for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
+		emu_nid_to_phys[i] = i;
 }
-#endif /* CONFIG_NUMA_EMU */
+#else	/* CONFIG_NUMA_EMU */
+static inline void numa_emulation(struct numa_meminfo *numa_meminfo,
+				  int numa_dist_cnt)
+{ }
+#endif	/* CONFIG_NUMA_EMU */
 
 static int __init dummy_numa_init(void)
 {
@@ -937,15 +953,9 @@ void __init initmem_init(void)
 
 		if (numa_cleanup_meminfo(&numa_meminfo) < 0)
 			continue;
-#ifdef CONFIG_NUMA_EMU
-		/*
-		 * If requested, try emulation.  If emulation is not used,
-		 * build identity emu_nid_to_phys[] for numa_add_cpu()
-		 */
-		if (!emu_cmdline || !numa_emulation())
-			for (j = 0; j < ARRAY_SIZE(emu_nid_to_phys); j++)
-				emu_nid_to_phys[j] = j;
-#endif
+
+		numa_emulation(&numa_meminfo, numa_distance_cnt);
+
 		if (numa_register_memblks(&numa_meminfo) < 0)
 			continue;
 
-- 
cgit v0.10.2


From b8ef9172b2aad7eeb1fcd37a9e632c7b24da1f64 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 22 Feb 2011 11:10:08 +0100
Subject: x86-64, NUMA: Move NUMA emulation into numa_emulation.c

Create numa_emulation.c and move all NUMA emulation code there.  The
definitions of struct numa_memblk and numa_meminfo are moved to
numa_64.h.  Also, numa_remove_memblk_from(), numa_cleanup_meminfo(),
numa_reset_distance() along with numa_emulation() are made global.

- v2: Internal declarations moved to numa_internal.h as suggested by
      Yinghai.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Yinghai Lu <yinghai@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>

diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 09df2f9..3e608ed 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -25,6 +25,7 @@ obj-$(CONFIG_MMIOTRACE_TEST)	+= testmmiotrace.o
 obj-$(CONFIG_NUMA)		+= numa.o numa_$(BITS).o
 obj-$(CONFIG_AMD_NUMA)		+= amdtopology_64.o
 obj-$(CONFIG_ACPI_NUMA)		+= srat_$(BITS).o
+obj-$(CONFIG_NUMA_EMU)		+= numa_emulation.o
 
 obj-$(CONFIG_HAVE_MEMBLOCK)		+= memblock.o
 
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 980d514..45a361b 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -18,20 +18,10 @@
 #include <asm/e820.h>
 #include <asm/proto.h>
 #include <asm/dma.h>
-#include <asm/numa.h>
 #include <asm/acpi.h>
 #include <asm/amd_nb.h>
 
-struct numa_memblk {
-	u64			start;
-	u64			end;
-	int			nid;
-};
-
-struct numa_meminfo {
-	int			nr_blks;
-	struct numa_memblk	blk[NR_NODE_MEMBLKS];
-};
+#include "numa_internal.h"
 
 struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
 EXPORT_SYMBOL(node_data);
@@ -215,7 +205,7 @@ static int __init numa_add_memblk_to(int nid, u64 start, u64 end,
 	return 0;
 }
 
-static void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi)
+void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi)
 {
 	mi->nr_blks--;
 	memmove(&mi->blk[idx], &mi->blk[idx + 1],
@@ -273,7 +263,7 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
 	node_set_online(nodeid);
 }
 
-static int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
+int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
 {
 	const u64 low = 0;
 	const u64 high = (u64)max_pfn << PAGE_SHIFT;
@@ -367,7 +357,7 @@ static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask,
  * Reset distance table.  The current table is freed.  The next
  * numa_set_distance() call will create a new one.
  */
-static void __init numa_reset_distance(void)
+void __init numa_reset_distance(void)
 {
 	size_t size;
 
@@ -525,388 +515,6 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
 	return 0;
 }
 
-#ifdef CONFIG_NUMA_EMU
-/* Numa emulation */
-static int emu_nid_to_phys[MAX_NUMNODES] __cpuinitdata;
-static char *emu_cmdline __initdata;
-
-void __init numa_emu_cmdline(char *str)
-{
-	emu_cmdline = str;
-}
-
-static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi)
-{
-	int i;
-
-	for (i = 0; i < mi->nr_blks; i++)
-		if (mi->blk[i].nid == nid)
-			return i;
-	return -ENOENT;
-}
-
-/*
- * Sets up nid to range from @start to @end.  The return value is -errno if
- * something went wrong, 0 otherwise.
- */
-static int __init emu_setup_memblk(struct numa_meminfo *ei,
-				   struct numa_meminfo *pi,
-				   int nid, int phys_blk, u64 size)
-{
-	struct numa_memblk *eb = &ei->blk[ei->nr_blks];
-	struct numa_memblk *pb = &pi->blk[phys_blk];
-
-	if (ei->nr_blks >= NR_NODE_MEMBLKS) {
-		pr_err("NUMA: Too many emulated memblks, failing emulation\n");
-		return -EINVAL;
-	}
-
-	ei->nr_blks++;
-	eb->start = pb->start;
-	eb->end = pb->start + size;
-	eb->nid = nid;
-
-	if (emu_nid_to_phys[nid] == NUMA_NO_NODE)
-		emu_nid_to_phys[nid] = pb->nid;
-
-	pb->start += size;
-	if (pb->start >= pb->end) {
-		WARN_ON_ONCE(pb->start > pb->end);
-		numa_remove_memblk_from(phys_blk, pi);
-	}
-
-	printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid,
-	       eb->start, eb->end, (eb->end - eb->start) >> 20);
-	return 0;
-}
-
-/*
- * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
- * to max_addr.  The return value is the number of nodes allocated.
- */
-static int __init split_nodes_interleave(struct numa_meminfo *ei,
-					 struct numa_meminfo *pi,
-					 u64 addr, u64 max_addr, int nr_nodes)
-{
-	nodemask_t physnode_mask = NODE_MASK_NONE;
-	u64 size;
-	int big;
-	int nid = 0;
-	int i, ret;
-
-	if (nr_nodes <= 0)
-		return -1;
-	if (nr_nodes > MAX_NUMNODES) {
-		pr_info("numa=fake=%d too large, reducing to %d\n",
-			nr_nodes, MAX_NUMNODES);
-		nr_nodes = MAX_NUMNODES;
-	}
-
-	size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) / nr_nodes;
-	/*
-	 * Calculate the number of big nodes that can be allocated as a result
-	 * of consolidating the remainder.
-	 */
-	big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) /
-		FAKE_NODE_MIN_SIZE;
-
-	size &= FAKE_NODE_MIN_HASH_MASK;
-	if (!size) {
-		pr_err("Not enough memory for each node.  "
-			"NUMA emulation disabled.\n");
-		return -1;
-	}
-
-	for (i = 0; i < pi->nr_blks; i++)
-		node_set(pi->blk[i].nid, physnode_mask);
-
-	/*
-	 * Continue to fill physical nodes with fake nodes until there is no
-	 * memory left on any of them.
-	 */
-	while (nodes_weight(physnode_mask)) {
-		for_each_node_mask(i, physnode_mask) {
-			u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
-			u64 start, limit, end;
-			int phys_blk;
-
-			phys_blk = emu_find_memblk_by_nid(i, pi);
-			if (phys_blk < 0) {
-				node_clear(i, physnode_mask);
-				continue;
-			}
-			start = pi->blk[phys_blk].start;
-			limit = pi->blk[phys_blk].end;
-			end = start + size;
-
-			if (nid < big)
-				end += FAKE_NODE_MIN_SIZE;
-
-			/*
-			 * Continue to add memory to this fake node if its
-			 * non-reserved memory is less than the per-node size.
-			 */
-			while (end - start -
-			       memblock_x86_hole_size(start, end) < size) {
-				end += FAKE_NODE_MIN_SIZE;
-				if (end > limit) {
-					end = limit;
-					break;
-				}
-			}
-
-			/*
-			 * If there won't be at least FAKE_NODE_MIN_SIZE of
-			 * non-reserved memory in ZONE_DMA32 for the next node,
-			 * this one must extend to the boundary.
-			 */
-			if (end < dma32_end && dma32_end - end -
-			    memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
-				end = dma32_end;
-
-			/*
-			 * If there won't be enough non-reserved memory for the
-			 * next node, this one must extend to the end of the
-			 * physical node.
-			 */
-			if (limit - end -
-			    memblock_x86_hole_size(end, limit) < size)
-				end = limit;
-
-			ret = emu_setup_memblk(ei, pi, nid++ % nr_nodes,
-					       phys_blk,
-					       min(end, limit) - start);
-			if (ret < 0)
-				return ret;
-		}
-	}
-	return 0;
-}
-
-/*
- * Returns the end address of a node so that there is at least `size' amount of
- * non-reserved memory or `max_addr' is reached.
- */
-static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size)
-{
-	u64 end = start + size;
-
-	while (end - start - memblock_x86_hole_size(start, end) < size) {
-		end += FAKE_NODE_MIN_SIZE;
-		if (end > max_addr) {
-			end = max_addr;
-			break;
-		}
-	}
-	return end;
-}
-
-/*
- * Sets up fake nodes of `size' interleaved over physical nodes ranging from
- * `addr' to `max_addr'.  The return value is the number of nodes allocated.
- */
-static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
-					      struct numa_meminfo *pi,
-					      u64 addr, u64 max_addr, u64 size)
-{
-	nodemask_t physnode_mask = NODE_MASK_NONE;
-	u64 min_size;
-	int nid = 0;
-	int i, ret;
-
-	if (!size)
-		return -1;
-	/*
-	 * The limit on emulated nodes is MAX_NUMNODES, so the size per node is
-	 * increased accordingly if the requested size is too small.  This
-	 * creates a uniform distribution of node sizes across the entire
-	 * machine (but not necessarily over physical nodes).
-	 */
-	min_size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) /
-						MAX_NUMNODES;
-	min_size = max(min_size, FAKE_NODE_MIN_SIZE);
-	if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size)
-		min_size = (min_size + FAKE_NODE_MIN_SIZE) &
-						FAKE_NODE_MIN_HASH_MASK;
-	if (size < min_size) {
-		pr_err("Fake node size %LuMB too small, increasing to %LuMB\n",
-			size >> 20, min_size >> 20);
-		size = min_size;
-	}
-	size &= FAKE_NODE_MIN_HASH_MASK;
-
-	for (i = 0; i < pi->nr_blks; i++)
-		node_set(pi->blk[i].nid, physnode_mask);
-
-	/*
-	 * Fill physical nodes with fake nodes of size until there is no memory
-	 * left on any of them.
-	 */
-	while (nodes_weight(physnode_mask)) {
-		for_each_node_mask(i, physnode_mask) {
-			u64 dma32_end = MAX_DMA32_PFN << PAGE_SHIFT;
-			u64 start, limit, end;
-			int phys_blk;
-
-			phys_blk = emu_find_memblk_by_nid(i, pi);
-			if (phys_blk < 0) {
-				node_clear(i, physnode_mask);
-				continue;
-			}
-			start = pi->blk[phys_blk].start;
-			limit = pi->blk[phys_blk].end;
-
-			end = find_end_of_node(start, limit, size);
-			/*
-			 * If there won't be at least FAKE_NODE_MIN_SIZE of
-			 * non-reserved memory in ZONE_DMA32 for the next node,
-			 * this one must extend to the boundary.
-			 */
-			if (end < dma32_end && dma32_end - end -
-			    memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
-				end = dma32_end;
-
-			/*
-			 * If there won't be enough non-reserved memory for the
-			 * next node, this one must extend to the end of the
-			 * physical node.
-			 */
-			if (limit - end -
-			    memblock_x86_hole_size(end, limit) < size)
-				end = limit;
-
-			ret = emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES,
-					       phys_blk,
-					       min(end, limit) - start);
-			if (ret < 0)
-				return ret;
-		}
-	}
-	return 0;
-}
-
-/*
- * Sets up the system RAM area from start_pfn to last_pfn according to the
- * numa=fake command-line option.
- */
-static void __init numa_emulation(struct numa_meminfo *numa_meminfo,
-				  int numa_dist_cnt)
-{
-	static struct numa_meminfo ei __initdata;
-	static struct numa_meminfo pi __initdata;
-	const u64 max_addr = max_pfn << PAGE_SHIFT;
-	u8 *phys_dist = NULL;
-	int i, j, ret;
-
-	if (!emu_cmdline)
-		goto no_emu;
-
-	memset(&ei, 0, sizeof(ei));
-	pi = *numa_meminfo;
-
-	for (i = 0; i < MAX_NUMNODES; i++)
-		emu_nid_to_phys[i] = NUMA_NO_NODE;
-
-	/*
-	 * If the numa=fake command-line contains a 'M' or 'G', it represents
-	 * the fixed node size.  Otherwise, if it is just a single number N,
-	 * split the system RAM into N fake nodes.
-	 */
-	if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) {
-		u64 size;
-
-		size = memparse(emu_cmdline, &emu_cmdline);
-		ret = split_nodes_size_interleave(&ei, &pi, 0, max_addr, size);
-	} else {
-		unsigned long n;
-
-		n = simple_strtoul(emu_cmdline, NULL, 0);
-		ret = split_nodes_interleave(&ei, &pi, 0, max_addr, n);
-	}
-
-	if (ret < 0)
-		goto no_emu;
-
-	if (numa_cleanup_meminfo(&ei) < 0) {
-		pr_warning("NUMA: Warning: constructed meminfo invalid, disabling emulation\n");
-		goto no_emu;
-	}
-
-	/*
-	 * Copy the original distance table.  It's temporary so no need to
-	 * reserve it.
-	 */
-	if (numa_dist_cnt) {
-		size_t size = numa_dist_cnt * sizeof(phys_dist[0]);
-		u64 phys;
-
-		phys = memblock_find_in_range(0,
-					      (u64)max_pfn_mapped << PAGE_SHIFT,
-					      size, PAGE_SIZE);
-		if (phys == MEMBLOCK_ERROR) {
-			pr_warning("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n");
-			goto no_emu;
-		}
-		phys_dist = __va(phys);
-
-		for (i = 0; i < numa_dist_cnt; i++)
-			for (j = 0; j < numa_dist_cnt; j++)
-				phys_dist[i * numa_dist_cnt + j] =
-					node_distance(i, j);
-	}
-
-	/* commit */
-	*numa_meminfo = ei;
-
-	/*
-	 * Transform __apicid_to_node table to use emulated nids by
-	 * reverse-mapping phys_nid.  The maps should always exist but fall
-	 * back to zero just in case.
-	 */
-	for (i = 0; i < ARRAY_SIZE(__apicid_to_node); i++) {
-		if (__apicid_to_node[i] == NUMA_NO_NODE)
-			continue;
-		for (j = 0; j < ARRAY_SIZE(emu_nid_to_phys); j++)
-			if (__apicid_to_node[i] == emu_nid_to_phys[j])
-				break;
-		__apicid_to_node[i] = j < ARRAY_SIZE(emu_nid_to_phys) ? j : 0;
-	}
-
-	/* make sure all emulated nodes are mapped to a physical node */
-	for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
-		if (emu_nid_to_phys[i] == NUMA_NO_NODE)
-			emu_nid_to_phys[i] = 0;
-
-	/* transform distance table */
-	numa_reset_distance();
-	for (i = 0; i < MAX_NUMNODES; i++) {
-		for (j = 0; j < MAX_NUMNODES; j++) {
-			int physi = emu_nid_to_phys[i];
-			int physj = emu_nid_to_phys[j];
-			int dist;
-
-			if (physi >= numa_dist_cnt || physj >= numa_dist_cnt)
-				dist = physi == physj ?
-					LOCAL_DISTANCE : REMOTE_DISTANCE;
-			else
-				dist = phys_dist[physi * numa_dist_cnt + physj];
-
-			numa_set_distance(i, j, dist);
-		}
-	}
-	return;
-
-no_emu:
-	/* No emulation.  Build identity emu_nid_to_phys[] for numa_add_cpu() */
-	for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
-		emu_nid_to_phys[i] = i;
-}
-#else	/* CONFIG_NUMA_EMU */
-static inline void numa_emulation(struct numa_meminfo *numa_meminfo,
-				  int numa_dist_cnt)
-{ }
-#endif	/* CONFIG_NUMA_EMU */
-
 static int __init dummy_numa_init(void)
 {
 	printk(KERN_INFO "%s\n",
@@ -994,83 +602,3 @@ int __cpuinit numa_cpu_node(int cpu)
 		return __apicid_to_node[apicid];
 	return NUMA_NO_NODE;
 }
-
-/*
- * UGLINESS AHEAD: Currently, CONFIG_NUMA_EMU is 64bit only and makes use
- * of 64bit specific data structures.  The distinction is artificial and
- * should be removed.  numa_{add|remove}_cpu() are implemented in numa.c
- * for both 32 and 64bit when CONFIG_NUMA_EMU is disabled but here when
- * enabled.
- *
- * NUMA emulation is planned to be made generic and the following and other
- * related code should be moved to numa.c.
- */
-#ifdef CONFIG_NUMA_EMU
-# ifndef CONFIG_DEBUG_PER_CPU_MAPS
-void __cpuinit numa_add_cpu(int cpu)
-{
-	int physnid, nid;
-
-	nid = numa_cpu_node(cpu);
-	if (nid == NUMA_NO_NODE)
-		nid = early_cpu_to_node(cpu);
-	BUG_ON(nid == NUMA_NO_NODE || !node_online(nid));
-
-	physnid = emu_nid_to_phys[nid];
-
-	/*
-	 * Map the cpu to each emulated node that is allocated on the physical
-	 * node of the cpu's apic id.
-	 */
-	for_each_online_node(nid)
-		if (emu_nid_to_phys[nid] == physnid)
-			cpumask_set_cpu(cpu, node_to_cpumask_map[nid]);
-}
-
-void __cpuinit numa_remove_cpu(int cpu)
-{
-	int i;
-
-	for_each_online_node(i)
-		cpumask_clear_cpu(cpu, node_to_cpumask_map[i]);
-}
-# else	/* !CONFIG_DEBUG_PER_CPU_MAPS */
-static void __cpuinit numa_set_cpumask(int cpu, int enable)
-{
-	struct cpumask *mask;
-	int nid, physnid, i;
-
-	nid = early_cpu_to_node(cpu);
-	if (nid == NUMA_NO_NODE) {
-		/* early_cpu_to_node() already emits a warning and trace */
-		return;
-	}
-
-	physnid = emu_nid_to_phys[nid];
-
-	for_each_online_node(i) {
-		if (emu_nid_to_phys[nid] != physnid)
-			continue;
-
-		mask = debug_cpumask_set_cpu(cpu, enable);
-		if (!mask)
-			return;
-
-		if (enable)
-			cpumask_set_cpu(cpu, mask);
-		else
-			cpumask_clear_cpu(cpu, mask);
-	}
-}
-
-void __cpuinit numa_add_cpu(int cpu)
-{
-	numa_set_cpumask(cpu, 1);
-}
-
-void __cpuinit numa_remove_cpu(int cpu)
-{
-	numa_set_cpumask(cpu, 0);
-}
-# endif	/* !CONFIG_DEBUG_PER_CPU_MAPS */
-#endif	/* CONFIG_NUMA_EMU */
diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c
new file mode 100644
index 0000000..23fa2d0
--- /dev/null
+++ b/arch/x86/mm/numa_emulation.c
@@ -0,0 +1,452 @@
+/*
+ * NUMA emulation
+ */
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/topology.h>
+#include <linux/memblock.h>
+#include <asm/dma.h>
+
+#include "numa_internal.h"
+
+static int emu_nid_to_phys[MAX_NUMNODES] __cpuinitdata;
+static char *emu_cmdline __initdata;
+
+void __init numa_emu_cmdline(char *str)
+{
+	emu_cmdline = str;
+}
+
+static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi)
+{
+	int i;
+
+	for (i = 0; i < mi->nr_blks; i++)
+		if (mi->blk[i].nid == nid)
+			return i;
+	return -ENOENT;
+}
+
+/*
+ * Sets up nid to range from @start to @end.  The return value is -errno if
+ * something went wrong, 0 otherwise.
+ */
+static int __init emu_setup_memblk(struct numa_meminfo *ei,
+				   struct numa_meminfo *pi,
+				   int nid, int phys_blk, u64 size)
+{
+	struct numa_memblk *eb = &ei->blk[ei->nr_blks];
+	struct numa_memblk *pb = &pi->blk[phys_blk];
+
+	if (ei->nr_blks >= NR_NODE_MEMBLKS) {
+		pr_err("NUMA: Too many emulated memblks, failing emulation\n");
+		return -EINVAL;
+	}
+
+	ei->nr_blks++;
+	eb->start = pb->start;
+	eb->end = pb->start + size;
+	eb->nid = nid;
+
+	if (emu_nid_to_phys[nid] == NUMA_NO_NODE)
+		emu_nid_to_phys[nid] = pb->nid;
+
+	pb->start += size;
+	if (pb->start >= pb->end) {
+		WARN_ON_ONCE(pb->start > pb->end);
+		numa_remove_memblk_from(phys_blk, pi);
+	}
+
+	printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid,
+	       eb->start, eb->end, (eb->end - eb->start) >> 20);
+	return 0;
+}
+
+/*
+ * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
+ * to max_addr.  The return value is the number of nodes allocated.
+ */
+static int __init split_nodes_interleave(struct numa_meminfo *ei,
+					 struct numa_meminfo *pi,
+					 u64 addr, u64 max_addr, int nr_nodes)
+{
+	nodemask_t physnode_mask = NODE_MASK_NONE;
+	u64 size;
+	int big;
+	int nid = 0;
+	int i, ret;
+
+	if (nr_nodes <= 0)
+		return -1;
+	if (nr_nodes > MAX_NUMNODES) {
+		pr_info("numa=fake=%d too large, reducing to %d\n",
+			nr_nodes, MAX_NUMNODES);
+		nr_nodes = MAX_NUMNODES;
+	}
+
+	size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) / nr_nodes;
+	/*
+	 * Calculate the number of big nodes that can be allocated as a result
+	 * of consolidating the remainder.
+	 */
+	big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) /
+		FAKE_NODE_MIN_SIZE;
+
+	size &= FAKE_NODE_MIN_HASH_MASK;
+	if (!size) {
+		pr_err("Not enough memory for each node.  "
+			"NUMA emulation disabled.\n");
+		return -1;
+	}
+
+	for (i = 0; i < pi->nr_blks; i++)
+		node_set(pi->blk[i].nid, physnode_mask);
+
+	/*
+	 * Continue to fill physical nodes with fake nodes until there is no
+	 * memory left on any of them.
+	 */
+	while (nodes_weight(physnode_mask)) {
+		for_each_node_mask(i, physnode_mask) {
+			u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
+			u64 start, limit, end;
+			int phys_blk;
+
+			phys_blk = emu_find_memblk_by_nid(i, pi);
+			if (phys_blk < 0) {
+				node_clear(i, physnode_mask);
+				continue;
+			}
+			start = pi->blk[phys_blk].start;
+			limit = pi->blk[phys_blk].end;
+			end = start + size;
+
+			if (nid < big)
+				end += FAKE_NODE_MIN_SIZE;
+
+			/*
+			 * Continue to add memory to this fake node if its
+			 * non-reserved memory is less than the per-node size.
+			 */
+			while (end - start -
+			       memblock_x86_hole_size(start, end) < size) {
+				end += FAKE_NODE_MIN_SIZE;
+				if (end > limit) {
+					end = limit;
+					break;
+				}
+			}
+
+			/*
+			 * If there won't be at least FAKE_NODE_MIN_SIZE of
+			 * non-reserved memory in ZONE_DMA32 for the next node,
+			 * this one must extend to the boundary.
+			 */
+			if (end < dma32_end && dma32_end - end -
+			    memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
+				end = dma32_end;
+
+			/*
+			 * If there won't be enough non-reserved memory for the
+			 * next node, this one must extend to the end of the
+			 * physical node.
+			 */
+			if (limit - end -
+			    memblock_x86_hole_size(end, limit) < size)
+				end = limit;
+
+			ret = emu_setup_memblk(ei, pi, nid++ % nr_nodes,
+					       phys_blk,
+					       min(end, limit) - start);
+			if (ret < 0)
+				return ret;
+		}
+	}
+	return 0;
+}
+
+/*
+ * Returns the end address of a node so that there is at least `size' amount of
+ * non-reserved memory or `max_addr' is reached.
+ */
+static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size)
+{
+	u64 end = start + size;
+
+	while (end - start - memblock_x86_hole_size(start, end) < size) {
+		end += FAKE_NODE_MIN_SIZE;
+		if (end > max_addr) {
+			end = max_addr;
+			break;
+		}
+	}
+	return end;
+}
+
+/*
+ * Sets up fake nodes of `size' interleaved over physical nodes ranging from
+ * `addr' to `max_addr'.  The return value is the number of nodes allocated.
+ */
+static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
+					      struct numa_meminfo *pi,
+					      u64 addr, u64 max_addr, u64 size)
+{
+	nodemask_t physnode_mask = NODE_MASK_NONE;
+	u64 min_size;
+	int nid = 0;
+	int i, ret;
+
+	if (!size)
+		return -1;
+	/*
+	 * The limit on emulated nodes is MAX_NUMNODES, so the size per node is
+	 * increased accordingly if the requested size is too small.  This
+	 * creates a uniform distribution of node sizes across the entire
+	 * machine (but not necessarily over physical nodes).
+	 */
+	min_size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) /
+						MAX_NUMNODES;
+	min_size = max(min_size, FAKE_NODE_MIN_SIZE);
+	if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size)
+		min_size = (min_size + FAKE_NODE_MIN_SIZE) &
+						FAKE_NODE_MIN_HASH_MASK;
+	if (size < min_size) {
+		pr_err("Fake node size %LuMB too small, increasing to %LuMB\n",
+			size >> 20, min_size >> 20);
+		size = min_size;
+	}
+	size &= FAKE_NODE_MIN_HASH_MASK;
+
+	for (i = 0; i < pi->nr_blks; i++)
+		node_set(pi->blk[i].nid, physnode_mask);
+
+	/*
+	 * Fill physical nodes with fake nodes of size until there is no memory
+	 * left on any of them.
+	 */
+	while (nodes_weight(physnode_mask)) {
+		for_each_node_mask(i, physnode_mask) {
+			u64 dma32_end = MAX_DMA32_PFN << PAGE_SHIFT;
+			u64 start, limit, end;
+			int phys_blk;
+
+			phys_blk = emu_find_memblk_by_nid(i, pi);
+			if (phys_blk < 0) {
+				node_clear(i, physnode_mask);
+				continue;
+			}
+			start = pi->blk[phys_blk].start;
+			limit = pi->blk[phys_blk].end;
+
+			end = find_end_of_node(start, limit, size);
+			/*
+			 * If there won't be at least FAKE_NODE_MIN_SIZE of
+			 * non-reserved memory in ZONE_DMA32 for the next node,
+			 * this one must extend to the boundary.
+			 */
+			if (end < dma32_end && dma32_end - end -
+			    memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
+				end = dma32_end;
+
+			/*
+			 * If there won't be enough non-reserved memory for the
+			 * next node, this one must extend to the end of the
+			 * physical node.
+			 */
+			if (limit - end -
+			    memblock_x86_hole_size(end, limit) < size)
+				end = limit;
+
+			ret = emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES,
+					       phys_blk,
+					       min(end, limit) - start);
+			if (ret < 0)
+				return ret;
+		}
+	}
+	return 0;
+}
+
+/*
+ * Sets up the system RAM area from start_pfn to last_pfn according to the
+ * numa=fake command-line option.
+ */
+void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
+{
+	static struct numa_meminfo ei __initdata;
+	static struct numa_meminfo pi __initdata;
+	const u64 max_addr = max_pfn << PAGE_SHIFT;
+	u8 *phys_dist = NULL;
+	int i, j, ret;
+
+	if (!emu_cmdline)
+		goto no_emu;
+
+	memset(&ei, 0, sizeof(ei));
+	pi = *numa_meminfo;
+
+	for (i = 0; i < MAX_NUMNODES; i++)
+		emu_nid_to_phys[i] = NUMA_NO_NODE;
+
+	/*
+	 * If the numa=fake command-line contains a 'M' or 'G', it represents
+	 * the fixed node size.  Otherwise, if it is just a single number N,
+	 * split the system RAM into N fake nodes.
+	 */
+	if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) {
+		u64 size;
+
+		size = memparse(emu_cmdline, &emu_cmdline);
+		ret = split_nodes_size_interleave(&ei, &pi, 0, max_addr, size);
+	} else {
+		unsigned long n;
+
+		n = simple_strtoul(emu_cmdline, NULL, 0);
+		ret = split_nodes_interleave(&ei, &pi, 0, max_addr, n);
+	}
+
+	if (ret < 0)
+		goto no_emu;
+
+	if (numa_cleanup_meminfo(&ei) < 0) {
+		pr_warning("NUMA: Warning: constructed meminfo invalid, disabling emulation\n");
+		goto no_emu;
+	}
+
+	/*
+	 * Copy the original distance table.  It's temporary so no need to
+	 * reserve it.
+	 */
+	if (numa_dist_cnt) {
+		size_t size = numa_dist_cnt * sizeof(phys_dist[0]);
+		u64 phys;
+
+		phys = memblock_find_in_range(0,
+					      (u64)max_pfn_mapped << PAGE_SHIFT,
+					      size, PAGE_SIZE);
+		if (phys == MEMBLOCK_ERROR) {
+			pr_warning("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n");
+			goto no_emu;
+		}
+		phys_dist = __va(phys);
+
+		for (i = 0; i < numa_dist_cnt; i++)
+			for (j = 0; j < numa_dist_cnt; j++)
+				phys_dist[i * numa_dist_cnt + j] =
+					node_distance(i, j);
+	}
+
+	/* commit */
+	*numa_meminfo = ei;
+
+	/*
+	 * Transform __apicid_to_node table to use emulated nids by
+	 * reverse-mapping phys_nid.  The maps should always exist but fall
+	 * back to zero just in case.
+	 */
+	for (i = 0; i < ARRAY_SIZE(__apicid_to_node); i++) {
+		if (__apicid_to_node[i] == NUMA_NO_NODE)
+			continue;
+		for (j = 0; j < ARRAY_SIZE(emu_nid_to_phys); j++)
+			if (__apicid_to_node[i] == emu_nid_to_phys[j])
+				break;
+		__apicid_to_node[i] = j < ARRAY_SIZE(emu_nid_to_phys) ? j : 0;
+	}
+
+	/* make sure all emulated nodes are mapped to a physical node */
+	for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
+		if (emu_nid_to_phys[i] == NUMA_NO_NODE)
+			emu_nid_to_phys[i] = 0;
+
+	/* transform distance table */
+	numa_reset_distance();
+	for (i = 0; i < MAX_NUMNODES; i++) {
+		for (j = 0; j < MAX_NUMNODES; j++) {
+			int physi = emu_nid_to_phys[i];
+			int physj = emu_nid_to_phys[j];
+			int dist;
+
+			if (physi >= numa_dist_cnt || physj >= numa_dist_cnt)
+				dist = physi == physj ?
+					LOCAL_DISTANCE : REMOTE_DISTANCE;
+			else
+				dist = phys_dist[physi * numa_dist_cnt + physj];
+
+			numa_set_distance(i, j, dist);
+		}
+	}
+	return;
+
+no_emu:
+	/* No emulation.  Build identity emu_nid_to_phys[] for numa_add_cpu() */
+	for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
+		emu_nid_to_phys[i] = i;
+}
+
+#ifndef CONFIG_DEBUG_PER_CPU_MAPS
+void __cpuinit numa_add_cpu(int cpu)
+{
+	int physnid, nid;
+
+	nid = numa_cpu_node(cpu);
+	if (nid == NUMA_NO_NODE)
+		nid = early_cpu_to_node(cpu);
+	BUG_ON(nid == NUMA_NO_NODE || !node_online(nid));
+
+	physnid = emu_nid_to_phys[nid];
+
+	/*
+	 * Map the cpu to each emulated node that is allocated on the physical
+	 * node of the cpu's apic id.
+	 */
+	for_each_online_node(nid)
+		if (emu_nid_to_phys[nid] == physnid)
+			cpumask_set_cpu(cpu, node_to_cpumask_map[nid]);
+}
+
+void __cpuinit numa_remove_cpu(int cpu)
+{
+	int i;
+
+	for_each_online_node(i)
+		cpumask_clear_cpu(cpu, node_to_cpumask_map[i]);
+}
+#else	/* !CONFIG_DEBUG_PER_CPU_MAPS */
+static void __cpuinit numa_set_cpumask(int cpu, int enable)
+{
+	struct cpumask *mask;
+	int nid, physnid, i;
+
+	nid = early_cpu_to_node(cpu);
+	if (nid == NUMA_NO_NODE) {
+		/* early_cpu_to_node() already emits a warning and trace */
+		return;
+	}
+
+	physnid = emu_nid_to_phys[nid];
+
+	for_each_online_node(i) {
+		if (emu_nid_to_phys[nid] != physnid)
+			continue;
+
+		mask = debug_cpumask_set_cpu(cpu, enable);
+		if (!mask)
+			return;
+
+		if (enable)
+			cpumask_set_cpu(cpu, mask);
+		else
+			cpumask_clear_cpu(cpu, mask);
+	}
+}
+
+void __cpuinit numa_add_cpu(int cpu)
+{
+	numa_set_cpumask(cpu, 1);
+}
+
+void __cpuinit numa_remove_cpu(int cpu)
+{
+	numa_set_cpumask(cpu, 0);
+}
+#endif	/* !CONFIG_DEBUG_PER_CPU_MAPS */
diff --git a/arch/x86/mm/numa_internal.h b/arch/x86/mm/numa_internal.h
new file mode 100644
index 0000000..ef2d973
--- /dev/null
+++ b/arch/x86/mm/numa_internal.h
@@ -0,0 +1,31 @@
+#ifndef __X86_MM_NUMA_INTERNAL_H
+#define __X86_MM_NUMA_INTERNAL_H
+
+#include <linux/types.h>
+#include <asm/numa.h>
+
+struct numa_memblk {
+	u64			start;
+	u64			end;
+	int			nid;
+};
+
+struct numa_meminfo {
+	int			nr_blks;
+	struct numa_memblk	blk[NR_NODE_MEMBLKS];
+};
+
+void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi);
+int __init numa_cleanup_meminfo(struct numa_meminfo *mi);
+void __init numa_reset_distance(void);
+
+#ifdef CONFIG_NUMA_EMU
+void __init numa_emulation(struct numa_meminfo *numa_meminfo,
+			   int numa_dist_cnt);
+#else
+static inline void numa_emulation(struct numa_meminfo *numa_meminfo,
+				  int numa_dist_cnt)
+{ }
+#endif
+
+#endif	/* __X86_MM_NUMA_INTERNAL_H */
-- 
cgit v0.10.2


From 90e6b677b47ff8c5ba1637941af6b9f92723b003 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 22 Feb 2011 11:10:08 +0100
Subject: x86-64, NUMA: Add proper function comments to global functions

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>

diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 45a361b..848381b 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -205,6 +205,14 @@ static int __init numa_add_memblk_to(int nid, u64 start, u64 end,
 	return 0;
 }
 
+/**
+ * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo
+ * @idx: Index of memblk to remove
+ * @mi: numa_meminfo to remove memblk from
+ *
+ * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and
+ * decrementing @mi->nr_blks.
+ */
 void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi)
 {
 	mi->nr_blks--;
@@ -212,6 +220,17 @@ void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi)
 		(mi->nr_blks - idx) * sizeof(mi->blk[0]));
 }
 
+/**
+ * numa_add_memblk - Add one numa_memblk to numa_meminfo
+ * @nid: NUMA node ID of the new memblk
+ * @start: Start address of the new memblk
+ * @end: End address of the new memblk
+ *
+ * Add a new memblk to the default numa_meminfo.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
 int __init numa_add_memblk(int nid, u64 start, u64 end)
 {
 	return numa_add_memblk_to(nid, start, end, &numa_meminfo);
@@ -263,6 +282,16 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
 	node_set_online(nodeid);
 }
 
+/**
+ * numa_cleanup_meminfo - Cleanup a numa_meminfo
+ * @mi: numa_meminfo to clean up
+ *
+ * Sanitize @mi by merging and removing unncessary memblks.  Also check for
+ * conflicts and clear unused memblks.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
 int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
 {
 	const u64 low = 0;
@@ -353,9 +382,11 @@ static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask,
 			node_set(mi->blk[i].nid, *nodemask);
 }
 
-/*
- * Reset distance table.  The current table is freed.  The next
- * numa_set_distance() call will create a new one.
+/**
+ * numa_reset_distance - Reset NUMA distance table
+ *
+ * The current table is freed.  The next numa_set_distance() call will
+ * create a new one.
  */
 void __init numa_reset_distance(void)
 {
@@ -370,10 +401,15 @@ void __init numa_reset_distance(void)
 	numa_distance = NULL;
 }
 
-/*
- * Set the distance between node @from to @to to @distance.  If distance
- * table doesn't exist, one which is large enough to accomodate all the
- * currently known nodes will be created.
+/**
+ * numa_set_distance - Set NUMA distance from one NUMA to another
+ * @from: the 'from' node to set distance
+ * @to: the 'to'  node to set distance
+ * @distance: NUMA distance
+ *
+ * Set the distance from node @from to @to to @distance.  If distance table
+ * doesn't exist, one which is large enough to accomodate all the currently
+ * known nodes will be created.
  */
 void __init numa_set_distance(int from, int to, int distance)
 {
diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c
index 23fa2d0..607a2e8 100644
--- a/arch/x86/mm/numa_emulation.c
+++ b/arch/x86/mm/numa_emulation.c
@@ -267,9 +267,32 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
 	return 0;
 }
 
-/*
- * Sets up the system RAM area from start_pfn to last_pfn according to the
- * numa=fake command-line option.
+/**
+ * numa_emulation - Emulate NUMA nodes
+ * @numa_meminfo: NUMA configuration to massage
+ * @numa_dist_cnt: The size of the physical NUMA distance table
+ *
+ * Emulate NUMA nodes according to the numa=fake kernel parameter.
+ * @numa_meminfo contains the physical memory configuration and is modified
+ * to reflect the emulated configuration on success.  @numa_dist_cnt is
+ * used to determine the size of the physical distance table.
+ *
+ * On success, the following modifications are made.
+ *
+ * - @numa_meminfo is updated to reflect the emulated nodes.
+ *
+ * - __apicid_to_node[] is updated such that APIC IDs are mapped to the
+ *   emulated nodes.
+ *
+ * - NUMA distance table is rebuilt to represent distances between emulated
+ *   nodes.  The distances are determined considering how emulated nodes
+ *   are mapped to physical nodes and match the actual distances.
+ *
+ * - emu_nid_to_phys[] reflects how emulated nodes are mapped to physical
+ *   nodes.  This is used by numa_add_cpu() and numa_remove_cpu().
+ *
+ * If emulation is not enabled or fails, emu_nid_to_phys[] is filled with
+ * identity mapping and no other modification is made.
  */
 void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
 {
-- 
cgit v0.10.2


From 2bf50555b0920be7e29d3823f6bbd20ee5920489 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Tue, 22 Feb 2011 11:18:49 +0100
Subject: x86-64, NUMA: Seperate out numa_alloc_distance() from
 numa_set_distance()

Alloc code is much bigger the distance setting.  Separate it out into
numa_alloc_distance() for readability.

-v2: Let alloc_numa_distance to return -ENOMEM on failing path,
     requested by tj.

-tj: Description update.  Minor tweaks including function name,
     location and return value check.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 848381b..cccc01d 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -401,6 +401,44 @@ void __init numa_reset_distance(void)
 	numa_distance = NULL;
 }
 
+static int __init numa_alloc_distance(void)
+{
+	nodemask_t nodes_parsed;
+	size_t size;
+	int i, j, cnt = 0;
+	u64 phys;
+
+	/* size the new table and allocate it */
+	nodes_parsed = numa_nodes_parsed;
+	numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo);
+
+	for_each_node_mask(i, nodes_parsed)
+		cnt = i;
+	size = ++cnt * sizeof(numa_distance[0]);
+
+	phys = memblock_find_in_range(0, (u64)max_pfn_mapped << PAGE_SHIFT,
+				      size, PAGE_SIZE);
+	if (phys == MEMBLOCK_ERROR) {
+		pr_warning("NUMA: Warning: can't allocate distance table!\n");
+		/* don't retry until explicitly reset */
+		numa_distance = (void *)1LU;
+		return -ENOMEM;
+	}
+	memblock_x86_reserve_range(phys, phys + size, "NUMA DIST");
+
+	numa_distance = __va(phys);
+	numa_distance_cnt = cnt;
+
+	/* fill with the default distances */
+	for (i = 0; i < cnt; i++)
+		for (j = 0; j < cnt; j++)
+			numa_distance[i * cnt + j] = i == j ?
+				LOCAL_DISTANCE : REMOTE_DISTANCE;
+	printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt);
+
+	return 0;
+}
+
 /**
  * numa_set_distance - Set NUMA distance from one NUMA to another
  * @from: the 'from' node to set distance
@@ -413,41 +451,8 @@ void __init numa_reset_distance(void)
  */
 void __init numa_set_distance(int from, int to, int distance)
 {
-	if (!numa_distance) {
-		nodemask_t nodes_parsed;
-		size_t size;
-		int i, j, cnt = 0;
-		u64 phys;
-
-		/* size the new table and allocate it */
-		nodes_parsed = numa_nodes_parsed;
-		numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo);
-
-		for_each_node_mask(i, nodes_parsed)
-			cnt = i;
-		size = ++cnt * sizeof(numa_distance[0]);
-
-		phys = memblock_find_in_range(0,
-					      (u64)max_pfn_mapped << PAGE_SHIFT,
-					      size, PAGE_SIZE);
-		if (phys == MEMBLOCK_ERROR) {
-			pr_warning("NUMA: Warning: can't allocate distance table!\n");
-			/* don't retry until explicitly reset */
-			numa_distance = (void *)1LU;
-			return;
-		}
-		memblock_x86_reserve_range(phys, phys + size, "NUMA DIST");
-
-		numa_distance = __va(phys);
-		numa_distance_cnt = cnt;
-
-		/* fill with the default distances */
-		for (i = 0; i < cnt; i++)
-			for (j = 0; j < cnt; j++)
-				numa_distance[i * cnt + j] = i == j ?
-					LOCAL_DISTANCE : REMOTE_DISTANCE;
-		printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt);
-	}
+	if (!numa_distance && numa_alloc_distance() < 0)
+		return;
 
 	if (from >= numa_distance_cnt || to >= numa_distance_cnt) {
 		printk_once(KERN_DEBUG "NUMA: Debug: distance out of bound, from=%d to=%d distance=%d\n",
-- 
cgit v0.10.2


From 70433c01613c2a44756c7b25f7bdd6c1c77b119f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 22 Feb 2011 12:50:12 +0100
Subject: genirq: Use the correct variable for note_interrupt

note_interrupt wants to be called with the combined result of all
handlers called, not with the last one. If it's a shared interrupt
then the last handler might return IRQ_NONE often enough to trigger
the spurious dectector which turns off a perfectly fine working
interrupt line. Bug was introduced in commit 1277a532(genirq: Simplify
handle_irq_event()).

Yes, I really messed up there. First the variable ret should not have
been named differently to avoid similarity with retval. Second it
should have been declared in the do {} loop.

Rename it to res and move it into the do {} loop and vanish under a
huge brown paperbag.

Reported-bisected-tested-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index cb62e2d..e099e9e 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -54,24 +54,26 @@ static void warn_no_thread(unsigned int irq, struct irqaction *action)
 irqreturn_t
 handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
 {
-	irqreturn_t ret, retval = IRQ_NONE;
+	irqreturn_t retval = IRQ_NONE;
 	unsigned int random = 0, irq = desc->irq_data.irq;
 
 	do {
+		irqreturn_t res;
+
 		trace_irq_handler_entry(irq, action);
-		ret = action->handler(irq, action->dev_id);
-		trace_irq_handler_exit(irq, action, ret);
+		res = action->handler(irq, action->dev_id);
+		trace_irq_handler_exit(irq, action, res);
 
 		if (WARN_ON_ONCE(!irqs_disabled()))
 			local_irq_disable();
 
-		switch (ret) {
+		switch (res) {
 		case IRQ_WAKE_THREAD:
 			/*
 			 * Set result to handled so the spurious check
 			 * does not trigger.
 			 */
-			ret = IRQ_HANDLED;
+			res = IRQ_HANDLED;
 
 			/*
 			 * Catch drivers which return WAKE_THREAD but
@@ -105,7 +107,7 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
 			break;
 		}
 
-		retval |= ret;
+		retval |= res;
 		action = action->next;
 	} while (action);
 
@@ -113,7 +115,7 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
 		add_interrupt_randomness(irq);
 
 	if (!noirqdebug)
-		note_interrupt(irq, desc, ret);
+		note_interrupt(irq, desc, retval);
 	return retval;
 }
 
-- 
cgit v0.10.2


From c97cf42219b7b6037d2f96c27a5f114f2383f828 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 22 Feb 2011 12:02:07 -0300
Subject: perf top: Live TUI Annotation

Now one has just to press the right key, 'a' or Enter on the main 'perf
top --tui' screen to live annotate the symbol under the cursor.

The annotate window starts centered on the hottest line (the one with
most samples so far) then TAB and shift+TAB can be used to go to the
prev/next hot line.

Pressing 'H' at any point will center again the screen on the hottest
line.

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index c9fd66d..f88a263 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -92,7 +92,6 @@ static bool			dump_symtab                     =  false;
 static struct winsize		winsize;
 
 static const char		*sym_filter			=   NULL;
-struct sym_entry		*sym_filter_entry		=   NULL;
 struct sym_entry		*sym_filter_entry_sched		=   NULL;
 static int			sym_pcnt_filter			=      5;
 
@@ -168,18 +167,19 @@ static int parse_source(struct sym_entry *syme)
 	pthread_mutex_lock(&notes->lock);
 
 	if (symbol__alloc_hist(sym, top.evlist->nr_entries) < 0) {
+		pthread_mutex_unlock(&notes->lock);
 		pr_err("Not enough memory for annotating '%s' symbol!\n",
 		       sym->name);
 		sleep(1);
-		goto out_unlock;
+		return err;
 	}
 
 	err = symbol__annotate(sym, syme->map, 0);
 	if (err == 0) {
 out_assign:
-	sym_filter_entry = syme;
+		top.sym_filter_entry = syme;
 	}
-out_unlock:
+
 	pthread_mutex_unlock(&notes->lock);
 	return err;
 }
@@ -195,7 +195,7 @@ static void record_precise_ip(struct sym_entry *syme, int counter, u64 ip)
 	struct annotation *notes;
 	struct symbol *sym;
 
-	if (syme != sym_filter_entry)
+	if (syme != top.sym_filter_entry)
 		return;
 
 	sym = sym_entry__symbol(syme);
@@ -275,8 +275,8 @@ static void print_sym_table(struct perf_session *session)
 		       session->hists.stats.total_lost);
 	}
 
-	if (sym_filter_entry) {
-		show_details(sym_filter_entry);
+	if (top.sym_filter_entry) {
+		show_details(top.sym_filter_entry);
 		return;
 	}
 
@@ -417,8 +417,8 @@ static void print_mapped_keys(void)
 {
 	char *name = NULL;
 
-	if (sym_filter_entry) {
-		struct symbol *sym = sym_entry__symbol(sym_filter_entry);
+	if (top.sym_filter_entry) {
+		struct symbol *sym = sym_entry__symbol(top.sym_filter_entry);
 		name = sym->name;
 	}
 
@@ -549,15 +549,15 @@ static void handle_keypress(struct perf_session *session, int c)
 				perf_session__fprintf_dsos(session, stderr);
 			exit(0);
 		case 's':
-			prompt_symbol(&sym_filter_entry, "Enter details symbol");
+			prompt_symbol(&top.sym_filter_entry, "Enter details symbol");
 			break;
 		case 'S':
-			if (!sym_filter_entry)
+			if (!top.sym_filter_entry)
 				break;
 			else {
-				struct sym_entry *syme = sym_filter_entry;
+				struct sym_entry *syme = top.sym_filter_entry;
 
-				sym_filter_entry = NULL;
+				top.sym_filter_entry = NULL;
 				__zero_source_counters(syme);
 			}
 			break;
@@ -656,7 +656,7 @@ static int symbol_filter(struct map *map, struct symbol *sym)
 	syme->map = map;
 	symbol__annotate_init(map, sym);
 
-	if (!sym_filter_entry && sym_filter && !strcmp(name, sym_filter)) {
+	if (!top.sym_filter_entry && sym_filter && !strcmp(name, sym_filter)) {
 		/* schedule initial sym_filter_entry setup */
 		sym_filter_entry_sched = syme;
 		sym_filter = NULL;
@@ -750,13 +750,13 @@ static void perf_event__process_sample(const union perf_event *event,
 
 	/* let's see, whether we need to install initial sym_filter_entry */
 	if (sym_filter_entry_sched) {
-		sym_filter_entry = sym_filter_entry_sched;
+		top.sym_filter_entry = sym_filter_entry_sched;
 		sym_filter_entry_sched = NULL;
-		if (parse_source(sym_filter_entry) < 0) {
-			struct symbol *sym = sym_entry__symbol(sym_filter_entry);
+		if (parse_source(top.sym_filter_entry) < 0) {
+			struct symbol *sym = sym_entry__symbol(top.sym_filter_entry);
 
 			pr_err("Can't annotate %s", sym->name);
-			if (sym_filter_entry->map->dso->origin == DSO__ORIG_KERNEL) {
+			if (top.sym_filter_entry->map->dso->origin == DSO__ORIG_KERNEL) {
 				pr_err(": No vmlinux file was found in the path:\n");
 				machine__fprintf_vmlinux_path(machine, stderr);
 			} else
diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h
index e848803..c2c2868 100644
--- a/tools/perf/util/annotate.h
+++ b/tools/perf/util/annotate.h
@@ -90,12 +90,14 @@ int symbol__tty_annotate(struct symbol *sym, struct map *map, int evidx,
 
 #ifdef NO_NEWT_SUPPORT
 static inline int symbol__tui_annotate(struct symbol *sym __used,
-				       struct map *map __used, int evidx __used)
+				       struct map *map __used,
+				       int evidx __used, int refresh __used)
 {
 	return 0;
 }
 #else
-int symbol__tui_annotate(struct symbol *sym, struct map *map, int evidx);
+int symbol__tui_annotate(struct symbol *sym, struct map *map, int evidx,
+			 int refresh);
 #endif
 
 #endif	/* __PERF_ANNOTATE_H */
diff --git a/tools/perf/util/top.h b/tools/perf/util/top.h
index 4f769f4..e8d28e2 100644
--- a/tools/perf/util/top.h
+++ b/tools/perf/util/top.h
@@ -44,6 +44,7 @@ struct perf_top {
 	pid_t		   target_pid, target_tid;
 	bool		   hide_kernel_symbols, hide_user_symbols, zero;
 	const char	   *cpu_list;
+	struct sym_entry   *sym_filter_entry;
 	struct perf_evsel  *sym_evsel;
 };
 
diff --git a/tools/perf/util/ui/browsers/annotate.c b/tools/perf/util/ui/browsers/annotate.c
index cfb5a27..8c17a87 100644
--- a/tools/perf/util/ui/browsers/annotate.c
+++ b/tools/perf/util/ui/browsers/annotate.c
@@ -6,6 +6,7 @@
 #include "../../sort.h"
 #include "../../symbol.h"
 #include "../../annotate.h"
+#include <pthread.h>
 
 static void ui__error_window(const char *fmt, ...)
 {
@@ -138,46 +139,108 @@ static void annotate_browser__set_top(struct annotate_browser *self,
 	self->curr_hot = nd;
 }
 
-static int annotate_browser__run(struct annotate_browser *self)
+static void annotate_browser__calc_percent(struct annotate_browser *browser,
+					   int evidx)
 {
-	struct rb_node *nd;
+	struct symbol *sym = browser->b.priv;
+	struct annotation *notes = symbol__annotation(sym);
+	struct objdump_line *pos;
+
+	browser->entries = RB_ROOT;
+
+	pthread_mutex_lock(&notes->lock);
+
+	list_for_each_entry(pos, &notes->src->source, node) {
+		struct objdump_line_rb_node *rbpos = objdump_line__rb(pos);
+		rbpos->percent = objdump_line__calc_percent(pos, sym, evidx);
+		if (rbpos->percent < 0.01) {
+			RB_CLEAR_NODE(&rbpos->rb_node);
+			continue;
+		}
+		objdump__insert_line(&browser->entries, rbpos);
+	}
+	pthread_mutex_unlock(&notes->lock);
+
+	browser->curr_hot = rb_last(&browser->entries);
+}
+
+static int annotate_browser__run(struct annotate_browser *self, int evidx,
+				 int refresh)
+{
+	struct rb_node *nd = NULL;
 	struct symbol *sym = self->b.priv;
+	/*
+	 * RIGHT To allow builtin-annotate to cycle thru multiple symbols by
+	 * examining the exit key for this function.
+	 */
+	int exit_keys[] = { 'H', NEWT_KEY_TAB, NEWT_KEY_UNTAB,
+			    NEWT_KEY_RIGHT, 0 };
 	int key;
 
 	if (ui_browser__show(&self->b, sym->name,
-			     "<-, -> or ESC: exit, TAB/shift+TAB: cycle thru samples") < 0)
+			     "<-, -> or ESC: exit, TAB/shift+TAB: "
+			     "cycle hottest lines, H: Hottest") < 0)
 		return -1;
-	/*
-	 * To allow builtin-annotate to cycle thru multiple symbols by
-	 * examining the exit key for this function.
-	 */
-	ui_browser__add_exit_key(&self->b, NEWT_KEY_RIGHT);
+
+	ui_browser__add_exit_keys(&self->b, exit_keys);
+	annotate_browser__calc_percent(self, evidx);
+
+	if (self->curr_hot)
+		annotate_browser__set_top(self, self->curr_hot);
 
 	nd = self->curr_hot;
-	if (nd) {
-		int tabs[] = { NEWT_KEY_TAB, NEWT_KEY_UNTAB, 0 };
-		ui_browser__add_exit_keys(&self->b, tabs);
-	}
+
+	if (refresh != 0)
+		newtFormSetTimer(self->b.form, refresh);
 
 	while (1) {
 		key = ui_browser__run(&self->b);
 
+		if (refresh != 0) {
+			annotate_browser__calc_percent(self, evidx);
+			/*
+			 * Current line focus got out of the list of most active
+			 * lines, NULL it so that if TAB|UNTAB is pressed, we
+			 * move to curr_hot (current hottest line).
+			 */
+			if (nd != NULL && RB_EMPTY_NODE(nd))
+				nd = NULL;
+		}
+
 		switch (key) {
+		case -1:
+			/*
+ 			 * FIXME we need to check if it was
+ 			 * es.reason == NEWT_EXIT_TIMER
+ 			 */
+			if (refresh != 0)
+				symbol__annotate_decay_histogram(sym, evidx);
+			continue;
 		case NEWT_KEY_TAB:
-			nd = rb_prev(nd);
-			if (nd == NULL)
-				nd = rb_last(&self->entries);
-			annotate_browser__set_top(self, nd);
+			if (nd != NULL) {
+				nd = rb_prev(nd);
+				if (nd == NULL)
+					nd = rb_last(&self->entries);
+			} else
+				nd = self->curr_hot;
 			break;
 		case NEWT_KEY_UNTAB:
-			nd = rb_next(nd);
-			if (nd == NULL)
-				nd = rb_first(&self->entries);
-			annotate_browser__set_top(self, nd);
+			if (nd != NULL)
+				nd = rb_next(nd);
+				if (nd == NULL)
+					nd = rb_first(&self->entries);
+			else
+				nd = self->curr_hot;
+			break;
+		case 'H':
+			nd = self->curr_hot;
 			break;
 		default:
 			goto out;
 		}
+
+		if (nd != NULL)
+			annotate_browser__set_top(self, nd);
 	}
 out:
 	ui_browser__hide(&self->b);
@@ -186,13 +249,13 @@ out:
 
 int hist_entry__tui_annotate(struct hist_entry *he, int evidx)
 {
-	return symbol__tui_annotate(he->ms.sym, he->ms.map, evidx);
+	return symbol__tui_annotate(he->ms.sym, he->ms.map, evidx, 0);
 }
 
-int symbol__tui_annotate(struct symbol *sym, struct map *map, int evidx)
+int symbol__tui_annotate(struct symbol *sym, struct map *map, int evidx,
+			 int refresh)
 {
 	struct objdump_line *pos, *n;
-	struct objdump_line_rb_node *rbpos;
 	struct annotation *notes = symbol__annotation(sym);
 	struct annotate_browser browser = {
 		.b = {
@@ -211,7 +274,7 @@ int symbol__tui_annotate(struct symbol *sym, struct map *map, int evidx)
 	if (map->dso->annotate_warned)
 		return -1;
 
-	if (symbol__annotate(sym, map, sizeof(*rbpos)) < 0) {
+	if (symbol__annotate(sym, map, sizeof(struct objdump_line_rb_node)) < 0) {
 		ui__error_window(ui_helpline__last_msg);
 		return -1;
 	}
@@ -219,26 +282,17 @@ int symbol__tui_annotate(struct symbol *sym, struct map *map, int evidx)
 	ui_helpline__push("Press <- or ESC to exit");
 
 	list_for_each_entry(pos, &notes->src->source, node) {
+		struct objdump_line_rb_node *rbpos;
 		size_t line_len = strlen(pos->line);
+
 		if (browser.b.width < line_len)
 			browser.b.width = line_len;
 		rbpos = objdump_line__rb(pos);
 		rbpos->idx = browser.b.nr_entries++;
-		rbpos->percent = objdump_line__calc_percent(pos, sym, evidx);
-		if (rbpos->percent < 0.01)
-			continue;
-		objdump__insert_line(&browser.entries, rbpos);
 	}
 
-	/*
-	 * Position the browser at the hottest line.
-	 */
-	browser.curr_hot = rb_last(&browser.entries);
-	if (browser.curr_hot)
-		annotate_browser__set_top(&browser, browser.curr_hot);
-
 	browser.b.width += 18; /* Percentage */
-	ret = annotate_browser__run(&browser);
+	ret = annotate_browser__run(&browser, evidx, refresh);
 	list_for_each_entry_safe(pos, n, &notes->src->source, node) {
 		list_del(&pos->node);
 		objdump_line__free(pos);
diff --git a/tools/perf/util/ui/browsers/top.c b/tools/perf/util/ui/browsers/top.c
index ca60624..377ff58 100644
--- a/tools/perf/util/ui/browsers/top.c
+++ b/tools/perf/util/ui/browsers/top.c
@@ -7,6 +7,7 @@
  * Released under the GPL v2. (and only v2, not any later version)
  */
 #include "../browser.h"
+#include "../../annotate.h"
 #include "../helpline.h"
 #include "../libslang.h"
 #include "../../evlist.h"
@@ -18,6 +19,7 @@
 struct perf_top_browser {
 	struct ui_browser b;
 	struct rb_root	  root;
+	struct sym_entry  *selection;
 	float		  sum_ksamples;
 	int		  dso_width;
 	int		  dso_short_width;
@@ -60,6 +62,9 @@ static void perf_top_browser__write(struct ui_browser *browser, void *entry, int
 	slsmg_write_nstring(width >= syme->map->dso->long_name_len ?
 				syme->map->dso->long_name :
 				syme->map->dso->short_name, width);
+
+	if (current_entry)
+		top_browser->selection = syme;
 }
 
 static void perf_top_browser__update_rb_tree(struct perf_top_browser *browser)
@@ -80,21 +85,52 @@ static void perf_top_browser__update_rb_tree(struct perf_top_browser *browser)
 	browser->b.nr_entries = top->rb_entries;
 }
 
+static void perf_top_browser__annotate(struct perf_top_browser *browser)
+{
+	struct sym_entry *syme = browser->selection;
+	struct symbol *sym = sym_entry__symbol(syme);
+	struct annotation *notes = symbol__annotation(sym);
+	struct perf_top *top = browser->b.priv;
+
+	if (notes->src != NULL)
+		goto do_annotation;
+
+	pthread_mutex_lock(&notes->lock);
+
+	top->sym_filter_entry = NULL;
+
+	if (symbol__alloc_hist(sym, top->evlist->nr_entries) < 0) {
+		pr_err("Not enough memory for annotating '%s' symbol!\n",
+		       sym->name);
+		pthread_mutex_unlock(&notes->lock);
+		return;
+	}
+
+	top->sym_filter_entry = syme;
+
+	pthread_mutex_unlock(&notes->lock);
+do_annotation:
+	symbol__tui_annotate(sym, syme->map, 0, top->delay_secs * 1000);
+}
+
 static int perf_top_browser__run(struct perf_top_browser *browser)
 {
 	int key;
 	char title[160];
 	struct perf_top *top = browser->b.priv;
 	int delay_msecs = top->delay_secs * 1000;
+	int exit_keys[] = { 'a', NEWT_KEY_ENTER, NEWT_KEY_RIGHT, 0, };
 
 	perf_top_browser__update_rb_tree(browser);
         perf_top__header_snprintf(top, title, sizeof(title));
         perf_top__reset_sample_counters(top);
 
-	if (ui_browser__show(&browser->b, title, "ESC: exit") < 0)
+	if (ui_browser__show(&browser->b, title,
+			     "ESC: exit, ENTER|->|a: Live Annotate") < 0)
 		return -1;
 
 	newtFormSetTimer(browser->b.form, delay_msecs);
+	ui_browser__add_exit_keys(&browser->b, exit_keys);
 
 	while (1) {
 		key = ui_browser__run(&browser->b);
@@ -109,7 +145,12 @@ static int perf_top_browser__run(struct perf_top_browser *browser)
 			SLsmg_gotorc(0, 0);
 			slsmg_write_nstring(title, browser->b.width);
 			break;
-		case NEWT_KEY_TAB:
+		case 'a':
+		case NEWT_KEY_RIGHT:
+		case NEWT_KEY_ENTER:
+			if (browser->selection)
+				perf_top_browser__annotate(browser);
+			break;
 		default:
 			goto out;
 		}
-- 
cgit v0.10.2


From 2f14ddc3a7146ea4cd5a3d1ecd993f85f2e4f948 Mon Sep 17 00:00:00 2001
From: "Zhang, Fengzhe" <fengzhe.zhang@intel.com>
Date: Wed, 16 Feb 2011 22:26:20 +0800
Subject: xen/setup: Inhibit resource API from using System RAM E820 gaps as
 PCI mem gaps.

With the hypervisor argument of dom0_mem=X we iterate over the physical
(only for the initial domain) E820 and subtract the the size from each
E820_RAM region the delta so that the cumulative size of all E820_RAM regions
is equal to 'X'. This sometimes ends up with E820_RAM regions with zero size
(which are removed by e820_sanitize) and E820_RAM that are smaller
than physically.

Later on the PCI API looks at the E820 and attempts to set up an
resource region for the "PCI mem". The E820 (assume dom0_mem=1GB is
set) compared to the physical looks as so:

 [    0.000000] BIOS-provided physical RAM map:
 [    0.000000]  Xen: 0000000000000000 - 0000000000097c00 (usable)
 [    0.000000]  Xen: 0000000000097c00 - 0000000000100000 (reserved)
-[    0.000000]  Xen: 0000000000100000 - 00000000defafe00 (usable)
+[    0.000000]  Xen: 0000000000100000 - 0000000040000000 (usable)
 [    0.000000]  Xen: 00000000defafe00 - 00000000defb1ea0 (ACPI NVS)
 [    0.000000]  Xen: 00000000defb1ea0 - 00000000e0000000 (reserved)
 [    0.000000]  Xen: 00000000f4000000 - 00000000f8000000 (reserved)
..
And we get
[    0.000000] Allocating PCI resources starting at 40000000 (gap: 40000000:9efafe00)

while it should have started at e0000000 (a nice big gap up to
f4000000 exists). The "Allocating PCI" is part of the resource API.

The users that end up using those PCI I/O regions usually supply their
own BARs when calling the resource API (request_resource, or allocate_resource),
but there are exceptions which provide an empty 'struct resource' and
expect the API to provide the 'struct resource' to be populated with valid values.
The one that triggered this bug was the intel AGP driver that requested
a region for the flush page (intel_i9xx_setup_flush).

Before this patch, when running under Xen hypervisor, the 'struct resource'
returned could have (depending on the dom0_mem size) physical ranges of a 'System RAM'
instead of 'I/O' regions. This ended up with the Hypervisor failing a request
to populate PTE's with those PFNs as the domain did not have access to those
'System RAM' regions (rightly so).

After this patch, the left-over E820_RAM region from the truncation, will be
labeled as E820_UNUSABLE. The E820 will look as so:

 [    0.000000] BIOS-provided physical RAM map:
 [    0.000000]  Xen: 0000000000000000 - 0000000000097c00 (usable)
 [    0.000000]  Xen: 0000000000097c00 - 0000000000100000 (reserved)
-[    0.000000]  Xen: 0000000000100000 - 00000000defafe00 (usable)
+[    0.000000]  Xen: 0000000000100000 - 0000000040000000 (usable)
+[    0.000000]  Xen: 0000000040000000 - 00000000defafe00 (unusable)
 [    0.000000]  Xen: 00000000defafe00 - 00000000defb1ea0 (ACPI NVS)
 [    0.000000]  Xen: 00000000defb1ea0 - 00000000e0000000 (reserved)
 [    0.000000]  Xen: 00000000f4000000 - 00000000f8000000 (reserved)

For more information:
http://mid.gmane.org/1A42CE6F5F474C41B63392A5F80372B2335E978C@shsmsx501.ccr.corp.intel.com

BugLink: http://bugzilla.xensource.com/bugzilla/show_bug.cgi?id=1726

Signed-off-by: Fengzhe Zhang <fengzhe.zhang@intel.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index a8a66a5..2a4add9 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -194,6 +194,14 @@ char * __init xen_memory_setup(void)
 			end -= delta;
 
 			extra_pages += PFN_DOWN(delta);
+			/*
+			 * Set RAM below 4GB that is not for us to be unusable.
+			 * This prevents "System RAM" address space from being
+			 * used as potential resource for I/O address (happens
+			 * when 'allocate_resource' is called).
+			 */
+			if (delta && end < 0x100000000UL)
+				e820_add_region(end, delta, E820_UNUSABLE);
 		}
 
 		if (map[i].size > 0 && end > xen_extra_mem_start)
-- 
cgit v0.10.2


From dbebbfbb1605f0179e7c0d900d941cc9c45de569 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 22 Feb 2011 21:46:25 +0100
Subject: rtmutex: tester: Remove the remaining BKL leftovers

We just leave the numbers assinged as commemoration and in case that
someone was crazy enough to reimplement the test stuff out of tree.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index d5b54350..5c9ccd3 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -44,9 +44,8 @@ enum test_opcodes {
 	RTTEST_LOCKINTNOWAIT,	/* 6 Lock interruptible no wait in wakeup, data = lockindex */
 	RTTEST_LOCKCONT,	/* 7 Continue locking after the wakeup delay */
 	RTTEST_UNLOCK,		/* 8 Unlock, data = lockindex */
-	RTTEST_LOCKBKL,		/* 9 Was: Lock BKL */
-	RTTEST_UNLOCKBKL,	/* 10 Was: Unlock BKL */
-	RTTEST_SIGNAL,		/* 11 Signal other test thread, data = thread id */
+	/* 9, 10 - reserved for BKL commemoration */
+	RTTEST_SIGNAL = 11,	/* 11 Signal other test thread, data = thread id */
 	RTTEST_RESETEVENT = 98,	/* 98 Reset event counter */
 	RTTEST_RESET = 99,	/* 99 Reset all pending operations */
 };
diff --git a/scripts/rt-tester/rt-tester.py b/scripts/rt-tester/rt-tester.py
index 44423b4..8c81d76 100644
--- a/scripts/rt-tester/rt-tester.py
+++ b/scripts/rt-tester/rt-tester.py
@@ -33,8 +33,6 @@ cmd_opcodes = {
     "lockintnowait" : "6",
     "lockcont"      : "7",
     "unlock"        : "8",
-    "lockbkl"       : "9",
-    "unlockbkl"     : "10",
     "signal"        : "11",
     "resetevent"    : "98",
     "reset"         : "99",
diff --git a/scripts/rt-tester/t2-l1-2rt-sameprio.tst b/scripts/rt-tester/t2-l1-2rt-sameprio.tst
index 8821f27..3710c8b 100644
--- a/scripts/rt-tester/t2-l1-2rt-sameprio.tst
+++ b/scripts/rt-tester/t2-l1-2rt-sameprio.tst
@@ -19,8 +19,6 @@
 # lockintnowait	lock nr (0-7)
 # lockcont	lock nr (0-7)
 # unlock	lock nr (0-7)
-# lockbkl	lock nr (0-7)
-# unlockbkl	lock nr (0-7)
 # signal	0
 # reset		0
 # resetevent	0
@@ -39,9 +37,6 @@
 # blocked	lock nr (0-7)
 # blockedwake	lock nr (0-7)
 # unlocked	lock nr (0-7)
-# lockedbkl	dont care
-# blockedbkl	dont care
-# unlockedbkl	dont care
 # opcodeeq	command opcode or number
 # opcodelt	number
 # opcodegt	number
diff --git a/scripts/rt-tester/t2-l1-pi.tst b/scripts/rt-tester/t2-l1-pi.tst
index cde1f18..b4cc959 100644
--- a/scripts/rt-tester/t2-l1-pi.tst
+++ b/scripts/rt-tester/t2-l1-pi.tst
@@ -19,8 +19,6 @@
 # lockintnowait	lock nr (0-7)
 # lockcont	lock nr (0-7)
 # unlock	lock nr (0-7)
-# lockbkl	lock nr (0-7)
-# unlockbkl	lock nr (0-7)
 # signal	0
 # reset		0
 # resetevent	0
@@ -39,9 +37,6 @@
 # blocked	lock nr (0-7)
 # blockedwake	lock nr (0-7)
 # unlocked	lock nr (0-7)
-# lockedbkl	dont care
-# blockedbkl	dont care
-# unlockedbkl	dont care
 # opcodeeq	command opcode or number
 # opcodelt	number
 # opcodegt	number
diff --git a/scripts/rt-tester/t2-l1-signal.tst b/scripts/rt-tester/t2-l1-signal.tst
index 3ab0bfc..1b57376 100644
--- a/scripts/rt-tester/t2-l1-signal.tst
+++ b/scripts/rt-tester/t2-l1-signal.tst
@@ -19,8 +19,6 @@
 # lockintnowait	lock nr (0-7)
 # lockcont	lock nr (0-7)
 # unlock	lock nr (0-7)
-# lockbkl	lock nr (0-7)
-# unlockbkl	lock nr (0-7)
 # signal	0
 # reset		0
 # resetevent	0
@@ -39,9 +37,6 @@
 # blocked	lock nr (0-7)
 # blockedwake	lock nr (0-7)
 # unlocked	lock nr (0-7)
-# lockedbkl	dont care
-# blockedbkl	dont care
-# unlockedbkl	dont care
 # opcodeeq	command opcode or number
 # opcodelt	number
 # opcodegt	number
diff --git a/scripts/rt-tester/t2-l2-2rt-deadlock.tst b/scripts/rt-tester/t2-l2-2rt-deadlock.tst
index f4b5d5d..68b1062 100644
--- a/scripts/rt-tester/t2-l2-2rt-deadlock.tst
+++ b/scripts/rt-tester/t2-l2-2rt-deadlock.tst
@@ -19,8 +19,6 @@
 # lockintnowait	lock nr (0-7)
 # lockcont	lock nr (0-7)
 # unlock	lock nr (0-7)
-# lockbkl	lock nr (0-7)
-# unlockbkl	lock nr (0-7)
 # signal	0
 # reset		0
 # resetevent	0
@@ -39,9 +37,6 @@
 # blocked	lock nr (0-7)
 # blockedwake	lock nr (0-7)
 # unlocked	lock nr (0-7)
-# lockedbkl	dont care
-# blockedbkl	dont care
-# unlockedbkl	dont care
 # opcodeeq	command opcode or number
 # opcodelt	number
 # opcodegt	number
diff --git a/scripts/rt-tester/t3-l1-pi-1rt.tst b/scripts/rt-tester/t3-l1-pi-1rt.tst
index 63440ca..8e6c8b1 100644
--- a/scripts/rt-tester/t3-l1-pi-1rt.tst
+++ b/scripts/rt-tester/t3-l1-pi-1rt.tst
@@ -19,8 +19,6 @@
 # lockintnowait	lock nr (0-7)
 # lockcont	lock nr (0-7)
 # unlock	lock nr (0-7)
-# lockbkl	lock nr (0-7)
-# unlockbkl	lock nr (0-7)
 # signal	thread to signal (0-7)
 # reset		0
 # resetevent	0
@@ -39,9 +37,6 @@
 # blocked	lock nr (0-7)
 # blockedwake	lock nr (0-7)
 # unlocked	lock nr (0-7)
-# lockedbkl	dont care
-# blockedbkl	dont care
-# unlockedbkl	dont care
 # opcodeeq	command opcode or number
 # opcodelt	number
 # opcodegt	number
diff --git a/scripts/rt-tester/t3-l1-pi-2rt.tst b/scripts/rt-tester/t3-l1-pi-2rt.tst
index e5816fe..69c2212 100644
--- a/scripts/rt-tester/t3-l1-pi-2rt.tst
+++ b/scripts/rt-tester/t3-l1-pi-2rt.tst
@@ -19,8 +19,6 @@
 # lockintnowait	lock nr (0-7)
 # lockcont	lock nr (0-7)
 # unlock	lock nr (0-7)
-# lockbkl	lock nr (0-7)
-# unlockbkl	lock nr (0-7)
 # signal	thread to signal (0-7)
 # reset		0
 # resetevent	0
@@ -39,9 +37,6 @@
 # blocked	lock nr (0-7)
 # blockedwake	lock nr (0-7)
 # unlocked	lock nr (0-7)
-# lockedbkl	dont care
-# blockedbkl	dont care
-# unlockedbkl	dont care
 # opcodeeq	command opcode or number
 # opcodelt	number
 # opcodegt	number
diff --git a/scripts/rt-tester/t3-l1-pi-3rt.tst b/scripts/rt-tester/t3-l1-pi-3rt.tst
index 718b82b..9b0f1eb 100644
--- a/scripts/rt-tester/t3-l1-pi-3rt.tst
+++ b/scripts/rt-tester/t3-l1-pi-3rt.tst
@@ -19,8 +19,6 @@
 # lockintnowait	lock nr (0-7)
 # lockcont	lock nr (0-7)
 # unlock	lock nr (0-7)
-# lockbkl	lock nr (0-7)
-# unlockbkl	lock nr (0-7)
 # signal	thread to signal (0-7)
 # reset		0
 # resetevent	0
@@ -39,9 +37,6 @@
 # blocked	lock nr (0-7)
 # blockedwake	lock nr (0-7)
 # unlocked	lock nr (0-7)
-# lockedbkl	dont care
-# blockedbkl	dont care
-# unlockedbkl	dont care
 # opcodeeq	command opcode or number
 # opcodelt	number
 # opcodegt	number
diff --git a/scripts/rt-tester/t3-l1-pi-signal.tst b/scripts/rt-tester/t3-l1-pi-signal.tst
index c6e2135..39ec74a 100644
--- a/scripts/rt-tester/t3-l1-pi-signal.tst
+++ b/scripts/rt-tester/t3-l1-pi-signal.tst
@@ -19,8 +19,6 @@
 # lockintnowait	lock nr (0-7)
 # lockcont	lock nr (0-7)
 # unlock	lock nr (0-7)
-# lockbkl	lock nr (0-7)
-# unlockbkl	lock nr (0-7)
 # signal	thread to signal (0-7)
 # reset		0
 # resetevent	0
@@ -39,9 +37,6 @@
 # blocked	lock nr (0-7)
 # blockedwake	lock nr (0-7)
 # unlocked	lock nr (0-7)
-# lockedbkl	dont care
-# blockedbkl	dont care
-# unlockedbkl	dont care
 # opcodeeq	command opcode or number
 # opcodelt	number
 # opcodegt	number
diff --git a/scripts/rt-tester/t3-l1-pi-steal.tst b/scripts/rt-tester/t3-l1-pi-steal.tst
index f53749d..e03db7e 100644
--- a/scripts/rt-tester/t3-l1-pi-steal.tst
+++ b/scripts/rt-tester/t3-l1-pi-steal.tst
@@ -19,8 +19,6 @@
 # lockintnowait	lock nr (0-7)
 # lockcont	lock nr (0-7)
 # unlock	lock nr (0-7)
-# lockbkl	lock nr (0-7)
-# unlockbkl	lock nr (0-7)
 # signal	thread to signal (0-7)
 # reset		0
 # resetevent	0
@@ -39,9 +37,6 @@
 # blocked	lock nr (0-7)
 # blockedwake	lock nr (0-7)
 # unlocked	lock nr (0-7)
-# lockedbkl	dont care
-# blockedbkl	dont care
-# unlockedbkl	dont care
 # opcodeeq	command opcode or number
 # opcodelt	number
 # opcodegt	number
diff --git a/scripts/rt-tester/t3-l2-pi.tst b/scripts/rt-tester/t3-l2-pi.tst
index cdc3e4f..7b59100 100644
--- a/scripts/rt-tester/t3-l2-pi.tst
+++ b/scripts/rt-tester/t3-l2-pi.tst
@@ -19,8 +19,6 @@
 # lockintnowait	lock nr (0-7)
 # lockcont	lock nr (0-7)
 # unlock	lock nr (0-7)
-# lockbkl	lock nr (0-7)
-# unlockbkl	lock nr (0-7)
 # signal	thread to signal (0-7)
 # reset		0
 # resetevent	0
@@ -39,9 +37,6 @@
 # blocked	lock nr (0-7)
 # blockedwake	lock nr (0-7)
 # unlocked	lock nr (0-7)
-# lockedbkl	dont care
-# blockedbkl	dont care
-# unlockedbkl	dont care
 # opcodeeq	command opcode or number
 # opcodelt	number
 # opcodegt	number
diff --git a/scripts/rt-tester/t4-l2-pi-deboost.tst b/scripts/rt-tester/t4-l2-pi-deboost.tst
index baa1413..2f0e049 100644
--- a/scripts/rt-tester/t4-l2-pi-deboost.tst
+++ b/scripts/rt-tester/t4-l2-pi-deboost.tst
@@ -19,8 +19,6 @@
 # lockintnowait	lock nr (0-7)
 # lockcont	lock nr (0-7)
 # unlock	lock nr (0-7)
-# lockbkl	lock nr (0-7)
-# unlockbkl	lock nr (0-7)
 # signal	thread to signal (0-7)
 # reset		0
 # resetevent	0
@@ -39,9 +37,6 @@
 # blocked	lock nr (0-7)
 # blockedwake	lock nr (0-7)
 # unlocked	lock nr (0-7)
-# lockedbkl	dont care
-# blockedbkl	dont care
-# unlockedbkl	dont care
 # opcodeeq	command opcode or number
 # opcodelt	number
 # opcodegt	number
diff --git a/scripts/rt-tester/t5-l4-pi-boost-deboost-setsched.tst b/scripts/rt-tester/t5-l4-pi-boost-deboost-setsched.tst
index e6ec0c8..04f4034 100644
--- a/scripts/rt-tester/t5-l4-pi-boost-deboost-setsched.tst
+++ b/scripts/rt-tester/t5-l4-pi-boost-deboost-setsched.tst
@@ -19,8 +19,6 @@
 # lockintnowait	lock nr (0-7)
 # lockcont	lock nr (0-7)
 # unlock	lock nr (0-7)
-# lockbkl	lock nr (0-7)
-# unlockbkl	lock nr (0-7)
 # signal	thread to signal (0-7)
 # reset		0
 # resetevent	0
@@ -39,9 +37,6 @@
 # blocked	lock nr (0-7)
 # blockedwake	lock nr (0-7)
 # unlocked	lock nr (0-7)
-# lockedbkl	dont care
-# blockedbkl	dont care
-# unlockedbkl	dont care
 # opcodeeq	command opcode or number
 # opcodelt	number
 # opcodegt	number
diff --git a/scripts/rt-tester/t5-l4-pi-boost-deboost.tst b/scripts/rt-tester/t5-l4-pi-boost-deboost.tst
index ca64f8b..a48a6ee 100644
--- a/scripts/rt-tester/t5-l4-pi-boost-deboost.tst
+++ b/scripts/rt-tester/t5-l4-pi-boost-deboost.tst
@@ -19,8 +19,6 @@
 # lockintnowait	lock nr (0-7)
 # lockcont	lock nr (0-7)
 # unlock	lock nr (0-7)
-# lockbkl	lock nr (0-7)
-# unlockbkl	lock nr (0-7)
 # signal	thread to signal (0-7)
 # reset		0
 # resetevent	0
@@ -39,9 +37,6 @@
 # blocked	lock nr (0-7)
 # blockedwake	lock nr (0-7)
 # unlocked	lock nr (0-7)
-# lockedbkl	dont care
-# blockedbkl	dont care
-# unlockedbkl	dont care
 # opcodeeq	command opcode or number
 # opcodelt	number
 # opcodegt	number
-- 
cgit v0.10.2


From fd4afaf33313d94f548cb09129ecba3dbab62931 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@novell.com>
Date: Thu, 17 Feb 2011 13:39:05 +0000
Subject: genirq: Streamline kernel/irq/Kconfig

"def_bool n" without prompt is pointless, these should be just "bool".

[ tglx: Adapted to latest changes ]

Signed-off-by: Jan Beulich <jbeulich@novell.com>
LKML-Reference: <4D5D3309020000780003264A@vpn.id2.novell.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 9149be7..355b8c7 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -1,5 +1,5 @@
 config HAVE_GENERIC_HARDIRQS
-	def_bool n
+	bool
 
 if HAVE_GENERIC_HARDIRQS
 menu "IRQ subsystem"
@@ -11,32 +11,32 @@ config GENERIC_HARDIRQS
 
 # Select this to disable the deprecated stuff
 config GENERIC_HARDIRQS_NO_DEPRECATED
-       def_bool n
+       bool
 
 config GENERIC_HARDIRQS_NO_COMPAT
-       def_bool n
+       bool
 
 # Options selectable by the architecture code
 config HAVE_SPARSE_IRQ
-       def_bool n
+       bool
 
 config GENERIC_IRQ_PROBE
-	def_bool n
+	bool
 
 config GENERIC_IRQ_SHOW
-       def_bool n
+       bool
 
 config GENERIC_PENDING_IRQ
-	def_bool n
+	bool
 
 config AUTO_IRQ_AFFINITY
-       def_bool n
+       bool
 
 config HARDIRQS_SW_RESEND
-       def_bool n
+       bool
 
 config IRQ_PREFLOW_FASTEOI
-       def_bool n
+       bool
 
 config SPARSE_IRQ
 	bool "Support sparse irq numbering"
-- 
cgit v0.10.2


From 540089798d2ae115fc9bff7ed3823c8c32249607 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Feb 2011 09:50:15 +0100
Subject: x86: OLPC: Remove redundant !X64_64 config dependency

OLPC is under if X86_32 already.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andres Salomon <dilinger@queued.net>

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index d5ed94d..e327f96 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2068,7 +2068,7 @@ config OLPC
 	bool "One Laptop Per Child support"
 	select GPIOLIB
 	select OLPC_OPENFIRMWARE
-	depends on !X86_64 && !X86_PAE
+	depends on !X86_PAE
 	---help---
 	  Add support for detecting the unique features of the OLPC
 	  XO hardware.
@@ -2081,7 +2081,7 @@ config OLPC_XO1
 
 config OLPC_OPENFIRMWARE
 	bool "Support for OLPC's Open Firmware"
-	depends on !X86_64 && !X86_PAE
+	depends on !X86_PAE
 	default n
 	select OF
 	help
-- 
cgit v0.10.2


From fe239545a1eed57a60c5d4063f0b56f6cd1811ff Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Feb 2011 10:05:53 +0100
Subject: x86: OLPC: Hide OLPC_OPENFIRMWARE config switch

OLPC selects OLPC_OPENFIRMWARE unconditionally. If OLPC=n then
the OLPC_OPENFIRMWARE functionality is pointless.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andres Salomon <dilinger@queued.net>

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index e327f96..dbc10fc 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2080,18 +2080,12 @@ config OLPC_XO1
 	  Add support for non-essential features of the OLPC XO-1 laptop.
 
 config OLPC_OPENFIRMWARE
-	bool "Support for OLPC's Open Firmware"
-	depends on !X86_PAE
-	default n
+	bool
 	select OF
-	help
-	  This option adds support for the implementation of Open Firmware
-	  that is used on the OLPC XO-1 Children's Machine.
-	  If unsure, say N here.
+	select OLPC_OPENFIRMWARE_DT if PROC_DEVICETREE
 
 config OLPC_OPENFIRMWARE_DT
 	bool
-	default y if OLPC_OPENFIRMWARE && PROC_DEVICETREE
 	select OF_PROMTREE
 
 endif # X86_32
-- 
cgit v0.10.2


From dc3119e700216a70e82fe07a79f1618852058354 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Feb 2011 10:08:31 +0100
Subject: x86: OLPC: Cleanup config maze completely

Neither CONFIG_OLPC_OPENFIRMWARE nor CONFIG_OLPC_OPENFIRMWARE_DT are
really necessary.

OLPC selects OLPC_OPENFIRMWARE unconditionally, so move the "select
OF" part under OLPC config option and fixup the dependencies in
Makefiles and code.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andres Salomon <dilinger@queued.net>

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index dbc10fc..677501d 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2066,9 +2066,10 @@ config SCx200HR_TIMER
 
 config OLPC
 	bool "One Laptop Per Child support"
-	select GPIOLIB
-	select OLPC_OPENFIRMWARE
 	depends on !X86_PAE
+	select GPIOLIB
+	select OF
+	select OLPC_OPENFIRMWARE_DT if PROC_DEVICETREE
 	---help---
 	  Add support for detecting the unique features of the OLPC
 	  XO hardware.
@@ -2079,11 +2080,6 @@ config OLPC_XO1
 	---help---
 	  Add support for non-essential features of the OLPC XO-1 laptop.
 
-config OLPC_OPENFIRMWARE
-	bool
-	select OF
-	select OLPC_OPENFIRMWARE_DT if PROC_DEVICETREE
-
 config OLPC_OPENFIRMWARE_DT
 	bool
 	select OF_PROMTREE
diff --git a/arch/x86/include/asm/olpc_ofw.h b/arch/x86/include/asm/olpc_ofw.h
index 641988e..1fff2de 100644
--- a/arch/x86/include/asm/olpc_ofw.h
+++ b/arch/x86/include/asm/olpc_ofw.h
@@ -6,8 +6,6 @@
 
 #define OLPC_OFW_SIG 0x2057464F	/* aka "OFW " */
 
-#ifdef CONFIG_OLPC_OPENFIRMWARE
-
 extern bool olpc_ofw_is_installed(void);
 
 /* run an OFW command by calling into the firmware */
@@ -26,15 +24,6 @@ extern void setup_olpc_ofw_pgd(void);
 /* check if OFW was detected during boot */
 extern bool olpc_ofw_present(void);
 
-#else /* !CONFIG_OLPC_OPENFIRMWARE */
-
-static inline bool olpc_ofw_is_installed(void) { return false; }
-static inline void olpc_ofw_detect(void) { }
-static inline void setup_olpc_ofw_pgd(void) { }
-static inline bool olpc_ofw_present(void) { return false; }
-
-#endif /* !CONFIG_OLPC_OPENFIRMWARE */
-
 #ifdef CONFIG_OLPC_OPENFIRMWARE_DT
 extern void olpc_dt_build_devicetree(void);
 #else
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 767d6c4..d8cc18a 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -137,7 +137,7 @@ ENTRY(startup_32)
 	movsl
 1:
 
-#ifdef CONFIG_OLPC_OPENFIRMWARE
+#ifdef CONFIG_OLPC
 	/* save OFW's pgdir table for later use when calling into OFW */
 	movl %cr3, %eax
 	movl %eax, pa(olpc_ofw_pgd)
diff --git a/arch/x86/platform/olpc/Makefile b/arch/x86/platform/olpc/Makefile
index e797428..e18e641 100644
--- a/arch/x86/platform/olpc/Makefile
+++ b/arch/x86/platform/olpc/Makefile
@@ -1,4 +1,4 @@
 obj-$(CONFIG_OLPC)		+= olpc.o
 obj-$(CONFIG_OLPC_XO1)		+= olpc-xo1.o
-obj-$(CONFIG_OLPC_OPENFIRMWARE)	+= olpc_ofw.o
+obj-$(CONFIG_OLPC)		+= olpc_ofw.o
 obj-$(CONFIG_OLPC_OPENFIRMWARE_DT)	+= olpc_dt.o
-- 
cgit v0.10.2


From c2a941fadb57d157d59eec424674bd0c3a28788c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Feb 2011 10:32:42 +0100
Subject: x86: OLPC: Remove extra OLPC_OPENFIRMWARE_DT indirection

OLPC_OPENFIRMWARE_DT is just there to be selected by OLPC and selects
OF_PROMTREE. So let OLPC select OF_PROMTREE and remove that extra
config indirection. Fixup code and Makefile and use CONFIG_OF_PROMTREE
instead.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andres Salomon <dilinger@queued.net>

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 677501d..4f3d3d4 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2069,7 +2069,7 @@ config OLPC
 	depends on !X86_PAE
 	select GPIOLIB
 	select OF
-	select OLPC_OPENFIRMWARE_DT if PROC_DEVICETREE
+	select OF_PROMTREE if PROC_DEVICETREE
 	---help---
 	  Add support for detecting the unique features of the OLPC
 	  XO hardware.
@@ -2080,10 +2080,6 @@ config OLPC_XO1
 	---help---
 	  Add support for non-essential features of the OLPC XO-1 laptop.
 
-config OLPC_OPENFIRMWARE_DT
-	bool
-	select OF_PROMTREE
-
 endif # X86_32
 
 config AMD_NB
diff --git a/arch/x86/include/asm/olpc_ofw.h b/arch/x86/include/asm/olpc_ofw.h
index 1fff2de..28a7fec 100644
--- a/arch/x86/include/asm/olpc_ofw.h
+++ b/arch/x86/include/asm/olpc_ofw.h
@@ -24,10 +24,10 @@ extern void setup_olpc_ofw_pgd(void);
 /* check if OFW was detected during boot */
 extern bool olpc_ofw_present(void);
 
-#ifdef CONFIG_OLPC_OPENFIRMWARE_DT
+#ifdef CONFIG_OF_PROMTREE
 extern void olpc_dt_build_devicetree(void);
 #else
 static inline void olpc_dt_build_devicetree(void) { }
-#endif /* CONFIG_OLPC_OPENFIRMWARE_DT */
+#endif
 
 #endif /* _ASM_X86_OLPC_OFW_H */
diff --git a/arch/x86/platform/olpc/Makefile b/arch/x86/platform/olpc/Makefile
index e18e641..c2a8cab 100644
--- a/arch/x86/platform/olpc/Makefile
+++ b/arch/x86/platform/olpc/Makefile
@@ -1,4 +1,4 @@
 obj-$(CONFIG_OLPC)		+= olpc.o
 obj-$(CONFIG_OLPC_XO1)		+= olpc-xo1.o
 obj-$(CONFIG_OLPC)		+= olpc_ofw.o
-obj-$(CONFIG_OLPC_OPENFIRMWARE_DT)	+= olpc_dt.o
+obj-$(CONFIG_OF_PROMTREE)	+= olpc_dt.o
-- 
cgit v0.10.2


From 6435a5e39d3e01a1a73a925ed53ee18619b0a368 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Wed, 23 Feb 2011 07:25:02 -0300
Subject: perf top browser: Adjust the browser indexes when refreshing

This is not a problem when we're not at the bottom of the active symbols
list, so was not noticed, but at the end of the screen it falls apart.

Fix it by adjusting the ui_browser indexes according to the new number
of entries in the rb_tree and by seeking from the start of the rb_tree
to find the new symbol at the top of the screen.

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/ui/browsers/top.c b/tools/perf/util/ui/browsers/top.c
index 377ff58..2f47224 100644
--- a/tools/perf/util/ui/browsers/top.c
+++ b/tools/perf/util/ui/browsers/top.c
@@ -70,6 +70,7 @@ static void perf_top_browser__write(struct ui_browser *browser, void *entry, int
 static void perf_top_browser__update_rb_tree(struct perf_top_browser *browser)
 {
 	struct perf_top *top = browser->b.priv;
+	u64 top_idx = browser->b.top_idx;
 
 	browser->root = RB_ROOT;
 	browser->b.top = NULL;
@@ -82,7 +83,29 @@ static void perf_top_browser__update_rb_tree(struct perf_top_browser *browser)
 		if (browser->sym_width + browser->dso_width > browser->b.width - 29)
 			browser->sym_width = browser->b.width - browser->dso_width - 29;
 	}
+
+	/*
+	 * Adjust the ui_browser indexes since the entries in the browser->root
+	 * rb_tree may have changed, then seek it from start, so that we get a
+	 * possible new top of the screen.
+ 	 */
 	browser->b.nr_entries = top->rb_entries;
+
+	if (top_idx >= browser->b.nr_entries) {
+		if (browser->b.height >= browser->b.nr_entries)
+			top_idx = browser->b.nr_entries - browser->b.height;
+		else
+			top_idx = 0;
+	}
+
+	if (browser->b.index >= top_idx + browser->b.height)
+		browser->b.index = top_idx + browser->b.index - browser->b.top_idx;
+
+	if (browser->b.index >= browser->b.nr_entries)
+		browser->b.index = browser->b.nr_entries - 1;
+
+	browser->b.top_idx = top_idx;
+	browser->b.seek(&browser->b, top_idx, SEEK_SET);
 }
 
 static void perf_top_browser__annotate(struct perf_top_browser *browser)
-- 
cgit v0.10.2


From 9826e8329bc160e4cc58b83019f3f056965e42d0 Mon Sep 17 00:00:00 2001
From: Marcin Slusarz <marcin.slusarz@gmail.com>
Date: Tue, 22 Feb 2011 21:53:12 +0100
Subject: perf lock: Document valid sort keys

Cc: Ingo Molnar <mingo@elte.hu>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20110222205312.GA18474@joi.lan>
Signed-off-by: Marcin Slusarz <marcin.slusarz@gmail.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/Documentation/perf-lock.txt b/tools/perf/Documentation/perf-lock.txt
index 921de25..4a26a2f 100644
--- a/tools/perf/Documentation/perf-lock.txt
+++ b/tools/perf/Documentation/perf-lock.txt
@@ -24,8 +24,8 @@ and statistics with this 'perf lock' command.
 
   'perf lock report' reports statistical data.
 
-OPTIONS
--------
+COMMON OPTIONS
+--------------
 
 -i::
 --input=<file>::
@@ -39,6 +39,14 @@ OPTIONS
 --dump-raw-trace::
         Dump raw trace in ASCII.
 
+REPORT OPTIONS
+--------------
+
+-k::
+--key=<value>::
+        Sorting key. Possible values: acquired (default), contended,
+        wait_total, wait_max, wait_min.
+
 SEE ALSO
 --------
 linkperf:perf[1]
diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c
index e00d938..2e93f99 100644
--- a/tools/perf/builtin-lock.c
+++ b/tools/perf/builtin-lock.c
@@ -893,7 +893,7 @@ static const char * const report_usage[] = {
 
 static const struct option report_options[] = {
 	OPT_STRING('k', "key", &sort_key, "acquired",
-		    "key for sorting"),
+		    "key for sorting (acquired / contended / wait_total / wait_max / wait_min)"),
 	/* TODO: type */
 	OPT_END()
 };
-- 
cgit v0.10.2


From c186fafe9aba87c1a93df8c7120a6ae01fe435ad Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 21 Feb 2011 18:52:53 +0100
Subject: sched: Clean up remnants of sd_idle

With the wholesale removal of the sd_idle SMT logic we can clean up
some more.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Nikhil Rao <ncrao@google.com>
Cc: Venkatesh Pallipadi <venki@google.com>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Mike Galbraith <efault@gmx.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index d384e73..cd18600 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -3150,25 +3150,23 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 	if (sds.this_load >= sds.avg_load)
 		goto out_balanced;
 
-	/*
-	 * In the CPU_NEWLY_IDLE, use imbalance_pct to be conservative.
-	 * And to check for busy balance use !idle_cpu instead of
-	 * CPU_NOT_IDLE. This is because HT siblings will use CPU_NOT_IDLE
-	 * even when they are idle.
-	 */
-	if (idle == CPU_NEWLY_IDLE || !idle_cpu(this_cpu)) {
-		if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
-			goto out_balanced;
-	} else {
+	if (idle == CPU_IDLE) {
 		/*
 		 * This cpu is idle. If the busiest group load doesn't
 		 * have more tasks than the number of available cpu's and
 		 * there is no imbalance between this and busiest group
 		 * wrt to idle cpu's, it is balanced.
 		 */
-		if ((sds.this_idle_cpus  <= sds.busiest_idle_cpus + 1) &&
+		if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) &&
 		    sds.busiest_nr_running <= sds.busiest_group_weight)
 			goto out_balanced;
+	} else {
+		/*
+		 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
+		 * imbalance_pct to be conservative.
+		 */
+		if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
+			goto out_balanced;
 	}
 
 force_balance:
@@ -3862,8 +3860,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
 			if (load_balance(cpu, rq, sd, idle, &balance)) {
 				/*
 				 * We've pulled tasks over so either we're no
-				 * longer idle, or one of our SMT siblings is
-				 * not idle.
+				 * longer idle.
 				 */
 				idle = CPU_NOT_IDLE;
 			}
-- 
cgit v0.10.2


From cc57aa8f4b3bece8c26c7929728edcc5fa6b5aed Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 21 Feb 2011 18:55:32 +0100
Subject: sched: Clean up some f_b_g() comments

The existing comment tends to grow state (as it already has), split it
up and place it near the actual tests.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Nikhil Rao <ncrao@google.com>
Cc: Venkatesh Pallipadi <venki@google.com>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Mike Galbraith <efault@gmx.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index cd18600..03496eb 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -3113,19 +3113,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 	 */
 	update_sd_lb_stats(sd, this_cpu, idle, cpus, balance, &sds);
 
-	/* Cases where imbalance does not exist from POV of this_cpu */
-	/* 1) this_cpu is not the appropriate cpu to perform load balancing
-	 *    at this level.
-	 * 2) There is no busy sibling group to pull from.
-	 * 3) This group is the busiest group.
-	 * 4) This group is more busy than the avg busieness at this
-	 *    sched_domain.
-	 * 5) The imbalance is within the specified limit.
-	 *
-	 * Note: when doing newidle balance, if the local group has excess
-	 * capacity (i.e. nr_running < group_capacity) and the busiest group
-	 * does not have any capacity, we force a load balance to pull tasks
-	 * to the local group. In this case, we skip past checks 3, 4 and 5.
+	/*
+	 * this_cpu is not the appropriate cpu to perform load balancing at
+	 * this level.
 	 */
 	if (!(*balance))
 		goto ret;
@@ -3134,19 +3124,27 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 	    check_asym_packing(sd, &sds, this_cpu, imbalance))
 		return sds.busiest;
 
+	/* There is no busy sibling group to pull tasks from */
 	if (!sds.busiest || sds.busiest_nr_running == 0)
 		goto out_balanced;
 
-	/*  SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
+	/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
 	if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
 			!sds.busiest_has_capacity)
 		goto force_balance;
 
+	/*
+	 * If the local group is more busy than the selected busiest group
+	 * don't try and pull any tasks.
+	 */
 	if (sds.this_load >= sds.max_load)
 		goto out_balanced;
 
+	/*
+	 * Don't pull any tasks if this group is already above the domain
+	 * average load.
+	 */
 	sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
-
 	if (sds.this_load >= sds.avg_load)
 		goto out_balanced;
 
-- 
cgit v0.10.2


From 866ab43efd325fae8889ea77a744d03f2b957e38 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 21 Feb 2011 18:56:47 +0100
Subject: sched: Fix the group_imb logic

On a 2*6*2 machine something like:

 taskset -c 3-11 bash -c 'for ((i=0;i<9;i++)) do while :; do :; done & done'

_should_ result in 9 busy CPUs, each running 1 task.

However it didn't quite work reliably, most of the time one cpu of the
second socket (6-11) would be idle and one cpu of the first socket
(0-5) would have two tasks on it.

The group_imb logic is supposed to deal with this and detect when a
particular group is imbalanced (like in our case, 0-2 are idle but 3-5
will have 4 tasks on it).

The detection phase needed a bit of a tweak as it was too weak and
required more than 2 avg weight tasks difference between idle and busy
cpus in the group which won't trigger for our test-case. So cure that
to be one or more avg task weight difference between cpus.

Once the detection phase worked, it was then defeated by the f_b_g()
tests trying to avoid ping-pongs. In particular, this_load >= max_load
triggered because the pulling cpu (the (first) idle cpu in on the
second socket, say 6) would find this_load to be 5 and max_load to be
4 (there'd be 5 tasks running on our socket and only 4 on the other
socket).

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Nikhil Rao <ncrao@google.com>
Cc: Venkatesh Pallipadi <venki@google.com>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Mike Galbraith <efault@gmx.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 03496eb..3a88dee 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -2743,7 +2743,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
 
 	/*
 	 * Consider the group unbalanced when the imbalance is larger
-	 * than the average weight of two tasks.
+	 * than the average weight of a task.
 	 *
 	 * APZ: with cgroup the avg task weight can vary wildly and
 	 *      might not be a suitable number - should we keep a
@@ -2753,7 +2753,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
 	if (sgs->sum_nr_running)
 		avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
 
-	if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task && max_nr_running > 1)
+	if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1)
 		sgs->group_imb = 1;
 
 	sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
@@ -3128,6 +3128,14 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 	if (!sds.busiest || sds.busiest_nr_running == 0)
 		goto out_balanced;
 
+	/*
+	 * If the busiest group is imbalanced the below checks don't
+	 * work because they assumes all things are equal, which typically
+	 * isn't true due to cpus_allowed constraints and the like.
+	 */
+	if (sds.group_imb)
+		goto force_balance;
+
 	/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
 	if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
 			!sds.busiest_has_capacity)
-- 
cgit v0.10.2


From 1747b21fecbfb63fbf6b9624e8b92707960d5a97 Mon Sep 17 00:00:00 2001
From: Yong Zhang <yong.zhang0@gmail.com>
Date: Sun, 20 Feb 2011 15:08:12 +0800
Subject: sched, autogroup, sysctl: Use proc_dointvec_minmax() instead

sched_autogroup_enabled has min/max value, proc_dointvec_minmax() is
be used for this case.

Signed-off-by: Yong Zhang <yong.zhang0@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1298185696-4403-2-git-send-email-yong.zhang0@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index cb7c830..7b5eead 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -367,7 +367,7 @@ static struct ctl_table kern_table[] = {
 		.data		= &sysctl_sched_autogroup_enabled,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &zero,
 		.extra2		= &one,
 	},
-- 
cgit v0.10.2


From 800d4d30c8f20bd728e5741a3b77c4859a613f7c Mon Sep 17 00:00:00 2001
From: Yong Zhang <yong.zhang0@gmail.com>
Date: Sun, 20 Feb 2011 15:08:14 +0800
Subject: sched, autogroup: Stop going ahead if autogroup is disabled

when autogroup is disable from the beginning,
sched_autogroup_create_attach()
  autogroup_move_group()                    <== 1
    sched_move_task()                       <== 2
      task_move_group_fair()
        set_task_rq()
          task_group()
            autogroup_task_group()

We go the whole path without doing anything useful.

Then stop going further if autogroup is disabled.

But there will be a race window between 1 and 2, in which
sysctl_sched_autogroup_enabled is enabled. This issue
will be toke by following patch.

Signed-off-by: Yong Zhang <yong.zhang0@gmail.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
LKML-Reference: <1298185696-4403-4-git-send-email-yong.zhang0@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c
index 9fb6562..137a096 100644
--- a/kernel/sched_autogroup.c
+++ b/kernel/sched_autogroup.c
@@ -161,11 +161,15 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
 
 	p->signal->autogroup = autogroup_kref_get(ag);
 
+	if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled))
+		goto out;
+
 	t = p;
 	do {
 		sched_move_task(t);
 	} while_each_thread(p, t);
 
+out:
 	unlock_task_sighand(p, &flags);
 	autogroup_kref_put(prev);
 }
diff --git a/kernel/sched_autogroup.h b/kernel/sched_autogroup.h
index 7b859ff..0557705 100644
--- a/kernel/sched_autogroup.h
+++ b/kernel/sched_autogroup.h
@@ -1,6 +1,11 @@
 #ifdef CONFIG_SCHED_AUTOGROUP
 
 struct autogroup {
+	/*
+	 * reference doesn't mean how many thread attach to this
+	 * autogroup now. It just stands for the number of task
+	 * could use this autogroup.
+	 */
 	struct kref		kref;
 	struct task_group	*tg;
 	struct rw_semaphore	lock;
-- 
cgit v0.10.2


From 511f67a5997c4967c69a3961e2fc9f04d8d244ac Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Tue, 22 Feb 2011 15:02:00 +0100
Subject: sched, autogroup: Stop claiming ownership of the root task group

Disown it, and only display autogroup association if one exists.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Reviewed-by: Yong Zhang <yong.zhang0@gmail.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1298383320.8036.5.camel@marge.simson.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c
index 137a096..5946ac5 100644
--- a/kernel/sched_autogroup.c
+++ b/kernel/sched_autogroup.c
@@ -12,7 +12,6 @@ static atomic_t autogroup_seq_nr;
 static void __init autogroup_init(struct task_struct *init_task)
 {
 	autogroup_default.tg = &root_task_group;
-	root_task_group.autogroup = &autogroup_default;
 	kref_init(&autogroup_default.kref);
 	init_rwsem(&autogroup_default.lock);
 	init_task->signal->autogroup = &autogroup_default;
@@ -130,7 +129,7 @@ task_wants_autogroup(struct task_struct *p, struct task_group *tg)
 
 static inline bool task_group_is_autogroup(struct task_group *tg)
 {
-	return tg != &root_task_group && tg->autogroup;
+	return !!tg->autogroup;
 }
 
 static inline struct task_group *
@@ -251,10 +250,14 @@ void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m)
 {
 	struct autogroup *ag = autogroup_task_get(p);
 
+	if (!task_group_is_autogroup(ag->tg))
+		goto out;
+
 	down_read(&ag->lock);
 	seq_printf(m, "/autogroup-%ld nice %d\n", ag->id, ag->nice);
 	up_read(&ag->lock);
 
+out:
 	autogroup_kref_put(ag);
 }
 #endif /* CONFIG_PROC_FS */
@@ -262,9 +265,7 @@ void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m)
 #ifdef CONFIG_SCHED_DEBUG
 static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
 {
-	int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
-
-	if (!enabled || !tg->autogroup)
+	if (!task_group_is_autogroup(tg))
 		return 0;
 
 	return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
-- 
cgit v0.10.2


From 3f7cce3c18188a067d463749168bdda5abc5b0f7 Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Fri, 18 Feb 2011 14:40:01 +0200
Subject: perf_events: Fix rcu and locking issues with cgroup support

This patches ensures that we do not end up calling
perf_cgroup_from_task() when there is no cgroup event.
This avoids potential RCU and locking issues.

The change in perf_cgroup_set_timestamp() ensures we
check against ctx->nr_cgroups. It also avoids calling
perf_clock() tiwce in a row. It also ensures we do need
to grab ctx->lock before calling the function.

We drop update_cgrp_time() from task_clock_event_read()
because it is not needed. This also avoids having to
deal with perf_cgroup_from_task().

Thanks to Peter Zijlstra for his help on this.

Signed-off-by: Stephane Eranian <eranian@gmail.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <4d5e76b8.815bdf0a.7ac3.774f@mx.google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index a0a6987..dadeaea 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -201,6 +201,11 @@ __get_cpu_context(struct perf_event_context *ctx)
 
 #ifdef CONFIG_CGROUP_PERF
 
+/*
+ * Must ensure cgroup is pinned (css_get) before calling
+ * this function. In other words, we cannot call this function
+ * if there is no cgroup event for the current CPU context.
+ */
 static inline struct perf_cgroup *
 perf_cgroup_from_task(struct task_struct *task)
 {
@@ -268,28 +273,41 @@ static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
 
 static inline void update_cgrp_time_from_event(struct perf_event *event)
 {
-	struct perf_cgroup *cgrp = perf_cgroup_from_task(current);
+	struct perf_cgroup *cgrp;
+
 	/*
-	 * do not update time when cgroup is not active
+	 * ensure we access cgroup data only when needed and
+	 * when we know the cgroup is pinned (css_get)
 	 */
-	if (!event->cgrp || cgrp != event->cgrp)
+	if (!is_cgroup_event(event))
 		return;
 
-	__update_cgrp_time(event->cgrp);
+	cgrp = perf_cgroup_from_task(current);
+	/*
+	 * Do not update time when cgroup is not active
+	 */
+	if (cgrp == event->cgrp)
+		__update_cgrp_time(event->cgrp);
 }
 
 static inline void
-perf_cgroup_set_timestamp(struct task_struct *task, u64 now)
+perf_cgroup_set_timestamp(struct task_struct *task,
+			  struct perf_event_context *ctx)
 {
 	struct perf_cgroup *cgrp;
 	struct perf_cgroup_info *info;
 
-	if (!task)
+	/*
+	 * ctx->lock held by caller
+	 * ensure we do not access cgroup data
+	 * unless we have the cgroup pinned (css_get)
+	 */
+	if (!task || !ctx->nr_cgroups)
 		return;
 
 	cgrp = perf_cgroup_from_task(task);
 	info = this_cpu_ptr(cgrp->info);
-	info->timestamp = now;
+	info->timestamp = ctx->timestamp;
 }
 
 #define PERF_CGROUP_SWOUT	0x1 /* cgroup switch out every event */
@@ -494,7 +512,8 @@ static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
 }
 
 static inline void
-perf_cgroup_set_timestamp(struct task_struct *task, u64 now)
+perf_cgroup_set_timestamp(struct task_struct *task,
+			  struct perf_event_context *ctx)
 {
 }
 
@@ -1613,7 +1632,7 @@ static int __perf_event_enable(void *info)
 	/*
 	 * set current task's cgroup time reference point
 	 */
-	perf_cgroup_set_timestamp(current, perf_clock());
+	perf_cgroup_set_timestamp(current, ctx);
 
 	__perf_event_mark_enabled(event, ctx);
 
@@ -2048,7 +2067,7 @@ ctx_sched_in(struct perf_event_context *ctx,
 
 	now = perf_clock();
 	ctx->timestamp = now;
-	perf_cgroup_set_timestamp(task, now);
+	perf_cgroup_set_timestamp(task, ctx);
 	/*
 	 * First go through the list and put on any pinned groups
 	 * in order to give them the best chance of going on.
@@ -5795,7 +5814,6 @@ static void task_clock_event_read(struct perf_event *event)
 
 	if (!in_nmi()) {
 		update_context_time(event->ctx);
-		update_cgrp_time_from_event(event);
 		time = event->ctx->time;
 	} else {
 		u64 now = perf_clock();
-- 
cgit v0.10.2


From 768a06e2ca49cdf72389208cfc056a36cf8bc5e3 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 22 Feb 2011 16:52:24 +0100
Subject: perf: Simplify task_clock_event_read()

There is no point in us having different code paths for nmi and !nmi
here, so remove the !nmi one.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephane Eranian <eranian@google.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index dadeaea..64a018e 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -5810,16 +5810,9 @@ static void task_clock_event_del(struct perf_event *event, int flags)
 
 static void task_clock_event_read(struct perf_event *event)
 {
-	u64 time;
-
-	if (!in_nmi()) {
-		update_context_time(event->ctx);
-		time = event->ctx->time;
-	} else {
-		u64 now = perf_clock();
-		u64 delta = now - event->ctx->timestamp;
-		time = event->ctx->time + delta;
-	}
+	u64 now = perf_clock();
+	u64 delta = now - event->ctx->timestamp;
+	u64 time = event->ctx->time + delta;
 
 	task_clock_event_update(event, time);
 }
-- 
cgit v0.10.2


From 4e034b245133adfd006ade5d7a809c9cac4beef9 Mon Sep 17 00:00:00 2001
From: Henrik Kretzschmar <henne@nachtwindheim.de>
Date: Tue, 22 Feb 2011 15:38:03 +0100
Subject: x86: Move ioapic_irq_destination_types to apicdef.h

This enum is used by non IOAPIC code, so apicdef.h is
the best place for it.

Signed-off-by: Henrik Kretzschmar <henne@nachtwindheim.de>
LKML-Reference: <1298385487-4708-1-git-send-email-henne@nachtwindheim.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
index 47a30ff..d87988b 100644
--- a/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@ -426,4 +426,16 @@ struct local_apic {
 #else
  #define BAD_APICID 0xFFFFu
 #endif
+
+enum ioapic_irq_destination_types {
+	dest_Fixed		= 0,
+	dest_LowestPrio		= 1,
+	dest_SMI		= 2,
+	dest__reserved_1	= 3,
+	dest_NMI		= 4,
+	dest_INIT		= 5,
+	dest__reserved_2	= 6,
+	dest_ExtINT		= 7
+};
+
 #endif /* _ASM_X86_APICDEF_H */
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index f327d38..e1a9b0e 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -63,17 +63,6 @@ union IO_APIC_reg_03 {
 	} __attribute__ ((packed)) bits;
 };
 
-enum ioapic_irq_destination_types {
-	dest_Fixed = 0,
-	dest_LowestPrio = 1,
-	dest_SMI = 2,
-	dest__reserved_1 = 3,
-	dest_NMI = 4,
-	dest_INIT = 5,
-	dest__reserved_2 = 6,
-	dest_ExtINT = 7
-};
-
 struct IO_APIC_route_entry {
 	__u32	vector		:  8,
 		delivery_mode	:  3,	/* 000: FIXED
-- 
cgit v0.10.2


From b6a1432da81fa387d76215108dc9f6ea6d343aed Mon Sep 17 00:00:00 2001
From: Henrik Kretzschmar <henne@nachtwindheim.de>
Date: Tue, 22 Feb 2011 15:38:04 +0100
Subject: x86: Add dummy mp_save_irq()

This is a dummy function, used when no IOAPIC is compiled in.

Signed-off-by: Henrik Kretzschmar <henne@nachtwindheim.de>
LKML-Reference: <1298385487-4708-2-git-send-email-henne@nachtwindheim.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index e1a9b0e..b24915f 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -188,6 +188,7 @@ static inline int mp_find_ioapic(u32 gsi) { return 0; }
 struct io_apic_irq_attr;
 static inline int io_apic_set_pci_routing(struct device *dev, int irq,
 		 struct io_apic_irq_attr *irq_attr) { return 0; }
+static inline void mp_save_irq(struct mpc_intsrc *m) { };
 #endif
 
 #endif /* _ASM_X86_IO_APIC_H */
-- 
cgit v0.10.2


From 7167d08e780a722fa79ea414fc4e72bc00751392 Mon Sep 17 00:00:00 2001
From: Henrik Kretzschmar <henne@nachtwindheim.de>
Date: Tue, 22 Feb 2011 15:38:05 +0100
Subject: x86: Rework arch_disable_smp_support() for x86

Currently arch_disable_smp_support() on x86 disables only the
support for the IOAPIC and is also compiled in if SMP-support is
not.

Therefore this function is renamed to disable_ioapic_support(),
which meets its purpose and is only compiled in the kernel
when IOAPIC support is also.

A new arch_disable_smp_support() is created in smpboot.c,
which calls disable_ioapic_support() and gets only compiled
in the kernel when SMP support is also.

Signed-off-by: Henrik Kretzschmar <henne@nachtwindheim.de>
LKML-Reference: <1298385487-4708-3-git-send-email-henne@nachtwindheim.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index b24915f..0be2f27 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -175,6 +175,8 @@ extern void __init pre_init_apic_IRQ0(void);
 
 extern void mp_save_irq(struct mpc_intsrc *m);
 
+extern void disable_ioapic_support(void);
+
 #else  /* !CONFIG_X86_IO_APIC */
 
 #define io_apic_assign_pci_irqs 0
@@ -189,6 +191,7 @@ struct io_apic_irq_attr;
 static inline int io_apic_set_pci_routing(struct device *dev, int irq,
 		 struct io_apic_irq_attr *irq_attr) { return 0; }
 static inline void mp_save_irq(struct mpc_intsrc *m) { };
+static inline void disable_ioapic_support(void) { }
 #endif
 
 #endif /* _ASM_X86_IO_APIC_H */
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 76b96d7..96e6809 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -43,6 +43,7 @@
 #include <asm/i8259.h>
 #include <asm/proto.h>
 #include <asm/apic.h>
+#include <asm/io_apic.h>
 #include <asm/desc.h>
 #include <asm/hpet.h>
 #include <asm/idle.h>
@@ -1209,7 +1210,7 @@ void __cpuinit setup_local_APIC(void)
 		rdtscll(tsc);
 
 	if (disable_apic) {
-		arch_disable_smp_support();
+		disable_ioapic_support();
 		return;
 	}
 
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index ca9e2a35..a2f2bf8 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -108,7 +108,10 @@ DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
 
 int skip_ioapic_setup;
 
-void arch_disable_smp_support(void)
+/**
+ * disable_ioapic_support() - disables ioapic support at runtime
+ */
+void disable_ioapic_support(void)
 {
 #ifdef CONFIG_PCI
 	noioapicquirk = 1;
@@ -120,7 +123,7 @@ void arch_disable_smp_support(void)
 static int __init parse_noapic(char *str)
 {
 	/* disable IO-APIC */
-	arch_disable_smp_support();
+	disable_ioapic_support();
 	return 0;
 }
 early_param("noapic", parse_noapic);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 08776a9..09d0172 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -64,6 +64,7 @@
 #include <asm/mtrr.h>
 #include <asm/mwait.h>
 #include <asm/apic.h>
+#include <asm/io_apic.h>
 #include <asm/setup.h>
 #include <asm/uv/uv.h>
 #include <linux/mc146818rtc.h>
@@ -945,6 +946,14 @@ int __cpuinit native_cpu_up(unsigned int cpu)
 	return 0;
 }
 
+/**
+ * arch_disable_smp_support() - disables SMP support for x86 at runtime
+ */
+void arch_disable_smp_support(void)
+{
+	disable_ioapic_support();
+}
+
 /*
  * Fall back to non SMP mode after errors.
  *
@@ -1045,7 +1054,7 @@ static int __init smp_sanity_check(unsigned max_cpus)
 				"(tell your hw vendor)\n");
 		}
 		smpboot_clear_io_apic();
-		arch_disable_smp_support();
+		disable_ioapic_support();
 		return -1;
 	}
 
-- 
cgit v0.10.2


From 7d0f1926131cf79aa5998d463bf1582156e7b41e Mon Sep 17 00:00:00 2001
From: Henrik Kretzschmar <henne@nachtwindheim.de>
Date: Tue, 22 Feb 2011 15:38:06 +0100
Subject: x86: Add dummy functions for compiling without IOAPIC

This patch adds IOAPIC dummy functions for compilation
with local APIC, but without IOAPIC.

The local variable ioapic_entries in enable_IR_x2apic()
does not need initialization anymore, since the dummy
returns NULL.

Signed-off-by: Henrik Kretzschmar <henne@nachtwindheim.de>
LKML-Reference: <1298385487-4708-4-git-send-email-henne@nachtwindheim.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index 0be2f27..56dcf08 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -190,6 +190,24 @@ static inline int mp_find_ioapic(u32 gsi) { return 0; }
 struct io_apic_irq_attr;
 static inline int io_apic_set_pci_routing(struct device *dev, int irq,
 		 struct io_apic_irq_attr *irq_attr) { return 0; }
+
+static inline struct IO_APIC_route_entry **alloc_ioapic_entries(void)
+{
+	return NULL;
+}
+
+static inline void free_ioapic_entries(struct IO_APIC_route_entry **ent) { }
+static inline int save_IO_APIC_setup(struct IO_APIC_route_entry **ent)
+{
+	return -ENOMEM;
+}
+
+static inline void mask_IO_APIC_setup(struct IO_APIC_route_entry **ent) { }
+static inline int restore_IO_APIC_setup(struct IO_APIC_route_entry **ent)
+{
+	return -ENOMEM;
+}
+
 static inline void mp_save_irq(struct mpc_intsrc *m) { };
 static inline void disable_ioapic_support(void) { }
 #endif
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 96e6809..f0e0798 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1449,7 +1449,7 @@ int __init enable_IR(void)
 void __init enable_IR_x2apic(void)
 {
 	unsigned long flags;
-	struct IO_APIC_route_entry **ioapic_entries = NULL;
+	struct IO_APIC_route_entry **ioapic_entries;
 	int ret, x2apic_enabled = 0;
 	int dmar_table_init_ret;
 
-- 
cgit v0.10.2


From 1444e0c9daf0d3472677efc15588b192fc2db761 Mon Sep 17 00:00:00 2001
From: Henrik Kretzschmar <henne@nachtwindheim.de>
Date: Tue, 22 Feb 2011 15:38:07 +0100
Subject: x86: Fix deps of X86_UP_IOAPIC

Since commit 7cd92366a593246650cc7d6198e2c7d3af8c1d8a
lAPIC enabled accidently the IOAPIC, which now gets fixed.

Signed-off-by: Henrik Kretzschmar <henne@nachtwindheim.de>
LKML-Reference: <1298385487-4708-5-git-send-email-henne@nachtwindheim.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index d5ed94d..867a8cf 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -811,7 +811,7 @@ config X86_LOCAL_APIC
 
 config X86_IO_APIC
 	def_bool y
-	depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC
+	depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_IOAPIC
 
 config X86_VISWS_APIC
 	def_bool y
-- 
cgit v0.10.2


From 939d578ecc62b07efeb186576ab190fe0b766501 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Feb 2011 11:46:01 +0100
Subject: x86: OLPC: Make OLPC=n build again

Stupid me missed the functions called from setup.c. Add the stubs back
for OLPC=n

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/include/asm/olpc_ofw.h b/arch/x86/include/asm/olpc_ofw.h
index 28a7fec..c5d3a5a 100644
--- a/arch/x86/include/asm/olpc_ofw.h
+++ b/arch/x86/include/asm/olpc_ofw.h
@@ -6,6 +6,8 @@
 
 #define OLPC_OFW_SIG 0x2057464F	/* aka "OFW " */
 
+#ifdef CONFIG_OLPC
+
 extern bool olpc_ofw_is_installed(void);
 
 /* run an OFW command by calling into the firmware */
@@ -24,6 +26,11 @@ extern void setup_olpc_ofw_pgd(void);
 /* check if OFW was detected during boot */
 extern bool olpc_ofw_present(void);
 
+#else /* !CONFIG_OLPC */
+static inline void olpc_ofw_detect(void) { }
+static inline void setup_olpc_ofw_pgd(void) { }
+#endif /* !CONFIG_OLPC */
+
 #ifdef CONFIG_OF_PROMTREE
 extern void olpc_dt_build_devicetree(void);
 #else
-- 
cgit v0.10.2


From 170ae6bc24e1d7f9bd921a484ec9ea2825497970 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Wed, 23 Feb 2011 11:08:59 -0300
Subject: perf annotate: Show better message when no vmlinux is found

In both --tui and --stdio, in 'annotate', 'top', 'report' when trying to
annotate a kernel symbol having just access to a kallsyms file, that
doesn't have the DWARF info needed for annotation.

Suggested-by: Ingo Molnar <mingo@elte.hu>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index 70ec422..0d0830c 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -295,12 +295,23 @@ fallback:
 	}
 
 	if (dso->origin == DSO__ORIG_KERNEL) {
+		char bf[BUILD_ID_SIZE * 2 + 16] = " with build id ";
+		char *build_id_msg = NULL;
+
 		if (dso->annotate_warned)
 			goto out_free_filename;
+
+		if (dso->has_build_id) {
+			build_id__sprintf(dso->build_id,
+					  sizeof(dso->build_id), bf + 15);
+			build_id_msg = bf;
+		}
 		err = -ENOENT;
 		dso->annotate_warned = 1;
-		pr_err("Can't annotate %s: No vmlinux file was found in the "
-		       "path\n", sym->name);
+		pr_err("Can't annotate %s: No vmlinux file%s was found in the "
+		       "path.\nPlease use 'perf buildid-cache -av vmlinux' or "
+		       "--vmlinux vmlinux.\n",
+		       sym->name, build_id_msg ?: "");
 		goto out_free_filename;
 	}
 
diff --git a/tools/perf/util/ui/browsers/hists.c b/tools/perf/util/ui/browsers/hists.c
index 294b495..497b3c4 100644
--- a/tools/perf/util/ui/browsers/hists.c
+++ b/tools/perf/util/ui/browsers/hists.c
@@ -924,14 +924,6 @@ int hists__browse(struct hists *self, const char *helpline,
 		if (choice == annotate) {
 			struct hist_entry *he;
 do_annotate:
-			if (browser->selection->map->dso->origin == DSO__ORIG_KERNEL) {
-				browser->selection->map->dso->annotate_warned = 1;
-				ui_helpline__puts("No vmlinux file found, can't "
-						 "annotate with just a "
-						 "kallsyms file");
-				continue;
-			}
-
 			he = hist_browser__selected_entry(browser);
 			if (he == NULL)
 				continue;
-- 
cgit v0.10.2


From c8d6b8fe72216ca47e399204b58c8be0448d4083 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Feb 2011 14:29:34 +0100
Subject: x86: ioapic: Remove silly debug bloat in setup_IOAPIC_irqs()

This is debug code and it does not matter at all whether we print each
not connected pin in an extra line or try to be extra clever.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index a2f2bf8..e33ccb4 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1388,9 +1388,19 @@ static struct {
 	DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1);
 } mp_ioapic_routing[MAX_IO_APICS];
 
+static bool __init io_apic_pin_not_connected(int idx, int apic_id, int pin)
+{
+	if (idx != -1)
+		return false;
+
+	apic_printk(APIC_VERBOSE, KERN_DEBUG " apic %d pin %d not connected\n",
+		    mp_ioapics[apic_id].apicid, pin);
+	return true;
+}
+
 static void __init setup_IO_APIC_irqs(void)
 {
-	int apic_id, pin, idx, irq, notcon = 0;
+	int apic_id, pin, idx, irq;
 	int node = cpu_to_node(0);
 	struct irq_cfg *cfg;
 
@@ -1399,22 +1409,8 @@ static void __init setup_IO_APIC_irqs(void)
 	for (apic_id = 0; apic_id < nr_ioapics; apic_id++)
 	for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) {
 		idx = find_irq_entry(apic_id, pin, mp_INT);
-		if (idx == -1) {
-			if (!notcon) {
-				notcon = 1;
-				apic_printk(APIC_VERBOSE,
-					KERN_DEBUG " %d-%d",
-					mp_ioapics[apic_id].apicid, pin);
-			} else
-				apic_printk(APIC_VERBOSE, " %d-%d",
-					mp_ioapics[apic_id].apicid, pin);
+		if (io_apic_pin_not_connected(idx, apic_id, pin))
 			continue;
-		}
-		if (notcon) {
-			apic_printk(APIC_VERBOSE,
-				" (apicid-pin) not connected\n");
-			notcon = 0;
-		}
 
 		irq = pin_2_irq(idx, apic_id, pin);
 
@@ -1441,10 +1437,6 @@ static void __init setup_IO_APIC_irqs(void)
 		setup_ioapic_irq(apic_id, pin, irq, cfg, irq_trigger(idx),
 				  irq_polarity(idx));
 	}
-
-	if (notcon)
-		apic_printk(APIC_VERBOSE,
-			" (apicid-pin) not connected\n");
 }
 
 /*
-- 
cgit v0.10.2


From ed972ccf434a9881a5881915ae04602af2776bad Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Feb 2011 14:31:36 +0100
Subject: x86: ioapic: Split out the nested loop in setup_IO_APIC_irqs()

Two consecutive

    for(...)
    for(...)

lines to avoid an extra indentation are just horrible to read. I had
to look more than once to figure out what the code is doing.

Split out the inner loop into a separate function.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index e33ccb4..f751a82 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1398,15 +1398,12 @@ static bool __init io_apic_pin_not_connected(int idx, int apic_id, int pin)
 	return true;
 }
 
-static void __init setup_IO_APIC_irqs(void)
+static void __init __io_apic_setup_irqs(unsigned int apic_id)
 {
-	int apic_id, pin, idx, irq;
-	int node = cpu_to_node(0);
+	int idx, node = cpu_to_node(0);
+	unsigned int pin, irq;
 	struct irq_cfg *cfg;
 
-	apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
-
-	for (apic_id = 0; apic_id < nr_ioapics; apic_id++)
 	for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) {
 		idx = find_irq_entry(apic_id, pin, mp_INT);
 		if (io_apic_pin_not_connected(idx, apic_id, pin))
@@ -1439,6 +1436,16 @@ static void __init setup_IO_APIC_irqs(void)
 	}
 }
 
+static void __init setup_IO_APIC_irqs(void)
+{
+	unsigned int apic_id;
+
+	apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
+
+	for (apic_id = 0; apic_id < nr_ioapics; apic_id++)
+		__io_apic_setup_irqs(apic_id);
+}
+
 /*
  * for the gsit that is not in first ioapic
  * but could not use acpi_register_gsi()
-- 
cgit v0.10.2


From ff973d041e5ab9ada9e49f4e93ef3a699c511463 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Feb 2011 13:00:56 +0100
Subject: x86: ioapic: Add io_apic_setup_irq_pin()

There are about four places in the ioapic code which do exactly the
same setup sequence. Also the OF based ioapic setup needs that
function to avoid putting the OF specific code into ioapic.c

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index 56dcf08..37dbb3f 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -151,6 +151,8 @@ void setup_IO_APIC_irq_extra(u32 gsi);
 extern void ioapic_and_gsi_init(void);
 extern void ioapic_insert_resources(void);
 
+int io_apic_setup_irq_pin(unsigned int irq, int node, struct io_apic_irq_attr *attr);
+
 extern struct IO_APIC_route_entry **alloc_ioapic_entries(void);
 extern void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries);
 extern int save_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index f751a82..6deb3ca 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -3607,6 +3607,21 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
 }
 #endif /* CONFIG_HT_IRQ */
 
+int
+io_apic_setup_irq_pin(unsigned int irq, int node, struct io_apic_irq_attr *attr)
+{
+	struct irq_cfg *cfg = alloc_irq_and_cfg_at(irq, node);
+	int ret;
+
+	if (!cfg)
+		return -EINVAL;
+	ret = __add_pin_to_irq_node(cfg, node, attr->ioapic, attr->ioapic_pin);
+	if (!ret)
+		setup_ioapic_irq(attr->ioapic, attr->ioapic_pin, irq, cfg,
+				 attr->trigger, attr->polarity);
+	return ret;
+}
+
 int __init io_apic_get_redir_entries (int ioapic)
 {
 	union IO_APIC_reg_01	reg_01;
-- 
cgit v0.10.2


From f880ec78fabebc58180778d223600e9be7b48502 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Feb 2011 13:07:54 +0100
Subject: x86: ioapic: Use new setup function in pre_init_apic_IRQ0()

Remove the duplicated code and call the function. It does not matter
whether we allocated the cfg before calling setup_local_APIC() and we
can set the irq chip and handler after that as well.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 6deb3ca..51c8bd1 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -4103,20 +4103,15 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
 /* Enable IOAPIC early just for system timer */
 void __init pre_init_apic_IRQ0(void)
 {
-	struct irq_cfg *cfg;
+	struct io_apic_irq_attr attr = { 0, 0, 0, 0 };
 
 	printk(KERN_INFO "Early APIC setup for system timer0\n");
 #ifndef CONFIG_SMP
 	physid_set_mask_of_physid(boot_cpu_physical_apicid,
 					 &phys_cpu_present_map);
 #endif
-	/* Make sure the irq descriptor is set up */
-	cfg = alloc_irq_and_cfg_at(0, 0);
-
 	setup_local_APIC();
 
-	add_pin_to_irq_node(cfg, 0, 0, 0);
+	io_apic_setup_irq_pin(0, 0, &attr);
 	set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge");
-
-	setup_ioapic_irq(0, 0, 0, cfg, 0, 0);
 }
-- 
cgit v0.10.2


From e0799c04b2080e0832538a911361f962c93fb744 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Feb 2011 14:10:54 +0100
Subject: x86: ioapic: Use setup function in __io_apic_set_pci_routing()

The only difference here is that we did not call
__add_pin_to_irq_node() for the legacy irqs, but that's not worth 30
lines of extra code.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 51c8bd1..a655bd8 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -3679,45 +3679,17 @@ int __init arch_probe_nr_irqs(void)
 static int __io_apic_set_pci_routing(struct device *dev, int irq,
 				struct io_apic_irq_attr *irq_attr)
 {
-	struct irq_cfg *cfg;
 	int node;
-	int ioapic, pin;
-	int trigger, polarity;
 
-	ioapic = irq_attr->ioapic;
 	if (!IO_APIC_IRQ(irq)) {
 		apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
-			ioapic);
+			    irq_attr->ioapic);
 		return -EINVAL;
 	}
 
-	if (dev)
-		node = dev_to_node(dev);
-	else
-		node = cpu_to_node(0);
-
-	cfg = alloc_irq_and_cfg_at(irq, node);
-	if (!cfg)
-		return 0;
-
-	pin = irq_attr->ioapic_pin;
-	trigger = irq_attr->trigger;
-	polarity = irq_attr->polarity;
-
-	/*
-	 * IRQs < 16 are already in the irq_2_pin[] map
-	 */
-	if (irq >= legacy_pic->nr_legacy_irqs) {
-		if (__add_pin_to_irq_node(cfg, node, ioapic, pin)) {
-			printk(KERN_INFO "can not add pin %d for irq %d\n",
-				pin, irq);
-			return 0;
-		}
-	}
-
-	setup_ioapic_irq(ioapic, pin, irq, cfg, trigger, polarity);
+	node = dev ? dev_to_node(dev) : cpu_to_node(0);
 
-	return 0;
+	return io_apic_setup_irq_pin(irq, node, irq_attr);
 }
 
 int io_apic_set_pci_routing(struct device *dev, int irq,
-- 
cgit v0.10.2


From 2d57e37dbf648fd6547752b8954f4104a85f4b15 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Feb 2011 14:40:35 +0100
Subject: x86: ioapic: Use setup function in __io_apic_setup_irqs()

Replace the duplicated code.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index a655bd8..63140d8 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1401,8 +1401,8 @@ static bool __init io_apic_pin_not_connected(int idx, int apic_id, int pin)
 static void __init __io_apic_setup_irqs(unsigned int apic_id)
 {
 	int idx, node = cpu_to_node(0);
+	struct io_apic_irq_attr attr;
 	unsigned int pin, irq;
-	struct irq_cfg *cfg;
 
 	for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) {
 		idx = find_irq_entry(apic_id, pin, mp_INT);
@@ -1419,20 +1419,13 @@ static void __init __io_apic_setup_irqs(unsigned int apic_id)
 		 * installed and if it returns 1:
 		 */
 		if (apic->multi_timer_check &&
-				apic->multi_timer_check(apic_id, irq))
+		    apic->multi_timer_check(apic_id, irq))
 			continue;
 
-		cfg = alloc_irq_and_cfg_at(irq, node);
-		if (!cfg)
-			continue;
+		set_io_apic_irq_attr(&attr, apic_id, pin, irq_trigger(idx),
+				     irq_polarity(idx));
 
-		add_pin_to_irq_node(cfg, node, apic_id, pin);
-		/*
-		 * don't mark it in pin_programmed, so later acpi could
-		 * set it correctly when irq < 16
-		 */
-		setup_ioapic_irq(apic_id, pin, irq, cfg, irq_trigger(idx),
-				  irq_polarity(idx));
+		io_apic_setup_irq_pin(irq, node, &attr);
 	}
 }
 
-- 
cgit v0.10.2


From da1ad9d7b2477594e8ff43706644ba8a375ad62a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Feb 2011 14:52:16 +0100
Subject: x86: ioapic: Use setup function in setup_IO_APIC_irq_extra()

Another version of the same thing. Only set the pin programmed, when
the setup function succeeds.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 63140d8..cfd9611 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1447,7 +1447,7 @@ static void __init setup_IO_APIC_irqs(void)
 void setup_IO_APIC_irq_extra(u32 gsi)
 {
 	int apic_id = 0, pin, idx, irq, node = cpu_to_node(0);
-	struct irq_cfg *cfg;
+	struct io_apic_irq_attr attr;
 
 	/*
 	 * Convert 'gsi' to 'ioapic.pin'.
@@ -1467,21 +1467,17 @@ void setup_IO_APIC_irq_extra(u32 gsi)
 	if (apic_id == 0 || irq < NR_IRQS_LEGACY)
 		return;
 
-	cfg = alloc_irq_and_cfg_at(irq, node);
-	if (!cfg)
-		return;
-
-	add_pin_to_irq_node(cfg, node, apic_id, pin);
-
 	if (test_bit(pin, mp_ioapic_routing[apic_id].pin_programmed)) {
 		pr_debug("Pin %d-%d already programmed\n",
 			 mp_ioapics[apic_id].apicid, pin);
 		return;
 	}
-	set_bit(pin, mp_ioapic_routing[apic_id].pin_programmed);
 
-	setup_ioapic_irq(apic_id, pin, irq, cfg,
-			irq_trigger(idx), irq_polarity(idx));
+	set_io_apic_irq_attr(&attr, apic_id, pin, irq_trigger(idx),
+			     irq_polarity(idx));
+
+	if (!io_apic_setup_irq_pin(irq, node, &attr))
+		set_bit(pin, mp_ioapic_routing[apic_id].pin_programmed);
 }
 
 /*
-- 
cgit v0.10.2


From 41098ffe050c4befe5fc21a5cedd42ebbd6f7469 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Feb 2011 16:08:03 +0100
Subject: x86: ioapic: Make a few functions static

No users outside of io_apic.c. Mark bad_ioapic() __init while at it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index 37dbb3f..be61e22 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -139,11 +139,6 @@ extern int timer_through_8259;
 #define io_apic_assign_pci_irqs \
 	(mp_irq_entries && !skip_ioapic_setup && io_apic_irqs)
 
-extern u8 io_apic_unique_id(u8 id);
-extern int io_apic_get_unique_id(int ioapic, int apic_id);
-extern int io_apic_get_version(int ioapic);
-extern int io_apic_get_redir_entries(int ioapic);
-
 struct io_apic_irq_attr;
 extern int io_apic_set_pci_routing(struct device *dev, int irq,
 		 struct io_apic_irq_attr *irq_attr);
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index cfd9611..7344b42 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -3611,7 +3611,7 @@ io_apic_setup_irq_pin(unsigned int irq, int node, struct io_apic_irq_attr *attr)
 	return ret;
 }
 
-int __init io_apic_get_redir_entries (int ioapic)
+static int __init io_apic_get_redir_entries(int ioapic)
 {
 	union IO_APIC_reg_01	reg_01;
 	unsigned long flags;
@@ -3702,31 +3702,8 @@ int io_apic_set_pci_routing(struct device *dev, int irq,
 	return __io_apic_set_pci_routing(dev, irq, irq_attr);
 }
 
-u8 __init io_apic_unique_id(u8 id)
-{
-#ifdef CONFIG_X86_32
-	if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
-	    !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
-		return io_apic_get_unique_id(nr_ioapics, id);
-	else
-		return id;
-#else
-	int i;
-	DECLARE_BITMAP(used, 256);
-
-	bitmap_zero(used, 256);
-	for (i = 0; i < nr_ioapics; i++) {
-		struct mpc_ioapic *ia = &mp_ioapics[i];
-		__set_bit(ia->apicid, used);
-	}
-	if (!test_bit(id, used))
-		return id;
-	return find_first_zero_bit(used, 256);
-#endif
-}
-
 #ifdef CONFIG_X86_32
-int __init io_apic_get_unique_id(int ioapic, int apic_id)
+static int __init io_apic_get_unique_id(int ioapic, int apic_id)
 {
 	union IO_APIC_reg_00 reg_00;
 	static physid_mask_t apic_id_map = PHYSID_MASK_NONE;
@@ -3799,9 +3776,33 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)
 
 	return apic_id;
 }
+
+static u8 __init io_apic_unique_id(u8 id)
+{
+	if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
+	    !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
+		return io_apic_get_unique_id(nr_ioapics, id);
+	else
+		return id;
+}
+#else
+static u8 __init io_apic_unique_id(u8 id)
+{
+	int i;
+	DECLARE_BITMAP(used, 256);
+
+	bitmap_zero(used, 256);
+	for (i = 0; i < nr_ioapics; i++) {
+		struct mpc_ioapic *ia = &mp_ioapics[i];
+		__set_bit(ia->apicid, used);
+	}
+	if (!test_bit(id, used))
+		return id;
+	return find_first_zero_bit(used, 256);
+}
 #endif
 
-int __init io_apic_get_version(int ioapic)
+static int __init io_apic_get_version(int ioapic)
 {
 	union IO_APIC_reg_01	reg_01;
 	unsigned long flags;
@@ -4004,7 +4005,7 @@ int mp_find_ioapic_pin(int ioapic, u32 gsi)
 	return gsi - mp_gsi_routing[ioapic].gsi_base;
 }
 
-static int bad_ioapic(unsigned long address)
+static __init int bad_ioapic(unsigned long address)
 {
 	if (nr_ioapics >= MAX_IO_APICS) {
 		printk(KERN_WARNING "WARING: Max # of I/O APICs (%d) exceeded "
-- 
cgit v0.10.2


From b77cf6a8609a8450786c572bc8af6ad068022dbe Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Feb 2011 17:33:53 +0100
Subject: x86: ioapic: Remove useless inlines

There is no point to have irq_trigger() and irq_polarity() as wrappers
around the MPBIOS_* camel case functions. Get rid of both the inlines
and the ugly camel case.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 7344b42..2d49e4b4 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -821,7 +821,7 @@ static int EISA_ELCR(unsigned int irq)
 #define default_MCA_trigger(idx)	(1)
 #define default_MCA_polarity(idx)	default_ISA_polarity(idx)
 
-static int MPBIOS_polarity(int idx)
+static int irq_polarity(int idx)
 {
 	int bus = mp_irqs[idx].srcbus;
 	int polarity;
@@ -863,7 +863,7 @@ static int MPBIOS_polarity(int idx)
 	return polarity;
 }
 
-static int MPBIOS_trigger(int idx)
+static int irq_trigger(int idx)
 {
 	int bus = mp_irqs[idx].srcbus;
 	int trigger;
@@ -935,16 +935,6 @@ static int MPBIOS_trigger(int idx)
 	return trigger;
 }
 
-static inline int irq_polarity(int idx)
-{
-	return MPBIOS_polarity(idx);
-}
-
-static inline int irq_trigger(int idx)
-{
-	return MPBIOS_trigger(idx);
-}
-
 static int pin_2_irq(int idx, int apic, int pin)
 {
 	int irq;
-- 
cgit v0.10.2


From 710dcda64369e3f3704a0eee502ce27dbf9fedc1 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Feb 2011 17:47:41 +0100
Subject: x86: ioapic: Implement and use io_apic_setup_irq_pin_once()

io_apic_set_pci_routing() and mp_save_irq() check the pin_programmed
bit before calling io_apic_setup_irq_pin() and set the bit when the
pin was setup.

Move that duplicated code into a separate function and use it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 2d49e4b4..46913ef 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -128,6 +128,9 @@ static int __init parse_noapic(char *str)
 }
 early_param("noapic", parse_noapic);
 
+static int io_apic_setup_irq_pin_once(unsigned int irq, int node,
+				      struct io_apic_irq_attr *attr);
+
 /* Will be called in mpparse/acpi/sfi codes for saving IRQ info */
 void mp_save_irq(struct mpc_intsrc *m)
 {
@@ -1457,17 +1460,10 @@ void setup_IO_APIC_irq_extra(u32 gsi)
 	if (apic_id == 0 || irq < NR_IRQS_LEGACY)
 		return;
 
-	if (test_bit(pin, mp_ioapic_routing[apic_id].pin_programmed)) {
-		pr_debug("Pin %d-%d already programmed\n",
-			 mp_ioapics[apic_id].apicid, pin);
-		return;
-	}
-
 	set_io_apic_irq_attr(&attr, apic_id, pin, irq_trigger(idx),
 			     irq_polarity(idx));
 
-	if (!io_apic_setup_irq_pin(irq, node, &attr))
-		set_bit(pin, mp_ioapic_routing[apic_id].pin_programmed);
+	io_apic_setup_irq_pin_once(irq, node, &attr);
 }
 
 /*
@@ -3601,6 +3597,24 @@ io_apic_setup_irq_pin(unsigned int irq, int node, struct io_apic_irq_attr *attr)
 	return ret;
 }
 
+static int io_apic_setup_irq_pin_once(unsigned int irq, int node,
+				      struct io_apic_irq_attr *attr)
+{
+	unsigned int id = attr->ioapic, pin = attr->ioapic_pin;
+	int ret;
+
+	/* Avoid redundant programming */
+	if (test_bit(pin, mp_ioapic_routing[id].pin_programmed)) {
+		pr_debug("Pin %d-%d already programmed\n",
+			 mp_ioapics[id].apicid, pin);
+		return 0;
+	}
+	ret = io_apic_setup_irq_pin(irq, node, attr);
+	if (!ret)
+		set_bit(pin, mp_ioapic_routing[id].pin_programmed);
+	return ret;
+}
+
 static int __init io_apic_get_redir_entries(int ioapic)
 {
 	union IO_APIC_reg_01	reg_01;
@@ -3655,8 +3669,8 @@ int __init arch_probe_nr_irqs(void)
 }
 #endif
 
-static int __io_apic_set_pci_routing(struct device *dev, int irq,
-				struct io_apic_irq_attr *irq_attr)
+int io_apic_set_pci_routing(struct device *dev, int irq,
+			    struct io_apic_irq_attr *irq_attr)
 {
 	int node;
 
@@ -3668,28 +3682,7 @@ static int __io_apic_set_pci_routing(struct device *dev, int irq,
 
 	node = dev ? dev_to_node(dev) : cpu_to_node(0);
 
-	return io_apic_setup_irq_pin(irq, node, irq_attr);
-}
-
-int io_apic_set_pci_routing(struct device *dev, int irq,
-				struct io_apic_irq_attr *irq_attr)
-{
-	int ioapic, pin;
-	/*
-	 * Avoid pin reprogramming.  PRTs typically include entries
-	 * with redundant pin->gsi mappings (but unique PCI devices);
-	 * we only program the IOAPIC on the first.
-	 */
-	ioapic = irq_attr->ioapic;
-	pin = irq_attr->ioapic_pin;
-	if (test_bit(pin, mp_ioapic_routing[ioapic].pin_programmed)) {
-		pr_debug("Pin %d-%d already programmed\n",
-			 mp_ioapics[ioapic].apicid, pin);
-		return 0;
-	}
-	set_bit(pin, mp_ioapic_routing[ioapic].pin_programmed);
-
-	return __io_apic_set_pci_routing(dev, irq, irq_attr);
+	return io_apic_setup_irq_pin_once(irq, node, irq_attr);
 }
 
 #ifdef CONFIG_X86_32
-- 
cgit v0.10.2


From abb0052289e58140d933b29491f59e4be0a19727 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Feb 2011 19:54:53 +0100
Subject: x86: ioapic: Move trigger defines to io_apic.h

Required for devicetree based io_apic configuration.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index be61e22..c4bd267 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -95,6 +95,10 @@ struct IR_IO_APIC_route_entry {
 		index		: 15;
 } __attribute__ ((packed));
 
+#define IOAPIC_AUTO     -1
+#define IOAPIC_EDGE     0
+#define IOAPIC_LEVEL    1
+
 #ifdef CONFIG_X86_IO_APIC
 
 /*
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 46913ef..8d23e83 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1216,10 +1216,6 @@ void __setup_vector_irq(int cpu)
 static struct irq_chip ioapic_chip;
 static struct irq_chip ir_ioapic_chip;
 
-#define IOAPIC_AUTO     -1
-#define IOAPIC_EDGE     0
-#define IOAPIC_LEVEL    1
-
 #ifdef CONFIG_X86_32
 static inline int IO_APIC_irq_trigger(int irq)
 {
-- 
cgit v0.10.2


From f1c2b357148ec27fcc6ce0992211209a0ea20d8f Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Tue, 22 Feb 2011 21:07:36 +0100
Subject: x86: e820: Remove conditional early mapping in parse_e820_ext

This patch ensures that the memory passed from parse_setup_data() is
large enough to cover the complete data structure. That means that the
conditional mapping in parse_e820_ext() can go.

While here, I also attempt not to map two pages if the address is not
aligned to a page boundary.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Dirk Brandewie <dirk.brandewie@gmail.com>
Cc: sodaville@linutronix.de
Cc: devicetree-discuss@lists.ozlabs.org
LKML-Reference: <1298405266-1624-2-git-send-email-bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
index e99d55d..908b969 100644
--- a/arch/x86/include/asm/e820.h
+++ b/arch/x86/include/asm/e820.h
@@ -96,7 +96,7 @@ extern void e820_setup_gap(void);
 extern int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize,
 			unsigned long start_addr, unsigned long long end_addr);
 struct setup_data;
-extern void parse_e820_ext(struct setup_data *data, unsigned long pa_data);
+extern void parse_e820_ext(struct setup_data *data);
 
 #if defined(CONFIG_X86_64) || \
 	(defined(CONFIG_X86_32) && defined(CONFIG_HIBERNATION))
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 294f26d..5fad626 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -667,21 +667,15 @@ __init void e820_setup_gap(void)
  * boot_params.e820_map, others are passed via SETUP_E820_EXT node of
  * linked list of struct setup_data, which is parsed here.
  */
-void __init parse_e820_ext(struct setup_data *sdata, unsigned long pa_data)
+void __init parse_e820_ext(struct setup_data *sdata)
 {
-	u32 map_len;
 	int entries;
 	struct e820entry *extmap;
 
 	entries = sdata->len / sizeof(struct e820entry);
-	map_len = sdata->len + sizeof(struct setup_data);
-	if (map_len > PAGE_SIZE)
-		sdata = early_ioremap(pa_data, map_len);
 	extmap = (struct e820entry *)(sdata->data);
 	__append_e820_map(extmap, entries);
 	sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
-	if (map_len > PAGE_SIZE)
-		early_iounmap(sdata, map_len);
 	printk(KERN_INFO "extended physical RAM map:\n");
 	e820_print_map("extended");
 }
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index ca2f106..9521483 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -429,16 +429,27 @@ static void __init parse_setup_data(void)
 		return;
 	pa_data = boot_params.hdr.setup_data;
 	while (pa_data) {
-		data = early_memremap(pa_data, PAGE_SIZE);
+		u32 data_len, map_len;
+
+		map_len = max(PAGE_SIZE - (pa_data & ~PAGE_MASK),
+			      (u64)sizeof(struct setup_data));
+		data = early_memremap(pa_data, map_len);
+		data_len = data->len + sizeof(struct setup_data);
+		if (data_len > map_len) {
+			early_iounmap(data, map_len);
+			data = early_memremap(pa_data, data_len);
+			map_len = data_len;
+		}
+
 		switch (data->type) {
 		case SETUP_E820_EXT:
-			parse_e820_ext(data, pa_data);
+			parse_e820_ext(data);
 			break;
 		default:
 			break;
 		}
 		pa_data = data->next;
-		early_iounmap(data, PAGE_SIZE);
+		early_iounmap(data, map_len);
 	}
 }
 
-- 
cgit v0.10.2


From da6b737b9ab768dd06bb4b0395131d10e524cf83 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Tue, 22 Feb 2011 21:07:37 +0100
Subject: x86: Add device tree support

This patch adds minimal support for device tree on x86. The device
tree blob is passed to the kernel via setup_data which requires at
least boot protocol 2.09.

Memory size, restricted memory regions, boot arguments are gathered
the traditional way so things like cmd_line are just here to let the
code compile.

The current plan is use the device tree as an extension and to gather
information which can not be enumerated and would have to be hardcoded
otherwise. This includes things like
   - which devices are on this I2C/SPI bus?
   - how are the interrupts wired to IO APIC?
   - where could my hpet be?

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Dirk Brandewie <dirk.brandewie@gmail.com>
Acked-by: Grant Likely <grant.likely@secretlab.ca>
Cc: sodaville@linutronix.de
Cc: devicetree-discuss@lists.ozlabs.org
LKML-Reference: <1298405266-1624-3-git-send-email-bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/Documentation/devicetree/booting-without-of.txt b/Documentation/devicetree/booting-without-of.txt
index 28b1c9d..55fd262 100644
--- a/Documentation/devicetree/booting-without-of.txt
+++ b/Documentation/devicetree/booting-without-of.txt
@@ -13,6 +13,7 @@ Table of Contents
 
   I - Introduction
     1) Entry point for arch/powerpc
+    2) Entry point for arch/x86
 
   II - The DT block format
     1) Header
@@ -225,6 +226,25 @@ it with special cases.
   cannot support both configurations with Book E and configurations
   with classic Powerpc architectures.
 
+2) Entry point for arch/x86
+-------------------------------
+
+  There is one single 32bit entry point to the kernel at code32_start,
+  the decompressor (the real mode entry point goes to the same  32bit
+  entry point once it switched into protected mode). That entry point
+  supports one calling convention which is documented in
+  Documentation/x86/boot.txt
+  The physical pointer to the device-tree block (defined in chapter II)
+  is passed via setup_data which requires at least boot protocol 2.09.
+  The type filed is defined as
+
+  #define SETUP_DTB                      2
+
+  This device-tree is used as an extension to the "boot page". As such it
+  does not parse / consider data which is already covered by the boot
+  page. This includes memory size, reserved ranges, command line arguments
+  or initrd address. It simply holds information which can not be retrieved
+  otherwise like interrupt routing or a list of devices behind an I2C bus.
 
 II - The DT block format
 ========================
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index efbae0c..b4c2e9c 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -382,6 +382,8 @@ config X86_INTEL_CE
 	depends on X86_32
 	depends on X86_EXTENDED_PLATFORM
 	select X86_REBOOTFIXUPS
+	select OF
+	select OF_EARLY_FLATTREE
 	---help---
 	  Select for the Intel CE media processor (CE4100) SOC.
 	  This option compiles in support for the CE4100 SOC for settop
diff --git a/arch/x86/include/asm/bootparam.h b/arch/x86/include/asm/bootparam.h
index c8bfe63..e020d88 100644
--- a/arch/x86/include/asm/bootparam.h
+++ b/arch/x86/include/asm/bootparam.h
@@ -12,6 +12,7 @@
 /* setup data types */
 #define SETUP_NONE			0
 #define SETUP_E820_EXT			1
+#define SETUP_DTB			2
 
 /* extensible setup data list node */
 struct setup_data {
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index c704b38..ba870bb 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -10,9 +10,6 @@
 #include <asm/apicdef.h>
 #include <asm/irq_vectors.h>
 
-/* Even though we don't support this, supply it to appease OF */
-static inline void irq_dispose_mapping(unsigned int virq) { }
-
 static inline int irq_canonicalize(int irq)
 {
 	return ((irq == 2) ? 9 : irq);
diff --git a/arch/x86/include/asm/prom.h b/arch/x86/include/asm/prom.h
index b4ec95f..e46f2a2 100644
--- a/arch/x86/include/asm/prom.h
+++ b/arch/x86/include/asm/prom.h
@@ -1 +1,47 @@
-/* dummy prom.h; here to make linux/of.h's #includes happy */
+/*
+ * Definitions for Device tree / OpenFirmware handling on X86
+ *
+ * based on arch/powerpc/include/asm/prom.h which is
+ *         Copyright (C) 1996-2005 Paul Mackerras.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _ASM_X86_PROM_H
+#define _ASM_X86_PROM_H
+#ifndef __ASSEMBLY__
+
+#include <linux/of.h>
+#include <linux/types.h>
+
+#include <asm/irq.h>
+#include <asm/atomic.h>
+#include <asm/setup.h>
+
+#ifdef CONFIG_OF
+extern void add_dtb(u64 data);
+#else
+static inline void add_dtb(u64 data) { }
+#endif
+
+extern char cmd_line[COMMAND_LINE_SIZE];
+
+#define pci_address_to_pio pci_address_to_pio
+unsigned long pci_address_to_pio(phys_addr_t addr);
+
+/**
+ * irq_dispose_mapping - Unmap an interrupt
+ * @virq: linux virq number of the interrupt to unmap
+ *
+ * FIXME: We really should implement proper virq handling like power,
+ * but that's going to be major surgery.
+ */
+static inline void irq_dispose_mapping(unsigned int virq) { }
+
+#define HAVE_ARCH_DEVTREE_FIXUPS
+
+#endif /* __ASSEMBLY__ */
+#endif
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 34244b2..6ac5036 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -109,6 +109,7 @@ obj-$(CONFIG_MICROCODE)			+= microcode.o
 obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o
 
 obj-$(CONFIG_SWIOTLB)			+= pci-swiotlb.o
+obj-$(CONFIG_OF)			+= devicetree.o
 
 ###
 # 64 bit specific files
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
new file mode 100644
index 0000000..0b8f0da
--- /dev/null
+++ b/arch/x86/kernel/devicetree.c
@@ -0,0 +1,51 @@
+/*
+ * Architecture specific OF callbacks.
+ */
+#include <linux/bootmem.h>
+#include <linux/io.h>
+#include <linux/list.h>
+#include <linux/of.h>
+#include <linux/of_fdt.h>
+#include <linux/of_platform.h>
+#include <linux/slab.h>
+
+char __initdata cmd_line[COMMAND_LINE_SIZE];
+
+unsigned int irq_create_of_mapping(struct device_node *controller,
+				   const u32 *intspec, unsigned int intsize)
+{
+	return intspec[0];
+
+}
+EXPORT_SYMBOL_GPL(irq_create_of_mapping);
+
+unsigned long pci_address_to_pio(phys_addr_t address)
+{
+	/*
+	 * The ioport address can be directly used by inX / outX
+	 */
+	BUG_ON(address >= (1 << 16));
+	return (unsigned long)address;
+}
+EXPORT_SYMBOL_GPL(pci_address_to_pio);
+
+void __init early_init_dt_scan_chosen_arch(unsigned long node)
+{
+	BUG();
+}
+
+void __init early_init_dt_add_memory_arch(u64 base, u64 size)
+{
+	BUG();
+}
+
+void * __init early_init_dt_alloc_memory_arch(u64 size, u64 align)
+{
+	return __alloc_bootmem(size, align, __pa(MAX_DMA_ADDRESS));
+}
+
+void __init add_dtb(u64 data)
+{
+	initial_boot_params = phys_to_virt((u64) (u32) data +
+				offsetof(struct setup_data, data));
+}
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 387b6a0..7531360 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -276,15 +276,6 @@ void smp_x86_platform_ipi(struct pt_regs *regs)
 
 EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq);
 
-#ifdef CONFIG_OF
-unsigned int irq_create_of_mapping(struct device_node *controller,
-		const u32 *intspec, unsigned int intsize)
-{
-	return intspec[0];
-}
-EXPORT_SYMBOL_GPL(irq_create_of_mapping);
-#endif
-
 #ifdef CONFIG_HOTPLUG_CPU
 /* A cpu has been removed from cpu_online_mask.  Reset irq affinities. */
 void fixup_irqs(void)
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 9521483..33dcbce 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -113,6 +113,7 @@
 #endif
 #include <asm/mce.h>
 #include <asm/alternative.h>
+#include <asm/prom.h>
 
 /*
  * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
@@ -445,6 +446,9 @@ static void __init parse_setup_data(void)
 		case SETUP_E820_EXT:
 			parse_e820_ext(data);
 			break;
+		case SETUP_DTB:
+			add_dtb(pa_data);
+			break;
 		default:
 			break;
 		}
-- 
cgit v0.10.2


From df2634f43f5106947f3735a0b61a6527a4b278cd Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Tue, 22 Feb 2011 21:07:38 +0100
Subject: x86: dtb: Add a device tree for CE4100

History:
v1..v2:
- dropped device_type except for cpu & pci. I have the compatible string
  for pci so I can drop the device_type once it is possible
- I lowercased all compatible types. I will need to resend some patches
  which have upper case intel
- The cpu had the same compatible string as the soc node. So I added to
  the soc node -immr for internel memory mapped registers.
- I added generic names for all parts.
- I reworked the i2c bars matching the way you suggested. I added a
  compatible node for the PCI device which only the PCI ids in its
  compatible string. The bars (each represents a complete i2c
  controller) have a "intel,ce4100-i2c-controller" compatible node. It
  is not used by the driver.
  The driver is probed via PCI ids (by the pci subsystem not OF) and
  matches the bar address against the ressource in the child node. Once
  there is a hit the node is attached.
- The SPI driver is also probed via pci. However I also attached a
  compatible property based on PCI ids

v2..v3:
- intel,ce4100-immr become intel,ce4100-cp. cp stands for core
  peripherals. The Atom data sheet talks here about ACPI devices. Since
  we don't have ACPI this does not apply here.
- The interrupt map is gone. There are now plenty of device nodes.
- The "unit address string" got fixed, it uses not DD,V format.

v3..v4:
- added descriptions for compatible nodes introduced here:
  - intel,ce4100-ioapic
  - intel,ce4100-lapic
  - intel,ce4100-hpet
  - intel,ce4100
  - intel,ce4100-cp
  - intel,ce4100-pci
- added a description about I2C controller magic.
- Added gpio-controller and gpio-cells property to gpio devices. Those
  properties are not (yet) used.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Dirk Brandewie <dirk.brandewie@gmail.com>
Acked-by: Grant Likely <grant.likely@secretlab.ca>
Cc: sodaville@linutronix.de
Cc: devicetree-discuss@lists.ozlabs.org
LKML-Reference: <1298405266-1624-4-git-send-email-bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/Documentation/devicetree/bindings/i2c/ce4100-i2c.txt b/Documentation/devicetree/bindings/i2c/ce4100-i2c.txt
new file mode 100644
index 0000000..569b162
--- /dev/null
+++ b/Documentation/devicetree/bindings/i2c/ce4100-i2c.txt
@@ -0,0 +1,93 @@
+CE4100 I2C
+----------
+
+CE4100 has one PCI device which is described as the I2C-Controller. This
+PCI device has three PCI-bars, each bar contains a complete I2C
+controller. So we have a total of three independent I2C-Controllers
+which share only an interrupt line.
+The driver is probed via the PCI-ID and is gathering the information of
+attached devices from the devices tree.
+Grant Likely recommended to use the ranges property to map the PCI-Bar
+number to its physical address and to use this to find the child nodes
+of the specific I2C controller. This were his exact words:
+
+       Here's where the magic happens.  Each entry in
+       ranges describes how the parent pci address space
+       (middle group of 3) is translated to the local
+       address space (first group of 2) and the size of
+       each range (last cell).  In this particular case,
+       the first cell of the local address is chosen to be
+       1:1 mapped to the BARs, and the second is the
+       offset from be base of the BAR (which would be
+       non-zero if you had 2 or more devices mapped off
+       the same BAR)
+
+       ranges allows the address mapping to be described
+       in a way that the OS can interpret without
+       requiring custom device driver code.
+
+This is an example which is used on FalconFalls:
+------------------------------------------------
+	i2c-controller@b,2 {
+		#address-cells = <2>;
+		#size-cells = <1>;
+		compatible = "pci8086,2e68.2",
+				"pci8086,2e68",
+				"pciclass,ff0000",
+				"pciclass,ff00";
+
+		reg = <0x15a00 0x0 0x0 0x0 0x0>;
+		interrupts = <16 1>;
+
+		/* as described by Grant, the first number in the group of
+		* three is the bar number followed by the 64bit bar address
+		* followed by size of the mapping. The bar address
+		* requires also a valid translation in parents ranges
+		* property.
+		*/
+		ranges = <0 0   0x02000000 0 0xdffe0500 0x100
+			  1 0   0x02000000 0 0xdffe0600 0x100
+			  2 0   0x02000000 0 0xdffe0700 0x100>;
+
+		i2c@0 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			compatible = "intel,ce4100-i2c-controller";
+
+			/* The first number in the reg property is the
+			* number of the bar
+			*/
+			reg = <0 0 0x100>;
+
+			/* This I2C controller has no devices */
+		};
+
+		i2c@1 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			compatible = "intel,ce4100-i2c-controller";
+			reg = <1 0 0x100>;
+
+			/* This I2C controller has one gpio controller */
+			gpio@26 {
+				#gpio-cells = <2>;
+				compatible = "ti,pcf8575";
+				reg = <0x26>;
+				gpio-controller;
+			};
+		};
+
+		i2c@2 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			compatible = "intel,ce4100-i2c-controller";
+			reg = <2 0 0x100>;
+
+			gpio@26 {
+				#gpio-cells = <2>;
+				compatible = "ti,pcf8575";
+				reg = <0x26>;
+				gpio-controller;
+			};
+		};
+	};
diff --git a/Documentation/devicetree/bindings/x86/ce4100.txt b/Documentation/devicetree/bindings/x86/ce4100.txt
new file mode 100644
index 0000000..b49ae59
--- /dev/null
+++ b/Documentation/devicetree/bindings/x86/ce4100.txt
@@ -0,0 +1,38 @@
+CE4100 Device Tree Bindings
+---------------------------
+
+The CE4100 SoC uses for in core peripherals the following compatible
+format: <vendor>,<chip>-<device>.
+Many of the "generic" devices like HPET or IO APIC have the ce4100
+name in their compatible property because they first appeared in this
+SoC.
+
+The CPU node
+------------
+	cpu@0 {
+		device_type = "cpu";
+		compatible = "intel,ce4100";
+		reg = <0>;
+		lapic = <&lapic0>;
+	};
+
+The reg property describes the CPU number. The lapic property points to
+the local APIC timer.
+
+The SoC node
+------------
+
+This node describes the in-core peripherals. Required property:
+  compatible = "intel,ce4100-cp";
+
+The PCI node
+------------
+This node describes the PCI bus on the SoC. Its property should be
+  compatible = "intel,ce4100-pci", "pci";
+
+If the OS is using the IO-APIC for interrupt routing then the reported
+interrupt numbers for devices is no longer true. In order to obtain the
+correct interrupt number, the child node which represents the device has
+to contain the interrupt property. Besides the interrupt property it has
+to contain at least the reg property containing the PCI bus address and
+compatible property according to "PCI Bus Binding Revision 2.1".
diff --git a/Documentation/devicetree/bindings/x86/interrupt.txt b/Documentation/devicetree/bindings/x86/interrupt.txt
new file mode 100644
index 0000000..8b0efb0
--- /dev/null
+++ b/Documentation/devicetree/bindings/x86/interrupt.txt
@@ -0,0 +1,29 @@
+Interrupt chips
+---------------
+
+* Intel I/O Advanced Programmable Interrupt Controller (IO APIC)
+
+  Required properties:
+  --------------------
+     compatible = "intel,ce4100-ioapic";
+     #interrupt-cells = <2>;
+
+  Device's interrupt property:
+
+     interrupts = <P S>;
+
+  The first number (P) represents the interrupt pin which is wired to the
+  IO APIC. The second number (S) represents the sense of interrupt which
+  should be configured and can be one of:
+    0 - Edge Rising
+    1 - Level Low
+    2 - Level High
+    3 - Edge Falling
+
+* Local APIC
+  Required property:
+
+     compatible = "intel,ce4100-lapic";
+
+  This node is currently unused by Linux as the address of the local APIC
+  read from a MSR.
diff --git a/Documentation/devicetree/bindings/x86/timer.txt b/Documentation/devicetree/bindings/x86/timer.txt
new file mode 100644
index 0000000..c688af5
--- /dev/null
+++ b/Documentation/devicetree/bindings/x86/timer.txt
@@ -0,0 +1,6 @@
+Timers
+------
+
+* High Precision Event Timer (HPET)
+  Required property:
+     compatible = "intel,ce4100-hpet";
diff --git a/arch/x86/platform/ce4100/falconfalls.dts b/arch/x86/platform/ce4100/falconfalls.dts
new file mode 100644
index 0000000..dc701ea
--- /dev/null
+++ b/arch/x86/platform/ce4100/falconfalls.dts
@@ -0,0 +1,428 @@
+/*
+ * CE4100 on Falcon Falls
+ *
+ * (c) Copyright 2010 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; version 2 of the License.
+ */
+/dts-v1/;
+/ {
+	model = "intel,falconfalls";
+	compatible = "intel,falconfalls";
+	#address-cells = <1>;
+	#size-cells = <1>;
+
+	cpus {
+		#address-cells = <1>;
+		#size-cells = <0>;
+
+		cpu@0 {
+			device_type = "cpu";
+			compatible = "intel,ce4100";
+			reg = <0>;
+			lapic = <&lapic0>;
+		};
+	};
+
+	soc@0 {
+		#address-cells = <1>;
+		#size-cells = <1>;
+		compatible = "intel,ce4100-cp";
+		ranges;
+
+		ioapic1: interrupt-controller@fec00000 {
+			#interrupt-cells = <2>;
+			compatible = "intel,ce4100-ioapic";
+			interrupt-controller;
+			reg = <0xfec00000 0x1000>;
+		};
+
+		timer@fed00000 {
+			compatible = "intel,ce4100-hpet";
+			reg = <0xfed00000 0x200>;
+		};
+
+		lapic0: interrupt-controller@fee00000 {
+			compatible = "intel,ce4100-lapic";
+			reg = <0xfee00000 0x1000>;
+		};
+
+		pci@3fc {
+			#address-cells = <3>;
+			#size-cells = <2>;
+			compatible = "intel,ce4100-pci", "pci";
+			device_type = "pci";
+			bus-range = <0 0>;
+			ranges = <0x2000000 0 0xbffff000 0xbffff000 0 0x1000
+				  0x2000000 0 0xdffe0000 0xdffe0000 0 0x1000
+				  0x0000000 0 0x0	 0x0	    0 0x100>;
+
+			/* Secondary IO-APIC */
+			ioapic2: interrupt-controller@0,1 {
+				#interrupt-cells = <2>;
+				compatible = "intel,ce4100-ioapic";
+				interrupt-controller;
+				reg = <0x100 0x0 0x0 0x0 0x0>;
+				assigned-addresses = <0x02000000 0x0 0xbffff000 0x0 0x1000>;
+			};
+
+			pci@1,0 {
+				#address-cells = <3>;
+				#size-cells = <2>;
+				compatible = "intel,ce4100-pci", "pci";
+				device_type = "pci";
+				bus-range = <1 1>;
+				ranges = <0x2000000 0 0xdffe0000 0x2000000 0 0xdffe0000 0 0x1000>;
+
+				interrupt-parent = <&ioapic2>;
+
+				display@2,0 {
+					compatible = "pci8086,2e5b.2",
+						   "pci8086,2e5b",
+						   "pciclass038000",
+						   "pciclass0380";
+
+					reg = <0x11000 0x0 0x0 0x0 0x0>;
+					interrupts = <0 1>;
+				};
+
+				multimedia@3,0 {
+					compatible = "pci8086,2e5c.2",
+						   "pci8086,2e5c",
+						   "pciclass048000",
+						   "pciclass0480";
+
+					reg = <0x11800 0x0 0x0 0x0 0x0>;
+					interrupts = <2 1>;
+				};
+
+				multimedia@4,0 {
+					compatible = "pci8086,2e5d.2",
+						   "pci8086,2e5d",
+						   "pciclass048000",
+						   "pciclass0480";
+
+					reg = <0x12000 0x0 0x0 0x0 0x0>;
+					interrupts = <4 1>;
+				};
+
+				multimedia@4,1 {
+					compatible = "pci8086,2e5e.2",
+						   "pci8086,2e5e",
+						   "pciclass048000",
+						   "pciclass0480";
+
+					reg = <0x12100 0x0 0x0 0x0 0x0>;
+					interrupts = <5 1>;
+				};
+
+				sound@6,0 {
+					compatible = "pci8086,2e5f.2",
+						   "pci8086,2e5f",
+						   "pciclass040100",
+						   "pciclass0401";
+
+					reg = <0x13000 0x0 0x0 0x0 0x0>;
+					interrupts = <6 1>;
+				};
+
+				sound@6,1 {
+					compatible = "pci8086,2e5f.2",
+						   "pci8086,2e5f",
+						   "pciclass040100",
+						   "pciclass0401";
+
+					reg = <0x13100 0x0 0x0 0x0 0x0>;
+					interrupts = <7 1>;
+				};
+
+				sound@6,2 {
+					compatible = "pci8086,2e60.2",
+						   "pci8086,2e60",
+						   "pciclass040100",
+						   "pciclass0401";
+
+					reg = <0x13200 0x0 0x0 0x0 0x0>;
+					interrupts = <8 1>;
+				};
+
+				display@8,0 {
+					compatible = "pci8086,2e61.2",
+						   "pci8086,2e61",
+						   "pciclass038000",
+						   "pciclass0380";
+
+					reg = <0x14000 0x0 0x0 0x0 0x0>;
+					interrupts = <9 1>;
+				};
+
+				display@8,1 {
+					compatible = "pci8086,2e62.2",
+						   "pci8086,2e62",
+						   "pciclass038000",
+						   "pciclass0380";
+
+					reg = <0x14100 0x0 0x0 0x0 0x0>;
+					interrupts = <10 1>;
+				};
+
+				multimedia@8,2 {
+					compatible = "pci8086,2e63.2",
+						   "pci8086,2e63",
+						   "pciclass048000",
+						   "pciclass0480";
+
+					reg = <0x14200 0x0 0x0 0x0 0x0>;
+					interrupts = <11 1>;
+				};
+
+				entertainment-encryption@9,0 {
+					compatible = "pci8086,2e64.2",
+						   "pci8086,2e64",
+						   "pciclass101000",
+						   "pciclass1010";
+
+					reg = <0x14800 0x0 0x0 0x0 0x0>;
+					interrupts = <12 1>;
+				};
+
+				localbus@a,0 {
+					compatible = "pci8086,2e65.2",
+						   "pci8086,2e65",
+						   "pciclassff0000",
+						   "pciclassff00";
+
+					reg = <0x15000 0x0 0x0 0x0 0x0>;
+				};
+
+				serial@b,0 {
+					compatible = "pci8086,2e66.2",
+						   "pci8086,2e66",
+						   "pciclass070003",
+						   "pciclass0700";
+
+					reg = <0x15800 0x0 0x0 0x0 0x0>;
+					interrupts = <14 1>;
+				};
+
+				gpio@b,1 {
+					compatible = "pci8086,2e67.2",
+						   "pci8086,2e67",
+						   "pciclassff0000",
+						   "pciclassff00";
+
+					#gpio-cells = <2>;
+					reg = <0x15900 0x0 0x0 0x0 0x0>;
+					interrupts = <15 1>;
+					gpio-controller;
+				};
+
+				i2c-controller@b,2 {
+					#address-cells = <2>;
+					#size-cells = <1>;
+					compatible = "pci8086,2e68.2",
+						   "pci8086,2e68",
+						   "pciclass,ff0000",
+						   "pciclass,ff00";
+
+					reg = <0x15a00 0x0 0x0 0x0 0x0>;
+					interrupts = <16 1>;
+					ranges = <0 0	0x02000000 0 0xdffe0500	0x100
+						  1 0	0x02000000 0 0xdffe0600	0x100
+						  2 0	0x02000000 0 0xdffe0700	0x100>;
+
+					i2c@0 {
+						#address-cells = <1>;
+						#size-cells = <0>;
+						compatible = "intel,ce4100-i2c-controller";
+						reg = <0 0 0x100>;
+					};
+
+					i2c@1 {
+						#address-cells = <1>;
+						#size-cells = <0>;
+						compatible = "intel,ce4100-i2c-controller";
+						reg = <1 0 0x100>;
+
+						gpio@26 {
+							#gpio-cells = <2>;
+							compatible = "ti,pcf8575";
+							reg = <0x26>;
+							gpio-controller;
+						};
+					};
+
+					i2c@2 {
+						#address-cells = <1>;
+						#size-cells = <0>;
+						compatible = "intel,ce4100-i2c-controller";
+						reg = <2 0 0x100>;
+
+						gpio@26 {
+							#gpio-cells = <2>;
+							compatible = "ti,pcf8575";
+							reg = <0x26>;
+							gpio-controller;
+						};
+					};
+				};
+
+				smard-card@b,3 {
+					compatible = "pci8086,2e69.2",
+						   "pci8086,2e69",
+						   "pciclass070500",
+						   "pciclass0705";
+
+					reg = <0x15b00 0x0 0x0 0x0 0x0>;
+					interrupts = <15 1>;
+				};
+
+				spi-controller@b,4 {
+					#address-cells = <1>;
+					#size-cells = <0>;
+					compatible =
+						"pci8086,2e6a.2",
+						"pci8086,2e6a",
+						"pciclass,ff0000",
+						"pciclass,ff00";
+
+					reg = <0x15c00 0x0 0x0 0x0 0x0>;
+					interrupts = <15 1>;
+
+					dac@0 {
+						compatible = "ti,pcm1755";
+						reg = <0>;
+						spi-max-frequency = <115200>;
+					};
+
+					dac@1 {
+						compatible = "ti,pcm1609a";
+						reg = <1>;
+						spi-max-frequency = <115200>;
+					};
+
+					eeprom@2 {
+						compatible = "atmel,at93c46";
+						reg = <2>;
+						spi-max-frequency = <115200>;
+					};
+				};
+
+				multimedia@b,7 {
+					compatible = "pci8086,2e6d.2",
+						   "pci8086,2e6d",
+						   "pciclassff0000",
+						   "pciclassff00";
+
+					reg = <0x15f00 0x0 0x0 0x0 0x0>;
+				};
+
+				ethernet@c,0 {
+					compatible = "pci8086,2e6e.2",
+						   "pci8086,2e6e",
+						   "pciclass020000",
+						   "pciclass0200";
+
+					reg = <0x16000 0x0 0x0 0x0 0x0>;
+					interrupts = <21 1>;
+				};
+
+				clock@c,1 {
+					compatible = "pci8086,2e6f.2",
+						   "pci8086,2e6f",
+						   "pciclassff0000",
+						   "pciclassff00";
+
+					reg = <0x16100 0x0 0x0 0x0 0x0>;
+					interrupts = <3 1>;
+				};
+
+				usb@d,0 {
+					compatible = "pci8086,2e70.2",
+						   "pci8086,2e70",
+						   "pciclass0c0320",
+						   "pciclass0c03";
+
+					reg = <0x16800 0x0 0x0 0x0 0x0>;
+					interrupts = <22 3>;
+				};
+
+				usb@d,1 {
+					compatible = "pci8086,2e70.2",
+						   "pci8086,2e70",
+						   "pciclass0c0320",
+						   "pciclass0c03";
+
+					reg = <0x16900 0x0 0x0 0x0 0x0>;
+					interrupts = <22 3>;
+				};
+
+				sata@e,0 {
+					compatible = "pci8086,2e71.0",
+						   "pci8086,2e71",
+						   "pciclass010601",
+						   "pciclass0106";
+
+					reg = <0x17000 0x0 0x0 0x0 0x0>;
+					interrupts = <23 3>;
+				};
+
+				flash@f,0 {
+					compatible = "pci8086,701.1",
+						   "pci8086,701",
+						   "pciclass050100",
+						   "pciclass0501";
+
+					reg = <0x17800 0x0 0x0 0x0 0x0>;
+					interrupts = <13 1>;
+				};
+
+				entertainment-encryption@10,0 {
+					compatible = "pci8086,702.1",
+						   "pci8086,702",
+						   "pciclass101000",
+						   "pciclass1010";
+
+					reg = <0x18000 0x0 0x0 0x0 0x0>;
+				};
+
+				co-processor@11,0 {
+					compatible = "pci8086,703.1",
+						   "pci8086,703",
+						   "pciclass0b4000",
+						   "pciclass0b40";
+
+					reg = <0x18800 0x0 0x0 0x0 0x0>;
+					interrupts = <1 1>;
+				};
+
+				multimedia@12,0 {
+					compatible = "pci8086,704.0",
+						   "pci8086,704",
+						   "pciclass048000",
+						   "pciclass0480";
+
+					reg = <0x19000 0x0 0x0 0x0 0x0>;
+				};
+			};
+
+			isa@1f,0 {
+				#address-cells = <2>;
+				#size-cells = <1>;
+				compatible = "isa";
+				ranges = <1 0 0 0 0 0x100>;
+
+				rtc@70 {
+					compatible = "intel,ce4100-rtc", "motorola,mc146818";
+					interrupts = <8 3>;
+					interrupt-parent = <&ioapic1>;
+					ctrl-reg = <2>;
+					freq-reg = <0x26>;
+					reg = <1 0x70 2>;
+				};
+			};
+		};
+	};
+};
-- 
cgit v0.10.2


From 19c4f5f7f7e9c5db89a91627af2a426cfb5568de Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Tue, 22 Feb 2011 21:07:39 +0100
Subject: x86: dtb: Add irq domain abstraction

The here introduced irq_domain abstraction represents a generic irq
controller. It is a subset of powerpc's irq_host which is going to be
renamed to irq_domain and then become generic. This implementation will
be removed once it is generic.

The xlate callback is resposible to parse irq informations like irq type
and number and returns the hardware irq number which is reported by the
hardware as active.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Tested-by: Dirk Brandewie <dirk.brandewie@gmail.com>
Acked-by: Grant Likely <grant.likely@secretlab.ca>
Cc: sodaville@linutronix.de
Cc: devicetree-discuss@lists.ozlabs.org
LKML-Reference: <1298405266-1624-5-git-send-email-bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/include/asm/irq_controller.h b/arch/x86/include/asm/irq_controller.h
new file mode 100644
index 0000000..423bbbd
--- /dev/null
+++ b/arch/x86/include/asm/irq_controller.h
@@ -0,0 +1,12 @@
+#ifndef __IRQ_CONTROLLER__
+#define __IRQ_CONTROLLER__
+
+struct irq_domain {
+	int (*xlate)(struct irq_domain *h, const u32 *intspec, u32 intsize,
+			u32 *out_hwirq, u32 *out_type);
+	void *priv;
+	struct device_node *controller;
+	struct list_head l;
+};
+
+#endif
diff --git a/arch/x86/include/asm/prom.h b/arch/x86/include/asm/prom.h
index e46f2a2..83833ac 100644
--- a/arch/x86/include/asm/prom.h
+++ b/arch/x86/include/asm/prom.h
@@ -20,9 +20,11 @@
 #include <asm/irq.h>
 #include <asm/atomic.h>
 #include <asm/setup.h>
+#include <asm/irq_controller.h>
 
 #ifdef CONFIG_OF
 extern void add_dtb(u64 data);
+void add_interrupt_host(struct irq_domain *ih);
 #else
 static inline void add_dtb(u64 data) { }
 #endif
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index 0b8f0da..ef98e4e 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -3,19 +3,63 @@
  */
 #include <linux/bootmem.h>
 #include <linux/io.h>
+#include <linux/interrupt.h>
 #include <linux/list.h>
 #include <linux/of.h>
 #include <linux/of_fdt.h>
 #include <linux/of_platform.h>
 #include <linux/slab.h>
 
+#include <asm/irq_controller.h>
+
 char __initdata cmd_line[COMMAND_LINE_SIZE];
+static LIST_HEAD(irq_domains);
+static DEFINE_RAW_SPINLOCK(big_irq_lock);
+
+void add_interrupt_host(struct irq_domain *ih)
+{
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&big_irq_lock, flags);
+	list_add(&ih->l, &irq_domains);
+	raw_spin_unlock_irqrestore(&big_irq_lock, flags);
+}
+
+static struct irq_domain *get_ih_from_node(struct device_node *controller)
+{
+	struct irq_domain *ih, *found = NULL;
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&big_irq_lock, flags);
+	list_for_each_entry(ih, &irq_domains, l) {
+		if (ih->controller ==  controller) {
+			found = ih;
+			break;
+		}
+	}
+	raw_spin_unlock_irqrestore(&big_irq_lock, flags);
+	return found;
+}
 
 unsigned int irq_create_of_mapping(struct device_node *controller,
 				   const u32 *intspec, unsigned int intsize)
 {
-	return intspec[0];
+	struct irq_domain *ih;
+	u32 virq, type;
+	int ret;
 
+	ih = get_ih_from_node(controller);
+	if (!ih)
+		return 0;
+	ret = ih->xlate(ih, intspec, intsize, &virq, &type);
+	if (ret)
+		return ret;
+	if (type == IRQ_TYPE_NONE)
+		return virq;
+	/* set the mask if it is different from current */
+	if (type == (irq_to_desc(virq)->status & IRQF_TRIGGER_MASK))
+		set_irq_type(virq, type);
+	return virq;
 }
 EXPORT_SYMBOL_GPL(irq_create_of_mapping);
 
-- 
cgit v0.10.2


From 3879a6f32948330782889cebc4d74c4f2316c676 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Tue, 22 Feb 2011 21:07:40 +0100
Subject: x86: dtb: Add early parsing of IO_APIC

APIC and IO_APIC have to be added to the system early because
native_init_IRQ() requires it.

In order to obtain the address of the ioapic the device tree has to be
unflattened so of_address_to_resource() works.

The device tree is relocated to ensure it is always covered by the
kernel mapping. That way the boot loader does not have to make
any assumptions about kernel's memory layout.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Acked-by: Grant Likely <grant.likely@secretlab.ca>
Cc: sodaville@linutronix.de
Cc: devicetree-discuss@lists.ozlabs.org
Cc: Dirk Brandewie <dirk.brandewie@gmail.com>
LKML-Reference: <1298405266-1624-6-git-send-email-bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/include/asm/prom.h b/arch/x86/include/asm/prom.h
index 83833ac..35ec32b 100644
--- a/arch/x86/include/asm/prom.h
+++ b/arch/x86/include/asm/prom.h
@@ -23,10 +23,17 @@
 #include <asm/irq_controller.h>
 
 #ifdef CONFIG_OF
+extern int of_ioapic;
+extern u64 initial_dtb;
 extern void add_dtb(u64 data);
+void x86_dtb_find_config(void);
+void x86_dtb_get_config(unsigned int unused);
 void add_interrupt_host(struct irq_domain *ih);
 #else
 static inline void add_dtb(u64 data) { }
+#define x86_dtb_find_config x86_init_noop
+#define x86_dtb_get_config x86_init_uint_noop
+#define of_ioapic 0
 #endif
 
 extern char cmd_line[COMMAND_LINE_SIZE];
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index ef98e4e..2739d56 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -7,15 +7,20 @@
 #include <linux/list.h>
 #include <linux/of.h>
 #include <linux/of_fdt.h>
+#include <linux/of_address.h>
 #include <linux/of_platform.h>
 #include <linux/slab.h>
 
 #include <asm/irq_controller.h>
+#include <asm/apic.h>
 
+__initdata u64 initial_dtb;
 char __initdata cmd_line[COMMAND_LINE_SIZE];
 static LIST_HEAD(irq_domains);
 static DEFINE_RAW_SPINLOCK(big_irq_lock);
 
+int __initdata of_ioapic;
+
 void add_interrupt_host(struct irq_domain *ih)
 {
 	unsigned long flags;
@@ -90,6 +95,107 @@ void * __init early_init_dt_alloc_memory_arch(u64 size, u64 align)
 
 void __init add_dtb(u64 data)
 {
-	initial_boot_params = phys_to_virt((u64) (u32) data +
-				offsetof(struct setup_data, data));
+	initial_dtb = data + offsetof(struct setup_data, data);
+}
+
+static void __init dtb_lapic_setup(void)
+{
+#ifdef CONFIG_X86_LOCAL_APIC
+	if (apic_force_enable())
+		return;
+
+	smp_found_config = 1;
+	pic_mode = 1;
+	/* Required for ioapic registration */
+	set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
+	if (boot_cpu_physical_apicid == -1U)
+		boot_cpu_physical_apicid = read_apic_id();
+
+	generic_processor_info(boot_cpu_physical_apicid,
+			GET_APIC_VERSION(apic_read(APIC_LVR)));
+#endif
+}
+
+#ifdef CONFIG_X86_IO_APIC
+static unsigned int ioapic_id;
+
+static void __init dtb_add_ioapic(struct device_node *dn)
+{
+	struct resource r;
+	int ret;
+
+	ret = of_address_to_resource(dn, 0, &r);
+	if (ret) {
+		printk(KERN_ERR "Can't obtain address from node %s.\n",
+				dn->full_name);
+		return;
+	}
+	mp_register_ioapic(++ioapic_id, r.start, gsi_top);
+}
+
+static void __init dtb_ioapic_setup(void)
+{
+	struct device_node *dn;
+
+	if (!smp_found_config)
+		return;
+
+	for_each_compatible_node(dn, NULL, "intel,ce4100-ioapic")
+		dtb_add_ioapic(dn);
+
+	if (nr_ioapics) {
+		of_ioapic = 1;
+		return;
+	}
+	printk(KERN_ERR "Error: No information about IO-APIC in OF.\n");
+	smp_found_config = 0;
+}
+#else
+static void __init dtb_ioapic_setup(void) {}
+#endif
+
+static void __init dtb_apic_setup(void)
+{
+	dtb_lapic_setup();
+	dtb_ioapic_setup();
+}
+
+void __init x86_dtb_find_config(void)
+{
+	if (initial_dtb)
+		smp_found_config = 1;
+	else
+		printk(KERN_ERR "Missing device tree!.\n");
+}
+
+void __init x86_dtb_get_config(unsigned int unused)
+{
+	u32 size, map_len;
+	void *new_dtb;
+
+	if (!initial_dtb)
+		return;
+
+	map_len = max(PAGE_SIZE - (initial_dtb & ~PAGE_MASK),
+			(u64)sizeof(struct boot_param_header));
+
+	initial_boot_params = early_memremap(initial_dtb, map_len);
+	size = be32_to_cpu(initial_boot_params->totalsize);
+	if (map_len < size) {
+		early_iounmap(initial_boot_params, map_len);
+		initial_boot_params = early_memremap(initial_dtb, size);
+		map_len = size;
+	}
+
+	new_dtb = alloc_bootmem(size);
+	memcpy(new_dtb, initial_boot_params, size);
+	early_iounmap(initial_boot_params, map_len);
+
+	initial_boot_params = new_dtb;
+
+	/* root level address cells */
+	of_scan_flat_dt(early_init_dt_scan_root, NULL);
+
+	unflatten_device_tree();
+	dtb_apic_setup();
 }
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index c752e97..4cadf86 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -25,6 +25,7 @@
 #include <asm/setup.h>
 #include <asm/i8259.h>
 #include <asm/traps.h>
+#include <asm/prom.h>
 
 /*
  * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts:
@@ -243,7 +244,7 @@ void __init native_init_IRQ(void)
 			set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
 	}
 
-	if (!acpi_ioapic)
+	if (!acpi_ioapic && !of_ioapic)
 		setup_irq(2, &irq2);
 
 #ifdef CONFIG_X86_32
-- 
cgit v0.10.2


From ffb9fc68dff38f811eeb24c15aba0418b6a8ee53 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Tue, 22 Feb 2011 21:07:41 +0100
Subject: x86: dtb: Add device tree support for HPET

Set hpet_address based on information provied form DTB

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Acked-by: Grant Likely <grant.likely@secretlab.ca>
Cc: sodaville@linutronix.de
Cc: devicetree-discuss@lists.ozlabs.org
Cc: Dirk Brandewie <dirk.brandewie@gmail.com>
LKML-Reference: <1298405266-1624-7-git-send-email-bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index 2739d56..dbb3bda 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -11,6 +11,7 @@
 #include <linux/of_platform.h>
 #include <linux/slab.h>
 
+#include <asm/hpet.h>
 #include <asm/irq_controller.h>
 #include <asm/apic.h>
 
@@ -98,6 +99,23 @@ void __init add_dtb(u64 data)
 	initial_dtb = data + offsetof(struct setup_data, data);
 }
 
+static void __init dtb_setup_hpet(void)
+{
+	struct device_node *dn;
+	struct resource r;
+	int ret;
+
+	dn = of_find_compatible_node(NULL, NULL, "intel,ce4100-hpet");
+	if (!dn)
+		return;
+	ret = of_address_to_resource(dn, 0, &r);
+	if (ret) {
+		WARN_ON(1);
+		return;
+	}
+	hpet_address = r.start;
+}
+
 static void __init dtb_lapic_setup(void)
 {
 #ifdef CONFIG_X86_LOCAL_APIC
@@ -197,5 +215,6 @@ void __init x86_dtb_get_config(unsigned int unused)
 	of_scan_flat_dt(early_init_dt_scan_root, NULL);
 
 	unflatten_device_tree();
+	dtb_setup_hpet();
 	dtb_apic_setup();
 }
-- 
cgit v0.10.2


From 96e0a0797eba35b5420c710b928f19094b2d5c45 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Tue, 22 Feb 2011 21:07:42 +0100
Subject: x86: dtb: Add support for PCI devices backed by dtb nodes

x86_of_pci_init() does two things:

- it provides a generic irq enable and disable function. enable queries
  the device tree for the interrupt information, calls ->xlate on the
  irq host and updates the pci->irq information for the device.

- it walks through PCI bus(es) in the device tree and adds its children
  (device) nodes to appropriate pci_dev nodes in kernel. So the dtb
  node information is available at probe time of the PCI device.

Adding a PCI bus based on the information in the device tree is
currently not supported. Right now direct access via ioports is used.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Tested-by: Dirk Brandewie <dirk.brandewie@gmail.com>
Acked-by: Grant Likely <grant.likely@secretlab.ca>
Cc: sodaville@linutronix.de
Cc: devicetree-discuss@lists.ozlabs.org
LKML-Reference: <1298405266-1624-8-git-send-email-bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/include/asm/prom.h b/arch/x86/include/asm/prom.h
index 35ec32b..8fcd519 100644
--- a/arch/x86/include/asm/prom.h
+++ b/arch/x86/include/asm/prom.h
@@ -16,6 +16,7 @@
 
 #include <linux/of.h>
 #include <linux/types.h>
+#include <linux/pci.h>
 
 #include <asm/irq.h>
 #include <asm/atomic.h>
@@ -29,8 +30,21 @@ extern void add_dtb(u64 data);
 void x86_dtb_find_config(void);
 void x86_dtb_get_config(unsigned int unused);
 void add_interrupt_host(struct irq_domain *ih);
+void __cpuinit x86_of_pci_init(void);
+
+static inline struct device_node *pci_device_to_OF_node(struct pci_dev *pdev)
+{
+	return pdev ? pdev->dev.of_node : NULL;
+}
+
+static inline struct device_node *pci_bus_to_OF_node(struct pci_bus *bus)
+{
+	return pci_device_to_OF_node(bus->self);
+}
+
 #else
 static inline void add_dtb(u64 data) { }
+static inline void x86_of_pci_init(void) { }
 #define x86_dtb_find_config x86_init_noop
 #define x86_dtb_get_config x86_init_uint_noop
 #define of_ioapic 0
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index dbb3bda..7b57422 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -9,11 +9,15 @@
 #include <linux/of_fdt.h>
 #include <linux/of_address.h>
 #include <linux/of_platform.h>
+#include <linux/of_irq.h>
 #include <linux/slab.h>
+#include <linux/pci.h>
+#include <linux/of_pci.h>
 
 #include <asm/hpet.h>
 #include <asm/irq_controller.h>
 #include <asm/apic.h>
+#include <asm/pci_x86.h>
 
 __initdata u64 initial_dtb;
 char __initdata cmd_line[COMMAND_LINE_SIZE];
@@ -99,6 +103,85 @@ void __init add_dtb(u64 data)
 	initial_dtb = data + offsetof(struct setup_data, data);
 }
 
+#ifdef CONFIG_PCI
+static int x86_of_pci_irq_enable(struct pci_dev *dev)
+{
+	struct of_irq oirq;
+	u32 virq;
+	int ret;
+	u8 pin;
+
+	ret = pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
+	if (ret)
+		return ret;
+	if (!pin)
+		return 0;
+
+	ret = of_irq_map_pci(dev, &oirq);
+	if (ret)
+		return ret;
+
+	virq = irq_create_of_mapping(oirq.controller, oirq.specifier,
+			oirq.size);
+	if (virq == 0)
+		return -EINVAL;
+	dev->irq = virq;
+	return 0;
+}
+
+static void x86_of_pci_irq_disable(struct pci_dev *dev)
+{
+}
+
+void __cpuinit x86_of_pci_init(void)
+{
+	struct device_node *np;
+
+	pcibios_enable_irq = x86_of_pci_irq_enable;
+	pcibios_disable_irq = x86_of_pci_irq_disable;
+
+	for_each_node_by_type(np, "pci") {
+		const void *prop;
+		struct pci_bus *bus;
+		unsigned int bus_min;
+		struct device_node *child;
+
+		prop = of_get_property(np, "bus-range", NULL);
+		if (!prop)
+			continue;
+		bus_min = be32_to_cpup(prop);
+
+		bus = pci_find_bus(0, bus_min);
+		if (!bus) {
+			printk(KERN_ERR "Can't find a node for bus %s.\n",
+					np->full_name);
+			continue;
+		}
+
+		if (bus->self)
+			bus->self->dev.of_node = np;
+		else
+			bus->dev.of_node = np;
+
+		for_each_child_of_node(np, child) {
+			struct pci_dev *dev;
+			u32 devfn;
+
+			prop = of_get_property(child, "reg", NULL);
+			if (!prop)
+				continue;
+
+			devfn = (be32_to_cpup(prop) >> 8) & 0xff;
+			dev = pci_get_slot(bus, devfn);
+			if (!dev)
+				continue;
+			dev->dev.of_node = child;
+			pci_dev_put(dev);
+		}
+	}
+}
+#endif
+
 static void __init dtb_setup_hpet(void)
 {
 	struct device_node *dn;
diff --git a/drivers/of/Kconfig b/drivers/of/Kconfig
index efabbf9..d06a637 100644
--- a/drivers/of/Kconfig
+++ b/drivers/of/Kconfig
@@ -71,7 +71,7 @@ config OF_MDIO
 
 config OF_PCI
 	def_tristate PCI
-	depends on PCI && (PPC || MICROBLAZE)
+	depends on PCI && (PPC || MICROBLAZE || X86)
 	help
 	  OpenFirmware PCI bus accessors
 
diff --git a/drivers/of/of_pci.c b/drivers/of/of_pci.c
index 314535f..ac1ec54 100644
--- a/drivers/of/of_pci.c
+++ b/drivers/of/of_pci.c
@@ -1,5 +1,6 @@
 #include <linux/kernel.h>
 #include <linux/of_pci.h>
+#include <linux/of_irq.h>
 #include <asm/prom.h>
 
 /**
-- 
cgit v0.10.2


From 9079b35364e75ce6b968a179f861d2f819f33e61 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Tue, 22 Feb 2011 21:07:43 +0100
Subject: x86: dtb: Add generic bus probe

For now we probe these busses and we change this to board dependent
probes once we have to.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Dirk Brandewie <dirk.brandewie@gmail.com>
Acked-by: Grant Likely <grant.likely@secretlab.ca>
Cc: sodaville@linutronix.de
Cc: devicetree-discuss@lists.ozlabs.org
LKML-Reference: <1298405266-1624-9-git-send-email-bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index 7b57422..0bc83bf 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -103,6 +103,25 @@ void __init add_dtb(u64 data)
 	initial_dtb = data + offsetof(struct setup_data, data);
 }
 
+/*
+ * CE4100 ids. Will be moved to machine_device_initcall() once we have it.
+ */
+static struct of_device_id __initdata ce4100_ids[] = {
+	{ .compatible = "intel,ce4100-cp", },
+	{ .compatible = "isa", },
+	{ .compatible = "pci", },
+	{},
+};
+
+static int __init add_bus_probe(void)
+{
+	if (!initial_boot_params)
+		return 0;
+
+	return of_platform_bus_probe(NULL, ce4100_ids, NULL);
+}
+module_init(add_bus_probe);
+
 #ifdef CONFIG_PCI
 static int x86_of_pci_irq_enable(struct pci_dev *dev)
 {
-- 
cgit v0.10.2


From bcc7c1244fcfd852b9f4590935491057e1cab9dd Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Tue, 22 Feb 2011 21:07:44 +0100
Subject: x86: ioapic: Add OF bindings for IO_APIC

ioapic_xlate provides a translation from the information in device tree
to ioapic related informations. This includes
- obtaining hw irq which is the vector number "=> pin number + gsi"
- obtaining type (level/edge/..)
- programming this information into ioapic

ioapic_add_ofnode adds an irq_domain based on informations from the device
tree. This information (irq_domain) is required in order to map a device to
its proper interrupt controller.

[ tglx: Adapted to the io_apic changes, which let us move that whole code
  	to devicetree.c ]

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Dirk Brandewie <dirk.brandewie@gmail.com>
Acked-by: Grant Likely <grant.likely@secretlab.ca>
Cc: sodaville@linutronix.de
Cc: devicetree-discuss@lists.ozlabs.org
LKML-Reference: <1298405266-1624-10-git-send-email-bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/include/asm/prom.h b/arch/x86/include/asm/prom.h
index 8fcd519..ae50131 100644
--- a/arch/x86/include/asm/prom.h
+++ b/arch/x86/include/asm/prom.h
@@ -27,6 +27,7 @@
 extern int of_ioapic;
 extern u64 initial_dtb;
 extern void add_dtb(u64 data);
+extern void x86_add_irq_domains(void);
 void x86_dtb_find_config(void);
 void x86_dtb_get_config(unsigned int unused);
 void add_interrupt_host(struct irq_domain *ih);
@@ -44,6 +45,7 @@ static inline struct device_node *pci_bus_to_OF_node(struct pci_bus *bus)
 
 #else
 static inline void add_dtb(u64 data) { }
+static inline void x86_add_irq_domains(void) { }
 static inline void x86_of_pci_init(void) { }
 #define x86_dtb_find_config x86_init_noop
 #define x86_dtb_get_config x86_init_uint_noop
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index 0bc83bf..2d65897 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -320,3 +320,107 @@ void __init x86_dtb_get_config(unsigned int unused)
 	dtb_setup_hpet();
 	dtb_apic_setup();
 }
+
+#ifdef CONFIG_X86_IO_APIC
+
+struct of_ioapic_type {
+	u32 out_type;
+	u32 trigger;
+	u32 polarity;
+};
+
+static struct of_ioapic_type of_ioapic_type[] =
+{
+	{
+		.out_type	= IRQ_TYPE_EDGE_RISING,
+		.trigger	= IOAPIC_EDGE,
+		.polarity	= 1,
+	},
+	{
+		.out_type	= IRQ_TYPE_LEVEL_LOW,
+		.trigger	= IOAPIC_LEVEL,
+		.polarity	= 0,
+	},
+	{
+		.out_type	= IRQ_TYPE_LEVEL_HIGH,
+		.trigger	= IOAPIC_LEVEL,
+		.polarity	= 1,
+	},
+	{
+		.out_type	= IRQ_TYPE_EDGE_FALLING,
+		.trigger	= IOAPIC_EDGE,
+		.polarity	= 0,
+	},
+};
+
+static int ioapic_xlate(struct irq_domain *id, const u32 *intspec, u32 intsize,
+			u32 *out_hwirq, u32 *out_type)
+{
+	struct io_apic_irq_attr attr;
+	struct of_ioapic_type *it;
+	u32 line, idx, type;
+
+	if (intsize < 2)
+		return -EINVAL;
+
+	line = *intspec;
+	idx = (u32) id->priv;
+	*out_hwirq = line + mp_gsi_routing[idx].gsi_base;
+
+	intspec++;
+	type = *intspec;
+
+	if (type >= ARRAY_SIZE(of_ioapic_type))
+		return -EINVAL;
+
+	it = of_ioapic_type + type;
+	*out_type = it->out_type;
+
+	set_io_apic_irq_attr(&attr, idx, line, it->trigger, it->polarity);
+
+	return io_apic_setup_irq_pin(*out_hwirq, cpu_to_node(0), &attr);
+}
+
+static void __init ioapic_add_ofnode(struct device_node *np)
+{
+	struct resource r;
+	int i, ret;
+
+	ret = of_address_to_resource(np, 0, &r);
+	if (ret) {
+		printk(KERN_ERR "Failed to obtain address for %s\n",
+				np->full_name);
+		return;
+	}
+
+	for (i = 0; i < nr_ioapics; i++) {
+		if (r.start == mp_ioapics[i].apicaddr) {
+			struct irq_domain *id;
+
+			id = kzalloc(sizeof(*id), GFP_KERNEL);
+			BUG_ON(!id);
+			id->controller = np;
+			id->xlate = ioapic_xlate;
+			id->priv = (void *)i;
+			add_interrupt_host(id);
+			return;
+		}
+	}
+	printk(KERN_ERR "IOxAPIC at %s is not registered.\n", np->full_name);
+}
+
+void __init x86_add_irq_domains(void)
+{
+	struct device_node *dp;
+
+	if (!initial_boot_params)
+		return;
+
+	for_each_node_with_property(dp, "interrupt-controller") {
+		if (of_device_is_compatible(dp, "intel,ce4100-ioapic"))
+			ioapic_add_ofnode(dp);
+	}
+}
+#else
+void __init x86_add_irq_domains(void) { }
+#endif
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 4cadf86..9f76f89 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -119,6 +119,12 @@ void __init init_IRQ(void)
 	int i;
 
 	/*
+	 * We probably need a better place for this, but it works for
+	 * now ...
+	 */
+	x86_add_irq_domains();
+
+	/*
 	 * On cpu 0, Assign IRQ0_VECTOR..IRQ15_VECTOR's to IRQ 0..15.
 	 * If these IRQ's are handled by legacy interrupt-controllers like PIC,
 	 * then this configuration will likely be static after the boot. If
-- 
cgit v0.10.2


From 1fa4163bdc199a0b80f9e333d718b3f65e901593 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Tue, 22 Feb 2011 21:07:45 +0100
Subject: x86: ce4100: Use OF to setup devices

Use device tree information to setup IO_APIC configuration, interrupt
routing, HPET and everything else which cannot be enumerated by other
means.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Dirk Brandewie <dirk.brandewie@gmail.com>
Acked-by: Grant Likely <grant.likely@secretlab.ca>
Cc: sodaville@linutronix.de
Cc: devicetree-discuss@lists.ozlabs.org
LKML-Reference: <1298405266-1624-11-git-send-email-bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/platform/ce4100/ce4100.c b/arch/x86/platform/ce4100/ce4100.c
index d2c0d51..b3436d3 100644
--- a/arch/x86/platform/ce4100/ce4100.c
+++ b/arch/x86/platform/ce4100/ce4100.c
@@ -15,21 +15,19 @@
 #include <linux/serial_reg.h>
 #include <linux/serial_8250.h>
 
+#include <asm/prom.h>
 #include <asm/setup.h>
+#include <asm/i8259.h>
 #include <asm/io.h>
+#include <asm/io_apic.h>
 
 static int ce4100_i8042_detect(void)
 {
 	return 0;
 }
 
-static void __init sdv_find_smp_config(void)
-{
-}
-
 #ifdef CONFIG_SERIAL_8250
 
-
 static unsigned int mem_serial_in(struct uart_port *p, int offset)
 {
 	offset = offset << p->regshift;
@@ -118,6 +116,15 @@ static void __init sdv_arch_setup(void)
 	sdv_serial_fixup();
 }
 
+#ifdef CONFIG_X86_IO_APIC
+static void __cpuinit sdv_pci_init(void)
+{
+	x86_of_pci_init();
+	/* We can't set this earlier, because we need to calibrate the timer */
+	legacy_pic = &null_legacy_pic;
+}
+#endif
+
 /*
  * CE4100 specific x86_init function overrides and early setup
  * calls.
@@ -127,6 +134,11 @@ void __init x86_ce4100_early_setup(void)
 	x86_init.oem.arch_setup = sdv_arch_setup;
 	x86_platform.i8042_detect = ce4100_i8042_detect;
 	x86_init.resources.probe_roms = x86_init_noop;
-	x86_init.mpparse.get_smp_config = x86_init_uint_noop;
-	x86_init.mpparse.find_smp_config = sdv_find_smp_config;
+	x86_init.mpparse.get_smp_config = x86_dtb_get_config;
+	x86_init.mpparse.find_smp_config = x86_dtb_find_config;
+
+#ifdef CONFIG_X86_IO_APIC
+	x86_init.pci.init_irq = sdv_pci_init;
+	x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc_nocheck;
+#endif
 }
-- 
cgit v0.10.2


From 3bcbaf6e08d8d82cde781997bd2c56dda87049b5 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Tue, 22 Feb 2011 21:07:46 +0100
Subject: rtc: cmos: Add OF bindings

This allows to load the OF driver based informations from the device
tree. Systems without BIOS may need to perform some initialization.
PowerPC creates a PNP device from the OF information and performs this
kind of initialization in their private PCI quirk. This looks more
generic.

This patch also avoids registering the platform RTC driver on X86 if
we have a device tree blob. Otherwise we would setup the device based
on the hardcoded information in arch/x86 rather than the device tree
based one.

[ tglx: Changed "int of_have_populated_dt()" to bool as recommended by
        Grant ]

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Dirk Brandewie <dirk.brandewie@gmail.com>
Acked-by: Grant Likely <grant.likely@secretlab.ca>
Cc: sodaville@linutronix.de
Cc: devicetree-discuss@lists.ozlabs.org
Cc: rtc-linux@googlegroups.com
Cc: Alessandro Zummo <a.zummo@towertech.it>
LKML-Reference: <1298405266-1624-12-git-send-email-bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/Documentation/devicetree/bindings/rtc/rtc-cmos.txt b/Documentation/devicetree/bindings/rtc/rtc-cmos.txt
new file mode 100644
index 0000000..7382989
--- /dev/null
+++ b/Documentation/devicetree/bindings/rtc/rtc-cmos.txt
@@ -0,0 +1,28 @@
+ Motorola mc146818 compatible RTC
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Required properties:
+  - compatible : "motorola,mc146818"
+  - reg : should contain registers location and length.
+
+Optional properties:
+  - interrupts : should contain interrupt.
+  - interrupt-parent : interrupt source phandle.
+  - ctrl-reg : Contains the initial value of the control register also
+    called "Register B".
+  - freq-reg : Contains the initial value of the frequency register also
+    called "Regsiter A".
+
+"Register A" and "B" are usually initialized by the firmware (BIOS for
+instance). If this is not done, it can be performed by the driver.
+
+ISA Example:
+
+	rtc@70 {
+	         compatible = "motorola,mc146818";
+	         interrupts = <8 3>;
+	         interrupt-parent = <&ioapic1>;
+	         ctrl-reg = <2>;
+	         freq-reg = <0x26>;
+	         reg = <1 0x70 2>;
+	 };
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index 6f39cab..3f2ad26 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -6,6 +6,7 @@
 #include <linux/acpi.h>
 #include <linux/bcd.h>
 #include <linux/pnp.h>
+#include <linux/of.h>
 
 #include <asm/vsyscall.h>
 #include <asm/x86_init.h>
@@ -236,6 +237,8 @@ static __init int add_rtc_cmos(void)
 		}
 	}
 #endif
+	if (of_have_populated_dt())
+		return 0;
 
 	platform_device_register(&rtc_device);
 	dev_info(&rtc_device.dev,
diff --git a/drivers/rtc/rtc-cmos.c b/drivers/rtc/rtc-cmos.c
index c7ff8df..159b95e 100644
--- a/drivers/rtc/rtc-cmos.c
+++ b/drivers/rtc/rtc-cmos.c
@@ -37,6 +37,8 @@
 #include <linux/mod_devicetable.h>
 #include <linux/log2.h>
 #include <linux/pm.h>
+#include <linux/of.h>
+#include <linux/of_platform.h>
 
 /* this is for "generic access to PC-style RTC" using CMOS_READ/CMOS_WRITE */
 #include <asm-generic/rtc.h>
@@ -1123,6 +1125,47 @@ static struct pnp_driver cmos_pnp_driver = {
 
 #endif	/* CONFIG_PNP */
 
+#ifdef CONFIG_OF
+static const struct of_device_id of_cmos_match[] = {
+	{
+		.compatible = "motorola,mc146818",
+	},
+	{ },
+};
+MODULE_DEVICE_TABLE(of, of_cmos_match);
+
+static __init void cmos_of_init(struct platform_device *pdev)
+{
+	struct device_node *node = pdev->dev.of_node;
+	struct rtc_time time;
+	int ret;
+	const __be32 *val;
+
+	if (!node)
+		return;
+
+	val = of_get_property(node, "ctrl-reg", NULL);
+	if (val)
+		CMOS_WRITE(be32_to_cpup(val), RTC_CONTROL);
+
+	val = of_get_property(node, "freq-reg", NULL);
+	if (val)
+		CMOS_WRITE(be32_to_cpup(val), RTC_FREQ_SELECT);
+
+	get_rtc_time(&time);
+	ret = rtc_valid_tm(&time);
+	if (ret) {
+		struct rtc_time def_time = {
+			.tm_year = 1,
+			.tm_mday = 1,
+		};
+		set_rtc_time(&def_time);
+	}
+}
+#else
+static inline void cmos_of_init(struct platform_device *pdev) {}
+#define of_cmos_match NULL
+#endif
 /*----------------------------------------------------------------*/
 
 /* Platform setup should have set up an RTC device, when PNP is
@@ -1131,6 +1174,7 @@ static struct pnp_driver cmos_pnp_driver = {
 
 static int __init cmos_platform_probe(struct platform_device *pdev)
 {
+	cmos_of_init(pdev);
 	cmos_wake_setup(&pdev->dev);
 	return cmos_do_probe(&pdev->dev,
 			platform_get_resource(pdev, IORESOURCE_IO, 0),
@@ -1162,6 +1206,7 @@ static struct platform_driver cmos_platform_driver = {
 #ifdef CONFIG_PM
 		.pm		= &cmos_pm_ops,
 #endif
+		.of_match_table = of_cmos_match,
 	}
 };
 
diff --git a/include/linux/of.h b/include/linux/of.h
index d9dd664..266db1d0 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -70,6 +70,11 @@ extern struct device_node *allnodes;
 extern struct device_node *of_chosen;
 extern rwlock_t devtree_lock;
 
+static inline bool of_have_populated_dt(void)
+{
+	return allnodes != NULL;
+}
+
 static inline bool of_node_is_root(const struct device_node *node)
 {
 	return node && (node->parent == NULL);
@@ -222,5 +227,12 @@ extern void of_attach_node(struct device_node *);
 extern void of_detach_node(struct device_node *);
 #endif
 
+#else
+
+static inline bool of_have_populated_dt(void)
+{
+	return false;
+}
+
 #endif /* CONFIG_OF */
 #endif /* _LINUX_OF_H */
-- 
cgit v0.10.2


From 4a66b1d95ad8baf6ab884a1c64461449b463eb78 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 24 Feb 2011 09:52:42 +0100
Subject: x86: dt: Fix OLPC=y/INTEL_CE=n build

Both OLPC and CE4100 activate CONFIG_OF. OLPC uses PROMTREE while CE
uses FLATTREE. Compiling for OLPC only breaks due to missing flat tree
functions and variables.

Use proper wrappers and provide an empty x86_flattree_get_config()
inline so OF=y FLATTREE=n builds and works.

[ tglx: Make it work with HPET_TIMER=n and make a function static ]

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/include/asm/prom.h b/arch/x86/include/asm/prom.h
index ae50131..f58361b 100644
--- a/arch/x86/include/asm/prom.h
+++ b/arch/x86/include/asm/prom.h
@@ -30,7 +30,6 @@ extern void add_dtb(u64 data);
 extern void x86_add_irq_domains(void);
 void x86_dtb_find_config(void);
 void x86_dtb_get_config(unsigned int unused);
-void add_interrupt_host(struct irq_domain *ih);
 void __cpuinit x86_of_pci_init(void);
 
 static inline struct device_node *pci_device_to_OF_node(struct pci_dev *pdev)
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index 2d65897..06e5e91 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -26,7 +26,7 @@ static DEFINE_RAW_SPINLOCK(big_irq_lock);
 
 int __initdata of_ioapic;
 
-void add_interrupt_host(struct irq_domain *ih)
+static void add_interrupt_host(struct irq_domain *ih)
 {
 	unsigned long flags;
 
@@ -115,7 +115,7 @@ static struct of_device_id __initdata ce4100_ids[] = {
 
 static int __init add_bus_probe(void)
 {
-	if (!initial_boot_params)
+	if (!of_have_populated_dt())
 		return 0;
 
 	return of_platform_bus_probe(NULL, ce4100_ids, NULL);
@@ -203,6 +203,7 @@ void __cpuinit x86_of_pci_init(void)
 
 static void __init dtb_setup_hpet(void)
 {
+#ifdef CONFIG_HPET_TIMER
 	struct device_node *dn;
 	struct resource r;
 	int ret;
@@ -216,6 +217,7 @@ static void __init dtb_setup_hpet(void)
 		return;
 	}
 	hpet_address = r.start;
+#endif
 }
 
 static void __init dtb_lapic_setup(void)
@@ -288,7 +290,8 @@ void __init x86_dtb_find_config(void)
 		printk(KERN_ERR "Missing device tree!.\n");
 }
 
-void __init x86_dtb_get_config(unsigned int unused)
+#ifdef CONFIG_OF_FLATTREE
+static void __init x86_flattree_get_config(void)
 {
 	u32 size, map_len;
 	void *new_dtb;
@@ -317,6 +320,18 @@ void __init x86_dtb_get_config(unsigned int unused)
 	of_scan_flat_dt(early_init_dt_scan_root, NULL);
 
 	unflatten_device_tree();
+}
+#else
+static inline void x86_flattree_get_config(void) { }
+#endif
+
+void __init x86_dtb_get_config(unsigned int unused)
+{
+	x86_flattree_get_config();
+
+	if (!of_have_populated_dt())
+		return;
+
 	dtb_setup_hpet();
 	dtb_apic_setup();
 }
@@ -413,7 +428,7 @@ void __init x86_add_irq_domains(void)
 {
 	struct device_node *dp;
 
-	if (!initial_boot_params)
+	if (!of_have_populated_dt())
 		return;
 
 	for_each_node_with_property(dp, "interrupt-controller") {
-- 
cgit v0.10.2


From 0932587328d9bd5b500a640fbaff3290c8d4cabf Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Thu, 24 Feb 2011 14:43:05 +0100
Subject: bootmem: Separate out CONFIG_NO_BOOTMEM code into nobootmem.c

mm/bootmem.c contained code paths for both bootmem and no bootmem
configurations.  They implement about the same set of APIs in
different ways and as a result bootmem.c contains massive amount of
#ifdef CONFIG_NO_BOOTMEM.

Separate out CONFIG_NO_BOOTMEM code into mm/nobootmem.c.  As the
common part is relatively small, duplicate them in nobootmem.c instead
of creating a common file or ifdef'ing in bootmem.c.

The followings are duplicated.

* {min|max}_low_pfn, max_pfn, saved_max_pfn
* free_bootmem_late()
* ___alloc_bootmem()
* __alloc_bootmem_low()

The followings are applicable only to nobootmem and moved verbatim.

* __free_pages_memory()
* free_all_memory_core_early()

The followings are not applicable to nobootmem and omitted in
nobootmem.c.

* reserve_bootmem_node()
* reserve_bootmem()

The rest split function bodies according to CONFIG_NO_BOOTMEM.

Makefile is updated so that only either bootmem.c or nobootmem.c is
built according to CONFIG_NO_BOOTMEM.

This patch doesn't introduce any behavior change.

-tj: Rewrote commit description.

Suggested-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Acked-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/mm/Makefile b/mm/Makefile
index 2b1b575..42a8326 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -7,7 +7,7 @@ mmu-$(CONFIG_MMU)	:= fremap.o highmem.o madvise.o memory.o mincore.o \
 			   mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
 			   vmalloc.o pagewalk.o pgtable-generic.o
 
-obj-y			:= bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
+obj-y			:= filemap.o mempool.o oom_kill.o fadvise.o \
 			   maccess.o page_alloc.o page-writeback.o \
 			   readahead.o swap.o truncate.o vmscan.o shmem.o \
 			   prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
@@ -15,6 +15,12 @@ obj-y			:= bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
 			   $(mmu-y)
 obj-y += init-mm.o
 
+ifdef CONFIG_NO_BOOTMEM
+	obj-y		+= nobootmem.o
+else
+	obj-y		+= bootmem.o
+endif
+
 obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
 
 obj-$(CONFIG_BOUNCE)	+= bounce.o
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 13b0caa..4403e2f 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -35,7 +35,6 @@ unsigned long max_pfn;
 unsigned long saved_max_pfn;
 #endif
 
-#ifndef CONFIG_NO_BOOTMEM
 bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata;
 
 static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list);
@@ -146,7 +145,7 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
 	min_low_pfn = start;
 	return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages);
 }
-#endif
+
 /*
  * free_bootmem_late - free bootmem pages directly to page allocator
  * @addr: starting address of the range
@@ -171,53 +170,6 @@ void __init free_bootmem_late(unsigned long addr, unsigned long size)
 	}
 }
 
-#ifdef CONFIG_NO_BOOTMEM
-static void __init __free_pages_memory(unsigned long start, unsigned long end)
-{
-	int i;
-	unsigned long start_aligned, end_aligned;
-	int order = ilog2(BITS_PER_LONG);
-
-	start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1);
-	end_aligned = end & ~(BITS_PER_LONG - 1);
-
-	if (end_aligned <= start_aligned) {
-		for (i = start; i < end; i++)
-			__free_pages_bootmem(pfn_to_page(i), 0);
-
-		return;
-	}
-
-	for (i = start; i < start_aligned; i++)
-		__free_pages_bootmem(pfn_to_page(i), 0);
-
-	for (i = start_aligned; i < end_aligned; i += BITS_PER_LONG)
-		__free_pages_bootmem(pfn_to_page(i), order);
-
-	for (i = end_aligned; i < end; i++)
-		__free_pages_bootmem(pfn_to_page(i), 0);
-}
-
-unsigned long __init free_all_memory_core_early(int nodeid)
-{
-	int i;
-	u64 start, end;
-	unsigned long count = 0;
-	struct range *range = NULL;
-	int nr_range;
-
-	nr_range = get_free_all_memory_range(&range, nodeid);
-
-	for (i = 0; i < nr_range; i++) {
-		start = range[i].start;
-		end = range[i].end;
-		count += end - start;
-		__free_pages_memory(start, end);
-	}
-
-	return count;
-}
-#else
 static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
 {
 	int aligned;
@@ -278,7 +230,6 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
 
 	return count;
 }
-#endif
 
 /**
  * free_all_bootmem_node - release a node's free pages to the buddy allocator
@@ -289,12 +240,7 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
 unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
 {
 	register_page_bootmem_info_node(pgdat);
-#ifdef CONFIG_NO_BOOTMEM
-	/* free_all_memory_core_early(MAX_NUMNODES) will be called later */
-	return 0;
-#else
 	return free_all_bootmem_core(pgdat->bdata);
-#endif
 }
 
 /**
@@ -304,16 +250,6 @@ unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
  */
 unsigned long __init free_all_bootmem(void)
 {
-#ifdef CONFIG_NO_BOOTMEM
-	/*
-	 * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id
-	 *  because in some case like Node0 doesnt have RAM installed
-	 *  low ram will be on Node1
-	 * Use MAX_NUMNODES will make sure all ranges in early_node_map[]
-	 *  will be used instead of only Node0 related
-	 */
-	return free_all_memory_core_early(MAX_NUMNODES);
-#else
 	unsigned long total_pages = 0;
 	bootmem_data_t *bdata;
 
@@ -321,10 +257,8 @@ unsigned long __init free_all_bootmem(void)
 		total_pages += free_all_bootmem_core(bdata);
 
 	return total_pages;
-#endif
 }
 
-#ifndef CONFIG_NO_BOOTMEM
 static void __init __free(bootmem_data_t *bdata,
 			unsigned long sidx, unsigned long eidx)
 {
@@ -419,7 +353,6 @@ static int __init mark_bootmem(unsigned long start, unsigned long end,
 	}
 	BUG();
 }
-#endif
 
 /**
  * free_bootmem_node - mark a page range as usable
@@ -434,10 +367,6 @@ static int __init mark_bootmem(unsigned long start, unsigned long end,
 void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
 			      unsigned long size)
 {
-#ifdef CONFIG_NO_BOOTMEM
-	kmemleak_free_part(__va(physaddr), size);
-	memblock_x86_free_range(physaddr, physaddr + size);
-#else
 	unsigned long start, end;
 
 	kmemleak_free_part(__va(physaddr), size);
@@ -446,7 +375,6 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
 	end = PFN_DOWN(physaddr + size);
 
 	mark_bootmem_node(pgdat->bdata, start, end, 0, 0);
-#endif
 }
 
 /**
@@ -460,10 +388,6 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
  */
 void __init free_bootmem(unsigned long addr, unsigned long size)
 {
-#ifdef CONFIG_NO_BOOTMEM
-	kmemleak_free_part(__va(addr), size);
-	memblock_x86_free_range(addr, addr + size);
-#else
 	unsigned long start, end;
 
 	kmemleak_free_part(__va(addr), size);
@@ -472,7 +396,6 @@ void __init free_bootmem(unsigned long addr, unsigned long size)
 	end = PFN_DOWN(addr + size);
 
 	mark_bootmem(start, end, 0, 0);
-#endif
 }
 
 /**
@@ -489,17 +412,12 @@ void __init free_bootmem(unsigned long addr, unsigned long size)
 int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
 				 unsigned long size, int flags)
 {
-#ifdef CONFIG_NO_BOOTMEM
-	panic("no bootmem");
-	return 0;
-#else
 	unsigned long start, end;
 
 	start = PFN_DOWN(physaddr);
 	end = PFN_UP(physaddr + size);
 
 	return mark_bootmem_node(pgdat->bdata, start, end, 1, flags);
-#endif
 }
 
 /**
@@ -515,20 +433,14 @@ int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
 int __init reserve_bootmem(unsigned long addr, unsigned long size,
 			    int flags)
 {
-#ifdef CONFIG_NO_BOOTMEM
-	panic("no bootmem");
-	return 0;
-#else
 	unsigned long start, end;
 
 	start = PFN_DOWN(addr);
 	end = PFN_UP(addr + size);
 
 	return mark_bootmem(start, end, 1, flags);
-#endif
 }
 
-#ifndef CONFIG_NO_BOOTMEM
 int __weak __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
 				   int flags)
 {
@@ -685,33 +597,12 @@ static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata,
 #endif
 	return NULL;
 }
-#endif
 
 static void * __init ___alloc_bootmem_nopanic(unsigned long size,
 					unsigned long align,
 					unsigned long goal,
 					unsigned long limit)
 {
-#ifdef CONFIG_NO_BOOTMEM
-	void *ptr;
-
-	if (WARN_ON_ONCE(slab_is_available()))
-		return kzalloc(size, GFP_NOWAIT);
-
-restart:
-
-	ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit);
-
-	if (ptr)
-		return ptr;
-
-	if (goal != 0) {
-		goal = 0;
-		goto restart;
-	}
-
-	return NULL;
-#else
 	bootmem_data_t *bdata;
 	void *region;
 
@@ -737,7 +628,6 @@ restart:
 	}
 
 	return NULL;
-#endif
 }
 
 /**
@@ -758,10 +648,6 @@ void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
 {
 	unsigned long limit = 0;
 
-#ifdef CONFIG_NO_BOOTMEM
-	limit = -1UL;
-#endif
-
 	return ___alloc_bootmem_nopanic(size, align, goal, limit);
 }
 
@@ -798,14 +684,9 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align,
 {
 	unsigned long limit = 0;
 
-#ifdef CONFIG_NO_BOOTMEM
-	limit = -1UL;
-#endif
-
 	return ___alloc_bootmem(size, align, goal, limit);
 }
 
-#ifndef CONFIG_NO_BOOTMEM
 static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata,
 				unsigned long size, unsigned long align,
 				unsigned long goal, unsigned long limit)
@@ -822,7 +703,6 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata,
 
 	return ___alloc_bootmem(size, align, goal, limit);
 }
-#endif
 
 /**
  * __alloc_bootmem_node - allocate boot memory from a specific node
@@ -842,24 +722,10 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata,
 void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
 				   unsigned long align, unsigned long goal)
 {
-	void *ptr;
-
 	if (WARN_ON_ONCE(slab_is_available()))
 		return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
 
-#ifdef CONFIG_NO_BOOTMEM
-	ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
-					 goal, -1ULL);
-	if (ptr)
-		return ptr;
-
-	ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align,
-					 goal, -1ULL);
-#else
-	ptr = ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0);
-#endif
-
-	return ptr;
+	return  ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0);
 }
 
 void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
@@ -880,13 +746,8 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
 		unsigned long new_goal;
 
 		new_goal = MAX_DMA32_PFN << PAGE_SHIFT;
-#ifdef CONFIG_NO_BOOTMEM
-		ptr =  __alloc_memory_core_early(pgdat->node_id, size, align,
-						 new_goal, -1ULL);
-#else
 		ptr = alloc_bootmem_core(pgdat->bdata, size, align,
 						 new_goal, 0);
-#endif
 		if (ptr)
 			return ptr;
 	}
@@ -907,16 +768,6 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
 void * __init alloc_bootmem_section(unsigned long size,
 				    unsigned long section_nr)
 {
-#ifdef CONFIG_NO_BOOTMEM
-	unsigned long pfn, goal, limit;
-
-	pfn = section_nr_to_pfn(section_nr);
-	goal = pfn << PAGE_SHIFT;
-	limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT;
-
-	return __alloc_memory_core_early(early_pfn_to_nid(pfn), size,
-					 SMP_CACHE_BYTES, goal, limit);
-#else
 	bootmem_data_t *bdata;
 	unsigned long pfn, goal, limit;
 
@@ -926,7 +777,6 @@ void * __init alloc_bootmem_section(unsigned long size,
 	bdata = &bootmem_node_data[early_pfn_to_nid(pfn)];
 
 	return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit);
-#endif
 }
 #endif
 
@@ -938,16 +788,11 @@ void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
 	if (WARN_ON_ONCE(slab_is_available()))
 		return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
 
-#ifdef CONFIG_NO_BOOTMEM
-	ptr =  __alloc_memory_core_early(pgdat->node_id, size, align,
-						 goal, -1ULL);
-#else
 	ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0);
 	if (ptr)
 		return ptr;
 
 	ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0);
-#endif
 	if (ptr)
 		return ptr;
 
@@ -995,21 +840,9 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
 void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
 				       unsigned long align, unsigned long goal)
 {
-	void *ptr;
-
 	if (WARN_ON_ONCE(slab_is_available()))
 		return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
 
-#ifdef CONFIG_NO_BOOTMEM
-	ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
+	return ___alloc_bootmem_node(pgdat->bdata, size, align,
 				goal, ARCH_LOW_ADDRESS_LIMIT);
-	if (ptr)
-		return ptr;
-	ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align,
-				goal, ARCH_LOW_ADDRESS_LIMIT);
-#else
-	ptr = ___alloc_bootmem_node(pgdat->bdata, size, align,
-				goal, ARCH_LOW_ADDRESS_LIMIT);
-#endif
-	return ptr;
 }
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
new file mode 100644
index 0000000..f220b8d
--- /dev/null
+++ b/mm/nobootmem.c
@@ -0,0 +1,405 @@
+/*
+ *  bootmem - A boot-time physical memory allocator and configurator
+ *
+ *  Copyright (C) 1999 Ingo Molnar
+ *                1999 Kanoj Sarcar, SGI
+ *                2008 Johannes Weiner
+ *
+ * Access to this subsystem has to be serialized externally (which is true
+ * for the boot process anyway).
+ */
+#include <linux/init.h>
+#include <linux/pfn.h>
+#include <linux/slab.h>
+#include <linux/bootmem.h>
+#include <linux/module.h>
+#include <linux/kmemleak.h>
+#include <linux/range.h>
+#include <linux/memblock.h>
+
+#include <asm/bug.h>
+#include <asm/io.h>
+#include <asm/processor.h>
+
+#include "internal.h"
+
+unsigned long max_low_pfn;
+unsigned long min_low_pfn;
+unsigned long max_pfn;
+
+#ifdef CONFIG_CRASH_DUMP
+/*
+ * If we have booted due to a crash, max_pfn will be a very low value. We need
+ * to know the amount of memory that the previous kernel used.
+ */
+unsigned long saved_max_pfn;
+#endif
+
+/*
+ * free_bootmem_late - free bootmem pages directly to page allocator
+ * @addr: starting address of the range
+ * @size: size of the range in bytes
+ *
+ * This is only useful when the bootmem allocator has already been torn
+ * down, but we are still initializing the system.  Pages are given directly
+ * to the page allocator, no bootmem metadata is updated because it is gone.
+ */
+void __init free_bootmem_late(unsigned long addr, unsigned long size)
+{
+	unsigned long cursor, end;
+
+	kmemleak_free_part(__va(addr), size);
+
+	cursor = PFN_UP(addr);
+	end = PFN_DOWN(addr + size);
+
+	for (; cursor < end; cursor++) {
+		__free_pages_bootmem(pfn_to_page(cursor), 0);
+		totalram_pages++;
+	}
+}
+
+static void __init __free_pages_memory(unsigned long start, unsigned long end)
+{
+	int i;
+	unsigned long start_aligned, end_aligned;
+	int order = ilog2(BITS_PER_LONG);
+
+	start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1);
+	end_aligned = end & ~(BITS_PER_LONG - 1);
+
+	if (end_aligned <= start_aligned) {
+		for (i = start; i < end; i++)
+			__free_pages_bootmem(pfn_to_page(i), 0);
+
+		return;
+	}
+
+	for (i = start; i < start_aligned; i++)
+		__free_pages_bootmem(pfn_to_page(i), 0);
+
+	for (i = start_aligned; i < end_aligned; i += BITS_PER_LONG)
+		__free_pages_bootmem(pfn_to_page(i), order);
+
+	for (i = end_aligned; i < end; i++)
+		__free_pages_bootmem(pfn_to_page(i), 0);
+}
+
+unsigned long __init free_all_memory_core_early(int nodeid)
+{
+	int i;
+	u64 start, end;
+	unsigned long count = 0;
+	struct range *range = NULL;
+	int nr_range;
+
+	nr_range = get_free_all_memory_range(&range, nodeid);
+
+	for (i = 0; i < nr_range; i++) {
+		start = range[i].start;
+		end = range[i].end;
+		count += end - start;
+		__free_pages_memory(start, end);
+	}
+
+	return count;
+}
+
+/**
+ * free_all_bootmem_node - release a node's free pages to the buddy allocator
+ * @pgdat: node to be released
+ *
+ * Returns the number of pages actually released.
+ */
+unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
+{
+	register_page_bootmem_info_node(pgdat);
+
+	/* free_all_memory_core_early(MAX_NUMNODES) will be called later */
+	return 0;
+}
+
+/**
+ * free_all_bootmem - release free pages to the buddy allocator
+ *
+ * Returns the number of pages actually released.
+ */
+unsigned long __init free_all_bootmem(void)
+{
+	/*
+	 * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id
+	 *  because in some case like Node0 doesnt have RAM installed
+	 *  low ram will be on Node1
+	 * Use MAX_NUMNODES will make sure all ranges in early_node_map[]
+	 *  will be used instead of only Node0 related
+	 */
+	return free_all_memory_core_early(MAX_NUMNODES);
+}
+
+/**
+ * free_bootmem_node - mark a page range as usable
+ * @pgdat: node the range resides on
+ * @physaddr: starting address of the range
+ * @size: size of the range in bytes
+ *
+ * Partial pages will be considered reserved and left as they are.
+ *
+ * The range must reside completely on the specified node.
+ */
+void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
+			      unsigned long size)
+{
+	kmemleak_free_part(__va(physaddr), size);
+	memblock_x86_free_range(physaddr, physaddr + size);
+}
+
+/**
+ * free_bootmem - mark a page range as usable
+ * @addr: starting address of the range
+ * @size: size of the range in bytes
+ *
+ * Partial pages will be considered reserved and left as they are.
+ *
+ * The range must be contiguous but may span node boundaries.
+ */
+void __init free_bootmem(unsigned long addr, unsigned long size)
+{
+	kmemleak_free_part(__va(addr), size);
+	memblock_x86_free_range(addr, addr + size);
+}
+
+static void * __init ___alloc_bootmem_nopanic(unsigned long size,
+					unsigned long align,
+					unsigned long goal,
+					unsigned long limit)
+{
+	void *ptr;
+
+	if (WARN_ON_ONCE(slab_is_available()))
+		return kzalloc(size, GFP_NOWAIT);
+
+restart:
+
+	ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit);
+
+	if (ptr)
+		return ptr;
+
+	if (goal != 0) {
+		goal = 0;
+		goto restart;
+	}
+
+	return NULL;
+}
+
+/**
+ * __alloc_bootmem_nopanic - allocate boot memory without panicking
+ * @size: size of the request in bytes
+ * @align: alignment of the region
+ * @goal: preferred starting address of the region
+ *
+ * The goal is dropped if it can not be satisfied and the allocation will
+ * fall back to memory below @goal.
+ *
+ * Allocation may happen on any node in the system.
+ *
+ * Returns NULL on failure.
+ */
+void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
+					unsigned long goal)
+{
+	unsigned long limit = -1UL;
+
+	return ___alloc_bootmem_nopanic(size, align, goal, limit);
+}
+
+static void * __init ___alloc_bootmem(unsigned long size, unsigned long align,
+					unsigned long goal, unsigned long limit)
+{
+	void *mem = ___alloc_bootmem_nopanic(size, align, goal, limit);
+
+	if (mem)
+		return mem;
+	/*
+	 * Whoops, we cannot satisfy the allocation request.
+	 */
+	printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size);
+	panic("Out of memory");
+	return NULL;
+}
+
+/**
+ * __alloc_bootmem - allocate boot memory
+ * @size: size of the request in bytes
+ * @align: alignment of the region
+ * @goal: preferred starting address of the region
+ *
+ * The goal is dropped if it can not be satisfied and the allocation will
+ * fall back to memory below @goal.
+ *
+ * Allocation may happen on any node in the system.
+ *
+ * The function panics if the request can not be satisfied.
+ */
+void * __init __alloc_bootmem(unsigned long size, unsigned long align,
+			      unsigned long goal)
+{
+	unsigned long limit = -1UL;
+
+	return ___alloc_bootmem(size, align, goal, limit);
+}
+
+/**
+ * __alloc_bootmem_node - allocate boot memory from a specific node
+ * @pgdat: node to allocate from
+ * @size: size of the request in bytes
+ * @align: alignment of the region
+ * @goal: preferred starting address of the region
+ *
+ * The goal is dropped if it can not be satisfied and the allocation will
+ * fall back to memory below @goal.
+ *
+ * Allocation may fall back to any node in the system if the specified node
+ * can not hold the requested memory.
+ *
+ * The function panics if the request can not be satisfied.
+ */
+void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
+				   unsigned long align, unsigned long goal)
+{
+	void *ptr;
+
+	if (WARN_ON_ONCE(slab_is_available()))
+		return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+
+	ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
+					 goal, -1ULL);
+	if (ptr)
+		return ptr;
+
+	return __alloc_memory_core_early(MAX_NUMNODES, size, align,
+					 goal, -1ULL);
+}
+
+void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
+				   unsigned long align, unsigned long goal)
+{
+#ifdef MAX_DMA32_PFN
+	unsigned long end_pfn;
+
+	if (WARN_ON_ONCE(slab_is_available()))
+		return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+
+	/* update goal according ...MAX_DMA32_PFN */
+	end_pfn = pgdat->node_start_pfn + pgdat->node_spanned_pages;
+
+	if (end_pfn > MAX_DMA32_PFN + (128 >> (20 - PAGE_SHIFT)) &&
+	    (goal >> PAGE_SHIFT) < MAX_DMA32_PFN) {
+		void *ptr;
+		unsigned long new_goal;
+
+		new_goal = MAX_DMA32_PFN << PAGE_SHIFT;
+		ptr =  __alloc_memory_core_early(pgdat->node_id, size, align,
+						 new_goal, -1ULL);
+		if (ptr)
+			return ptr;
+	}
+#endif
+
+	return __alloc_bootmem_node(pgdat, size, align, goal);
+
+}
+
+#ifdef CONFIG_SPARSEMEM
+/**
+ * alloc_bootmem_section - allocate boot memory from a specific section
+ * @size: size of the request in bytes
+ * @section_nr: sparse map section to allocate from
+ *
+ * Return NULL on failure.
+ */
+void * __init alloc_bootmem_section(unsigned long size,
+				    unsigned long section_nr)
+{
+	unsigned long pfn, goal, limit;
+
+	pfn = section_nr_to_pfn(section_nr);
+	goal = pfn << PAGE_SHIFT;
+	limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT;
+
+	return __alloc_memory_core_early(early_pfn_to_nid(pfn), size,
+					 SMP_CACHE_BYTES, goal, limit);
+}
+#endif
+
+void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
+				   unsigned long align, unsigned long goal)
+{
+	void *ptr;
+
+	if (WARN_ON_ONCE(slab_is_available()))
+		return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+
+	ptr =  __alloc_memory_core_early(pgdat->node_id, size, align,
+						 goal, -1ULL);
+	if (ptr)
+		return ptr;
+
+	return __alloc_bootmem_nopanic(size, align, goal);
+}
+
+#ifndef ARCH_LOW_ADDRESS_LIMIT
+#define ARCH_LOW_ADDRESS_LIMIT	0xffffffffUL
+#endif
+
+/**
+ * __alloc_bootmem_low - allocate low boot memory
+ * @size: size of the request in bytes
+ * @align: alignment of the region
+ * @goal: preferred starting address of the region
+ *
+ * The goal is dropped if it can not be satisfied and the allocation will
+ * fall back to memory below @goal.
+ *
+ * Allocation may happen on any node in the system.
+ *
+ * The function panics if the request can not be satisfied.
+ */
+void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
+				  unsigned long goal)
+{
+	return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT);
+}
+
+/**
+ * __alloc_bootmem_low_node - allocate low boot memory from a specific node
+ * @pgdat: node to allocate from
+ * @size: size of the request in bytes
+ * @align: alignment of the region
+ * @goal: preferred starting address of the region
+ *
+ * The goal is dropped if it can not be satisfied and the allocation will
+ * fall back to memory below @goal.
+ *
+ * Allocation may fall back to any node in the system if the specified node
+ * can not hold the requested memory.
+ *
+ * The function panics if the request can not be satisfied.
+ */
+void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
+				       unsigned long align, unsigned long goal)
+{
+	void *ptr;
+
+	if (WARN_ON_ONCE(slab_is_available()))
+		return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+
+	ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
+				goal, ARCH_LOW_ADDRESS_LIMIT);
+	if (ptr)
+		return ptr;
+
+	return  __alloc_memory_core_early(MAX_NUMNODES, size, align,
+				goal, ARCH_LOW_ADDRESS_LIMIT);
+}
-- 
cgit v0.10.2


From e782ab421bbba1912c87934bd0e8998630736418 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Thu, 24 Feb 2011 14:43:06 +0100
Subject: bootmem: Move contig_page_data definition to bootmem.c/nobootmem.c

Now that bootmem.c and nobootmem.c are separate, it's cleaner to
define contig_page_data in each file than in page_alloc.c with #ifdef.
Move it.

This patch doesn't introduce any behavior change.

-v2: According to Andrew, fixed the struct layout.
-tj: Updated commit description.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Acked-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/mm/bootmem.c b/mm/bootmem.c
index 4403e2f..07aeb89 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -23,6 +23,13 @@
 
 #include "internal.h"
 
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+struct pglist_data __refdata contig_page_data = {
+	.bdata = &bootmem_node_data[0]
+};
+EXPORT_SYMBOL(contig_page_data);
+#endif
+
 unsigned long max_low_pfn;
 unsigned long min_low_pfn;
 unsigned long max_pfn;
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index f220b8d..6a018e4 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -23,6 +23,11 @@
 
 #include "internal.h"
 
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+struct pglist_data __refdata contig_page_data;
+EXPORT_SYMBOL(contig_page_data);
+#endif
+
 unsigned long max_low_pfn;
 unsigned long min_low_pfn;
 unsigned long max_pfn;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 887ce3b..a243a7f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4841,15 +4841,6 @@ void __init set_dma_reserve(unsigned long new_dma_reserve)
 	dma_reserve = new_dma_reserve;
 }
 
-#ifndef CONFIG_NEED_MULTIPLE_NODES
-struct pglist_data __refdata contig_page_data = {
-#ifndef CONFIG_NO_BOOTMEM
- .bdata = &bootmem_node_data[0]
-#endif
- };
-EXPORT_SYMBOL(contig_page_data);
-#endif
-
 void __init free_area_init(unsigned long *zones_size)
 {
 	free_area_init_node(0, zones_size,
-- 
cgit v0.10.2


From 8bc1f91e1f0e977fb95b11d8fa686f5091888110 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Thu, 24 Feb 2011 14:43:06 +0100
Subject: bootmem: Move __alloc_memory_core_early() to nobootmem.c

Now that bootmem.c and nobootmem.c are separate, there's no reason to
define __alloc_memory_core_early(), which is used only by nobootmem,
inside #ifdef in page_alloc.c.  Move it to nobootmem.c and make it
static.

This patch doesn't introduce any behavior change.

-tj: Updated commit description.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Acked-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/include/linux/mm.h b/include/linux/mm.h
index f6385fc..679300c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1309,8 +1309,6 @@ int add_from_early_node_map(struct range *range, int az,
 				   int nr_range, int nid);
 u64 __init find_memory_core_early(int nid, u64 size, u64 align,
 					u64 goal, u64 limit);
-void *__alloc_memory_core_early(int nodeid, u64 size, u64 align,
-				 u64 goal, u64 limit);
 typedef int (*work_fn_t)(unsigned long, unsigned long, void *);
 extern void work_with_active_regions(int nid, work_fn_t work_fn, void *data);
 extern void sparse_memory_present_with_active_regions(int nid);
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 6a018e4..e2bdb070 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -40,6 +40,31 @@ unsigned long max_pfn;
 unsigned long saved_max_pfn;
 #endif
 
+static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
+					u64 goal, u64 limit)
+{
+	void *ptr;
+	u64 addr;
+
+	if (limit > memblock.current_limit)
+		limit = memblock.current_limit;
+
+	addr = find_memory_core_early(nid, size, align, goal, limit);
+
+	if (addr == MEMBLOCK_ERROR)
+		return NULL;
+
+	ptr = phys_to_virt(addr);
+	memset(ptr, 0, size);
+	memblock_x86_reserve_range(addr, addr + size, "BOOTMEM");
+	/*
+	 * The min_count is set to 0 so that bootmem allocated blocks
+	 * are never reported as leaks.
+	 */
+	kmemleak_alloc(ptr, size, 0, 0);
+	return ptr;
+}
+
 /*
  * free_bootmem_late - free bootmem pages directly to page allocator
  * @addr: starting address of the range
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a243a7f..6035136 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3780,34 +3780,6 @@ int __init add_from_early_node_map(struct range *range, int az,
 	return nr_range;
 }
 
-#ifdef CONFIG_NO_BOOTMEM
-void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
-					u64 goal, u64 limit)
-{
-	void *ptr;
-	u64 addr;
-
-	if (limit > memblock.current_limit)
-		limit = memblock.current_limit;
-
-	addr = find_memory_core_early(nid, size, align, goal, limit);
-
-	if (addr == MEMBLOCK_ERROR)
-		return NULL;
-
-	ptr = phys_to_virt(addr);
-	memset(ptr, 0, size);
-	memblock_x86_reserve_range(addr, addr + size, "BOOTMEM");
-	/*
-	 * The min_count is set to 0 so that bootmem allocated blocks
-	 * are never reported as leaks.
-	 */
-	kmemleak_alloc(ptr, size, 0, 0);
-	return ptr;
-}
-#endif
-
-
 void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data)
 {
 	int i;
-- 
cgit v0.10.2


From d1b19426b04787e48f2689923e28d37b488969b0 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Thu, 24 Feb 2011 14:46:24 +0100
Subject: x86: Rename e820_table_* to pgt_buf_*

e820_table_{start|end|top}, which are used to buffer page table
allocation during early boot, are now derived from memblock and don't
have much to do with e820.  Change the names so that they reflect what
they're used for.

This patch doesn't introduce any behavior change.

-v2: Ingo found that earlier patch "x86: Use early pre-allocated page
     table buffer top-down" caused crash on 32bit and needed to be
     dropped.  This patch was updated to reflect the change.

-tj: Updated commit description.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h
index 36fb1a6a..8dbe353 100644
--- a/arch/x86/include/asm/init.h
+++ b/arch/x86/include/asm/init.h
@@ -11,8 +11,8 @@ kernel_physical_mapping_init(unsigned long start,
 			     unsigned long page_size_mask);
 
 
-extern unsigned long __initdata e820_table_start;
-extern unsigned long __meminitdata e820_table_end;
-extern unsigned long __meminitdata e820_table_top;
+extern unsigned long __initdata pgt_buf_start;
+extern unsigned long __meminitdata pgt_buf_end;
+extern unsigned long __meminitdata pgt_buf_top;
 
 #endif /* _ASM_X86_INIT_32_H */
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index b8054e0..286d289 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -18,9 +18,9 @@
 
 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
 
-unsigned long __initdata e820_table_start;
-unsigned long __meminitdata e820_table_end;
-unsigned long __meminitdata e820_table_top;
+unsigned long __initdata pgt_buf_start;
+unsigned long __meminitdata pgt_buf_end;
+unsigned long __meminitdata pgt_buf_top;
 
 int after_bootmem;
 
@@ -73,12 +73,12 @@ static void __init find_early_table_space(unsigned long end, int use_pse,
 	if (base == MEMBLOCK_ERROR)
 		panic("Cannot find space for the kernel page tables");
 
-	e820_table_start = base >> PAGE_SHIFT;
-	e820_table_end = e820_table_start;
-	e820_table_top = e820_table_start + (tables >> PAGE_SHIFT);
+	pgt_buf_start = base >> PAGE_SHIFT;
+	pgt_buf_end = pgt_buf_start;
+	pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT);
 
 	printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
-		end, e820_table_start << PAGE_SHIFT, e820_table_top << PAGE_SHIFT);
+		end, pgt_buf_start << PAGE_SHIFT, pgt_buf_top << PAGE_SHIFT);
 }
 
 struct map_range {
@@ -272,9 +272,9 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
 
 	__flush_tlb_all();
 
-	if (!after_bootmem && e820_table_end > e820_table_start)
-		memblock_x86_reserve_range(e820_table_start << PAGE_SHIFT,
-				 e820_table_end << PAGE_SHIFT, "PGTABLE");
+	if (!after_bootmem && pgt_buf_end > pgt_buf_start)
+		memblock_x86_reserve_range(pgt_buf_start << PAGE_SHIFT,
+				 pgt_buf_end << PAGE_SHIFT, "PGTABLE");
 
 	if (!after_bootmem)
 		early_memtest(start, end);
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 5d43fa5..73ad7eb 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -62,10 +62,10 @@ bool __read_mostly __vmalloc_start_set = false;
 
 static __init void *alloc_low_page(void)
 {
-	unsigned long pfn = e820_table_end++;
+	unsigned long pfn = pgt_buf_end++;
 	void *adr;
 
-	if (pfn >= e820_table_top)
+	if (pfn >= pgt_buf_top)
 		panic("alloc_low_page: ran out of memory");
 
 	adr = __va(pfn * PAGE_SIZE);
@@ -163,8 +163,8 @@ static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
 	if (pmd_idx_kmap_begin != pmd_idx_kmap_end
 	    && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin
 	    && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end
-	    && ((__pa(pte) >> PAGE_SHIFT) < e820_table_start
-		|| (__pa(pte) >> PAGE_SHIFT) >= e820_table_end)) {
+	    && ((__pa(pte) >> PAGE_SHIFT) < pgt_buf_start
+		|| (__pa(pte) >> PAGE_SHIFT) >= pgt_buf_end)) {
 		pte_t *newpte;
 		int i;
 
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 4f1f461..470cc47 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -314,7 +314,7 @@ void __init cleanup_highmap(void)
 
 static __ref void *alloc_low_page(unsigned long *phys)
 {
-	unsigned long pfn = e820_table_end++;
+	unsigned long pfn = pgt_buf_end++;
 	void *adr;
 
 	if (after_bootmem) {
@@ -324,7 +324,7 @@ static __ref void *alloc_low_page(unsigned long *phys)
 		return adr;
 	}
 
-	if (pfn >= e820_table_top)
+	if (pfn >= pgt_buf_top)
 		panic("alloc_low_page: ran out of memory");
 
 	adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE);
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 5e92b61..13783a1 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1443,7 +1443,7 @@ static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
 	 * early_ioremap fixmap slot, make sure it is RO.
 	 */
 	if (!is_early_ioremap_ptep(ptep) &&
-	    pfn >= e820_table_start && pfn < e820_table_end)
+	    pfn >= pgt_buf_start && pfn < pgt_buf_end)
 		pte = pte_wrprotect(pte);
 
 	return pte;
-- 
cgit v0.10.2


From 1f565a896ee139a70e1a16f74a4ec29707691b0b Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Fri, 25 Feb 2011 10:06:39 +0100
Subject: x86-64, NUMA: Fix size of numa_distance array

numa_distance should be sized like the SLIT, an NxN matrix where N is
the highest node id + 1.  This patch fixes the calculation to avoid
overflowing the array on the subsequent iteration.

-tj: The original patch used last index to calculate size.  Yinghai
     pointed out it should be incremented so it is the number of
     elements instead of the last index to calculate the size of the
     table.  Updated accordingly.

Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index cccc01d..7757d22 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -414,7 +414,8 @@ static int __init numa_alloc_distance(void)
 
 	for_each_node_mask(i, nodes_parsed)
 		cnt = i;
-	size = ++cnt * sizeof(numa_distance[0]);
+	cnt++;
+	size = cnt * cnt * sizeof(numa_distance[0]);
 
 	phys = memblock_find_in_range(0, (u64)max_pfn_mapped << PAGE_SHIFT,
 				      size, PAGE_SIZE);
-- 
cgit v0.10.2


From c16bfe9ac389b13a37ff617a09682ecc0685960f Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 25 Feb 2011 09:30:29 -0300
Subject: perf top browser: Fix up exit keys

The left key was exiting 'perf top --tui' when it really shouldn't, it
was too easy to leave the live annotation window and then press one too
many <- and get out of the tool altogether.

Do just like the report TUI does, ignore the left key for exit and also
ask the user when pressing ESC if that is really what is wanted.

Reported-by: Mike Galbraith <efault@gmx.de>
Suggested-by: Ingo Molnar <mingo@elte.hu>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/ui/browsers/top.c b/tools/perf/util/ui/browsers/top.c
index 2f47224..e9381ec 100644
--- a/tools/perf/util/ui/browsers/top.c
+++ b/tools/perf/util/ui/browsers/top.c
@@ -10,6 +10,7 @@
 #include "../../annotate.h"
 #include "../helpline.h"
 #include "../libslang.h"
+#include "../util.h"
 #include "../../evlist.h"
 #include "../../hist.h"
 #include "../../sort.h"
@@ -174,6 +175,12 @@ static int perf_top_browser__run(struct perf_top_browser *browser)
 			if (browser->selection)
 				perf_top_browser__annotate(browser);
 			break;
+		case NEWT_KEY_LEFT:
+			continue;
+		case NEWT_KEY_ESCAPE:
+			if (!ui__dialog_yesno("Do you really want to exit?"))
+				continue;
+			/* Fall thru */
 		default:
 			goto out;
 		}
-- 
cgit v0.10.2


From 7e9498705e810404ecf29bb2d6fa632b9484c609 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Fri, 25 Feb 2011 14:32:28 +0100
Subject: sched: Add #ifdef around irq time accounting functions

Get rid of this:

 kernel/sched.c:3731:13: warning: 'irqtime_account_idle_ticks' defined but not used
 kernel/sched.c:3732:13: warning: 'irqtime_account_process_tick' defined but not used

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Venkatesh Pallipadi <venki@google.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20110225133228.GD7469@osiris.boeblingen.de.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched.c b/kernel/sched.c
index 2effcb7..79bca16 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3687,6 +3687,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
 	__account_system_time(p, cputime, cputime_scaled, target_cputime64);
 }
 
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
 /*
  * Account a tick to a process and cpustat
@@ -3753,6 +3754,7 @@ static void irqtime_account_idle_ticks(int ticks) {}
 static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
 						struct rq *rq) {}
 #endif
+#endif /* !CONFIG_VIRT_CPU_ACCOUNTING */
 
 /*
  * Account for involuntary wait time.
-- 
cgit v0.10.2


From b210b3bb1b002f27165325a5edb6ebce3c168e92 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 25 Feb 2011 11:33:31 -0300
Subject: perf ui browser: Introduce ui_browser__show_title

Needed because we were only showing the title in ui_browser__show,
not in ui_browser__run, and in the run loop we may be calling other
browsers that would then change the title, when we go back to the
previous browser, we need to redraw the title.

We could have done this as the Newt help line, with pop, etc, but I
don't think its worth, doing it explicitely, when needed (some browsers
may not use the title area at all) seems enough/more flexible.

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/ui/browser.c b/tools/perf/util/ui/browser.c
index 60d6c81..611219f 100644
--- a/tools/perf/util/ui/browser.c
+++ b/tools/perf/util/ui/browser.c
@@ -157,6 +157,20 @@ void ui_browser__add_exit_keys(struct ui_browser *self, int keys[])
 	}
 }
 
+void __ui_browser__show_title(struct ui_browser *browser, const char *title)
+{
+	SLsmg_gotorc(0, 0);
+	ui_browser__set_color(browser, NEWT_COLORSET_ROOT);
+	slsmg_write_nstring(title, browser->width);
+}
+
+void ui_browser__show_title(struct ui_browser *browser, const char *title)
+{
+	pthread_mutex_lock(&ui__lock);
+	__ui_browser__show_title(browser, title);
+	pthread_mutex_unlock(&ui__lock);
+}
+
 int ui_browser__show(struct ui_browser *self, const char *title,
 		     const char *helpline, ...)
 {
@@ -180,9 +194,7 @@ int ui_browser__show(struct ui_browser *self, const char *title,
 		return -1;
 
 	pthread_mutex_lock(&ui__lock);
-	SLsmg_gotorc(0, 0);
-	ui_browser__set_color(self, NEWT_COLORSET_ROOT);
-	slsmg_write_nstring(title, self->width);
+	__ui_browser__show_title(self, title);
 
 	ui_browser__add_exit_keys(self, keys);
 	newtFormAddComponent(self->form, self->sb);
diff --git a/tools/perf/util/ui/browser.h b/tools/perf/util/ui/browser.h
index 0dc7e4d..fc63dda 100644
--- a/tools/perf/util/ui/browser.h
+++ b/tools/perf/util/ui/browser.h
@@ -24,7 +24,6 @@ struct ui_browser {
 	u32	      nr_entries;
 };
 
-
 void ui_browser__set_color(struct ui_browser *self, int color);
 void ui_browser__set_percent_color(struct ui_browser *self,
 				   double percent, bool current);
@@ -35,6 +34,8 @@ void ui_browser__reset_index(struct ui_browser *self);
 void ui_browser__gotorc(struct ui_browser *self, int y, int x);
 void ui_browser__add_exit_key(struct ui_browser *self, int key);
 void ui_browser__add_exit_keys(struct ui_browser *self, int keys[]);
+void __ui_browser__show_title(struct ui_browser *browser, const char *title);
+void ui_browser__show_title(struct ui_browser *browser, const char *title);
 int ui_browser__show(struct ui_browser *self, const char *title,
 		     const char *helpline, ...);
 void ui_browser__hide(struct ui_browser *self);
-- 
cgit v0.10.2


From a906fdaacca49917d83e5032dfc31f694249ad10 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 25 Feb 2011 16:09:31 +0100
Subject: x86: dt: Cleanup local apic setup

Up to now we force enable the local apic in the devicetree setup
uncoditionally and set smp_found_config unconditionally to 1 when a
devicetree blob is available. This breaks, when local apic is disabled
in the Kconfig.

Make it consistent by initializing device tree explicitely before
smp_get_config() so a non lapic configuration could be used as well.
To be functional that would require to implement PIT as an interrupt
host, but the only user of this code until now is ce4100 which
requires apics to be available. So we leave this up to those who need
it.

Tested-by: Sebastian Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 3c89694..4afe512 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -240,7 +240,7 @@ extern void setup_boot_APIC_clock(void);
 extern void setup_secondary_APIC_clock(void);
 extern int APIC_init_uniprocessor(void);
 extern void enable_NMI_through_LVT0(void);
-extern int apic_force_enable(void);
+extern int apic_force_enable(unsigned long addr);
 
 /*
  * On 32bit this is mach-xxx local
diff --git a/arch/x86/include/asm/prom.h b/arch/x86/include/asm/prom.h
index f58361b..971e0b4 100644
--- a/arch/x86/include/asm/prom.h
+++ b/arch/x86/include/asm/prom.h
@@ -28,9 +28,8 @@ extern int of_ioapic;
 extern u64 initial_dtb;
 extern void add_dtb(u64 data);
 extern void x86_add_irq_domains(void);
-void x86_dtb_find_config(void);
-void x86_dtb_get_config(unsigned int unused);
 void __cpuinit x86_of_pci_init(void);
+void x86_dtb_init(void);
 
 static inline struct device_node *pci_device_to_OF_node(struct pci_dev *pdev)
 {
@@ -46,8 +45,7 @@ static inline struct device_node *pci_bus_to_OF_node(struct pci_bus *bus)
 static inline void add_dtb(u64 data) { }
 static inline void x86_add_irq_domains(void) { }
 static inline void x86_of_pci_init(void) { }
-#define x86_dtb_find_config x86_init_noop
-#define x86_dtb_get_config x86_init_uint_noop
+static inline void x86_dtb_init(void) { }
 #define of_ioapic 0
 #endif
 
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index f0e0798..4f43312 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1563,7 +1563,7 @@ static int apic_verify(void)
 	return 0;
 }
 
-int apic_force_enable(void)
+int apic_force_enable(unsigned long addr)
 {
 	u32 h, l;
 
@@ -1579,7 +1579,7 @@ int apic_force_enable(void)
 	if (!(l & MSR_IA32_APICBASE_ENABLE)) {
 		pr_info("Local APIC disabled by BIOS -- reenabling.\n");
 		l &= ~MSR_IA32_APICBASE_BASE;
-		l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE;
+		l |= MSR_IA32_APICBASE_ENABLE | addr;
 		wrmsr(MSR_IA32_APICBASE, l, h);
 		enabled_via_apicbase = 1;
 	}
@@ -1620,7 +1620,7 @@ static int __init detect_init_APIC(void)
 				"you can enable it with \"lapic\"\n");
 			return -1;
 		}
-		if (apic_force_enable())
+		if (apic_force_enable(APIC_DEFAULT_PHYS_BASE))
 			return -1;
 	} else {
 		if (apic_verify())
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index 06e5e91..7a8cebc 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -26,6 +26,7 @@ static DEFINE_RAW_SPINLOCK(big_irq_lock);
 
 int __initdata of_ioapic;
 
+#ifdef CONFIG_X86_IO_APIC
 static void add_interrupt_host(struct irq_domain *ih)
 {
 	unsigned long flags;
@@ -34,6 +35,7 @@ static void add_interrupt_host(struct irq_domain *ih)
 	list_add(&ih->l, &irq_domains);
 	raw_spin_unlock_irqrestore(&big_irq_lock, flags);
 }
+#endif
 
 static struct irq_domain *get_ih_from_node(struct device_node *controller)
 {
@@ -223,18 +225,28 @@ static void __init dtb_setup_hpet(void)
 static void __init dtb_lapic_setup(void)
 {
 #ifdef CONFIG_X86_LOCAL_APIC
-	if (apic_force_enable())
+	struct device_node *dn;
+	struct resource r;
+	int ret;
+
+	dn = of_find_compatible_node(NULL, NULL, "intel,ce4100-lapic");
+	if (!dn)
+		return;
+
+	ret = of_address_to_resource(dn, 0, &r);
+	if (WARN_ON(ret))
 		return;
 
+	/* Did the boot loader setup the local APIC ? */
+	if (!cpu_has_apic) {
+		if (apic_force_enable(r.start))
+			return;
+	}
 	smp_found_config = 1;
 	pic_mode = 1;
-	/* Required for ioapic registration */
-	set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
-	if (boot_cpu_physical_apicid == -1U)
-		boot_cpu_physical_apicid = read_apic_id();
-
+	register_lapic_address(r.start);
 	generic_processor_info(boot_cpu_physical_apicid,
-			GET_APIC_VERSION(apic_read(APIC_LVR)));
+			       GET_APIC_VERSION(apic_read(APIC_LVR)));
 #endif
 }
 
@@ -259,9 +271,6 @@ static void __init dtb_ioapic_setup(void)
 {
 	struct device_node *dn;
 
-	if (!smp_found_config)
-		return;
-
 	for_each_compatible_node(dn, NULL, "intel,ce4100-ioapic")
 		dtb_add_ioapic(dn);
 
@@ -270,7 +279,6 @@ static void __init dtb_ioapic_setup(void)
 		return;
 	}
 	printk(KERN_ERR "Error: No information about IO-APIC in OF.\n");
-	smp_found_config = 0;
 }
 #else
 static void __init dtb_ioapic_setup(void) {}
@@ -282,14 +290,6 @@ static void __init dtb_apic_setup(void)
 	dtb_ioapic_setup();
 }
 
-void __init x86_dtb_find_config(void)
-{
-	if (initial_dtb)
-		smp_found_config = 1;
-	else
-		printk(KERN_ERR "Missing device tree!.\n");
-}
-
 #ifdef CONFIG_OF_FLATTREE
 static void __init x86_flattree_get_config(void)
 {
@@ -325,7 +325,7 @@ static void __init x86_flattree_get_config(void)
 static inline void x86_flattree_get_config(void) { }
 #endif
 
-void __init x86_dtb_get_config(unsigned int unused)
+void __init x86_dtb_init(void)
 {
 	x86_flattree_get_config();
 
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 33dcbce..b3143bc 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1044,8 +1044,8 @@ void __init setup_arch(char **cmdline_p)
 	 * Read APIC and some other early information from ACPI tables.
 	 */
 	acpi_boot_init();
-
 	sfi_init();
+	x86_dtb_init();
 
 	/*
 	 * get boot-time SMP configuration:
diff --git a/arch/x86/platform/ce4100/ce4100.c b/arch/x86/platform/ce4100/ce4100.c
index b3436d3..68c0dbc 100644
--- a/arch/x86/platform/ce4100/ce4100.c
+++ b/arch/x86/platform/ce4100/ce4100.c
@@ -134,8 +134,8 @@ void __init x86_ce4100_early_setup(void)
 	x86_init.oem.arch_setup = sdv_arch_setup;
 	x86_platform.i8042_detect = ce4100_i8042_detect;
 	x86_init.resources.probe_roms = x86_init_noop;
-	x86_init.mpparse.get_smp_config = x86_dtb_get_config;
-	x86_init.mpparse.find_smp_config = x86_dtb_find_config;
+	x86_init.mpparse.get_smp_config = x86_init_uint_noop;
+	x86_init.mpparse.find_smp_config = x86_init_noop;
 
 #ifdef CONFIG_X86_IO_APIC
 	x86_init.pci.init_irq = sdv_pci_init;
-- 
cgit v0.10.2


From 1204e95689f9fbd245a4ce5c1b0cd0a9b77f8d25 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 25 Feb 2011 17:17:18 +0100
Subject: genirq: Make warning in handle_percpu_event useful

The WARN_ON_ONCE in handle_percpu_event() which emits a warning when
an action handler returns with interrupts enabled is not really
useful. It does not reveal the interrupt number and handler function
which caused it. Make it WARN_ONCE() and add the information.

Reported-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index e099e9e..b110c83 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -64,7 +64,8 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
 		res = action->handler(irq, action->dev_id);
 		trace_irq_handler_exit(irq, action, res);
 
-		if (WARN_ON_ONCE(!irqs_disabled()))
+		if (WARN_ONCE(!irqs_disabled(),"irq %u handler %pF enabled interrupts\n",
+			      irq, action->handler))
 			local_irq_disable();
 
 		switch (res) {
-- 
cgit v0.10.2


From 702d4eb9b3de4398ab99cf0a4e799e552c7ab756 Mon Sep 17 00:00:00 2001
From: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Date: Thu, 2 Dec 2010 17:54:50 +0000
Subject: xen: no need to delay xen_setup_shutdown_event for hvm guests anymore

Now that xenstore_ready is used correctly for PV on HVM guests too, we
don't need to delay the initialization of xen_setup_shutdown_event
anymore.

Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Acked-by: Jeremy Fitzhardinge <jeremy@goop.org>

diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
index 2417727..b2a8d78 100644
--- a/drivers/xen/manage.c
+++ b/drivers/xen/manage.c
@@ -291,27 +291,18 @@ static int shutdown_event(struct notifier_block *notifier,
 	return NOTIFY_DONE;
 }
 
-static int __init __setup_shutdown_event(void)
-{
-	/* Delay initialization in the PV on HVM case */
-	if (xen_hvm_domain())
-		return 0;
-
-	if (!xen_pv_domain())
-		return -ENODEV;
-
-	return xen_setup_shutdown_event();
-}
-
 int xen_setup_shutdown_event(void)
 {
 	static struct notifier_block xenstore_notifier = {
 		.notifier_call = shutdown_event
 	};
+
+	if (!xen_domain())
+		return -ENODEV;
 	register_xenstore_notifier(&xenstore_notifier);
 
 	return 0;
 }
 EXPORT_SYMBOL_GPL(xen_setup_shutdown_event);
 
-subsys_initcall(__setup_shutdown_event);
+subsys_initcall(xen_setup_shutdown_event);
diff --git a/drivers/xen/platform-pci.c b/drivers/xen/platform-pci.c
index afbe041..319dd0a 100644
--- a/drivers/xen/platform-pci.c
+++ b/drivers/xen/platform-pci.c
@@ -156,9 +156,6 @@ static int __devinit platform_pci_init(struct pci_dev *pdev,
 	if (ret)
 		goto out;
 	xenbus_probe(NULL);
-	ret = xen_setup_shutdown_event();
-	if (ret)
-		goto out;
 	return 0;
 
 out:
-- 
cgit v0.10.2


From cff520b9c2ee1486ea9ff1dbc774510c62e5ecb9 Mon Sep 17 00:00:00 2001
From: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Date: Thu, 2 Dec 2010 17:54:54 +0000
Subject: xen: do not use xen_info on HVM, set pv_info name to "Xen HVM"

Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Acked-by: Jeremy Fitzhardinge <jeremy@goop.org>

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 50542ef..2f67e2e 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1284,8 +1284,7 @@ static int init_hvm_pv_info(int *major, int *minor)
 
 	xen_setup_features();
 
-	pv_info = xen_info;
-	pv_info.kernel_rpl = 0;
+	pv_info.name = "Xen HVM";
 
 	xen_domain_type = XEN_HVM_DOMAIN;
 
-- 
cgit v0.10.2


From c80a420995e721099906607b07c09a24543b31d9 Mon Sep 17 00:00:00 2001
From: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Date: Thu, 2 Dec 2010 17:55:00 +0000
Subject: xen-blkfront: handle Xen major numbers other than XENVBD

This patch makes sure blkfront handles correctly virtual device numbers
corresponding to Xen emulated IDE and SCSI disks: in those cases
blkfront translates the major number to XENVBD and the minor number to a
low xvd minor.

Note: this behaviour is different from what old xenlinux PV guests used
to do: they used to steal an IDE or SCSI major number and use it
instead.

Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Acked-by: Jeremy Fitzhardinge <jeremy@goop.org>

diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index d7aa39e..64d9c6d 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -120,6 +120,10 @@ static DEFINE_SPINLOCK(minor_lock);
 #define EXTENDED (1<<EXT_SHIFT)
 #define VDEV_IS_EXTENDED(dev) ((dev)&(EXTENDED))
 #define BLKIF_MINOR_EXT(dev) ((dev)&(~EXTENDED))
+#define EMULATED_HD_DISK_MINOR_OFFSET (0)
+#define EMULATED_HD_DISK_NAME_OFFSET (EMULATED_HD_DISK_MINOR_OFFSET / 256)
+#define EMULATED_SD_DISK_MINOR_OFFSET (EMULATED_HD_DISK_MINOR_OFFSET + (4 * 16))
+#define EMULATED_SD_DISK_NAME_OFFSET (EMULATED_HD_DISK_NAME_OFFSET + 4)
 
 #define DEV_NAME	"xvd"	/* name in /dev */
 
@@ -434,6 +438,65 @@ static void xlvbd_flush(struct blkfront_info *info)
 	       info->feature_flush ? "enabled" : "disabled");
 }
 
+static int xen_translate_vdev(int vdevice, int *minor, unsigned int *offset)
+{
+	int major;
+	major = BLKIF_MAJOR(vdevice);
+	*minor = BLKIF_MINOR(vdevice);
+	switch (major) {
+		case XEN_IDE0_MAJOR:
+			*offset = (*minor / 64) + EMULATED_HD_DISK_NAME_OFFSET;
+			*minor = ((*minor / 64) * PARTS_PER_DISK) +
+				EMULATED_HD_DISK_MINOR_OFFSET;
+			break;
+		case XEN_IDE1_MAJOR:
+			*offset = (*minor / 64) + 2 + EMULATED_HD_DISK_NAME_OFFSET;
+			*minor = (((*minor / 64) + 2) * PARTS_PER_DISK) +
+				EMULATED_HD_DISK_MINOR_OFFSET;
+			break;
+		case XEN_SCSI_DISK0_MAJOR:
+			*offset = (*minor / PARTS_PER_DISK) + EMULATED_SD_DISK_NAME_OFFSET;
+			*minor = *minor + EMULATED_SD_DISK_MINOR_OFFSET;
+			break;
+		case XEN_SCSI_DISK1_MAJOR:
+		case XEN_SCSI_DISK2_MAJOR:
+		case XEN_SCSI_DISK3_MAJOR:
+		case XEN_SCSI_DISK4_MAJOR:
+		case XEN_SCSI_DISK5_MAJOR:
+		case XEN_SCSI_DISK6_MAJOR:
+		case XEN_SCSI_DISK7_MAJOR:
+			*offset = (*minor / PARTS_PER_DISK) + 
+				((major - XEN_SCSI_DISK1_MAJOR + 1) * 16) +
+				EMULATED_SD_DISK_NAME_OFFSET;
+			*minor = *minor +
+				((major - XEN_SCSI_DISK1_MAJOR + 1) * 16 * PARTS_PER_DISK) +
+				EMULATED_SD_DISK_MINOR_OFFSET;
+			break;
+		case XEN_SCSI_DISK8_MAJOR:
+		case XEN_SCSI_DISK9_MAJOR:
+		case XEN_SCSI_DISK10_MAJOR:
+		case XEN_SCSI_DISK11_MAJOR:
+		case XEN_SCSI_DISK12_MAJOR:
+		case XEN_SCSI_DISK13_MAJOR:
+		case XEN_SCSI_DISK14_MAJOR:
+		case XEN_SCSI_DISK15_MAJOR:
+			*offset = (*minor / PARTS_PER_DISK) + 
+				((major - XEN_SCSI_DISK8_MAJOR + 8) * 16) +
+				EMULATED_SD_DISK_NAME_OFFSET;
+			*minor = *minor +
+				((major - XEN_SCSI_DISK8_MAJOR + 8) * 16 * PARTS_PER_DISK) +
+				EMULATED_SD_DISK_MINOR_OFFSET;
+			break;
+		case XENVBD_MAJOR:
+			*offset = *minor / PARTS_PER_DISK;
+			break;
+		default:
+			printk(KERN_WARNING "blkfront: your disk configuration is "
+					"incorrect, please use an xvd device instead\n");
+			return -ENODEV;
+	}
+	return 0;
+}
 
 static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
 			       struct blkfront_info *info,
@@ -441,7 +504,7 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
 {
 	struct gendisk *gd;
 	int nr_minors = 1;
-	int err = -ENODEV;
+	int err;
 	unsigned int offset;
 	int minor;
 	int nr_parts;
@@ -456,12 +519,20 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
 	}
 
 	if (!VDEV_IS_EXTENDED(info->vdevice)) {
-		minor = BLKIF_MINOR(info->vdevice);
-		nr_parts = PARTS_PER_DISK;
+		err = xen_translate_vdev(info->vdevice, &minor, &offset);
+		if (err)
+			return err;		
+ 		nr_parts = PARTS_PER_DISK;
 	} else {
 		minor = BLKIF_MINOR_EXT(info->vdevice);
 		nr_parts = PARTS_PER_EXT_DISK;
+		offset = minor / nr_parts;
+		if (xen_hvm_domain() && offset <= EMULATED_HD_DISK_NAME_OFFSET + 4)
+			printk(KERN_WARNING "blkfront: vdevice 0x%x might conflict with "
+					"emulated IDE disks,\n\t choose an xvd device name"
+					"from xvde on\n", info->vdevice);
 	}
+	err = -ENODEV;
 
 	if ((minor % nr_parts) == 0)
 		nr_minors = nr_parts;
@@ -475,8 +546,6 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
 	if (gd == NULL)
 		goto release;
 
-	offset = minor / nr_parts;
-
 	if (nr_minors > 1) {
 		if (offset < 26)
 			sprintf(gd->disk_name, "%s%c", DEV_NAME, 'a' + offset);
diff --git a/include/xen/interface/io/blkif.h b/include/xen/interface/io/blkif.h
index c2d1fa4..68dd2b4 100644
--- a/include/xen/interface/io/blkif.h
+++ b/include/xen/interface/io/blkif.h
@@ -91,4 +91,25 @@ DEFINE_RING_TYPES(blkif, struct blkif_request, struct blkif_response);
 #define VDISK_REMOVABLE    0x2
 #define VDISK_READONLY     0x4
 
+/* Xen-defined major numbers for virtual disks, they look strangely
+ * familiar */
+#define XEN_IDE0_MAJOR	3
+#define XEN_IDE1_MAJOR	22
+#define XEN_SCSI_DISK0_MAJOR	8
+#define XEN_SCSI_DISK1_MAJOR	65
+#define XEN_SCSI_DISK2_MAJOR	66
+#define XEN_SCSI_DISK3_MAJOR	67
+#define XEN_SCSI_DISK4_MAJOR	68
+#define XEN_SCSI_DISK5_MAJOR	69
+#define XEN_SCSI_DISK6_MAJOR	70
+#define XEN_SCSI_DISK7_MAJOR	71
+#define XEN_SCSI_DISK8_MAJOR	128
+#define XEN_SCSI_DISK9_MAJOR	129
+#define XEN_SCSI_DISK10_MAJOR	130
+#define XEN_SCSI_DISK11_MAJOR	131
+#define XEN_SCSI_DISK12_MAJOR	132
+#define XEN_SCSI_DISK13_MAJOR	133
+#define XEN_SCSI_DISK14_MAJOR	134
+#define XEN_SCSI_DISK15_MAJOR	135
+
 #endif /* __XEN_PUBLIC_IO_BLKIF_H__ */
-- 
cgit v0.10.2


From 53d5522cad291a0e93a385e0594b6aea6b54a071 Mon Sep 17 00:00:00 2001
From: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Date: Thu, 2 Dec 2010 17:55:05 +0000
Subject: xen: make the ballon driver work for hvm domains

Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>

diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c
index 43f9f02..9294f25 100644
--- a/drivers/xen/balloon.c
+++ b/drivers/xen/balloon.c
@@ -232,7 +232,7 @@ static int increase_reservation(unsigned long nr_pages)
 		set_phys_to_machine(pfn, frame_list[i]);
 
 		/* Link back into the page tables if not highmem. */
-		if (pfn < max_low_pfn) {
+		if (!xen_hvm_domain() && pfn < max_low_pfn) {
 			int ret;
 			ret = HYPERVISOR_update_va_mapping(
 				(unsigned long)__va(pfn << PAGE_SHIFT),
@@ -280,7 +280,7 @@ static int decrease_reservation(unsigned long nr_pages)
 
 		scrub_page(page);
 
-		if (!PageHighMem(page)) {
+		if (!xen_hvm_domain() && !PageHighMem(page)) {
 			ret = HYPERVISOR_update_va_mapping(
 				(unsigned long)__va(pfn << PAGE_SHIFT),
 				__pte_ma(0), 0);
@@ -392,15 +392,19 @@ static struct notifier_block xenstore_notifier;
 
 static int __init balloon_init(void)
 {
-	unsigned long pfn, extra_pfn_end;
+ 	unsigned long pfn, nr_pages, extra_pfn_end;
 	struct page *page;
 
-	if (!xen_pv_domain())
+	if (!xen_domain())
 		return -ENODEV;
 
 	pr_info("xen_balloon: Initialising balloon driver.\n");
 
-	balloon_stats.current_pages = min(xen_start_info->nr_pages, max_pfn);
+ 	if (xen_pv_domain())
+ 		nr_pages = xen_start_info->nr_pages;
+ 	else
+ 		nr_pages = max_pfn;
+ 	balloon_stats.current_pages = min(nr_pages, max_pfn);
 	balloon_stats.target_pages  = balloon_stats.current_pages;
 	balloon_stats.balloon_low   = 0;
 	balloon_stats.balloon_high  = 0;
-- 
cgit v0.10.2


From 99bbb3a84a99cd04ab16b998b20f01a72cfa9f4f Mon Sep 17 00:00:00 2001
From: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Date: Thu, 2 Dec 2010 17:55:10 +0000
Subject: xen: PV on HVM: support PV spinlocks and IPIs

Initialize PV spinlocks on boot CPU right after native_smp_prepare_cpus
(that switch to APIC mode and initialize APIC routing); on secondary
CPUs on CPU_UP_PREPARE.

Enable the usage of event channels to send and receive IPIs when
running as a PV on HVM guest.

Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 2f67e2e..fe02574 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1330,6 +1330,8 @@ static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self,
 	switch (action) {
 	case CPU_UP_PREPARE:
 		per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
+		if (xen_have_vector_callback)
+			xen_init_lock_cpu(cpu);
 		break;
 	default:
 		break;
@@ -1354,6 +1356,7 @@ static void __init xen_hvm_guest_init(void)
 
 	if (xen_feature(XENFEAT_hvm_callback_vector))
 		xen_have_vector_callback = 1;
+	xen_hvm_smp_init();
 	register_cpu_notifier(&xen_hvm_cpu_notifier);
 	xen_unplug_emulated_devices();
 	have_vcpu_info_placement = 0;
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 72a4c79..3061244 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -509,3 +509,41 @@ void __init xen_smp_init(void)
 	xen_fill_possible_map();
 	xen_init_spinlocks();
 }
+
+static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus)
+{
+	native_smp_prepare_cpus(max_cpus);
+	WARN_ON(xen_smp_intr_init(0));
+
+	if (!xen_have_vector_callback)
+		return;
+	xen_init_lock_cpu(0);
+	xen_init_spinlocks();
+}
+
+static int __cpuinit xen_hvm_cpu_up(unsigned int cpu)
+{
+	int rc;
+	rc = native_cpu_up(cpu);
+	WARN_ON (xen_smp_intr_init(cpu));
+	return rc;
+}
+
+static void xen_hvm_cpu_die(unsigned int cpu)
+{
+	unbind_from_irqhandler(per_cpu(xen_resched_irq, cpu), NULL);
+	unbind_from_irqhandler(per_cpu(xen_callfunc_irq, cpu), NULL);
+	unbind_from_irqhandler(per_cpu(xen_debug_irq, cpu), NULL);
+	unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu), NULL);
+	native_cpu_die(cpu);
+}
+
+void __init xen_hvm_smp_init(void)
+{
+	smp_ops.smp_prepare_cpus = xen_hvm_smp_prepare_cpus;
+	smp_ops.smp_send_reschedule = xen_smp_send_reschedule;
+	smp_ops.cpu_up = xen_hvm_cpu_up;
+	smp_ops.cpu_die = xen_hvm_cpu_die;
+	smp_ops.send_call_func_ipi = xen_smp_send_call_function_ipi;
+	smp_ops.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi;
+}
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 9d41bf9..3112f55 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -64,10 +64,12 @@ void xen_setup_vcpu_info_placement(void);
 
 #ifdef CONFIG_SMP
 void xen_smp_init(void);
+void __init xen_hvm_smp_init(void);
 
 extern cpumask_var_t xen_cpu_initialized_map;
 #else
 static inline void xen_smp_init(void) {}
+static inline void xen_hvm_smp_init(void) {}
 #endif
 
 #ifdef CONFIG_PARAVIRT_SPINLOCKS
-- 
cgit v0.10.2


From e057a4b6e0eb6701f6ec923be2075d4984cef51a Mon Sep 17 00:00:00 2001
From: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Date: Fri, 11 Feb 2011 17:55:13 +0000
Subject: xen: fix compile issue if XEN is enabled but XEN_PVHVM is disabled

Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>

diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index 9bbd63a..4a3d3dd 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -28,6 +28,7 @@ void xen_pre_suspend(void)
 
 void xen_hvm_post_suspend(int suspend_cancelled)
 {
+#ifdef CONFIG_XEN_PVHVM
 	int cpu;
 	xen_hvm_init_shared_info();
 	xen_callback_vector();
@@ -37,6 +38,7 @@ void xen_hvm_post_suspend(int suspend_cancelled)
 			xen_setup_runstate_info(cpu);
 		}
 	}
+#endif
 }
 
 void xen_post_suspend(int suspend_cancelled)
-- 
cgit v0.10.2


From 552717231e50b478dfd19d63fd97879476ae051d Mon Sep 17 00:00:00 2001
From: Ian Campbell <ian.campbell@citrix.com>
Date: Thu, 17 Feb 2011 11:04:20 +0000
Subject: xen: do not respond to unknown xenstore control requests

The PV xenbus control/shutdown node is written by the toolstack as a
request to the guest to perform a particular action (shutdown, reboot,
suspend etc). The guest is expected to acknowledge that it will
complete a request by clearing the control node.

Previously it would acknowledge any request, even if it did not know
what to do with it. Specifically in the case where CONFIG_PM_SLEEP is
not enabled the kernel would acknowledge a suspend request even though
it was not actually going to do anything.

Instead make the kernel only acknowledge requests if it is actually
going to do something with it. This will improve the toolstack's
ability to diagnose and deal with failures.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
index b2a8d78..972bf78 100644
--- a/drivers/xen/manage.c
+++ b/drivers/xen/manage.c
@@ -172,12 +172,39 @@ out:
 }
 #endif	/* CONFIG_PM_SLEEP */
 
+struct shutdown_handler {
+	const char *command;
+	void (*cb)(void);
+};
+
+static void do_poweroff(void)
+{
+	shutting_down = SHUTDOWN_POWEROFF;
+	orderly_poweroff(false);
+}
+
+static void do_reboot(void)
+{
+	shutting_down = SHUTDOWN_POWEROFF; /* ? */
+	ctrl_alt_del();
+}
+
 static void shutdown_handler(struct xenbus_watch *watch,
 			     const char **vec, unsigned int len)
 {
 	char *str;
 	struct xenbus_transaction xbt;
 	int err;
+	static struct shutdown_handler handlers[] = {
+		{ "poweroff",	do_poweroff },
+		{ "halt",	do_poweroff },
+		{ "reboot",	do_reboot   },
+#ifdef CONFIG_PM_SLEEP
+		{ "suspend",	do_suspend  },
+#endif
+		{NULL, NULL},
+	};
+	static struct shutdown_handler *handler;
 
 	if (shutting_down != SHUTDOWN_INVALID)
 		return;
@@ -194,7 +221,14 @@ static void shutdown_handler(struct xenbus_watch *watch,
 		return;
 	}
 
-	xenbus_write(xbt, "control", "shutdown", "");
+	for (handler = &handlers[0]; handler->command; handler++) {
+		if (strcmp(str, handler->command) == 0)
+			break;
+	}
+
+	/* Only acknowledge commands which we are prepared to handle. */
+	if (handler->cb)
+		xenbus_write(xbt, "control", "shutdown", "");
 
 	err = xenbus_transaction_end(xbt, 0);
 	if (err == -EAGAIN) {
@@ -202,17 +236,8 @@ static void shutdown_handler(struct xenbus_watch *watch,
 		goto again;
 	}
 
-	if (strcmp(str, "poweroff") == 0 ||
-	    strcmp(str, "halt") == 0) {
-		shutting_down = SHUTDOWN_POWEROFF;
-		orderly_poweroff(false);
-	} else if (strcmp(str, "reboot") == 0) {
-		shutting_down = SHUTDOWN_POWEROFF; /* ? */
-		ctrl_alt_del();
-#ifdef CONFIG_PM_SLEEP
-	} else if (strcmp(str, "suspend") == 0) {
-		do_suspend();
-#endif
+	if (handler->cb) {
+		handler->cb();
 	} else {
 		printk(KERN_INFO "Ignoring shutdown request: %s\n", str);
 		shutting_down = SHUTDOWN_INVALID;
-- 
cgit v0.10.2


From 8e15597fa430c03415e2268dfbae0f262b948788 Mon Sep 17 00:00:00 2001
From: Ian Campbell <ian.campbell@citrix.com>
Date: Thu, 17 Feb 2011 11:04:20 +0000
Subject: xen: use new schedop interface for suspend

Take the opportunity to comment on the semantics of the PV guest
suspend hypercall arguments.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index a3c28ae..a7d5823 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -422,10 +422,17 @@ HYPERVISOR_set_segment_base(int reg, unsigned long value)
 #endif
 
 static inline int
-HYPERVISOR_suspend(unsigned long srec)
+HYPERVISOR_suspend(unsigned long start_info_mfn)
 {
-	return _hypercall3(int, sched_op, SCHEDOP_shutdown,
-			   SHUTDOWN_suspend, srec);
+	struct sched_shutdown r = { .reason = SHUTDOWN_suspend };
+
+	/*
+	 * For a PV guest the tools require that the start_info mfn be
+	 * present in rdx/edx when the hypercall is made. Per the
+	 * hypercall calling convention this is the third hypercall
+	 * argument, which is start_info_mfn here.
+	 */
+	return _hypercall3(int, sched_op_new, SCHEDOP_shutdown, &r, start_info_mfn);
 }
 
 static inline int
-- 
cgit v0.10.2


From a8b7458363b9174f3c2196ca6085630b4b30b7a1 Mon Sep 17 00:00:00 2001
From: Ian Campbell <ian.campbell@citrix.com>
Date: Thu, 17 Feb 2011 11:04:20 +0000
Subject: xen: switch to new schedop hypercall by default.

Rename old interface to sched_op_compat and rename sched_op_new to
simply sched_op.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index a7d5823..8508bfe 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -287,7 +287,7 @@ HYPERVISOR_fpu_taskswitch(int set)
 static inline int
 HYPERVISOR_sched_op(int cmd, void *arg)
 {
-	return _hypercall2(int, sched_op_new, cmd, arg);
+	return _hypercall2(int, sched_op, cmd, arg);
 }
 
 static inline long
@@ -432,7 +432,7 @@ HYPERVISOR_suspend(unsigned long start_info_mfn)
 	 * hypercall calling convention this is the third hypercall
 	 * argument, which is start_info_mfn here.
 	 */
-	return _hypercall3(int, sched_op_new, SCHEDOP_shutdown, &r, start_info_mfn);
+	return _hypercall3(int, sched_op, SCHEDOP_shutdown, &r, start_info_mfn);
 }
 
 static inline int
diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h
index 2befa3e..b33257b 100644
--- a/include/xen/interface/xen.h
+++ b/include/xen/interface/xen.h
@@ -30,7 +30,7 @@
 #define __HYPERVISOR_stack_switch          3
 #define __HYPERVISOR_set_callbacks         4
 #define __HYPERVISOR_fpu_taskswitch        5
-#define __HYPERVISOR_sched_op              6
+#define __HYPERVISOR_sched_op_compat       6
 #define __HYPERVISOR_dom0_op               7
 #define __HYPERVISOR_set_debugreg          8
 #define __HYPERVISOR_get_debugreg          9
@@ -52,7 +52,7 @@
 #define __HYPERVISOR_mmuext_op            26
 #define __HYPERVISOR_acm_op               27
 #define __HYPERVISOR_nmi_op               28
-#define __HYPERVISOR_sched_op_new         29
+#define __HYPERVISOR_sched_op             29
 #define __HYPERVISOR_callback_op          30
 #define __HYPERVISOR_xenoprof_op          31
 #define __HYPERVISOR_event_channel_op     32
-- 
cgit v0.10.2


From bd1c0ad28451df4610d352c7e438213c84de0c28 Mon Sep 17 00:00:00 2001
From: Ian Campbell <ian.campbell@citrix.com>
Date: Thu, 17 Feb 2011 11:04:20 +0000
Subject: xen: suspend: use HYPERVISOR_suspend for PVHVM case instead of open
 coding

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
index 972bf78..4dd0186 100644
--- a/drivers/xen/manage.c
+++ b/drivers/xen/manage.c
@@ -38,7 +38,6 @@ static enum shutdown_state shutting_down = SHUTDOWN_INVALID;
 static int xen_hvm_suspend(void *data)
 {
 	int err;
-	struct sched_shutdown r = { .reason = SHUTDOWN_suspend };
 	int *cancelled = data;
 
 	BUG_ON(!irqs_disabled());
@@ -50,7 +49,12 @@ static int xen_hvm_suspend(void *data)
 		return err;
 	}
 
-	*cancelled = HYPERVISOR_sched_op(SCHEDOP_shutdown, &r);
+	/*
+	 * This hypercall returns 1 if suspend was cancelled
+	 * or the domain was merely checkpointed, and 0 if it
+	 * is resuming in a new domain.
+	 */
+	*cancelled = HYPERVISOR_suspend(0UL);
 
 	xen_hvm_post_suspend(*cancelled);
 	gnttab_resume();
-- 
cgit v0.10.2


From ceb180294790c8a6a437533488616f6b591b49d0 Mon Sep 17 00:00:00 2001
From: Ian Campbell <ian.campbell@citrix.com>
Date: Thu, 17 Feb 2011 11:04:20 +0000
Subject: xen: suspend: refactor cancellation flag into a structure

Will add extra fields in subsequent patches.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
index 4dd0186..5c0184f 100644
--- a/drivers/xen/manage.c
+++ b/drivers/xen/manage.c
@@ -34,11 +34,15 @@ enum shutdown_state {
 /* Ignore multiple shutdown requests. */
 static enum shutdown_state shutting_down = SHUTDOWN_INVALID;
 
+struct suspend_info {
+	int cancelled;
+};
+
 #ifdef CONFIG_PM_SLEEP
 static int xen_hvm_suspend(void *data)
 {
+	struct suspend_info *si = data;
 	int err;
-	int *cancelled = data;
 
 	BUG_ON(!irqs_disabled());
 
@@ -54,12 +58,12 @@ static int xen_hvm_suspend(void *data)
 	 * or the domain was merely checkpointed, and 0 if it
 	 * is resuming in a new domain.
 	 */
-	*cancelled = HYPERVISOR_suspend(0UL);
+	si->cancelled = HYPERVISOR_suspend(0UL);
 
-	xen_hvm_post_suspend(*cancelled);
+	xen_hvm_post_suspend(si->cancelled);
 	gnttab_resume();
 
-	if (!*cancelled) {
+	if (!si->cancelled) {
 		xen_irq_resume();
 		xen_console_resume();
 		xen_timer_resume();
@@ -72,8 +76,8 @@ static int xen_hvm_suspend(void *data)
 
 static int xen_suspend(void *data)
 {
+	struct suspend_info *si = data;
 	int err;
-	int *cancelled = data;
 
 	BUG_ON(!irqs_disabled());
 
@@ -93,13 +97,13 @@ static int xen_suspend(void *data)
 	 * or the domain was merely checkpointed, and 0 if it
 	 * is resuming in a new domain.
 	 */
-	*cancelled = HYPERVISOR_suspend(virt_to_mfn(xen_start_info));
+	si->cancelled = HYPERVISOR_suspend(virt_to_mfn(xen_start_info));
 
-	xen_post_suspend(*cancelled);
+	xen_post_suspend(si->cancelled);
 	gnttab_resume();
 	xen_mm_unpin_all();
 
-	if (!*cancelled) {
+	if (!si->cancelled) {
 		xen_irq_resume();
 		xen_console_resume();
 		xen_timer_resume();
@@ -113,7 +117,7 @@ static int xen_suspend(void *data)
 static void do_suspend(void)
 {
 	int err;
-	int cancelled = 1;
+	struct suspend_info si;
 
 	shutting_down = SHUTDOWN_SUSPEND;
 
@@ -143,20 +147,22 @@ static void do_suspend(void)
 		goto out_resume;
 	}
 
+	si.cancelled = 1;
+
 	if (xen_hvm_domain())
-		err = stop_machine(xen_hvm_suspend, &cancelled, cpumask_of(0));
+		err = stop_machine(xen_hvm_suspend, &si, cpumask_of(0));
 	else
-		err = stop_machine(xen_suspend, &cancelled, cpumask_of(0));
+		err = stop_machine(xen_suspend, &si, cpumask_of(0));
 
 	dpm_resume_noirq(PMSG_RESUME);
 
 	if (err) {
 		printk(KERN_ERR "failed to start xen_suspend: %d\n", err);
-		cancelled = 1;
+		si.cancelled = 1;
 	}
 
 out_resume:
-	if (!cancelled) {
+	if (!si.cancelled) {
 		xen_arch_resume();
 		xs_resume();
 	} else
-- 
cgit v0.10.2


From 36b401e2c2788c7b4881115ddbbff603fe4cf78d Mon Sep 17 00:00:00 2001
From: Ian Campbell <ian.campbell@citrix.com>
Date: Thu, 17 Feb 2011 11:04:20 +0000
Subject: xen: suspend: pass extra hypercall argument via suspend_info struct

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
index 5c0184f..6ce6b91 100644
--- a/drivers/xen/manage.c
+++ b/drivers/xen/manage.c
@@ -36,6 +36,7 @@ static enum shutdown_state shutting_down = SHUTDOWN_INVALID;
 
 struct suspend_info {
 	int cancelled;
+	unsigned long arg; /* extra hypercall argument */
 };
 
 #ifdef CONFIG_PM_SLEEP
@@ -58,7 +59,7 @@ static int xen_hvm_suspend(void *data)
 	 * or the domain was merely checkpointed, and 0 if it
 	 * is resuming in a new domain.
 	 */
-	si->cancelled = HYPERVISOR_suspend(0UL);
+	si->cancelled = HYPERVISOR_suspend(si->arg);
 
 	xen_hvm_post_suspend(si->cancelled);
 	gnttab_resume();
@@ -97,7 +98,7 @@ static int xen_suspend(void *data)
 	 * or the domain was merely checkpointed, and 0 if it
 	 * is resuming in a new domain.
 	 */
-	si->cancelled = HYPERVISOR_suspend(virt_to_mfn(xen_start_info));
+	si->cancelled = HYPERVISOR_suspend(si->arg);
 
 	xen_post_suspend(si->cancelled);
 	gnttab_resume();
@@ -150,6 +151,11 @@ static void do_suspend(void)
 	si.cancelled = 1;
 
 	if (xen_hvm_domain())
+		si.arg = 0UL;
+	else
+		si.arg = virt_to_mfn(xen_start_info);
+
+	if (xen_hvm_domain())
 		err = stop_machine(xen_hvm_suspend, &si, cpumask_of(0));
 	else
 		err = stop_machine(xen_suspend, &si, cpumask_of(0));
-- 
cgit v0.10.2


From 03c8142bd2fb3b87effa6ecb2f8957be588bc85f Mon Sep 17 00:00:00 2001
From: Ian Campbell <ian.campbell@citrix.com>
Date: Thu, 17 Feb 2011 11:04:20 +0000
Subject: xen: suspend: add "arch" to pre/post suspend hooks

xen_pre_device_suspend is unused on ia64.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/arch/ia64/xen/suspend.c b/arch/ia64/xen/suspend.c
index fd66b04..419c862 100644
--- a/arch/ia64/xen/suspend.c
+++ b/arch/ia64/xen/suspend.c
@@ -37,19 +37,14 @@ xen_mm_unpin_all(void)
 	/* nothing */
 }
 
-void xen_pre_device_suspend(void)
-{
-	/* nothing */
-}
-
 void
-xen_pre_suspend()
+xen_arch_pre_suspend()
 {
 	/* nothing */
 }
 
 void
-xen_post_suspend(int suspend_cancelled)
+xen_arch_post_suspend(int suspend_cancelled)
 {
 	if (suspend_cancelled)
 		return;
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index 4a3d3dd..45329c8 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -12,7 +12,7 @@
 #include "xen-ops.h"
 #include "mmu.h"
 
-void xen_pre_suspend(void)
+void xen_arch_pre_suspend(void)
 {
 	xen_start_info->store_mfn = mfn_to_pfn(xen_start_info->store_mfn);
 	xen_start_info->console.domU.mfn =
@@ -26,7 +26,7 @@ void xen_pre_suspend(void)
 		BUG();
 }
 
-void xen_hvm_post_suspend(int suspend_cancelled)
+void xen_arch_hvm_post_suspend(int suspend_cancelled)
 {
 #ifdef CONFIG_XEN_PVHVM
 	int cpu;
@@ -41,7 +41,7 @@ void xen_hvm_post_suspend(int suspend_cancelled)
 #endif
 }
 
-void xen_post_suspend(int suspend_cancelled)
+void xen_arch_post_suspend(int suspend_cancelled)
 {
 	xen_build_mfn_list_list();
 
diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
index 6ce6b91..134eb73 100644
--- a/drivers/xen/manage.c
+++ b/drivers/xen/manage.c
@@ -61,7 +61,7 @@ static int xen_hvm_suspend(void *data)
 	 */
 	si->cancelled = HYPERVISOR_suspend(si->arg);
 
-	xen_hvm_post_suspend(si->cancelled);
+	xen_arch_hvm_post_suspend(si->cancelled);
 	gnttab_resume();
 
 	if (!si->cancelled) {
@@ -91,7 +91,7 @@ static int xen_suspend(void *data)
 
 	xen_mm_pin_all();
 	gnttab_suspend();
-	xen_pre_suspend();
+	xen_arch_pre_suspend();
 
 	/*
 	 * This hypercall returns 1 if suspend was cancelled
@@ -100,7 +100,7 @@ static int xen_suspend(void *data)
 	 */
 	si->cancelled = HYPERVISOR_suspend(si->arg);
 
-	xen_post_suspend(si->cancelled);
+	xen_arch_post_suspend(si->cancelled);
 	gnttab_resume();
 	xen_mm_unpin_all();
 
diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h
index 98b9215..03c85d7 100644
--- a/include/xen/xen-ops.h
+++ b/include/xen/xen-ops.h
@@ -5,9 +5,9 @@
 
 DECLARE_PER_CPU(struct vcpu_info *, xen_vcpu);
 
-void xen_pre_suspend(void);
-void xen_post_suspend(int suspend_cancelled);
-void xen_hvm_post_suspend(int suspend_cancelled);
+void xen_arch_pre_suspend(void);
+void xen_arch_post_suspend(int suspend_cancelled);
+void xen_arch_hvm_post_suspend(int suspend_cancelled);
 
 void xen_mm_pin_all(void);
 void xen_mm_unpin_all(void);
-- 
cgit v0.10.2


From 82043bb60d24d2897074905c94be5a53071e8913 Mon Sep 17 00:00:00 2001
From: Ian Campbell <ian.campbell@citrix.com>
Date: Thu, 17 Feb 2011 11:04:20 +0000
Subject: xen: suspend: refactor non-arch specific pre/post suspend hooks

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
index 134eb73..33312c0 100644
--- a/drivers/xen/manage.c
+++ b/drivers/xen/manage.c
@@ -39,6 +39,23 @@ struct suspend_info {
 	unsigned long arg; /* extra hypercall argument */
 };
 
+static void xen_hvm_post_suspend(void)
+{
+	gnttab_resume();
+}
+
+static void xen_pre_suspend(void)
+{
+	xen_mm_pin_all();
+	gnttab_suspend();
+}
+
+static void xen_post_suspend(void)
+{
+	gnttab_resume();
+	xen_mm_unpin_all();
+}
+
 #ifdef CONFIG_PM_SLEEP
 static int xen_hvm_suspend(void *data)
 {
@@ -62,7 +79,7 @@ static int xen_hvm_suspend(void *data)
 	si->cancelled = HYPERVISOR_suspend(si->arg);
 
 	xen_arch_hvm_post_suspend(si->cancelled);
-	gnttab_resume();
+	xen_hvm_post_suspend();
 
 	if (!si->cancelled) {
 		xen_irq_resume();
@@ -89,8 +106,7 @@ static int xen_suspend(void *data)
 		return err;
 	}
 
-	xen_mm_pin_all();
-	gnttab_suspend();
+	xen_pre_suspend();
 	xen_arch_pre_suspend();
 
 	/*
@@ -101,8 +117,7 @@ static int xen_suspend(void *data)
 	si->cancelled = HYPERVISOR_suspend(si->arg);
 
 	xen_arch_post_suspend(si->cancelled);
-	gnttab_resume();
-	xen_mm_unpin_all();
+	xen_post_suspend();
 
 	if (!si->cancelled) {
 		xen_irq_resume();
-- 
cgit v0.10.2


From 07af38102fc4f260cc5a2418ec833707f53cdf70 Mon Sep 17 00:00:00 2001
From: Ian Campbell <ian.campbell@citrix.com>
Date: Thu, 17 Feb 2011 11:04:20 +0000
Subject: xen: suspend: move arch specific pre/post suspend hooks into generic
 hooks

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
index 33312c0..a6bd2e9 100644
--- a/drivers/xen/manage.c
+++ b/drivers/xen/manage.c
@@ -39,8 +39,9 @@ struct suspend_info {
 	unsigned long arg; /* extra hypercall argument */
 };
 
-static void xen_hvm_post_suspend(void)
+static void xen_hvm_post_suspend(int cancelled)
 {
+	xen_arch_hvm_post_suspend(cancelled);
 	gnttab_resume();
 }
 
@@ -48,10 +49,12 @@ static void xen_pre_suspend(void)
 {
 	xen_mm_pin_all();
 	gnttab_suspend();
+	xen_arch_pre_suspend();
 }
 
-static void xen_post_suspend(void)
+static void xen_post_suspend(int cancelled)
 {
+	xen_arch_post_suspend(cancelled);
 	gnttab_resume();
 	xen_mm_unpin_all();
 }
@@ -78,8 +81,7 @@ static int xen_hvm_suspend(void *data)
 	 */
 	si->cancelled = HYPERVISOR_suspend(si->arg);
 
-	xen_arch_hvm_post_suspend(si->cancelled);
-	xen_hvm_post_suspend();
+	xen_hvm_post_suspend(si->cancelled);
 
 	if (!si->cancelled) {
 		xen_irq_resume();
@@ -107,7 +109,6 @@ static int xen_suspend(void *data)
 	}
 
 	xen_pre_suspend();
-	xen_arch_pre_suspend();
 
 	/*
 	 * This hypercall returns 1 if suspend was cancelled
@@ -116,8 +117,7 @@ static int xen_suspend(void *data)
 	 */
 	si->cancelled = HYPERVISOR_suspend(si->arg);
 
-	xen_arch_post_suspend(si->cancelled);
-	xen_post_suspend();
+	xen_post_suspend(si->cancelled);
 
 	if (!si->cancelled) {
 		xen_irq_resume();
-- 
cgit v0.10.2


From 55fb4acef7089a6d4d93ed8caae6c258d06cfaf7 Mon Sep 17 00:00:00 2001
From: Ian Campbell <ian.campbell@citrix.com>
Date: Thu, 17 Feb 2011 11:04:20 +0000
Subject: xen: suspend: pull pre/post suspend hooks out into suspend_info

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
index a6bd2e9..5b7a0a9 100644
--- a/drivers/xen/manage.c
+++ b/drivers/xen/manage.c
@@ -37,6 +37,8 @@ static enum shutdown_state shutting_down = SHUTDOWN_INVALID;
 struct suspend_info {
 	int cancelled;
 	unsigned long arg; /* extra hypercall argument */
+	void (*pre)(void);
+	void (*post)(int cancelled);
 };
 
 static void xen_hvm_post_suspend(int cancelled)
@@ -74,6 +76,9 @@ static int xen_hvm_suspend(void *data)
 		return err;
 	}
 
+	if (si->pre)
+		si->pre();
+
 	/*
 	 * This hypercall returns 1 if suspend was cancelled
 	 * or the domain was merely checkpointed, and 0 if it
@@ -81,7 +86,8 @@ static int xen_hvm_suspend(void *data)
 	 */
 	si->cancelled = HYPERVISOR_suspend(si->arg);
 
-	xen_hvm_post_suspend(si->cancelled);
+	if (si->post)
+		si->post(si->cancelled);
 
 	if (!si->cancelled) {
 		xen_irq_resume();
@@ -108,7 +114,8 @@ static int xen_suspend(void *data)
 		return err;
 	}
 
-	xen_pre_suspend();
+	if (si->pre)
+		si->pre();
 
 	/*
 	 * This hypercall returns 1 if suspend was cancelled
@@ -117,7 +124,8 @@ static int xen_suspend(void *data)
 	 */
 	si->cancelled = HYPERVISOR_suspend(si->arg);
 
-	xen_post_suspend(si->cancelled);
+	if (si->post)
+		si->post(si->cancelled);
 
 	if (!si->cancelled) {
 		xen_irq_resume();
@@ -165,10 +173,15 @@ static void do_suspend(void)
 
 	si.cancelled = 1;
 
-	if (xen_hvm_domain())
+	if (xen_hvm_domain()) {
 		si.arg = 0UL;
-	else
+		si.pre = NULL;
+		si.post = &xen_hvm_post_suspend;
+	} else {
 		si.arg = virt_to_mfn(xen_start_info);
+		si.pre = &xen_pre_suspend;
+		si.post = &xen_post_suspend;
+	}
 
 	if (xen_hvm_domain())
 		err = stop_machine(xen_hvm_suspend, &si, cpumask_of(0));
-- 
cgit v0.10.2


From b056b6a0144de90707cd22cf7b4f60bf69c86d59 Mon Sep 17 00:00:00 2001
From: Ian Campbell <ian.campbell@citrix.com>
Date: Thu, 17 Feb 2011 11:04:20 +0000
Subject: xen: suspend: remove xen_hvm_suspend

It is now identical to xen_suspend, the differences are encapsulated
in the suspend_info struct.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
index 5b7a0a9..ebb2928 100644
--- a/drivers/xen/manage.c
+++ b/drivers/xen/manage.c
@@ -62,44 +62,6 @@ static void xen_post_suspend(int cancelled)
 }
 
 #ifdef CONFIG_PM_SLEEP
-static int xen_hvm_suspend(void *data)
-{
-	struct suspend_info *si = data;
-	int err;
-
-	BUG_ON(!irqs_disabled());
-
-	err = sysdev_suspend(PMSG_SUSPEND);
-	if (err) {
-		printk(KERN_ERR "xen_hvm_suspend: sysdev_suspend failed: %d\n",
-		       err);
-		return err;
-	}
-
-	if (si->pre)
-		si->pre();
-
-	/*
-	 * This hypercall returns 1 if suspend was cancelled
-	 * or the domain was merely checkpointed, and 0 if it
-	 * is resuming in a new domain.
-	 */
-	si->cancelled = HYPERVISOR_suspend(si->arg);
-
-	if (si->post)
-		si->post(si->cancelled);
-
-	if (!si->cancelled) {
-		xen_irq_resume();
-		xen_console_resume();
-		xen_timer_resume();
-	}
-
-	sysdev_resume();
-
-	return 0;
-}
-
 static int xen_suspend(void *data)
 {
 	struct suspend_info *si = data;
@@ -183,10 +145,7 @@ static void do_suspend(void)
 		si.post = &xen_post_suspend;
 	}
 
-	if (xen_hvm_domain())
-		err = stop_machine(xen_hvm_suspend, &si, cpumask_of(0));
-	else
-		err = stop_machine(xen_suspend, &si, cpumask_of(0));
+	err = stop_machine(xen_suspend, &si, cpumask_of(0));
 
 	dpm_resume_noirq(PMSG_RESUME);
 
-- 
cgit v0.10.2


From b5faba21a6805c33b40e258d36f57997ee1de131 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Feb 2011 23:52:13 +0000
Subject: genirq: Prepare the handling of shared oneshot interrupts

For level type interrupts we need to track how many threads are on
flight to avoid useless interrupt storms when not all thread handlers
have finished yet. Keep track of the woken threads and only unmask
when there are no more threads in flight.

Yes, I'm lazy and using a bitfield. But not only because I'm lazy, the
main reason is that it's way simpler than using a refcount. A refcount
based solution would need to keep track of various things like
crashing the irq thread, spurious interrupts coming in,
disables/enables, free_irq() and some more. The bitfield keeps the
tracking simple and makes things just work. It's also nicely confined
to the thread code pathes and does not require additional checks all
over the place.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <20110223234956.388095876@linutronix.de>

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 8da6643..e116fef 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -99,6 +99,7 @@ typedef irqreturn_t (*irq_handler_t)(int, void *);
  * @thread_fn:	interupt handler function for threaded interrupts
  * @thread:	thread pointer for threaded interrupts
  * @thread_flags:	flags related to @thread
+ * @thread_mask:	bitmask for keeping track of @thread activity
  */
 struct irqaction {
 	irq_handler_t handler;
@@ -109,6 +110,7 @@ struct irqaction {
 	irq_handler_t thread_fn;
 	struct task_struct *thread;
 	unsigned long thread_flags;
+	unsigned long thread_mask;
 	const char *name;
 	struct proc_dir_entry *dir;
 } ____cacheline_internodealigned_in_smp;
diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
index 2f87d64..9eb9cd3 100644
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -28,6 +28,7 @@ struct timer_rand_state;
  * @lock:		locking for SMP
  * @affinity_notify:	context for notification of affinity changes
  * @pending_mask:	pending rebalanced interrupts
+ * @threads_oneshot:	bitfield to handle shared oneshot threads
  * @threads_active:	number of irqaction threads currently running
  * @wait_for_threads:	wait queue for sync_irq to wait for threaded handlers
  * @dir:		/proc/irq/ procfs entry
@@ -86,6 +87,7 @@ struct irq_desc {
 	cpumask_var_t		pending_mask;
 #endif
 #endif
+	unsigned long		threads_oneshot;
 	atomic_t		threads_active;
 	wait_queue_head_t       wait_for_threads;
 #ifdef CONFIG_PROC_FS
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index b110c83..517561f 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -51,6 +51,68 @@ static void warn_no_thread(unsigned int irq, struct irqaction *action)
 	       "but no thread function available.", irq, action->name);
 }
 
+static void irq_wake_thread(struct irq_desc *desc, struct irqaction *action)
+{
+	/*
+	 * Wake up the handler thread for this action. In case the
+	 * thread crashed and was killed we just pretend that we
+	 * handled the interrupt. The hardirq handler has disabled the
+	 * device interrupt, so no irq storm is lurking. If the
+	 * RUNTHREAD bit is already set, nothing to do.
+	 */
+	if (test_bit(IRQTF_DIED, &action->thread_flags) ||
+	    test_and_set_bit(IRQTF_RUNTHREAD, &action->thread_flags))
+		return;
+
+	/*
+	 * It's safe to OR the mask lockless here. We have only two
+	 * places which write to threads_oneshot: This code and the
+	 * irq thread.
+	 *
+	 * This code is the hard irq context and can never run on two
+	 * cpus in parallel. If it ever does we have more serious
+	 * problems than this bitmask.
+	 *
+	 * The irq threads of this irq which clear their "running" bit
+	 * in threads_oneshot are serialized via desc->lock against
+	 * each other and they are serialized against this code by
+	 * IRQS_INPROGRESS.
+	 *
+	 * Hard irq handler:
+	 *
+	 *	spin_lock(desc->lock);
+	 *	desc->state |= IRQS_INPROGRESS;
+	 *	spin_unlock(desc->lock);
+	 *	set_bit(IRQTF_RUNTHREAD, &action->thread_flags);
+	 *	desc->threads_oneshot |= mask;
+	 *	spin_lock(desc->lock);
+	 *	desc->state &= ~IRQS_INPROGRESS;
+	 *	spin_unlock(desc->lock);
+	 *
+	 * irq thread:
+	 *
+	 * again:
+	 *	spin_lock(desc->lock);
+	 *	if (desc->state & IRQS_INPROGRESS) {
+	 *		spin_unlock(desc->lock);
+	 *		while(desc->state & IRQS_INPROGRESS)
+	 *			cpu_relax();
+	 *		goto again;
+	 *	}
+	 *	if (!test_bit(IRQTF_RUNTHREAD, &action->thread_flags))
+	 *		desc->threads_oneshot &= ~mask;
+	 *	spin_unlock(desc->lock);
+	 *
+	 * So either the thread waits for us to clear IRQS_INPROGRESS
+	 * or we are waiting in the flow handler for desc->lock to be
+	 * released before we reach this point. The thread also checks
+	 * IRQTF_RUNTHREAD under desc->lock. If set it leaves
+	 * threads_oneshot untouched and runs the thread another time.
+	 */
+	desc->threads_oneshot |= action->thread_mask;
+	wake_up_process(action->thread);
+}
+
 irqreturn_t
 handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
 {
@@ -85,19 +147,7 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
 				break;
 			}
 
-			/*
-			 * Wake up the handler thread for this
-			 * action. In case the thread crashed and was
-			 * killed we just pretend that we handled the
-			 * interrupt. The hardirq handler above has
-			 * disabled the device interrupt, so no irq
-			 * storm is lurking.
-			 */
-			if (likely(!test_bit(IRQTF_DIED,
-					     &action->thread_flags))) {
-				set_bit(IRQTF_RUNTHREAD, &action->thread_flags);
-				wake_up_process(action->thread);
-			}
+			irq_wake_thread(desc, action);
 
 			/* Fall through to add to randomness */
 		case IRQ_HANDLED:
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 01f8a95..2301de1 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -617,8 +617,11 @@ static int irq_wait_for_interrupt(struct irqaction *action)
  * handler finished. unmask if the interrupt has not been disabled and
  * is marked MASKED.
  */
-static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc)
+static void irq_finalize_oneshot(struct irq_desc *desc,
+				 struct irqaction *action, bool force)
 {
+	if (!(desc->istate & IRQS_ONESHOT))
+		return;
 again:
 	chip_bus_lock(desc);
 	raw_spin_lock_irq(&desc->lock);
@@ -631,6 +634,11 @@ again:
 	 * on the other CPU. If we unmask the irq line then the
 	 * interrupt can come in again and masks the line, leaves due
 	 * to IRQS_INPROGRESS and the irq line is masked forever.
+	 *
+	 * This also serializes the state of shared oneshot handlers
+	 * versus "desc->threads_onehsot |= action->thread_mask;" in
+	 * irq_wake_thread(). See the comment there which explains the
+	 * serialization.
 	 */
 	if (unlikely(desc->istate & IRQS_INPROGRESS)) {
 		raw_spin_unlock_irq(&desc->lock);
@@ -639,11 +647,23 @@ again:
 		goto again;
 	}
 
-	if (!(desc->istate & IRQS_DISABLED) && (desc->istate & IRQS_MASKED)) {
+	/*
+	 * Now check again, whether the thread should run. Otherwise
+	 * we would clear the threads_oneshot bit of this thread which
+	 * was just set.
+	 */
+	if (!force && test_bit(IRQTF_RUNTHREAD, &action->thread_flags))
+		goto out_unlock;
+
+	desc->threads_oneshot &= ~action->thread_mask;
+
+	if (!desc->threads_oneshot && !(desc->istate & IRQS_DISABLED) &&
+	    (desc->istate & IRQS_MASKED)) {
 		irq_compat_clr_masked(desc);
 		desc->istate &= ~IRQS_MASKED;
 		desc->irq_data.chip->irq_unmask(&desc->irq_data);
 	}
+out_unlock:
 	raw_spin_unlock_irq(&desc->lock);
 	chip_bus_sync_unlock(desc);
 }
@@ -691,7 +711,7 @@ static int irq_thread(void *data)
 	};
 	struct irqaction *action = data;
 	struct irq_desc *desc = irq_to_desc(action->irq);
-	int wake, oneshot = desc->istate & IRQS_ONESHOT;
+	int wake;
 
 	sched_setscheduler(current, SCHED_FIFO, &param);
 	current->irqaction = action;
@@ -719,8 +739,7 @@ static int irq_thread(void *data)
 
 			action->thread_fn(action->irq, action->dev_id);
 
-			if (oneshot)
-				irq_finalize_oneshot(action->irq, desc);
+			irq_finalize_oneshot(desc, action, false);
 		}
 
 		wake = atomic_dec_and_test(&desc->threads_active);
@@ -729,6 +748,9 @@ static int irq_thread(void *data)
 			wake_up(&desc->wait_for_threads);
 	}
 
+	/* Prevent a stale desc->threads_oneshot */
+	irq_finalize_oneshot(desc, action, true);
+
 	/*
 	 * Clear irqaction. Otherwise exit_irq_thread() would make
 	 * fuzz about an active irq thread going into nirvana.
@@ -743,6 +765,7 @@ static int irq_thread(void *data)
 void exit_irq_thread(void)
 {
 	struct task_struct *tsk = current;
+	struct irq_desc *desc;
 
 	if (!tsk->irqaction)
 		return;
@@ -751,6 +774,14 @@ void exit_irq_thread(void)
 	       "exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n",
 	       tsk->comm ? tsk->comm : "", tsk->pid, tsk->irqaction->irq);
 
+	desc = irq_to_desc(tsk->irqaction->irq);
+
+	/*
+	 * Prevent a stale desc->threads_oneshot. Must be called
+	 * before setting the IRQTF_DIED flag.
+	 */
+	irq_finalize_oneshot(desc, tsk->irqaction, true);
+
 	/*
 	 * Set the THREAD DIED flag to prevent further wakeups of the
 	 * soon to be gone threaded handler.
@@ -767,7 +798,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 {
 	struct irqaction *old, **old_ptr;
 	const char *old_name = NULL;
-	unsigned long flags;
+	unsigned long flags, thread_mask = 0;
 	int ret, nested, shared = 0;
 	cpumask_var_t mask;
 
@@ -865,12 +896,23 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 
 		/* add new interrupt at end of irq queue */
 		do {
+			thread_mask |= old->thread_mask;
 			old_ptr = &old->next;
 			old = *old_ptr;
 		} while (old);
 		shared = 1;
 	}
 
+	/*
+	 * Setup the thread mask for this irqaction. Unlikely to have
+	 * 32 resp 64 irqs sharing one line, but who knows.
+	 */
+	if (new->flags & IRQF_ONESHOT && thread_mask == ~0UL) {
+		ret = -EBUSY;
+		goto out_mask;
+	}
+	new->thread_mask = 1 << ffz(thread_mask);
+
 	if (!shared) {
 		irq_chip_set_defaults(desc->irq_data.chip);
 
-- 
cgit v0.10.2


From 9d591edd02a245305b1b9379e4c5571bad4d2774 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Feb 2011 23:52:16 +0000
Subject: genirq: Allow shared oneshot interrupts

Support ONESHOT on shared interrupts, if all drivers agree on it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <20110223234956.483640430@linutronix.de>

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 2301de1..58c8613 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -824,10 +824,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 		rand_initialize_irq(irq);
 	}
 
-	/* Oneshot interrupts are not allowed with shared */
-	if ((new->flags & IRQF_ONESHOT) && (new->flags & IRQF_SHARED))
-		return -EINVAL;
-
 	/*
 	 * Check whether the interrupt nests into another interrupt
 	 * thread.
@@ -881,10 +877,12 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 		 * Can't share interrupts unless both agree to and are
 		 * the same type (level, edge, polarity). So both flag
 		 * fields must have IRQF_SHARED set and the bits which
-		 * set the trigger type must match.
+		 * set the trigger type must match. Also all must
+		 * agree on ONESHOT.
 		 */
 		if (!((old->flags & new->flags) & IRQF_SHARED) ||
-		    ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK)) {
+		    ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK) ||
+		    ((old->flags ^ new->flags) & IRQF_ONESHOT)) {
 			old_name = old->name;
 			goto mismatch;
 		}
-- 
cgit v0.10.2


From 0c4602ff88d6d6ef0ee6d228ee9acaa6448ff6f5 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Feb 2011 23:52:18 +0000
Subject: genirq: Add IRQF_NO_THREAD

Some low level interrupts cannot be threaded even when we force thread
all interrupt handlers. Add a flag to annotate such interrupts. Add
all timer interrupts to this category by default.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <20110223234956.578893460@linutronix.de>

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index e116fef..0fc3eb9 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -58,6 +58,7 @@
  *                irq line disabled until the threaded handler has been run.
  * IRQF_NO_SUSPEND - Do not disable this IRQ during suspend
  * IRQF_FORCE_RESUME - Force enable it on resume even if IRQF_NO_SUSPEND is set
+ * IRQF_NO_THREAD - Interrupt cannot be threaded
  */
 #define IRQF_DISABLED		0x00000020
 #define IRQF_SAMPLE_RANDOM	0x00000040
@@ -70,8 +71,9 @@
 #define IRQF_ONESHOT		0x00002000
 #define IRQF_NO_SUSPEND		0x00004000
 #define IRQF_FORCE_RESUME	0x00008000
+#define IRQF_NO_THREAD		0x00010000
 
-#define IRQF_TIMER		(__IRQF_TIMER | IRQF_NO_SUSPEND)
+#define IRQF_TIMER		(__IRQF_TIMER | IRQF_NO_SUSPEND | IRQF_NO_THREAD)
 
 /*
  * These values can be returned by request_any_context_irq() and
-- 
cgit v0.10.2


From 8eb90c30e0e815a1308828352eabd03ca04229dd Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Feb 2011 23:52:21 +0000
Subject: sched: Switch wait_task_inactive to schedule_hrtimeout()

When we force thread hard and soft interrupts the startup of ksoftirqd
would hang in kthread_bind() when wait_task_inactive() calls
schedule_timeout_uninterruptible() because there is no softirq yet
which will wake us up.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <20110223234956.677109139@linutronix.de>

diff --git a/kernel/sched.c b/kernel/sched.c
index 18d38e4..66ca5d9 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2224,7 +2224,10 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
 		 * yield - it could be a while.
 		 */
 		if (unlikely(on_rq)) {
-			schedule_timeout_uninterruptible(1);
+			ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
+
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			schedule_hrtimeout(&to, HRTIMER_MODE_REL);
 			continue;
 		}
 
-- 
cgit v0.10.2


From 7bf04be8f48ceeeffa5b5a79734d6d6e0d59e5f8 Mon Sep 17 00:00:00 2001
From: Stratos Psomadakis <psomas@cslab.ece.ntua.gr>
Date: Fri, 25 Feb 2011 22:46:13 +0200
Subject: x86, asm: Cleanup unnecssary macros in asm-offsets.c

PAGE_SIZE_asm, PAGE_SHIFT_asm, THREAD_SIZE_asm can be safely removed from
asm-offsets.c, and be replaced by their non-'_asm' counterparts in the code
that uses them, since the _AC macro defined in include/linux/const.h makes
PAGE_SIZE/PAGE_SHIFT/THREAD_SIZE work with as.

Signed-off-by: Stratos Psomadakis <psomas@cslab.ece.ntua.gr>
LKML-Reference: <1298666774-17646-2-git-send-email-psomas@cslab.ece.ntua.gr>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>

diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 2b141c1..4f13faf 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -29,11 +29,6 @@
 
 void common(void) {
 	BLANK();
-	DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
-	DEFINE(PAGE_SHIFT_asm, PAGE_SHIFT);
-	DEFINE(THREAD_SIZE_asm, THREAD_SIZE);
-
-	BLANK();
 	OFFSET(TI_flags, thread_info, flags);
 	OFFSET(TI_status, thread_info, status);
 	OFFSET(TI_addr_limit, thread_info, addr_limit);
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index c8b4efa..49bdedd 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -395,7 +395,7 @@ sysenter_past_esp:
 	 * A tiny bit of offset fixup is necessary - 4*4 means the 4 words
 	 * pushed above; +8 corresponds to copy_thread's esp0 setting.
 	 */
-	pushl_cfi ((TI_sysenter_return)-THREAD_SIZE_asm+8+4*4)(%esp)
+	pushl_cfi ((TI_sysenter_return)-THREAD_SIZE+8+4*4)(%esp)
 	CFI_REL_OFFSET eip, 0
 
 	pushl_cfi %eax
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 767d6c4..187aa63 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -73,7 +73,7 @@ MAPPING_BEYOND_END = PAGE_TABLE_SIZE(LOWMEM_PAGES) << PAGE_SHIFT
  */
 KERNEL_PAGES = LOWMEM_PAGES
 
-INIT_MAP_SIZE = PAGE_TABLE_SIZE(KERNEL_PAGES) * PAGE_SIZE_asm
+INIT_MAP_SIZE = PAGE_TABLE_SIZE(KERNEL_PAGES) * PAGE_SIZE
 RESERVE_BRK(pagetables, INIT_MAP_SIZE)
 
 /*
@@ -623,7 +623,7 @@ ENTRY(initial_code)
  * BSS section
  */
 __PAGE_ALIGNED_BSS
-	.align PAGE_SIZE_asm
+	.align PAGE_SIZE
 #ifdef CONFIG_X86_PAE
 initial_pg_pmd:
 	.fill 1024*KPMDS,4,0
@@ -644,7 +644,7 @@ ENTRY(swapper_pg_dir)
 #ifdef CONFIG_X86_PAE
 __PAGE_ALIGNED_DATA
 	/* Page-aligned for the benefit of paravirt? */
-	.align PAGE_SIZE_asm
+	.align PAGE_SIZE
 ENTRY(initial_page_table)
 	.long	pa(initial_pg_pmd+PGD_IDENT_ATTR),0	/* low identity map */
 # if KPMDS == 3
@@ -662,7 +662,7 @@ ENTRY(initial_page_table)
 # else
 #  error "Kernel PMDs should be 1, 2 or 3"
 # endif
-	.align PAGE_SIZE_asm		/* needs to be page-sized too */
+	.align PAGE_SIZE		/* needs to be page-sized too */
 #endif
 
 .data
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index 1a5ff24..aaa7291 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -28,9 +28,9 @@ ENTRY(startup_xen)
 	__FINIT
 
 .pushsection .text
-	.align PAGE_SIZE_asm
+	.align PAGE_SIZE
 ENTRY(hypercall_page)
-	.skip PAGE_SIZE_asm
+	.skip PAGE_SIZE
 .popsection
 
 	ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS,       .asciz "linux")
-- 
cgit v0.10.2


From 544b4a1f309d18f40969dbab7e08bafd136b2f55 Mon Sep 17 00:00:00 2001
From: Venkatesh Pallipadi <venki@google.com>
Date: Fri, 25 Feb 2011 15:13:16 -0800
Subject: sched: Clean up the IRQ_TIME_ACCOUNTING code

Fix this warning:

  lkml.org/lkml/2011/1/30/124

 kernel/sched.c:3719: warning: 'irqtime_account_idle_ticks' defined but not used
 kernel/sched.c:3720: warning: 'irqtime_account_process_tick' defined but not used

In a cleaner way than:

 7e9498705e81: sched: Add #ifdef around irq time accounting functions

This patch will not have any functional impact.

Signed-off-by: Venkatesh Pallipadi <venki@google.com>
Cc: heiko.carstens@de.ibm.com
Cc: a.p.zijlstra@chello.nl
LKML-Reference: <1298675596-10992-1-git-send-email-venki@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched.c b/kernel/sched.c
index 79bca16..0c87126 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3687,7 +3687,36 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
 	__account_system_time(p, cputime, cputime_scaled, target_cputime64);
 }
 
+/*
+ * Account for involuntary wait time.
+ * @cputime: the cpu time spent in involuntary wait
+ */
+void account_steal_time(cputime_t cputime)
+{
+	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+	cputime64_t cputime64 = cputime_to_cputime64(cputime);
+
+	cpustat->steal = cputime64_add(cpustat->steal, cputime64);
+}
+
+/*
+ * Account for idle time.
+ * @cputime: the cpu time spent in idle wait
+ */
+void account_idle_time(cputime_t cputime)
+{
+	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+	cputime64_t cputime64 = cputime_to_cputime64(cputime);
+	struct rq *rq = this_rq();
+
+	if (atomic_read(&rq->nr_iowait) > 0)
+		cpustat->iowait = cputime64_add(cpustat->iowait, cputime64);
+	else
+		cpustat->idle = cputime64_add(cpustat->idle, cputime64);
+}
+
 #ifndef CONFIG_VIRT_CPU_ACCOUNTING
+
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
 /*
  * Account a tick to a process and cpustat
@@ -3749,42 +3778,11 @@ static void irqtime_account_idle_ticks(int ticks)
 	for (i = 0; i < ticks; i++)
 		irqtime_account_process_tick(current, 0, rq);
 }
-#else
+#else /* CONFIG_IRQ_TIME_ACCOUNTING */
 static void irqtime_account_idle_ticks(int ticks) {}
 static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
 						struct rq *rq) {}
-#endif
-#endif /* !CONFIG_VIRT_CPU_ACCOUNTING */
-
-/*
- * Account for involuntary wait time.
- * @steal: the cpu time spent in involuntary wait
- */
-void account_steal_time(cputime_t cputime)
-{
-	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
-	cputime64_t cputime64 = cputime_to_cputime64(cputime);
-
-	cpustat->steal = cputime64_add(cpustat->steal, cputime64);
-}
-
-/*
- * Account for idle time.
- * @cputime: the cpu time spent in idle wait
- */
-void account_idle_time(cputime_t cputime)
-{
-	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
-	cputime64_t cputime64 = cputime_to_cputime64(cputime);
-	struct rq *rq = this_rq();
-
-	if (atomic_read(&rq->nr_iowait) > 0)
-		cpustat->iowait = cputime64_add(cpustat->iowait, cputime64);
-	else
-		cpustat->idle = cputime64_add(cpustat->idle, cputime64);
-}
-
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
 
 /*
  * Account a single tick of cpu time.
-- 
cgit v0.10.2


From 8d32a307e4faa8b123dc8a9cd56d1a7525f69ad3 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Feb 2011 23:52:23 +0000
Subject: genirq: Provide forced interrupt threading

Add a commandline parameter "threadirqs" which forces all interrupts except
those marked IRQF_NO_THREAD to run threaded. That's mostly a debug option to
allow retrieving better debug data from crashing interrupt handlers. If
"threadirqs" is not enabled on the kernel command line, then there is no
impact in the interrupt hotpath.

Architecture code needs to select CONFIG_IRQ_FORCED_THREADING after
marking the interrupts which cant be threaded IRQF_NO_THREAD. All
interrupts which have IRQF_TIMER set are implict marked
IRQF_NO_THREAD. Also all PER_CPU interrupts are excluded.

Forced threading hard interrupts also forces all soft interrupt
handling into thread context.

When enabled it might slow down things a bit, but for debugging problems in
interrupt code it's a reasonable penalty as it does not immediately
crash and burn the machine when an interrupt handler is buggy.

Some test results on a Core2Duo machine:

Cache cold run of:
 # time git grep irq_desc

      non-threaded       threaded
 real 1m18.741s          1m19.061s
 user 0m1.874s           0m1.757s
 sys  0m5.843s           0m5.427s

 # iperf -c server
non-threaded
[  3]  0.0-10.0 sec  1.09 GBytes   933 Mbits/sec
[  3]  0.0-10.0 sec  1.09 GBytes   934 Mbits/sec
[  3]  0.0-10.0 sec  1.09 GBytes   933 Mbits/sec
threaded
[  3]  0.0-10.0 sec  1.09 GBytes   939 Mbits/sec
[  3]  0.0-10.0 sec  1.09 GBytes   934 Mbits/sec
[  3]  0.0-10.0 sec  1.09 GBytes   937 Mbits/sec

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <20110223234956.772668648@linutronix.de>

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 89835a4..cac6cf9 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2436,6 +2436,10 @@ and is between 256 and 4096 characters. It is defined in the file
 			<deci-seconds>: poll all this frequency
 			0: no polling (default)
 
+	threadirqs	[KNL]
+			Force threading of all interrupt handlers except those
+			marked explicitely IRQF_NO_THREAD.
+
 	topology=	[S390]
 			Format: {off | on}
 			Specify if the kernel should make use of the cpu
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 0fc3eb9..f8a8af1 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -383,6 +383,13 @@ static inline int disable_irq_wake(unsigned int irq)
 }
 #endif /* CONFIG_GENERIC_HARDIRQS */
 
+
+#ifdef CONFIG_IRQ_FORCED_THREADING
+extern bool force_irqthreads;
+#else
+#define force_irqthreads	(0)
+#endif
+
 #ifndef __ARCH_SET_SOFTIRQ_PENDING
 #define set_softirq_pending(x) (local_softirq_pending() = (x))
 #define or_softirq_pending(x)  (local_softirq_pending() |= (x))
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 355b8c7..144db9d 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -38,6 +38,9 @@ config HARDIRQS_SW_RESEND
 config IRQ_PREFLOW_FASTEOI
        bool
 
+config IRQ_FORCED_THREADING
+       bool
+
 config SPARSE_IRQ
 	bool "Support sparse irq numbering"
 	depends on HAVE_SPARSE_IRQ
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 935bec4..6c6ec9a 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -27,12 +27,14 @@ extern int noirqdebug;
  * IRQTF_DIED      - handler thread died
  * IRQTF_WARNED    - warning "IRQ_WAKE_THREAD w/o thread_fn" has been printed
  * IRQTF_AFFINITY  - irq thread is requested to adjust affinity
+ * IRQTF_FORCED_THREAD  - irq action is force threaded
  */
 enum {
 	IRQTF_RUNTHREAD,
 	IRQTF_DIED,
 	IRQTF_WARNED,
 	IRQTF_AFFINITY,
+	IRQTF_FORCED_THREAD,
 };
 
 /*
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 58c8613..acd599a 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -17,6 +17,17 @@
 
 #include "internals.h"
 
+#ifdef CONFIG_IRQ_FORCED_THREADING
+__read_mostly bool force_irqthreads;
+
+static int __init setup_forced_irqthreads(char *arg)
+{
+	force_irqthreads = true;
+	return 0;
+}
+early_param("threadirqs", setup_forced_irqthreads);
+#endif
+
 /**
  *	synchronize_irq - wait for pending IRQ handlers (on other CPUs)
  *	@irq: interrupt number to wait for
@@ -702,6 +713,32 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { }
 #endif
 
 /*
+ * Interrupts which are not explicitely requested as threaded
+ * interrupts rely on the implicit bh/preempt disable of the hard irq
+ * context. So we need to disable bh here to avoid deadlocks and other
+ * side effects.
+ */
+static void
+irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action)
+{
+	local_bh_disable();
+	action->thread_fn(action->irq, action->dev_id);
+	irq_finalize_oneshot(desc, action, false);
+	local_bh_enable();
+}
+
+/*
+ * Interrupts explicitely requested as threaded interupts want to be
+ * preemtible - many of them need to sleep and wait for slow busses to
+ * complete.
+ */
+static void irq_thread_fn(struct irq_desc *desc, struct irqaction *action)
+{
+	action->thread_fn(action->irq, action->dev_id);
+	irq_finalize_oneshot(desc, action, false);
+}
+
+/*
  * Interrupt handler thread
  */
 static int irq_thread(void *data)
@@ -711,8 +748,15 @@ static int irq_thread(void *data)
 	};
 	struct irqaction *action = data;
 	struct irq_desc *desc = irq_to_desc(action->irq);
+	void (*handler_fn)(struct irq_desc *desc, struct irqaction *action);
 	int wake;
 
+	if (force_irqthreads & test_bit(IRQTF_FORCED_THREAD,
+					&action->thread_flags))
+		handler_fn = irq_forced_thread_fn;
+	else
+		handler_fn = irq_thread_fn;
+
 	sched_setscheduler(current, SCHED_FIFO, &param);
 	current->irqaction = action;
 
@@ -736,10 +780,7 @@ static int irq_thread(void *data)
 			raw_spin_unlock_irq(&desc->lock);
 		} else {
 			raw_spin_unlock_irq(&desc->lock);
-
-			action->thread_fn(action->irq, action->dev_id);
-
-			irq_finalize_oneshot(desc, action, false);
+			handler_fn(desc, action);
 		}
 
 		wake = atomic_dec_and_test(&desc->threads_active);
@@ -789,6 +830,22 @@ void exit_irq_thread(void)
 	set_bit(IRQTF_DIED, &tsk->irqaction->flags);
 }
 
+static void irq_setup_forced_threading(struct irqaction *new)
+{
+	if (!force_irqthreads)
+		return;
+	if (new->flags & (IRQF_NO_THREAD | IRQF_PERCPU | IRQF_ONESHOT))
+		return;
+
+	new->flags |= IRQF_ONESHOT;
+
+	if (!new->thread_fn) {
+		set_bit(IRQTF_FORCED_THREAD, &new->thread_flags);
+		new->thread_fn = new->handler;
+		new->handler = irq_default_primary_handler;
+	}
+}
+
 /*
  * Internal function to register an irqaction - typically used to
  * allocate special interrupts that are part of the architecture.
@@ -838,6 +895,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 		 * dummy function which warns when called.
 		 */
 		new->handler = irq_nested_primary_handler;
+	} else {
+		irq_setup_forced_threading(new);
 	}
 
 	/*
diff --git a/kernel/softirq.c b/kernel/softirq.c
index c049046..a33fb29 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -311,9 +311,21 @@ void irq_enter(void)
 }
 
 #ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
-# define invoke_softirq()	__do_softirq()
+static inline void invoke_softirq(void)
+{
+	if (!force_irqthreads)
+		__do_softirq();
+	else
+		wakeup_softirqd();
+}
 #else
-# define invoke_softirq()	do_softirq()
+static inline void invoke_softirq(void)
+{
+	if (!force_irqthreads)
+		do_softirq();
+	else
+		wakeup_softirqd();
+}
 #endif
 
 /*
-- 
cgit v0.10.2


From cc28989437de5617875a2943697fe6ba51a0da8f Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Sat, 26 Feb 2011 13:05:43 +0100
Subject: mm: Move early_node_map[] reverse scan helpers under HAVE_MEMBLOCK

Heiko found recent memblock change triggers these warnings on s390:

  mm/page_alloc.c:3623:22: warning: 'last_active_region_index_in_nid' defined but not used
  mm/page_alloc.c:3638:22: warning: 'previous_active_region_index_in_nid' defined but not used

Need to move those two function under HAVE_MEMBLOCK with its only
user, find_memory_core_early().

-tj: Minor updates to description.

Reported-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6035136..da0fe32 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3616,34 +3616,6 @@ static int __meminit next_active_region_index_in_nid(int index, int nid)
 	return -1;
 }
 
-/*
- * Basic iterator support. Return the last range of PFNs for a node
- * Note: nid == MAX_NUMNODES returns last region regardless of node
- */
-static int __meminit last_active_region_index_in_nid(int nid)
-{
-	int i;
-
-	for (i = nr_nodemap_entries - 1; i >= 0; i--)
-		if (nid == MAX_NUMNODES || early_node_map[i].nid == nid)
-			return i;
-
-	return -1;
-}
-
-/*
- * Basic iterator support. Return the previous active range of PFNs for a node
- * Note: nid == MAX_NUMNODES returns next region regardless of node
- */
-static int __meminit previous_active_region_index_in_nid(int index, int nid)
-{
-	for (index = index - 1; index >= 0; index--)
-		if (nid == MAX_NUMNODES || early_node_map[index].nid == nid)
-			return index;
-
-	return -1;
-}
-
 #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
 /*
  * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
@@ -3695,10 +3667,6 @@ bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
 	for (i = first_active_region_index_in_nid(nid); i != -1; \
 				i = next_active_region_index_in_nid(i, nid))
 
-#define for_each_active_range_index_in_nid_reverse(i, nid) \
-	for (i = last_active_region_index_in_nid(nid); i != -1; \
-				i = previous_active_region_index_in_nid(i, nid))
-
 /**
  * free_bootmem_with_active_regions - Call free_bootmem_node for each active range
  * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
@@ -3731,6 +3699,38 @@ void __init free_bootmem_with_active_regions(int nid,
 }
 
 #ifdef CONFIG_HAVE_MEMBLOCK
+/*
+ * Basic iterator support. Return the last range of PFNs for a node
+ * Note: nid == MAX_NUMNODES returns last region regardless of node
+ */
+static int __meminit last_active_region_index_in_nid(int nid)
+{
+	int i;
+
+	for (i = nr_nodemap_entries - 1; i >= 0; i--)
+		if (nid == MAX_NUMNODES || early_node_map[i].nid == nid)
+			return i;
+
+	return -1;
+}
+
+/*
+ * Basic iterator support. Return the previous active range of PFNs for a node
+ * Note: nid == MAX_NUMNODES returns next region regardless of node
+ */
+static int __meminit previous_active_region_index_in_nid(int index, int nid)
+{
+	for (index = index - 1; index >= 0; index--)
+		if (nid == MAX_NUMNODES || early_node_map[index].nid == nid)
+			return index;
+
+	return -1;
+}
+
+#define for_each_active_range_index_in_nid_reverse(i, nid) \
+	for (i = last_active_region_index_in_nid(nid); i != -1; \
+				i = previous_active_region_index_in_nid(i, nid))
+
 u64 __init find_memory_core_early(int nid, u64 size, u64 align,
 					u64 goal, u64 limit)
 {
-- 
cgit v0.10.2


From a56ec98357ad26b380f1005198de1aa519c9e9cb Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Sun, 27 Feb 2011 19:13:39 +0100
Subject: x86: dt: Correct local apic documentation in device tree bindings

Until "x86: dt: Cleanup local apic setup" we read the local apic
address from the MSR and ignored the entry in DT. Reflect this change
in the documentation.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
LKML-Reference: <1298830419-22681-1-git-send-email-bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/Documentation/devicetree/bindings/x86/interrupt.txt b/Documentation/devicetree/bindings/x86/interrupt.txt
index 8b0efb0..7d19f49 100644
--- a/Documentation/devicetree/bindings/x86/interrupt.txt
+++ b/Documentation/devicetree/bindings/x86/interrupt.txt
@@ -24,6 +24,3 @@ Interrupt chips
   Required property:
 
      compatible = "intel,ce4100-lapic";
-
-  This node is currently unused by Linux as the address of the local APIC
-  read from a MSR.
-- 
cgit v0.10.2


From 39f2205e1abd1b6fffdaf45e1f1c3049a5f8999c Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@novell.com>
Date: Mon, 28 Feb 2011 15:31:59 +0000
Subject: x86-64: Add CFI annotations to lib/rwsem_64.S

These weren't part of the initial commit of this code.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Cc: Alexander van Heukelum <heukelum@fastmail.fm>
LKML-Reference: <4D6BCDFF02000078000341B0@vpn.id2.novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/lib/rwsem_64.S b/arch/x86/lib/rwsem_64.S
index 41fcf00..6774397 100644
--- a/arch/x86/lib/rwsem_64.S
+++ b/arch/x86/lib/rwsem_64.S
@@ -23,43 +23,50 @@
 #include <asm/dwarf2.h>
 
 #define save_common_regs \
-	pushq %rdi; \
-	pushq %rsi; \
-	pushq %rcx; \
-	pushq %r8; \
-	pushq %r9; \
-	pushq %r10; \
-	pushq %r11
+	pushq_cfi %rdi; CFI_REL_OFFSET rdi, 0; \
+	pushq_cfi %rsi; CFI_REL_OFFSET rsi, 0; \
+	pushq_cfi %rcx; CFI_REL_OFFSET rcx, 0; \
+	pushq_cfi %r8;  CFI_REL_OFFSET r8,  0; \
+	pushq_cfi %r9;  CFI_REL_OFFSET r9,  0; \
+	pushq_cfi %r10; CFI_REL_OFFSET r10, 0; \
+	pushq_cfi %r11; CFI_REL_OFFSET r11, 0
 
 #define restore_common_regs \
-	popq %r11; \
-	popq %r10; \
-	popq %r9; \
-	popq %r8; \
-	popq %rcx; \
-	popq %rsi; \
-	popq %rdi
+	popq_cfi %r11; CFI_RESTORE r11; \
+	popq_cfi %r10; CFI_RESTORE r10; \
+	popq_cfi %r9;  CFI_RESTORE r9; \
+	popq_cfi %r8;  CFI_RESTORE r8; \
+	popq_cfi %rcx; CFI_RESTORE rcx; \
+	popq_cfi %rsi; CFI_RESTORE rsi; \
+	popq_cfi %rdi; CFI_RESTORE rdi
 
 /* Fix up special calling conventions */
 ENTRY(call_rwsem_down_read_failed)
+	CFI_STARTPROC
 	save_common_regs
-	pushq %rdx
+	pushq_cfi %rdx
+	CFI_REL_OFFSET rdx, 0
 	movq %rax,%rdi
 	call rwsem_down_read_failed
-	popq %rdx
+	popq_cfi %rdx
+	CFI_RESTORE rdx
 	restore_common_regs
 	ret
-	ENDPROC(call_rwsem_down_read_failed)
+	CFI_ENDPROC
+ENDPROC(call_rwsem_down_read_failed)
 
 ENTRY(call_rwsem_down_write_failed)
+	CFI_STARTPROC
 	save_common_regs
 	movq %rax,%rdi
 	call rwsem_down_write_failed
 	restore_common_regs
 	ret
-	ENDPROC(call_rwsem_down_write_failed)
+	CFI_ENDPROC
+ENDPROC(call_rwsem_down_write_failed)
 
 ENTRY(call_rwsem_wake)
+	CFI_STARTPROC
 	decl %edx	/* do nothing if still outstanding active readers */
 	jnz 1f
 	save_common_regs
@@ -67,15 +74,20 @@ ENTRY(call_rwsem_wake)
 	call rwsem_wake
 	restore_common_regs
 1:	ret
-	ENDPROC(call_rwsem_wake)
+	CFI_ENDPROC
+ENDPROC(call_rwsem_wake)
 
 /* Fix up special calling conventions */
 ENTRY(call_rwsem_downgrade_wake)
+	CFI_STARTPROC
 	save_common_regs
-	pushq %rdx
+	pushq_cfi %rdx
+	CFI_REL_OFFSET rdx, 0
 	movq %rax,%rdi
 	call rwsem_downgrade_wake
-	popq %rdx
+	popq_cfi %rdx
+	CFI_RESTORE rdx
 	restore_common_regs
 	ret
-	ENDPROC(call_rwsem_downgrade_wake)
+	CFI_ENDPROC
+ENDPROC(call_rwsem_downgrade_wake)
-- 
cgit v0.10.2


From 60cf637a13932a4750da6746efd0199e8a4c341b Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@novell.com>
Date: Mon, 28 Feb 2011 15:54:40 +0000
Subject: x86: Use {push,pop}_cfi in more places

Cleaning up and shortening code...

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Cc: Alexander van Heukelum <heukelum@fastmail.fm>
LKML-Reference: <4D6BD35002000078000341DA@vpn.id2.novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 518bb99..7c6aabd 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -126,26 +126,20 @@ ENTRY(ia32_sysenter_target)
 	 */
 	ENABLE_INTERRUPTS(CLBR_NONE)
  	movl	%ebp,%ebp		/* zero extension */
-	pushq	$__USER32_DS
-	CFI_ADJUST_CFA_OFFSET 8
+	pushq_cfi $__USER32_DS
 	/*CFI_REL_OFFSET ss,0*/
-	pushq	%rbp
-	CFI_ADJUST_CFA_OFFSET 8
+	pushq_cfi %rbp
 	CFI_REL_OFFSET rsp,0
-	pushfq
-	CFI_ADJUST_CFA_OFFSET 8
+	pushfq_cfi
 	/*CFI_REL_OFFSET rflags,0*/
 	movl	8*3-THREAD_SIZE+TI_sysenter_return(%rsp), %r10d
 	CFI_REGISTER rip,r10
-	pushq	$__USER32_CS
-	CFI_ADJUST_CFA_OFFSET 8
+	pushq_cfi $__USER32_CS
 	/*CFI_REL_OFFSET cs,0*/
 	movl	%eax, %eax
-	pushq	%r10
-	CFI_ADJUST_CFA_OFFSET 8
+	pushq_cfi %r10
 	CFI_REL_OFFSET rip,0
-	pushq	%rax
-	CFI_ADJUST_CFA_OFFSET 8
+	pushq_cfi %rax
 	cld
 	SAVE_ARGS 0,0,1
  	/* no need to do an access_ok check here because rbp has been
@@ -182,11 +176,9 @@ sysexit_from_sys_call:
 	xorq	%r9,%r9
 	xorq	%r10,%r10
 	xorq	%r11,%r11
-	popfq
-	CFI_ADJUST_CFA_OFFSET -8
+	popfq_cfi
 	/*CFI_RESTORE rflags*/
-	popq	%rcx				/* User %esp */
-	CFI_ADJUST_CFA_OFFSET -8
+	popq_cfi %rcx				/* User %esp */
 	CFI_REGISTER rsp,rcx
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS_SYSEXIT32
@@ -421,8 +413,7 @@ ENTRY(ia32_syscall)
 	 */
 	ENABLE_INTERRUPTS(CLBR_NONE)
 	movl %eax,%eax
-	pushq %rax
-	CFI_ADJUST_CFA_OFFSET 8
+	pushq_cfi %rax
 	cld
 	/* note the registers are not zero extended to the sf.
 	   this could be a problem. */
diff --git a/arch/x86/include/asm/frame.h b/arch/x86/include/asm/frame.h
index 06850a7..2c6fc9e 100644
--- a/arch/x86/include/asm/frame.h
+++ b/arch/x86/include/asm/frame.h
@@ -7,14 +7,12 @@
    frame pointer later */
 #ifdef CONFIG_FRAME_POINTER
 	.macro FRAME
-	pushl %ebp
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %ebp
 	CFI_REL_OFFSET ebp,0
 	movl %esp,%ebp
 	.endm
 	.macro ENDFRAME
-	popl %ebp
-	CFI_ADJUST_CFA_OFFSET -4
+	popl_cfi %ebp
 	CFI_RESTORE ebp
 	.endm
 #else
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 49bdedd..2878821 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -1409,8 +1409,7 @@ END(general_protection)
 #ifdef CONFIG_KVM_GUEST
 ENTRY(async_page_fault)
 	RING0_EC_FRAME
-	pushl $do_async_page_fault
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi $do_async_page_fault
 	jmp error_code
 	CFI_ENDPROC
 END(apf_page_fault)
diff --git a/arch/x86/lib/atomic64_386_32.S b/arch/x86/lib/atomic64_386_32.S
index 2cda60a..e8e7e0d 100644
--- a/arch/x86/lib/atomic64_386_32.S
+++ b/arch/x86/lib/atomic64_386_32.S
@@ -15,14 +15,12 @@
 
 /* if you want SMP support, implement these with real spinlocks */
 .macro LOCK reg
-	pushfl
-	CFI_ADJUST_CFA_OFFSET 4
+	pushfl_cfi
 	cli
 .endm
 
 .macro UNLOCK reg
-	popfl
-	CFI_ADJUST_CFA_OFFSET -4
+	popfl_cfi
 .endm
 
 #define BEGIN(op) \
diff --git a/arch/x86/lib/atomic64_cx8_32.S b/arch/x86/lib/atomic64_cx8_32.S
index 71e080d..391a083 100644
--- a/arch/x86/lib/atomic64_cx8_32.S
+++ b/arch/x86/lib/atomic64_cx8_32.S
@@ -14,14 +14,12 @@
 #include <asm/dwarf2.h>
 
 .macro SAVE reg
-	pushl %\reg
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %\reg
 	CFI_REL_OFFSET \reg, 0
 .endm
 
 .macro RESTORE reg
-	popl %\reg
-	CFI_ADJUST_CFA_OFFSET -4
+	popl_cfi %\reg
 	CFI_RESTORE \reg
 .endm
 
diff --git a/arch/x86/lib/checksum_32.S b/arch/x86/lib/checksum_32.S
index adbccd0..78d16a5 100644
--- a/arch/x86/lib/checksum_32.S
+++ b/arch/x86/lib/checksum_32.S
@@ -50,11 +50,9 @@ unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum)
 	   */		
 ENTRY(csum_partial)
 	CFI_STARTPROC
-	pushl %esi
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %esi
 	CFI_REL_OFFSET esi, 0
-	pushl %ebx
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %ebx
 	CFI_REL_OFFSET ebx, 0
 	movl 20(%esp),%eax	# Function arg: unsigned int sum
 	movl 16(%esp),%ecx	# Function arg: int len
@@ -132,11 +130,9 @@ ENTRY(csum_partial)
 	jz 8f
 	roll $8, %eax
 8:
-	popl %ebx
-	CFI_ADJUST_CFA_OFFSET -4
+	popl_cfi %ebx
 	CFI_RESTORE ebx
-	popl %esi
-	CFI_ADJUST_CFA_OFFSET -4
+	popl_cfi %esi
 	CFI_RESTORE esi
 	ret
 	CFI_ENDPROC
@@ -148,11 +144,9 @@ ENDPROC(csum_partial)
 
 ENTRY(csum_partial)
 	CFI_STARTPROC
-	pushl %esi
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %esi
 	CFI_REL_OFFSET esi, 0
-	pushl %ebx
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %ebx
 	CFI_REL_OFFSET ebx, 0
 	movl 20(%esp),%eax	# Function arg: unsigned int sum
 	movl 16(%esp),%ecx	# Function arg: int len
@@ -260,11 +254,9 @@ ENTRY(csum_partial)
 	jz 90f
 	roll $8, %eax
 90: 
-	popl %ebx
-	CFI_ADJUST_CFA_OFFSET -4
+	popl_cfi %ebx
 	CFI_RESTORE ebx
-	popl %esi
-	CFI_ADJUST_CFA_OFFSET -4
+	popl_cfi %esi
 	CFI_RESTORE esi
 	ret
 	CFI_ENDPROC
@@ -309,14 +301,11 @@ ENTRY(csum_partial_copy_generic)
 	CFI_STARTPROC
 	subl  $4,%esp	
 	CFI_ADJUST_CFA_OFFSET 4
-	pushl %edi
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %edi
 	CFI_REL_OFFSET edi, 0
-	pushl %esi
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %esi
 	CFI_REL_OFFSET esi, 0
-	pushl %ebx
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %ebx
 	CFI_REL_OFFSET ebx, 0
 	movl ARGBASE+16(%esp),%eax	# sum
 	movl ARGBASE+12(%esp),%ecx	# len
@@ -426,17 +415,13 @@ DST(	movb %cl, (%edi)	)
 
 .previous
 
-	popl %ebx
-	CFI_ADJUST_CFA_OFFSET -4
+	popl_cfi %ebx
 	CFI_RESTORE ebx
-	popl %esi
-	CFI_ADJUST_CFA_OFFSET -4
+	popl_cfi %esi
 	CFI_RESTORE esi
-	popl %edi
-	CFI_ADJUST_CFA_OFFSET -4
+	popl_cfi %edi
 	CFI_RESTORE edi
-	popl %ecx			# equivalent to addl $4,%esp
-	CFI_ADJUST_CFA_OFFSET -4
+	popl_cfi %ecx			# equivalent to addl $4,%esp
 	ret	
 	CFI_ENDPROC
 ENDPROC(csum_partial_copy_generic)
@@ -459,14 +444,11 @@ ENDPROC(csum_partial_copy_generic)
 		
 ENTRY(csum_partial_copy_generic)
 	CFI_STARTPROC
-	pushl %ebx
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %ebx
 	CFI_REL_OFFSET ebx, 0
-	pushl %edi
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %edi
 	CFI_REL_OFFSET edi, 0
-	pushl %esi
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %esi
 	CFI_REL_OFFSET esi, 0
 	movl ARGBASE+4(%esp),%esi	#src
 	movl ARGBASE+8(%esp),%edi	#dst	
@@ -527,14 +509,11 @@ DST(	movb %dl, (%edi)         )
 	jmp  7b			
 .previous				
 
-	popl %esi
-	CFI_ADJUST_CFA_OFFSET -4
+	popl_cfi %esi
 	CFI_RESTORE esi
-	popl %edi
-	CFI_ADJUST_CFA_OFFSET -4
+	popl_cfi %edi
 	CFI_RESTORE edi
-	popl %ebx
-	CFI_ADJUST_CFA_OFFSET -4
+	popl_cfi %ebx
 	CFI_RESTORE ebx
 	ret
 	CFI_ENDPROC
diff --git a/arch/x86/lib/semaphore_32.S b/arch/x86/lib/semaphore_32.S
index 648fe47..48e44f7 100644
--- a/arch/x86/lib/semaphore_32.S
+++ b/arch/x86/lib/semaphore_32.S
@@ -74,29 +74,23 @@ ENTRY(__read_lock_failed)
 /* Fix up special calling conventions */
 ENTRY(call_rwsem_down_read_failed)
 	CFI_STARTPROC
-	push %ecx
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %ecx
 	CFI_REL_OFFSET ecx,0
-	push %edx
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %edx
 	CFI_REL_OFFSET edx,0
 	call rwsem_down_read_failed
-	pop %edx
-	CFI_ADJUST_CFA_OFFSET -4
-	pop %ecx
-	CFI_ADJUST_CFA_OFFSET -4
+	popl_cfi %edx
+	popl_cfi %ecx
 	ret
 	CFI_ENDPROC
 	ENDPROC(call_rwsem_down_read_failed)
 
 ENTRY(call_rwsem_down_write_failed)
 	CFI_STARTPROC
-	push %ecx
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %ecx
 	CFI_REL_OFFSET ecx,0
 	calll rwsem_down_write_failed
-	pop %ecx
-	CFI_ADJUST_CFA_OFFSET -4
+	popl_cfi %ecx
 	ret
 	CFI_ENDPROC
 	ENDPROC(call_rwsem_down_write_failed)
@@ -105,12 +99,10 @@ ENTRY(call_rwsem_wake)
 	CFI_STARTPROC
 	decw %dx    /* do nothing if still outstanding active readers */
 	jnz 1f
-	push %ecx
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %ecx
 	CFI_REL_OFFSET ecx,0
 	call rwsem_wake
-	pop %ecx
-	CFI_ADJUST_CFA_OFFSET -4
+	popl_cfi %ecx
 1:	ret
 	CFI_ENDPROC
 	ENDPROC(call_rwsem_wake)
@@ -118,17 +110,13 @@ ENTRY(call_rwsem_wake)
 /* Fix up special calling conventions */
 ENTRY(call_rwsem_downgrade_wake)
 	CFI_STARTPROC
-	push %ecx
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %ecx
 	CFI_REL_OFFSET ecx,0
-	push %edx
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %edx
 	CFI_REL_OFFSET edx,0
 	call rwsem_downgrade_wake
-	pop %edx
-	CFI_ADJUST_CFA_OFFSET -4
-	pop %ecx
-	CFI_ADJUST_CFA_OFFSET -4
+	popl_cfi %edx
+	popl_cfi %ecx
 	ret
 	CFI_ENDPROC
 	ENDPROC(call_rwsem_downgrade_wake)
-- 
cgit v0.10.2


From 039e13890b0615cb8c5c04b6afa84d676e24c761 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@novell.com>
Date: Mon, 28 Feb 2011 15:56:00 +0000
Subject: x86: Remove unused bits from lib/thunk_*.S

Some of the items removed were apparently never used, others
simply didn't get removed with their last user.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
LKML-Reference: <4D6BD3A002000078000341F1@vpn.id2.novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/lib/thunk_32.S b/arch/x86/lib/thunk_32.S
index 650b11e..2930ae0 100644
--- a/arch/x86/lib/thunk_32.S
+++ b/arch/x86/lib/thunk_32.S
@@ -7,24 +7,6 @@
 
 	#include <linux/linkage.h>
 
-#define ARCH_TRACE_IRQS_ON			\
-	pushl %eax;				\
-	pushl %ecx;				\
-	pushl %edx;				\
-	call trace_hardirqs_on;			\
-	popl %edx;				\
-	popl %ecx;				\
-	popl %eax;
-
-#define ARCH_TRACE_IRQS_OFF			\
-	pushl %eax;				\
-	pushl %ecx;				\
-	pushl %edx;				\
-	call trace_hardirqs_off;		\
-	popl %edx;				\
-	popl %ecx;				\
-	popl %eax;
-
 #ifdef CONFIG_TRACE_IRQFLAGS
 	/* put return address in eax (arg1) */
 	.macro thunk_ra name,func
diff --git a/arch/x86/lib/thunk_64.S b/arch/x86/lib/thunk_64.S
index bf9a7d5..782b082 100644
--- a/arch/x86/lib/thunk_64.S
+++ b/arch/x86/lib/thunk_64.S
@@ -22,26 +22,6 @@
 	CFI_ENDPROC
 	.endm
 
-	/* rdi:	arg1 ... normal C conventions. rax is passed from C. */ 	
-	.macro thunk_retrax name,func
-	.globl \name
-\name:	
-	CFI_STARTPROC
-	SAVE_ARGS
-	call \func
-	jmp  restore_norax
-	CFI_ENDPROC
-	.endm
-	
-
-	.section .sched.text, "ax"
-#ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM
-	thunk rwsem_down_read_failed_thunk,rwsem_down_read_failed
-	thunk rwsem_down_write_failed_thunk,rwsem_down_write_failed
-	thunk rwsem_wake_thunk,rwsem_wake
-	thunk rwsem_downgrade_thunk,rwsem_downgrade_wake
-#endif	
-	
 #ifdef CONFIG_TRACE_IRQFLAGS
 	/* put return address in rdi (arg1) */
 	.macro thunk_ra name,func
@@ -72,10 +52,3 @@ restore:
 	RESTORE_ARGS
 	ret	
 	CFI_ENDPROC
-	
-	CFI_STARTPROC
-	SAVE_ARGS
-restore_norax:	
-	RESTORE_ARGS 1
-	ret
-	CFI_ENDPROC
-- 
cgit v0.10.2


From bfc39061d3dbf812e6a78f9529a548e5f0050c64 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@novell.com>
Date: Tue, 1 Mar 2011 11:14:55 +0000
Subject: um, x86-64: Fix UML build after adding CFI annotations to
 lib/rwsem_64.S

arch/um/Kconfig.x86 has X86_32 but not X86_64 - that's resulting in
asm/dwarf2.h producing the 32-bit (pushl_cfi & Co) macros instead of
the 64-bit ones.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Cc: Jeff Dike <jdike@addtoit.com>
LKML-Reference: <4D6CE3400200007800034498@vpn.id2.novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/um/Kconfig.x86 b/arch/um/Kconfig.x86
index 5ee3280..62f2115 100644
--- a/arch/um/Kconfig.x86
+++ b/arch/um/Kconfig.x86
@@ -19,6 +19,9 @@ config X86_32
 	def_bool !64BIT
 	select HAVE_AOUT
 
+config X86_64
+	def_bool 64BIT
+
 config RWSEM_XCHGADD_ALGORITHM
 	def_bool X86_XADD
 
-- 
cgit v0.10.2


From 3166fc8fb6a2f52273d545e970297524e02c3e39 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 1 Mar 2011 10:21:44 -0300
Subject: perf top browser: Handle empty active symbols list

Fixing a SEGV. An empty list could happen when not being able to resolve
symbols, for instance when --vmlinux invalid-file is used.

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/ui/browsers/top.c b/tools/perf/util/ui/browsers/top.c
index e9381ec..5a06538 100644
--- a/tools/perf/util/ui/browsers/top.c
+++ b/tools/perf/util/ui/browsers/top.c
@@ -76,6 +76,12 @@ static void perf_top_browser__update_rb_tree(struct perf_top_browser *browser)
 	browser->root = RB_ROOT;
 	browser->b.top = NULL;
 	browser->sum_ksamples = perf_top__decay_samples(top, &browser->root);
+	/*
+ 	 * No active symbols
+ 	 */
+	if (top->rb_entries == 0)
+		return;
+
 	perf_top__find_widths(top, &browser->root, &browser->dso_width,
 			      &browser->dso_short_width,
                               &browser->sym_width);
-- 
cgit v0.10.2


From a1ceb741cf86ef433006379742db81c00b450bae Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 1 Mar 2011 10:24:43 -0300
Subject: perf tui: Make ui__warning modal

By taking the ui__lock so that no other screen updates take place while
waiting for the user.

That was happening when handling an invalid --vmlinux parameter in 'perf
top --tui', with the screen refresh routine repainting the screen and
removing the warning window.

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/ui/util.c b/tools/perf/util/ui/util.c
index 7b5a892..fdf1fc8 100644
--- a/tools/perf/util/ui/util.c
+++ b/tools/perf/util/ui/util.c
@@ -9,6 +9,7 @@
 #include "../debug.h"
 #include "browser.h"
 #include "helpline.h"
+#include "ui.h"
 #include "util.h"
 
 static void newt_form__set_exit_keys(newtComponent self)
@@ -118,10 +119,12 @@ void ui__warning(const char *format, ...)
 	va_list args;
 
 	va_start(args, format);
-	if (use_browser > 0)
+	if (use_browser > 0) {
+		pthread_mutex_lock(&ui__lock);
 		newtWinMessagev((char *)warning_str, (char *)ok,
 				(char *)format, args);
-	else
+		pthread_mutex_unlock(&ui__lock);
+	} else
 		vfprintf(stderr, format, args);
 	va_end(args);
 }
-- 
cgit v0.10.2


From 374cfe56892701f062586d6a6de6cb71777a4184 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 1 Mar 2011 10:27:27 -0300
Subject: perf top: Fix reporting of invalid --vmlinux

Using ui__warning, that will, in --tui, show a window with the message,
waiting for the user to press Ok.

Also run exit_browser() to let newt do its final cleaning of the screen.

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index f88a263..0b07cc3 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -740,8 +740,9 @@ static void perf_event__process_sample(const union perf_event *event,
 		 */
 		if (al.map == machine->vmlinux_maps[MAP__FUNCTION] &&
 		    RB_EMPTY_ROOT(&al.map->dso->symbols[MAP__FUNCTION])) {
-			pr_err("The %s file can't be used\n",
-			       symbol_conf.vmlinux_name);
+			ui__warning("The %s file can't be used\n",
+				    symbol_conf.vmlinux_name);
+			exit_browser(0);
 			exit(1);
 		}
 
-- 
cgit v0.10.2


From 5807806a92450fd57f8063868efae9d4af74db02 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 1 Mar 2011 10:43:03 -0300
Subject: perf top tui: Wait till the first sample to refresh the screen.

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 0b07cc3..417f757 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -72,6 +72,7 @@ static struct perf_top top = {
 	.target_tid		= -1,
 	.active_symbols		= LIST_HEAD_INIT(top.active_symbols),
 	.active_symbols_lock	= PTHREAD_MUTEX_INITIALIZER,
+	.active_symbols_cond	= PTHREAD_COND_INITIALIZER,
 	.freq			= 1000, /* 1 KHz */
 };
 
@@ -577,7 +578,17 @@ static void handle_keypress(struct perf_session *session, int c)
 
 static void *display_thread_tui(void *arg __used)
 {
-	perf_top__tui_browser(&top);
+	int err = 0;
+	pthread_mutex_lock(&top.active_symbols_lock);
+	while (list_empty(&top.active_symbols)) {
+		err = pthread_cond_wait(&top.active_symbols_cond,
+					&top.active_symbols_lock);
+		if (err)
+			break;
+	}
+	pthread_mutex_unlock(&top.active_symbols_lock);
+	if (!err)
+		perf_top__tui_browser(&top);
 	exit_browser(0);
 	exit(0);
 	return NULL;
@@ -776,8 +787,14 @@ static void perf_event__process_sample(const union perf_event *event,
 		syme->count[evsel->idx]++;
 		record_precise_ip(syme, evsel->idx, ip);
 		pthread_mutex_lock(&top.active_symbols_lock);
-		if (list_empty(&syme->node) || !syme->node.next)
+		if (list_empty(&syme->node) || !syme->node.next) {
+			static bool first = true;
 			__list_insert_active_sym(syme);
+			if (first) {
+				pthread_cond_broadcast(&top.active_symbols_cond);
+				first = false;
+			}
+		}
 		pthread_mutex_unlock(&top.active_symbols_lock);
 	}
 }
diff --git a/tools/perf/util/top.h b/tools/perf/util/top.h
index e8d28e2..96d1cb7 100644
--- a/tools/perf/util/top.h
+++ b/tools/perf/util/top.h
@@ -35,6 +35,7 @@ struct perf_top {
 	 */
 	struct list_head   active_symbols;
 	pthread_mutex_t	   active_symbols_lock;
+	pthread_cond_t	   active_symbols_cond;
 	u64		   samples;
 	u64		   kernel_samples, us_samples;
 	u64		   exact_samples;
-- 
cgit v0.10.2


From 44e69767cb7c3bc46e5370c39532c205d4347d80 Mon Sep 17 00:00:00 2001
From: Ian Campbell <Ian.Campbell@eu.citrix.com>
Date: Tue, 1 Mar 2011 20:05:49 +0000
Subject: xen: ia64 build broken due to "xen: switch to new schedop hypercall
 by default."

The git commit:

> commit a8b7458363b9174f3c2196ca6085630b4b30b7a1
> Author: Ian Campbell <ian.campbell@citrix.com>
> Date:   Thu Feb 17 11:04:20 2011 +0000
>
>     xen: switch to new schedop hypercall by default.
>
>     Rename old interface to sched_op_compat and rename sched_op_new to
>     simply sched_op.
>

breaks the IA64 build. This patch fixes it.

Signed-off-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Ian Campbell <Ian.Campbell@eu.citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/arch/ia64/include/asm/xen/hypercall.h b/arch/ia64/include/asm/xen/hypercall.h
index 96fc623..ed28bcd 100644
--- a/arch/ia64/include/asm/xen/hypercall.h
+++ b/arch/ia64/include/asm/xen/hypercall.h
@@ -107,7 +107,7 @@ extern unsigned long __hypercall(unsigned long a1, unsigned long a2,
 static inline int
 xencomm_arch_hypercall_sched_op(int cmd, struct xencomm_handle *arg)
 {
-	return _hypercall2(int, sched_op_new, cmd, arg);
+	return _hypercall2(int, sched_op, cmd, arg);
 }
 
 static inline long
-- 
cgit v0.10.2


From e938c287ea8d977e079f07464ac69923412663ce Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@novell.com>
Date: Tue, 1 Mar 2011 14:28:02 +0000
Subject: x86: Fix a bogus unwind annotation in lib/semaphore_32.S

'simple' would have required specifying current frame address
and return address location manually, but that's obviously not
the case (and not necessary) here.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
LKML-Reference: <4D6D1082020000780003454C@vpn.id2.novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/lib/semaphore_32.S b/arch/x86/lib/semaphore_32.S
index 48e44f7..06691da 100644
--- a/arch/x86/lib/semaphore_32.S
+++ b/arch/x86/lib/semaphore_32.S
@@ -36,7 +36,7 @@
  */
 #ifdef CONFIG_SMP
 ENTRY(__write_lock_failed)
-	CFI_STARTPROC simple
+	CFI_STARTPROC
 	FRAME
 2: 	LOCK_PREFIX
 	addl	$ RW_LOCK_BIAS,(%eax)
-- 
cgit v0.10.2


From c69e3758ff56d03e161187355791ec992c574276 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 2 Mar 2011 11:49:21 +0100
Subject: genirq: Fixup fasteoi handler for oneshot mode

The fasteoi handler must mask the interrupt line in oneshot mode
otherwise we end up with an irq storm.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index b514565..c9c0601 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -513,6 +513,10 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
 		mask_irq(desc);
 		goto out;
 	}
+
+	if (desc->istate & IRQS_ONESHOT)
+		mask_irq(desc);
+
 	preflow_handler(desc);
 	handle_irq_event(desc);
 
-- 
cgit v0.10.2


From b06b3d49699a52e8f9ca056c4f96e81b1987d78e Mon Sep 17 00:00:00 2001
From: Lin Ming <ming.m.lin@intel.com>
Date: Wed, 2 Mar 2011 21:27:04 +0800
Subject: perf, x86: Add Intel SandyBridge CPU support

This patch adds basic SandyBridge support, including hardware
cache events and PEBS events support.

It has been tested on SandyBridge CPUs with perf stat and also
with PEBS based profiling - both work fine.

The patch does not affect other models.

v2 -> v3:
 - fix PEBS event 0xd0 with right umask combinations
 - move snb pebs constraint assignment to intel_pmu_init

v1 -> v2:
 - add more raw and PEBS events constraints
 - use offcore events for LLC-* cache events
 - remove the call to Nehalem workaround enable_all function

Signed-off-by: Lin Ming <ming.m.lin@intel.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephane Eranian <eranian@google.com>
Cc: Andi Kleen <andi@firstfloor.org>
LKML-Reference: <1299072424.2175.24.camel@localhost>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 9d977a2..390fa6d 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -166,8 +166,10 @@ struct cpu_hw_events {
 /*
  * Constraint on the Event code + UMask
  */
-#define PEBS_EVENT_CONSTRAINT(c, n)	\
+#define INTEL_UEVENT_CONSTRAINT(c, n)	\
 	EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK)
+#define PEBS_EVENT_CONSTRAINT(c, n)	\
+	INTEL_UEVENT_CONSTRAINT(c, n)
 
 #define EVENT_CONSTRAINT_END		\
 	EVENT_CONSTRAINT(0, 0, 0)
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 008835c..d00f386 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -76,6 +76,19 @@ static struct event_constraint intel_westmere_event_constraints[] =
 	EVENT_CONSTRAINT_END
 };
 
+static struct event_constraint intel_snb_event_constraints[] =
+{
+	FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+	FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+	/* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */
+	INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.PENDING */
+	INTEL_EVENT_CONSTRAINT(0xb7, 0x1), /* OFF_CORE_RESPONSE_0 */
+	INTEL_EVENT_CONSTRAINT(0xbb, 0x8), /* OFF_CORE_RESPONSE_1 */
+	INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
+	INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
+	EVENT_CONSTRAINT_END
+};
+
 static struct event_constraint intel_gen_event_constraints[] =
 {
 	FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
@@ -89,6 +102,106 @@ static u64 intel_pmu_event_map(int hw_event)
 	return intel_perfmon_event_map[hw_event];
 }
 
+static __initconst const u64 snb_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0xf1d0, /* MEM_UOP_RETIRED.LOADS        */
+		[ C(RESULT_MISS)   ] = 0x0151, /* L1D.REPLACEMENT              */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0xf2d0, /* MEM_UOP_RETIRED.STORES       */
+		[ C(RESULT_MISS)   ] = 0x0851, /* L1D.ALL_M_REPLACEMENT        */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x024e, /* HW_PRE_REQ.DL1_MISS          */
+	},
+ },
+ [ C(L1I ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0280, /* ICACHE.MISSES */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(LL  ) ] = {
+	/*
+	 * TBD: Need Off-core Response Performance Monitoring support
+	 */
+	[ C(OP_READ) ] = {
+		/* OFFCORE_RESPONSE_0.ANY_DATA.LOCAL_CACHE */
+		[ C(RESULT_ACCESS) ] = 0x01b7,
+		/* OFFCORE_RESPONSE_1.ANY_DATA.ANY_LLC_MISS */
+		[ C(RESULT_MISS)   ] = 0x01bb,
+	},
+	[ C(OP_WRITE) ] = {
+		/* OFFCORE_RESPONSE_0.ANY_RFO.LOCAL_CACHE */
+		[ C(RESULT_ACCESS) ] = 0x01b7,
+		/* OFFCORE_RESPONSE_1.ANY_RFO.ANY_LLC_MISS */
+		[ C(RESULT_MISS)   ] = 0x01bb,
+	},
+	[ C(OP_PREFETCH) ] = {
+		/* OFFCORE_RESPONSE_0.PREFETCH.LOCAL_CACHE */
+		[ C(RESULT_ACCESS) ] = 0x01b7,
+		/* OFFCORE_RESPONSE_1.PREFETCH.ANY_LLC_MISS */
+		[ C(RESULT_MISS)   ] = 0x01bb,
+	},
+ },
+ [ C(DTLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_UOP_RETIRED.ALL_LOADS */
+		[ C(RESULT_MISS)   ] = 0x0108, /* DTLB_LOAD_MISSES.CAUSES_A_WALK */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_UOP_RETIRED.ALL_STORES */
+		[ C(RESULT_MISS)   ] = 0x0149, /* DTLB_STORE_MISSES.MISS_CAUSES_A_WALK */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(ITLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x1085, /* ITLB_MISSES.STLB_HIT         */
+		[ C(RESULT_MISS)   ] = 0x0185, /* ITLB_MISSES.CAUSES_A_WALK    */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(BPU ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
+		[ C(RESULT_MISS)   ] = 0x00c5, /* BR_MISP_RETIRED.ALL_BRANCHES */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+};
+
 static __initconst const u64 westmere_hw_cache_event_ids
 				[PERF_COUNT_HW_CACHE_MAX]
 				[PERF_COUNT_HW_CACHE_OP_MAX]
@@ -1062,6 +1175,17 @@ static __init int intel_pmu_init(void)
 		pr_cont("Westmere events, ");
 		break;
 
+	case 42: /* SandyBridge */
+		memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
+		       sizeof(hw_cache_event_ids));
+
+		intel_pmu_lbr_init_nhm();
+
+		x86_pmu.event_constraints = intel_snb_event_constraints;
+		x86_pmu.pebs_constraints = intel_snb_pebs_events;
+		pr_cont("SandyBridge events, ");
+		break;
+
 	default:
 		/*
 		 * default constraints for v2 and up
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index b7dcd9f..8251998 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -388,6 +388,44 @@ static struct event_constraint intel_nehalem_pebs_events[] = {
 	EVENT_CONSTRAINT_END
 };
 
+static struct event_constraint intel_snb_pebs_events[] = {
+	PEBS_EVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
+	PEBS_EVENT_CONSTRAINT(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
+	PEBS_EVENT_CONSTRAINT(0x02c2, 0xf), /* UOPS_RETIRED.RETIRE_SLOTS */
+	PEBS_EVENT_CONSTRAINT(0x01c4, 0xf), /* BR_INST_RETIRED.CONDITIONAL */
+	PEBS_EVENT_CONSTRAINT(0x02c4, 0xf), /* BR_INST_RETIRED.NEAR_CALL */
+	PEBS_EVENT_CONSTRAINT(0x04c4, 0xf), /* BR_INST_RETIRED.ALL_BRANCHES */
+	PEBS_EVENT_CONSTRAINT(0x08c4, 0xf), /* BR_INST_RETIRED.NEAR_RETURN */
+	PEBS_EVENT_CONSTRAINT(0x10c4, 0xf), /* BR_INST_RETIRED.NOT_TAKEN */
+	PEBS_EVENT_CONSTRAINT(0x20c4, 0xf), /* BR_INST_RETIRED.NEAR_TAKEN */
+	PEBS_EVENT_CONSTRAINT(0x40c4, 0xf), /* BR_INST_RETIRED.FAR_BRANCH */
+	PEBS_EVENT_CONSTRAINT(0x01c5, 0xf), /* BR_MISP_RETIRED.CONDITIONAL */
+	PEBS_EVENT_CONSTRAINT(0x02c5, 0xf), /* BR_MISP_RETIRED.NEAR_CALL */
+	PEBS_EVENT_CONSTRAINT(0x04c5, 0xf), /* BR_MISP_RETIRED.ALL_BRANCHES */
+	PEBS_EVENT_CONSTRAINT(0x10c5, 0xf), /* BR_MISP_RETIRED.NOT_TAKEN */
+	PEBS_EVENT_CONSTRAINT(0x20c5, 0xf), /* BR_MISP_RETIRED.TAKEN */
+	PEBS_EVENT_CONSTRAINT(0x01cd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
+	PEBS_EVENT_CONSTRAINT(0x02cd, 0x8), /* MEM_TRANS_RETIRED.PRECISE_STORE */
+	PEBS_EVENT_CONSTRAINT(0x11d0, 0xf), /* MEM_UOP_RETIRED.STLB_MISS_LOADS */
+	PEBS_EVENT_CONSTRAINT(0x12d0, 0xf), /* MEM_UOP_RETIRED.STLB_MISS_STORES */
+	PEBS_EVENT_CONSTRAINT(0x21d0, 0xf), /* MEM_UOP_RETIRED.LOCK_LOADS */
+	PEBS_EVENT_CONSTRAINT(0x22d0, 0xf), /* MEM_UOP_RETIRED.LOCK_STORES */
+	PEBS_EVENT_CONSTRAINT(0x41d0, 0xf), /* MEM_UOP_RETIRED.SPLIT_LOADS */
+	PEBS_EVENT_CONSTRAINT(0x42d0, 0xf), /* MEM_UOP_RETIRED.SPLIT_STORES */
+	PEBS_EVENT_CONSTRAINT(0x81d0, 0xf), /* MEM_UOP_RETIRED.ANY_LOADS */
+	PEBS_EVENT_CONSTRAINT(0x82d0, 0xf), /* MEM_UOP_RETIRED.ANY_STORES */
+	PEBS_EVENT_CONSTRAINT(0x01d1, 0xf), /* MEM_LOAD_UOPS_RETIRED.L1_HIT */
+	PEBS_EVENT_CONSTRAINT(0x02d1, 0xf), /* MEM_LOAD_UOPS_RETIRED.L2_HIT */
+	PEBS_EVENT_CONSTRAINT(0x04d1, 0xf), /* MEM_LOAD_UOPS_RETIRED.LLC_HIT */
+	PEBS_EVENT_CONSTRAINT(0x40d1, 0xf), /* MEM_LOAD_UOPS_RETIRED.HIT_LFB */
+	PEBS_EVENT_CONSTRAINT(0x01d2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS */
+	PEBS_EVENT_CONSTRAINT(0x02d2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT */
+	PEBS_EVENT_CONSTRAINT(0x04d2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM */
+	PEBS_EVENT_CONSTRAINT(0x08d2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_NONE */
+	PEBS_EVENT_CONSTRAINT(0x02d4, 0xf), /* MEM_LOAD_UOPS_MISC_RETIRED.LLC_MISS */
+	EVENT_CONSTRAINT_END
+};
+
 static struct event_constraint *
 intel_pebs_constraints(struct perf_event *event)
 {
-- 
cgit v0.10.2


From 0a10247914a5cad3caf7ef8a255c54c4d3ed2062 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 26 Feb 2011 04:51:54 +0100
Subject: perf: Set filters before mmaping events

We currently set the filters after we mmap the events, this is a
race that let undesired events record themselves in the buffer before
we had the time to set the filters.

So set the filters before they can be recorded. That also librarizes
the filters setting so that filtering can be done more easily
from other tools than perf record later.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index db4cd1e..d40a81e 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -180,12 +180,10 @@ static struct perf_header_attr *get_header_attr(struct perf_event_attr *a, int n
 
 static void create_counter(struct perf_evsel *evsel, int cpu)
 {
-	char *filter = evsel->filter;
 	struct perf_event_attr *attr = &evsel->attr;
 	struct perf_header_attr *h_attr;
 	struct perf_sample_id *sid;
 	int thread_index;
-	int ret;
 
 	for (thread_index = 0; thread_index < evsel_list->threads->nr; thread_index++) {
 		h_attr = get_header_attr(attr, evsel->idx);
@@ -204,16 +202,6 @@ static void create_counter(struct perf_evsel *evsel, int cpu)
 			pr_warning("Not enough memory to add id\n");
 			exit(-1);
 		}
-
-		if (filter != NULL) {
-			ret = ioctl(FD(evsel, cpu, thread_index),
-				    PERF_EVENT_IOC_SET_FILTER, filter);
-			if (ret) {
-				error("failed to set filter with %d (%s)\n", errno,
-						strerror(errno));
-				exit(-1);
-			}
-		}
 	}
 
 	if (!sample_type)
@@ -367,6 +355,12 @@ try_again:
 		}
 	}
 
+	if (perf_evlist__set_filters(evlist)) {
+		error("failed to set filter with %d (%s)\n", errno,
+			strerror(errno));
+		exit(-1);
+	}
+
 	if (perf_evlist__mmap(evlist, mmap_pages, false) < 0)
 		die("failed to mmap with %d (%s)\n", errno, strerror(errno));
 
diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index 95b21fe..030ae7f 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -348,3 +348,31 @@ void perf_evlist__delete_maps(struct perf_evlist *evlist)
 	evlist->cpus	= NULL;
 	evlist->threads = NULL;
 }
+
+int perf_evlist__set_filters(struct perf_evlist *evlist)
+{
+	const struct thread_map *threads = evlist->threads;
+	const struct cpu_map *cpus = evlist->cpus;
+	struct perf_evsel *evsel;
+	char *filter;
+	int thread;
+	int cpu;
+	int err;
+	int fd;
+
+	list_for_each_entry(evsel, &evlist->entries, node) {
+		filter = evsel->filter;
+		if (!filter)
+			continue;
+		for (cpu = 0; cpu < cpus->nr; cpu++) {
+			for (thread = 0; thread < threads->nr; thread++) {
+				fd = FD(evsel, cpu, thread);
+				err = ioctl(fd, PERF_EVENT_IOC_SET_FILTER, filter);
+				if (err)
+					return err;
+			}
+		}
+	}
+
+	return 0;
+}
diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h
index c988405..b75805a 100644
--- a/tools/perf/util/evlist.h
+++ b/tools/perf/util/evlist.h
@@ -60,5 +60,6 @@ static inline void perf_evlist__set_maps(struct perf_evlist *evlist,
 int perf_evlist__create_maps(struct perf_evlist *evlist, pid_t target_pid,
 			     pid_t target_tid, const char *cpu_list);
 void perf_evlist__delete_maps(struct perf_evlist *evlist);
+int perf_evlist__set_filters(struct perf_evlist *evlist);
 
 #endif /* __PERF_EVLIST_H */
-- 
cgit v0.10.2


From ce0033307f1b45e23e0c149f56ea4855eb4687ce Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Wed, 2 Mar 2011 11:22:14 +0100
Subject: x86-64, NUMA: Fix distance table handling

NUMA distance table handling has the following problems.

* numa_reset_distance() uses numa_distance * sizeof(numa_distance[0])
  as the table size when it should be using the square of
  numa_distance.

* The same size miscalculation when allocation space for phys_dist in
  numa_emulation().

* In numa_emulation(), phys_dist must be reserved; otherwise, the new
  emulated distance table may overlap it.

Fix them and, while at it, take numa_distance_cnt resetting in
numa_reset_distance() out of the if block to simplify the code a bit.

David Rientjes reported incorrect handling of distance table during
emulation.

-tj: Edited out numa_alloc_distance() related changes which weren't
     necessary and rewrote patch description.

-v2: Ingo was unhappy with 80-column limit induced linebreaks.  Let
     lines run over 80-column.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Reported-by: David Rientjes <rientjes@google.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Acked-by: David Rientjes <rientjes@google.com>

diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 7757d22..541746f 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -390,14 +390,12 @@ static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask,
  */
 void __init numa_reset_distance(void)
 {
-	size_t size;
+	size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]);
 
-	if (numa_distance_cnt) {
-		size = numa_distance_cnt * sizeof(numa_distance[0]);
+	if (numa_distance_cnt)
 		memblock_x86_free_range(__pa(numa_distance),
 					__pa(numa_distance) + size);
-		numa_distance_cnt = 0;
-	}
+	numa_distance_cnt = 0;
 	numa_distance = NULL;
 }
 
diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c
index 607a2e8..0afa25d 100644
--- a/arch/x86/mm/numa_emulation.c
+++ b/arch/x86/mm/numa_emulation.c
@@ -300,6 +300,7 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
 	static struct numa_meminfo pi __initdata;
 	const u64 max_addr = max_pfn << PAGE_SHIFT;
 	u8 *phys_dist = NULL;
+	size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]);
 	int i, j, ret;
 
 	if (!emu_cmdline)
@@ -336,21 +337,18 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
 		goto no_emu;
 	}
 
-	/*
-	 * Copy the original distance table.  It's temporary so no need to
-	 * reserve it.
-	 */
+	/* copy the physical distance table */
 	if (numa_dist_cnt) {
-		size_t size = numa_dist_cnt * sizeof(phys_dist[0]);
 		u64 phys;
 
 		phys = memblock_find_in_range(0,
 					      (u64)max_pfn_mapped << PAGE_SHIFT,
-					      size, PAGE_SIZE);
+					      phys_size, PAGE_SIZE);
 		if (phys == MEMBLOCK_ERROR) {
 			pr_warning("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n");
 			goto no_emu;
 		}
+		memblock_x86_reserve_range(phys, phys + phys_size, "TMP NUMA DIST");
 		phys_dist = __va(phys);
 
 		for (i = 0; i < numa_dist_cnt; i++)
@@ -398,6 +396,10 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
 			numa_set_distance(i, j, dist);
 		}
 	}
+
+	/* free the copied physical distance table */
+	if (phys_dist)
+		memblock_x86_free_range(__pa(phys_dist), __pa(phys_dist) + phys_size);
 	return;
 
 no_emu:
-- 
cgit v0.10.2


From eb8c1e2c830fc25c93bc94e215ed387fe142a98d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 2 Mar 2011 11:32:47 +0100
Subject: x86-64, NUMA: Better explain numa_distance handling

Handling of out-of-bounds distances and allocation failure can use
better documentation.  Add it.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Acked-by: David Rientjes <rientjes@google.com>

diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 541746f..74064e8 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -392,11 +392,12 @@ void __init numa_reset_distance(void)
 {
 	size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]);
 
+	/* numa_distance could be 1LU marking allocation failure, test cnt */
 	if (numa_distance_cnt)
 		memblock_x86_free_range(__pa(numa_distance),
 					__pa(numa_distance) + size);
 	numa_distance_cnt = 0;
-	numa_distance = NULL;
+	numa_distance = NULL;	/* enable table creation */
 }
 
 static int __init numa_alloc_distance(void)
@@ -447,6 +448,14 @@ static int __init numa_alloc_distance(void)
  * Set the distance from node @from to @to to @distance.  If distance table
  * doesn't exist, one which is large enough to accomodate all the currently
  * known nodes will be created.
+ *
+ * If such table cannot be allocated, a warning is printed and further
+ * calls are ignored until the distance table is reset with
+ * numa_reset_distance().
+ *
+ * If @from or @to is higher than the highest known node at the time of
+ * table creation or @distance doesn't make sense, the call is ignored.
+ * This is to allow simplification of specific NUMA config implementations.
  */
 void __init numa_set_distance(int from, int to, int distance)
 {
diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c
index 0afa25d..aeecea9 100644
--- a/arch/x86/mm/numa_emulation.c
+++ b/arch/x86/mm/numa_emulation.c
@@ -379,7 +379,11 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
 		if (emu_nid_to_phys[i] == NUMA_NO_NODE)
 			emu_nid_to_phys[i] = 0;
 
-	/* transform distance table */
+	/*
+	 * Transform distance table.  numa_set_distance() ignores all
+	 * out-of-bound distances.  Just call it for every possible node
+	 * combination.
+	 */
 	numa_reset_distance();
 	for (i = 0; i < MAX_NUMNODES; i++) {
 		for (j = 0; j < MAX_NUMNODES; j++) {
-- 
cgit v0.10.2


From 5cd10e7946d28cfc42442fee2e6c757e244d756e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 2 Mar 2011 16:58:30 +0100
Subject: hrtimer: Update base[CLOCK_BOOTTIME].offset correctly

We calculate the current time of each clock base by adding an offset
to clock_monotonic. The offset for the clock bases is set in
retrigger_next_event() which is called when we switch a cpu to highres
mode or when the clock was set.

Add the missing update for clock boottime.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: John Stultz <johnstul@us.ibm.com>

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 4c53af1..f679a2d 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -635,6 +635,8 @@ static void retrigger_next_event(void *arg)
 	raw_spin_lock(&base->lock);
 	base->clock_base[HRTIMER_BASE_REALTIME].offset =
 		timespec_to_ktime(realtime_offset);
+	base->clock_base[HRTIMER_BASE_BOOTTIME].offset =
+		timespec_to_ktime(sleep);
 
 	hrtimer_force_reprogram(base, 0);
 	raw_spin_unlock(&base->lock);
-- 
cgit v0.10.2


From d04c579f971bf7d995db1ef7a7161c0143068859 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@novell.com>
Date: Thu, 3 Mar 2011 10:55:29 +0000
Subject: x86: Work around old gas bug

Add extra parentheses around a couple of definitions introduced
by "x86: Cleanup vector usage" and used in assembly macro
arguments, and remove spaces. Without that old (2.16.1) gas
would see more macro arguments than were actually specified.

Reported-and-tested-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Jan Beulich <jbeulich@novell.com>
Cc: Shaohua Li <shaohua.li@intel.com>
LKML-Reference: <4D6F81B10200007800034B0B@vpn.id2.novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 4980f48..6e976ee 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -126,14 +126,14 @@
 
 /* up to 32 vectors used for spreading out TLB flushes: */
 #if NR_CPUS <= 32
-# define NUM_INVALIDATE_TLB_VECTORS NR_CPUS
+# define NUM_INVALIDATE_TLB_VECTORS	(NR_CPUS)
 #else
-# define NUM_INVALIDATE_TLB_VECTORS 32
+# define NUM_INVALIDATE_TLB_VECTORS	(32)
 #endif
 
-#define INVALIDATE_TLB_VECTOR_END	0xee
+#define INVALIDATE_TLB_VECTOR_END	(0xee)
 #define INVALIDATE_TLB_VECTOR_START	\
-	(INVALIDATE_TLB_VECTOR_END - NUM_INVALIDATE_TLB_VECTORS + 1)
+	(INVALIDATE_TLB_VECTOR_END-NUM_INVALIDATE_TLB_VECTORS+1)
 
 #define NR_VECTORS			 256
 
-- 
cgit v0.10.2


From 6eaa412f2753d98566b777836a98c6e7f672a3bb Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Tue, 18 Jan 2011 20:09:41 -0500
Subject: xen: Mark all initial reserved pages for the balloon as
 INVALID_P2M_ENTRY.

With this patch, we diligently set regions that will be used by the
balloon driver to be INVALID_P2M_ENTRY and under the ownership
of the balloon driver. We are OK using the __set_phys_to_machine
as we do not expect to be allocating any P2M middle or entries pages.
The set_phys_to_machine has the side-effect of potentially allocating
new pages and we do not want that at this stage.

We can do this because xen_build_mfn_list_list will have already
allocated all such pages up to xen_max_p2m_pfn.

We also move the check for auto translated physmap down the
stack so it is present in __set_phys_to_machine.

[v2: Rebased with mmu->p2m code split]
Reviewed-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index f25bdf2..8ea9772 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -41,6 +41,7 @@ extern unsigned int   machine_to_phys_order;
 
 extern unsigned long get_phys_to_machine(unsigned long pfn);
 extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn);
+extern bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn);
 
 extern int m2p_add_override(unsigned long mfn, struct page *page);
 extern int m2p_remove_override(struct page *page);
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 5e92b61..0180ae8 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -2074,7 +2074,7 @@ static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order,
 			in_frames[i] = virt_to_mfn(vaddr);
 
 		MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0);
-		set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY);
+		__set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY);
 
 		if (out_frames)
 			out_frames[i] = virt_to_pfn(vaddr);
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index ddc81a0..df4e367 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -365,6 +365,10 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
 {
 	unsigned topidx, mididx, idx;
 
+	if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
+		BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
+		return true;
+	}
 	if (unlikely(pfn >= MAX_P2M_PFN)) {
 		BUG_ON(mfn != INVALID_P2M_ENTRY);
 		return true;
@@ -384,11 +388,6 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
 
 bool set_phys_to_machine(unsigned long pfn, unsigned long mfn)
 {
-	if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
-		BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
-		return true;
-	}
-
 	if (unlikely(!__set_phys_to_machine(pfn, mfn)))  {
 		if (!alloc_p2m(pfn))
 			return false;
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index b5a7f92..7201800 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -52,6 +52,8 @@ phys_addr_t xen_extra_mem_start, xen_extra_mem_size;
 
 static __init void xen_add_extra_mem(unsigned long pages)
 {
+	unsigned long pfn;
+
 	u64 size = (u64)pages * PAGE_SIZE;
 	u64 extra_start = xen_extra_mem_start + xen_extra_mem_size;
 
@@ -66,6 +68,9 @@ static __init void xen_add_extra_mem(unsigned long pages)
 	xen_extra_mem_size += size;
 
 	xen_max_p2m_pfn = PFN_DOWN(extra_start + size);
+
+	for (pfn = PFN_DOWN(extra_start); pfn <= xen_max_p2m_pfn; pfn++)
+		__set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
 }
 
 static unsigned long __init xen_release_chunk(phys_addr_t start_addr,
@@ -104,7 +109,7 @@ static unsigned long __init xen_release_chunk(phys_addr_t start_addr,
 		WARN(ret != 1, "Failed to release memory %lx-%lx err=%d\n",
 		     start, end, ret);
 		if (ret == 1) {
-			set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
+			__set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
 			len++;
 		}
 	}
diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c
index 43f9f02..b1661cd 100644
--- a/drivers/xen/balloon.c
+++ b/drivers/xen/balloon.c
@@ -296,7 +296,7 @@ static int decrease_reservation(unsigned long nr_pages)
 	/* No more mappings: invalidate P2M and add to balloon. */
 	for (i = 0; i < nr_pages; i++) {
 		pfn = mfn_to_pfn(frame_list[i]);
-		set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
+		__set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
 		balloon_append(pfn_to_page(pfn));
 	}
 
-- 
cgit v0.10.2


From 3f2a230caf21a1f7ac75f9e4892d0e5af9ccee88 Mon Sep 17 00:00:00 2001
From: Ian Campbell <ian.campbell@citrix.com>
Date: Tue, 11 Jan 2011 17:20:13 +0000
Subject: xen: handled remapped IRQs when enabling a pcifront PCI device.

This happens to not be an issue currently because we take pains to try
to ensure that the GSI-IRQ mapping is 1-1 in a PV guest and that
regular event channels do not clash. However a subsequent patch is
going to break this 1-1 mapping.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>

diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index 25cd4a0..2a12f3d 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -226,21 +226,27 @@ static int xen_pcifront_enable_irq(struct pci_dev *dev)
 {
 	int rc;
 	int share = 1;
+	u8 gsi;
 
-	dev_info(&dev->dev, "Xen PCI enabling IRQ: %d\n", dev->irq);
-
-	if (dev->irq < 0)
-		return -EINVAL;
+	rc = pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &gsi);
+	if (rc < 0) {
+		dev_warn(&dev->dev, "Xen PCI: failed to read interrupt line: %d\n",
+			 rc);
+		return rc;
+	}
 
-	if (dev->irq < NR_IRQS_LEGACY)
+	if (gsi < NR_IRQS_LEGACY)
 		share = 0;
 
-	rc = xen_allocate_pirq(dev->irq, share, "pcifront");
+	rc = xen_allocate_pirq(gsi, share, "pcifront");
 	if (rc < 0) {
-		dev_warn(&dev->dev, "Xen PCI IRQ: %d, failed to register:%d\n",
-			 dev->irq, rc);
+		dev_warn(&dev->dev, "Xen PCI: failed to register GSI%d: %d\n",
+			 gsi, rc);
 		return rc;
 	}
+
+	dev->irq = rc;
+	dev_info(&dev->dev, "Xen PCI mapped GSI%d to IRQ%d\n", gsi, dev->irq);
 	return 0;
 }
 
-- 
cgit v0.10.2


From cbf6aa89fc52c5253ee141d53eeb73147eb37ac0 Mon Sep 17 00:00:00 2001
From: Ian Campbell <ian.campbell@citrix.com>
Date: Tue, 11 Jan 2011 17:20:14 +0000
Subject: xen:events: move find_unbound_irq inside CONFIG_PCI_MSI

The only caller is xen_allocate_pirq_msi which is also under this
ifdef so this fixes:
    drivers/xen/events.c:377: warning: 'find_unbound_pirq' defined but not used
when CONFIG_PCI_MSI=n

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>

diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index 7468147..1ae7757 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -387,23 +387,6 @@ static int get_nr_hw_irqs(void)
 	return ret;
 }
 
-static int find_unbound_pirq(int type)
-{
-	int rc, i;
-	struct physdev_get_free_pirq op_get_free_pirq;
-	op_get_free_pirq.type = type;
-
-	rc = HYPERVISOR_physdev_op(PHYSDEVOP_get_free_pirq, &op_get_free_pirq);
-	if (!rc)
-		return op_get_free_pirq.pirq;
-
-	for (i = 0; i < nr_irqs; i++) {
-		if (pirq_to_irq[i] < 0)
-			return i;
-	}
-	return -1;
-}
-
 static int find_unbound_irq(void)
 {
 	struct irq_data *data;
@@ -677,6 +660,23 @@ out:
 #include <linux/msi.h>
 #include "../pci/msi.h"
 
+static int find_unbound_pirq(int type)
+{
+	int rc, i;
+	struct physdev_get_free_pirq op_get_free_pirq;
+	op_get_free_pirq.type = type;
+
+	rc = HYPERVISOR_physdev_op(PHYSDEVOP_get_free_pirq, &op_get_free_pirq);
+	if (!rc)
+		return op_get_free_pirq.pirq;
+
+	for (i = 0; i < nr_irqs; i++) {
+		if (pirq_to_irq[i] < 0)
+			return i;
+	}
+	return -1;
+}
+
 void xen_allocate_pirq_msi(char *name, int *irq, int *pirq, int alloc)
 {
 	spin_lock(&irq_mapping_update_lock);
-- 
cgit v0.10.2


From c9df1ce585e3bb5a2f101c1d87381b285a9f962f Mon Sep 17 00:00:00 2001
From: Ian Campbell <ian.campbell@citrix.com>
Date: Tue, 11 Jan 2011 17:20:15 +0000
Subject: xen: events: add xen_allocate_irq_{dynamic, gsi} and xen_free_irq

This is neater than open-coded calls to irq_alloc_desc_at and
irq_free_desc.

No intended behavioural change.

Note that we previously were not checking the return value of
irq_alloc_desc_at which would be failing for GSI<NR_IRQS_LEGACY
because the core architecture code has already allocated those for
us. Hence the additional check against NR_IRQS_LEGACY in
xen_allocate_irq_gsi.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>

diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index 1ae7757..81a53eb 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -387,7 +387,7 @@ static int get_nr_hw_irqs(void)
 	return ret;
 }
 
-static int find_unbound_irq(void)
+static int xen_allocate_irq_dynamic(void)
 {
 	struct irq_data *data;
 	int irq, res;
@@ -436,6 +436,30 @@ static bool identity_mapped_irq(unsigned irq)
 	return irq < get_nr_hw_irqs();
 }
 
+static int xen_allocate_irq_gsi(unsigned gsi)
+{
+	int irq;
+
+	if (!identity_mapped_irq(gsi) &&
+	    (xen_initial_domain() || !xen_pv_domain()))
+		return xen_allocate_irq_dynamic();
+
+	/* Legacy IRQ descriptors are already allocated by the arch. */
+	if (gsi < NR_IRQS_LEGACY)
+		return gsi;
+
+	irq = irq_alloc_desc_at(gsi, -1);
+	if (irq < 0)
+		panic("Unable to allocate to IRQ%d (%d)\n", gsi, irq);
+
+	return irq;
+}
+
+static void xen_free_irq(unsigned irq)
+{
+	irq_free_desc(irq);
+}
+
 static void pirq_unmask_notify(int irq)
 {
 	struct physdev_eoi eoi = { .irq = pirq_from_irq(irq) };
@@ -621,14 +645,7 @@ int xen_map_pirq_gsi(unsigned pirq, unsigned gsi, int shareable, char *name)
 		goto out;	/* XXX need refcount? */
 	}
 
-	/* If we are a PV guest, we don't have GSIs (no ACPI passed). Therefore
-	 * we are using the !xen_initial_domain() to drop in the function.*/
-	if (identity_mapped_irq(gsi) || (!xen_initial_domain() &&
-				xen_pv_domain())) {
-		irq = gsi;
-		irq_alloc_desc_at(irq, -1);
-	} else
-		irq = find_unbound_irq();
+	irq = xen_allocate_irq_gsi(gsi);
 
 	set_irq_chip_and_handler_name(irq, &xen_pirq_chip,
 				      handle_level_irq, name);
@@ -641,7 +658,7 @@ int xen_map_pirq_gsi(unsigned pirq, unsigned gsi, int shareable, char *name)
 	 * this in the priv domain. */
 	if (xen_initial_domain() &&
 	    HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op)) {
-		irq_free_desc(irq);
+		xen_free_irq(irq);
 		irq = -ENOSPC;
 		goto out;
 	}
@@ -682,7 +699,7 @@ void xen_allocate_pirq_msi(char *name, int *irq, int *pirq, int alloc)
 	spin_lock(&irq_mapping_update_lock);
 
 	if (alloc & XEN_ALLOC_IRQ) {
-		*irq = find_unbound_irq();
+		*irq = xen_allocate_irq_dynamic();
 		if (*irq == -1)
 			goto out;
 	}
@@ -732,7 +749,7 @@ int xen_create_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int type)
 
 	spin_lock(&irq_mapping_update_lock);
 
-	irq = find_unbound_irq();
+	irq = xen_allocate_irq_dynamic();
 
 	if (irq == -1)
 		goto out;
@@ -741,7 +758,7 @@ int xen_create_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int type)
 	if (rc) {
 		printk(KERN_WARNING "xen map irq failed %d\n", rc);
 
-		irq_free_desc(irq);
+		xen_free_irq(irq);
 
 		irq = -1;
 		goto out;
@@ -783,7 +800,7 @@ int xen_destroy_irq(int irq)
 	}
 	irq_info[irq] = mk_unbound_info();
 
-	irq_free_desc(irq);
+	xen_free_irq(irq);
 
 out:
 	spin_unlock(&irq_mapping_update_lock);
@@ -814,7 +831,7 @@ int bind_evtchn_to_irq(unsigned int evtchn)
 	irq = evtchn_to_irq[evtchn];
 
 	if (irq == -1) {
-		irq = find_unbound_irq();
+		irq = xen_allocate_irq_dynamic();
 
 		set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
 					      handle_fasteoi_irq, "event");
@@ -839,7 +856,7 @@ static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
 	irq = per_cpu(ipi_to_irq, cpu)[ipi];
 
 	if (irq == -1) {
-		irq = find_unbound_irq();
+		irq = xen_allocate_irq_dynamic();
 		if (irq < 0)
 			goto out;
 
@@ -875,7 +892,7 @@ int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
 	irq = per_cpu(virq_to_irq, cpu)[virq];
 
 	if (irq == -1) {
-		irq = find_unbound_irq();
+		irq = xen_allocate_irq_dynamic();
 
 		set_irq_chip_and_handler_name(irq, &xen_percpu_chip,
 					      handle_percpu_irq, "virq");
@@ -934,7 +951,7 @@ static void unbind_from_irq(unsigned int irq)
 	if (irq_info[irq].type != IRQT_UNBOUND) {
 		irq_info[irq] = mk_unbound_info();
 
-		irq_free_desc(irq);
+		xen_free_irq(irq);
 	}
 
 	spin_unlock(&irq_mapping_update_lock);
-- 
cgit v0.10.2


From 89911501f3aae44a43984793341a3bf1f4c583c2 Mon Sep 17 00:00:00 2001
From: Ian Campbell <ian.campbell@citrix.com>
Date: Thu, 3 Mar 2011 11:57:44 -0500
Subject: xen: events: allocate GSIs and dynamic IRQs from separate IRQ ranges.

There are three cases which we need to care about, PV guest, PV domain
0 and HVM guest.

The PV guest case is simple since it has no access to ACPI or real
APICs and therefore has no GSIs therefore we simply dynamically
allocate all IRQs. The potentially interesting case here is PIRQ type
event channels associated with passed through PCI devices. However
even in this case the guest has no direct interaction with the
physical GSI since that happens in the PCI backend.

The PV domain 0 and HVM guest cases are actually the same. In domain 0
case the kernel sees the host ACPI and GSIs (although it only sees the
APIC indirectly via the hypervisor) and in the HVM guest case it sees
the virtualised ACPI and emulated APICs. In these cases we start
allocating dynamic IRQs at nr_irqs_gsi so that they cannot clash with
any GSI.

Currently xen_allocate_irq_dynamic starts at nr_irqs and works
backwards looking for a free IRQ in order to (try and) avoid clashing
with GSIs used in domain 0 and in HVM guests. This change avoids that
although we retain the behaviour of allowing dynamic IRQs to encroach
on the GSI range if no suitable IRQs are available since a future IRQ
clash is deemed preferable to failure right now.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>

diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index 81a53eb..06f2e61 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -376,72 +376,49 @@ static void unmask_evtchn(int port)
 	put_cpu();
 }
 
-static int get_nr_hw_irqs(void)
+static int xen_allocate_irq_dynamic(void)
 {
-	int ret = 1;
+	int first = 0;
+	int irq;
 
 #ifdef CONFIG_X86_IO_APIC
-	ret = get_nr_irqs_gsi();
+	/*
+	 * For an HVM guest or domain 0 which see "real" (emulated or
+	 * actual repectively) GSIs we allocate dynamic IRQs
+	 * e.g. those corresponding to event channels or MSIs
+	 * etc. from the range above those "real" GSIs to avoid
+	 * collisions.
+	 */
+	if (xen_initial_domain() || xen_hvm_domain())
+		first = get_nr_irqs_gsi();
 #endif
 
-	return ret;
-}
+retry:
+	irq = irq_alloc_desc_from(first, -1);
 
-static int xen_allocate_irq_dynamic(void)
-{
-	struct irq_data *data;
-	int irq, res;
-	int bottom = get_nr_hw_irqs();
-	int top = nr_irqs-1;
-
-	if (bottom == nr_irqs)
-		goto no_irqs;
-
-	/* This loop starts from the top of IRQ space and goes down.
-	 * We need this b/c if we have a PCI device in a Xen PV guest
-	 * we do not have an IO-APIC (though the backend might have them)
-	 * mapped in. To not have a collision of physical IRQs with the Xen
-	 * event channels start at the top of the IRQ space for virtual IRQs.
-	 */
-	for (irq = top; irq > bottom; irq--) {
-		data = irq_get_irq_data(irq);
-		/* only 15->0 have init'd desc; handle irq > 16 */
-		if (!data)
-			break;
-		if (data->chip == &no_irq_chip)
-			break;
-		if (data->chip != &xen_dynamic_chip)
-			continue;
-		if (irq_info[irq].type == IRQT_UNBOUND)
-			return irq;
+	if (irq == -ENOMEM && first > NR_IRQS_LEGACY) {
+		printk(KERN_ERR "Out of dynamic IRQ space and eating into GSI space. You should increase nr_irqs\n");
+		first = max(NR_IRQS_LEGACY, first - NR_IRQS_LEGACY);
+		goto retry;
 	}
 
-	if (irq == bottom)
-		goto no_irqs;
-
-	res = irq_alloc_desc_at(irq, -1);
-
-	if (WARN_ON(res != irq))
-		return -1;
+	if (irq < 0)
+		panic("No available IRQ to bind to: increase nr_irqs!\n");
 
 	return irq;
-
-no_irqs:
-	panic("No available IRQ to bind to: increase nr_irqs!\n");
-}
-
-static bool identity_mapped_irq(unsigned irq)
-{
-	/* identity map all the hardware irqs */
-	return irq < get_nr_hw_irqs();
 }
 
 static int xen_allocate_irq_gsi(unsigned gsi)
 {
 	int irq;
 
-	if (!identity_mapped_irq(gsi) &&
-	    (xen_initial_domain() || !xen_pv_domain()))
+	/*
+	 * A PV guest has no concept of a GSI (since it has no ACPI
+	 * nor access to/knowledge of the physical APICs). Therefore
+	 * all IRQs are dynamically allocated from the entire IRQ
+	 * space.
+	 */
+	if (xen_pv_domain() && !xen_initial_domain())
 		return xen_allocate_irq_dynamic();
 
 	/* Legacy IRQ descriptors are already allocated by the arch. */
-- 
cgit v0.10.2


From 7214610475b2847a81478d96e4d3ba0bbe49598c Mon Sep 17 00:00:00 2001
From: Ian Campbell <Ian.Campbell@eu.citrix.com>
Date: Thu, 3 Feb 2011 09:49:35 +0000
Subject: xen: events: do not free legacy IRQs

c514d00c8057 "xen: events: add xen_allocate_irq_{dynamic, gsi} and
xen_free_irq" correctly avoids reallocating legacy IRQs (which are
managed by the arch core) but erroneously did not prevent them being
freed.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index 06f2e61..accb37a 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -434,6 +434,10 @@ static int xen_allocate_irq_gsi(unsigned gsi)
 
 static void xen_free_irq(unsigned irq)
 {
+	/* Legacy IRQ descriptors are managed by the arch. */
+	if (irq < NR_IRQS_LEGACY)
+		return;
+
 	irq_free_desc(irq);
 }
 
-- 
cgit v0.10.2


From 149f256f8ca690c28dd8aa9fb8bcdaf2e93b1e1c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 5 Feb 2011 20:08:52 +0000
Subject: xen: Remove stale irq_chip.end

irq_chip.end got obsolete with the removal of __do_IRQ()

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index accb37a..c8826b5 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -555,23 +555,6 @@ static void ack_pirq(unsigned int irq)
 	}
 }
 
-static void end_pirq(unsigned int irq)
-{
-	int evtchn = evtchn_from_irq(irq);
-	struct irq_desc *desc = irq_to_desc(irq);
-
-	if (WARN_ON(!desc))
-		return;
-
-	if ((desc->status & (IRQ_DISABLED|IRQ_PENDING)) ==
-	    (IRQ_DISABLED|IRQ_PENDING)) {
-		shutdown_pirq(irq);
-	} else if (VALID_EVTCHN(evtchn)) {
-		unmask_evtchn(evtchn);
-		pirq_unmask_notify(irq);
-	}
-}
-
 static int find_irq_by_gsi(unsigned gsi)
 {
 	int irq;
@@ -1508,7 +1491,6 @@ static struct irq_chip xen_pirq_chip __read_mostly = {
 	.mask		= disable_pirq,
 
 	.ack		= ack_pirq,
-	.end		= end_pirq,
 
 	.set_affinity	= set_affinity_irq,
 
-- 
cgit v0.10.2


From c9e265e030537167c94cbed190826f02e3887f4d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 5 Feb 2011 20:08:54 +0000
Subject: xen: Switch to new irq_chip functions

Convert Xen to the new irq_chip functions. Brings us closer to enable
CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index c8826b5..cf1712f 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -277,7 +277,7 @@ static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)
 
 	BUG_ON(irq == -1);
 #ifdef CONFIG_SMP
-	cpumask_copy(irq_to_desc(irq)->affinity, cpumask_of(cpu));
+	cpumask_copy(irq_to_desc(irq)->irq_data.affinity, cpumask_of(cpu));
 #endif
 
 	clear_bit(chn, cpu_evtchn_mask(cpu_from_irq(irq)));
@@ -294,7 +294,7 @@ static void init_evtchn_cpu_bindings(void)
 
 	/* By default all event channels notify CPU#0. */
 	for_each_irq_desc(i, desc) {
-		cpumask_copy(desc->affinity, cpumask_of(0));
+		cpumask_copy(desc->irq_data.affinity, cpumask_of(0));
 	}
 #endif
 
@@ -474,7 +474,7 @@ static bool probing_irq(int irq)
 	return desc && desc->action == NULL;
 }
 
-static unsigned int startup_pirq(unsigned int irq)
+static unsigned int __startup_pirq(unsigned int irq)
 {
 	struct evtchn_bind_pirq bind_pirq;
 	struct irq_info *info = info_for_irq(irq);
@@ -512,9 +512,15 @@ out:
 	return 0;
 }
 
-static void shutdown_pirq(unsigned int irq)
+static unsigned int startup_pirq(struct irq_data *data)
+{
+	return __startup_pirq(data->irq);
+}
+
+static void shutdown_pirq(struct irq_data *data)
 {
 	struct evtchn_close close;
+	unsigned int irq = data->irq;
 	struct irq_info *info = info_for_irq(irq);
 	int evtchn = evtchn_from_irq(irq);
 
@@ -534,20 +540,20 @@ static void shutdown_pirq(unsigned int irq)
 	info->evtchn = 0;
 }
 
-static void enable_pirq(unsigned int irq)
+static void enable_pirq(struct irq_data *data)
 {
-	startup_pirq(irq);
+	startup_pirq(data);
 }
 
-static void disable_pirq(unsigned int irq)
+static void disable_pirq(struct irq_data *data)
 {
 }
 
-static void ack_pirq(unsigned int irq)
+static void ack_pirq(struct irq_data *data)
 {
-	int evtchn = evtchn_from_irq(irq);
+	int evtchn = evtchn_from_irq(data->irq);
 
-	move_native_irq(irq);
+	irq_move_irq(data);
 
 	if (VALID_EVTCHN(evtchn)) {
 		mask_evtchn(evtchn);
@@ -1215,11 +1221,12 @@ static int rebind_irq_to_cpu(unsigned irq, unsigned tcpu)
 	return 0;
 }
 
-static int set_affinity_irq(unsigned irq, const struct cpumask *dest)
+static int set_affinity_irq(struct irq_data *data, const struct cpumask *dest,
+			    bool force)
 {
 	unsigned tcpu = cpumask_first(dest);
 
-	return rebind_irq_to_cpu(irq, tcpu);
+	return rebind_irq_to_cpu(data->irq, tcpu);
 }
 
 int resend_irq_on_evtchn(unsigned int irq)
@@ -1238,35 +1245,35 @@ int resend_irq_on_evtchn(unsigned int irq)
 	return 1;
 }
 
-static void enable_dynirq(unsigned int irq)
+static void enable_dynirq(struct irq_data *data)
 {
-	int evtchn = evtchn_from_irq(irq);
+	int evtchn = evtchn_from_irq(data->irq);
 
 	if (VALID_EVTCHN(evtchn))
 		unmask_evtchn(evtchn);
 }
 
-static void disable_dynirq(unsigned int irq)
+static void disable_dynirq(struct irq_data *data)
 {
-	int evtchn = evtchn_from_irq(irq);
+	int evtchn = evtchn_from_irq(data->irq);
 
 	if (VALID_EVTCHN(evtchn))
 		mask_evtchn(evtchn);
 }
 
-static void ack_dynirq(unsigned int irq)
+static void ack_dynirq(struct irq_data *data)
 {
-	int evtchn = evtchn_from_irq(irq);
+	int evtchn = evtchn_from_irq(data->irq);
 
-	move_masked_irq(irq);
+	move_masked_irq(data->irq);
 
 	if (VALID_EVTCHN(evtchn))
 		unmask_evtchn(evtchn);
 }
 
-static int retrigger_dynirq(unsigned int irq)
+static int retrigger_dynirq(struct irq_data *data)
 {
-	int evtchn = evtchn_from_irq(irq);
+	int evtchn = evtchn_from_irq(data->irq);
 	struct shared_info *sh = HYPERVISOR_shared_info;
 	int ret = 0;
 
@@ -1315,7 +1322,7 @@ static void restore_cpu_pirqs(void)
 
 		printk(KERN_DEBUG "xen: --> irq=%d, pirq=%d\n", irq, map_irq.pirq);
 
-		startup_pirq(irq);
+		__startup_pirq(irq);
 	}
 }
 
@@ -1467,44 +1474,44 @@ void xen_irq_resume(void)
 }
 
 static struct irq_chip xen_dynamic_chip __read_mostly = {
-	.name		= "xen-dyn",
+	.name			= "xen-dyn",
 
-	.disable	= disable_dynirq,
-	.mask		= disable_dynirq,
-	.unmask		= enable_dynirq,
+	.irq_disable		= disable_dynirq,
+	.irq_mask		= disable_dynirq,
+	.irq_unmask		= enable_dynirq,
 
-	.eoi		= ack_dynirq,
-	.set_affinity	= set_affinity_irq,
-	.retrigger	= retrigger_dynirq,
+	.irq_eoi		= ack_dynirq,
+	.irq_set_affinity	= set_affinity_irq,
+	.irq_retrigger		= retrigger_dynirq,
 };
 
 static struct irq_chip xen_pirq_chip __read_mostly = {
-	.name		= "xen-pirq",
+	.name			= "xen-pirq",
 
-	.startup	= startup_pirq,
-	.shutdown	= shutdown_pirq,
+	.irq_startup		= startup_pirq,
+	.irq_shutdown		= shutdown_pirq,
 
-	.enable		= enable_pirq,
-	.unmask		= enable_pirq,
+	.irq_enable		= enable_pirq,
+	.irq_unmask		= enable_pirq,
 
-	.disable	= disable_pirq,
-	.mask		= disable_pirq,
+	.irq_disable		= disable_pirq,
+	.irq_mask		= disable_pirq,
 
-	.ack		= ack_pirq,
+	.irq_ack		= ack_pirq,
 
-	.set_affinity	= set_affinity_irq,
+	.irq_set_affinity	= set_affinity_irq,
 
-	.retrigger	= retrigger_dynirq,
+	.irq_retrigger		= retrigger_dynirq,
 };
 
 static struct irq_chip xen_percpu_chip __read_mostly = {
-	.name		= "xen-percpu",
+	.name			= "xen-percpu",
 
-	.disable	= disable_dynirq,
-	.mask		= disable_dynirq,
-	.unmask		= enable_dynirq,
+	.irq_disable		= disable_dynirq,
+	.irq_mask		= disable_dynirq,
+	.irq_unmask		= enable_dynirq,
 
-	.ack		= ack_dynirq,
+	.irq_ack		= ack_dynirq,
 };
 
 int xen_set_callback_via(uint64_t via)
-- 
cgit v0.10.2


From aa673c1cb3a66d0b37595251c4e8bb688efc8726 Mon Sep 17 00:00:00 2001
From: Ian Campbell <Ian.Campbell@citrix.com>
Date: Mon, 7 Feb 2011 11:08:39 +0000
Subject: xen: Fix compile error introduced by "switch to new irq_chip
 functions"

drivers/xen/events.c: In function 'ack_pirq':
drivers/xen/events.c:568: error: implicit declaration of function 'irq_move_irq'

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index cf1712f..5aa422a 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -553,7 +553,7 @@ static void ack_pirq(struct irq_data *data)
 {
 	int evtchn = evtchn_from_irq(data->irq);
 
-	irq_move_irq(data);
+	move_native_irq(data->irq);
 
 	if (VALID_EVTCHN(evtchn)) {
 		mask_evtchn(evtchn);
-- 
cgit v0.10.2


From f611f2da99420abc973c32cdbddbf5c365d0a20c Mon Sep 17 00:00:00 2001
From: Ian Campbell <Ian.Campbell@citrix.com>
Date: Tue, 8 Feb 2011 14:03:31 +0000
Subject: xen/timer: Missing IRQF_NO_SUSPEND in timer code broke suspend.

The patches missed an indirect use of IRQF_NO_SUSPEND pulled in via
IRQF_TIMER. The following patch fixes the issue.

With this fixlet PV guest migration works just fine. I also booted the
entire series as a dom0 kernel and it appeared fine.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 067759e..2e2d370 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -397,7 +397,9 @@ void xen_setup_timer(int cpu)
 		name = "<timer kasprintf failed>";
 
 	irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt,
-				      IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING|IRQF_TIMER,
+				      IRQF_DISABLED|IRQF_PERCPU|
+				      IRQF_NOBALANCING|IRQF_TIMER|
+				      IRQF_FORCE_RESUME,
 				      name, NULL);
 
 	evt = &per_cpu(xen_clock_events, cpu);
-- 
cgit v0.10.2


From 676dc3cf5bc36a9e129a3ad8fe3bd7b2ebf20f5d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 5 Feb 2011 20:08:59 +0000
Subject: xen: Use IRQF_FORCE_RESUME

Mark the IRQF_NO_SUSPEND interrupts IRQF_FORCE_RESUME and remove the extra
walk through the interrupt descriptors.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index 5aa422a..975e90f 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -977,7 +977,7 @@ int bind_ipi_to_irqhandler(enum ipi_vector ipi,
 	if (irq < 0)
 		return irq;
 
-	irqflags |= IRQF_NO_SUSPEND;
+	irqflags |= IRQF_NO_SUSPEND | IRQF_FORCE_RESUME;
 	retval = request_irq(irq, handler, irqflags, devname, dev_id);
 	if (retval != 0) {
 		unbind_from_irq(irq);
@@ -1433,7 +1433,6 @@ void xen_poll_irq(int irq)
 void xen_irq_resume(void)
 {
 	unsigned int cpu, irq, evtchn;
-	struct irq_desc *desc;
 
 	init_evtchn_cpu_bindings();
 
@@ -1453,23 +1452,6 @@ void xen_irq_resume(void)
 		restore_cpu_ipis(cpu);
 	}
 
-	/*
-	 * Unmask any IRQF_NO_SUSPEND IRQs which are enabled. These
-	 * are not handled by the IRQ core.
-	 */
-	for_each_irq_desc(irq, desc) {
-		if (!desc->action || !(desc->action->flags & IRQF_NO_SUSPEND))
-			continue;
-		if (desc->status & IRQ_DISABLED)
-			continue;
-
-		evtchn = evtchn_from_irq(irq);
-		if (evtchn == -1)
-			continue;
-
-		unmask_evtchn(evtchn);
-	}
-
 	restore_cpu_pirqs();
 }
 
-- 
cgit v0.10.2


From 1aa0b51a033d4a1ec6d29d06487e053398afa21b Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Thu, 17 Feb 2011 11:23:58 -0500
Subject: xen/irq: Cleanup up the pirq_to_irq for DomU PV PCI passthrough
 guests as well.

We only did this for PV guests that are xen_initial_domain() but
there is not reason not to do this for other cases. The other
case is only exercised when you pass in a PCI device to a PV guest
_and_ the device in question.

Reviewed-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index 975e90f..89987a7 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -766,8 +766,9 @@ int xen_destroy_irq(int irq)
 			printk(KERN_WARNING "unmap irq failed %d\n", rc);
 			goto out;
 		}
-		pirq_to_irq[info->u.pirq.pirq] = -1;
 	}
+	pirq_to_irq[info->u.pirq.pirq] = -1;
+
 	irq_info[irq] = mk_unbound_info();
 
 	xen_free_irq(irq);
-- 
cgit v0.10.2


From ff9ae1babd8ce88c3f90db6278ea5f55bdcb4624 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 25 Feb 2011 21:57:04 +0100
Subject: perf: Fix missing strndup declaration
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

<ctype.h> is included first without _GNU_SOURCE, so it ends up
including <string.h> without declaring strndup(). And further
<string.h> declarations, even with _GNU_SOURCE defined, are
of course without effect.

Therefore:

	util/strfilter.c: Dans la fonction «strfilter_node__new» :
	util/strfilter.c:134: attention : déclaration implicite de la fonction « «strndup» »
	util/strfilter.c:134: attention : incompatible implicit declaration of built-in function «strndup»
	make: *** [util/strfilter.o] Erreur 1

Just don't include ctype.h as it doesn't appear to be necessary
anyway.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/strfilter.c b/tools/perf/util/strfilter.c
index 4064b7d..834c8eb 100644
--- a/tools/perf/util/strfilter.c
+++ b/tools/perf/util/strfilter.c
@@ -1,4 +1,3 @@
-#include <ctype.h>
 #include "util.h"
 #include "string.h"
 #include "strfilter.h"
-- 
cgit v0.10.2


From cfff2d909cbdaf8c467bd321aa0502a548ec8f7e Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 25 Feb 2011 21:30:16 +0100
Subject: perf: Fix undefined PyVarObject_HEAD_INIT in python 2.5
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

PyVarObject_HEAD_INIT is undefined in python 2.5, resulting
in a build crash:

	util/python.c:81: attention : déclaration implicite de la fonction « «PyVarObject_HEAD_INIT» »
	util/python.c:82: erreur: request for member «tp_name» in something not a structure or union
	util/python.c:117: erreur: request for member «tp_name» in something not a structure or union
	util/python.c:146: erreur: request for member «tp_name» in something not a structure or union
	util/python.c:177: erreur: request for member «tp_name» in something not a structure or union
	util/python.c:290: erreur: request for member «tp_name» in something not a structure or union
	util/python.c:359: erreur: request for member «tp_name» in something not a structure or union
	util/python.c:532: erreur: request for member «tp_name» in something not a structure or union
	util/python.c:761: erreur: request for member «tp_name» in something not a structure or union
	error: command 'gcc' failed with exit status 1
	make: *** [python/perf.so] Erreur 1

We can fix that by defining PyVarObject_HEAD_INIT as a wrapper on
PyObject_HEAD_INIT, thanks to a trick found on biopython:
https://github.com/biopython/biopython/commit/d4eaf57946c7b4c32eca8d18821edf32f83e300d

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c
index 5317ef2..a9f2d7e 100644
--- a/tools/perf/util/python.c
+++ b/tools/perf/util/python.c
@@ -8,6 +8,11 @@
 #include "cpumap.h"
 #include "thread_map.h"
 
+/* Define PyVarObject_HEAD_INIT for python 2.5 */
+#ifndef PyVarObject_HEAD_INIT
+# define PyVarObject_HEAD_INIT(type, size) PyObject_HEAD_INIT(type) size,
+#endif
+
 struct throttle_event {
 	struct perf_event_header header;
 	u64			 time;
-- 
cgit v0.10.2


From f89112502805c1f6a6955f90ad158e538edb319d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 4 Mar 2011 10:26:36 +0100
Subject: x86-64, NUMA: Revert NUMA affine page table allocation

This patch reverts NUMA affine page table allocation added by commit
1411e0ec31 (x86-64, numa: Put pgtable to local node memory).

The commit made an undocumented change where the kernel linear mapping
strictly follows intersection of e820 memory map and NUMA
configuration.  If the physical memory configuration has holes or NUMA
nodes are not properly aligned, this leads to using unnecessarily
smaller mapping size which leads to increased TLB pressure.  For
details,

  http://thread.gmane.org/gmane.linux.kernel/1104672

Patches to fix the problem have been proposed but the underlying code
needs more cleanup and the approach itself seems a bit heavy handed
and it has been determined to revert the feature for now and come back
to it in the next developement cycle.

  http://thread.gmane.org/gmane.linux.kernel/1105959

As init_memory_mapping_high() callsites have been consolidated since
the commit, reverting is done manually.  Also, the RED-PEN comment in
arch/x86/mm/init.c is not restored as the problem no longer exists
with memblock based top-down early memory allocation.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index 97e6007..bce688d 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -54,8 +54,6 @@ static inline phys_addr_t get_max_mapped(void)
 extern unsigned long init_memory_mapping(unsigned long start,
 					 unsigned long end);
 
-void init_memory_mapping_high(void);
-
 extern void initmem_init(void);
 extern void free_initmem(void);
 
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 46e684f..c3a606c 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -963,6 +963,14 @@ void __init setup_arch(char **cmdline_p)
 	max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT);
 	max_pfn_mapped = max_low_pfn_mapped;
 
+#ifdef CONFIG_X86_64
+	if (max_pfn > max_low_pfn) {
+		max_pfn_mapped = init_memory_mapping(1UL<<32,
+						     max_pfn<<PAGE_SHIFT);
+		/* can we preseve max_low_pfn ?*/
+		max_low_pfn = max_pfn;
+	}
+#endif
 	memblock.current_limit = get_max_mapped();
 
 	/*
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 470cc47..c8813aa 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -606,63 +606,9 @@ kernel_physical_mapping_init(unsigned long start,
 void __init initmem_init(void)
 {
 	memblock_x86_register_active_regions(0, 0, max_pfn);
-	init_memory_mapping_high();
 }
 #endif
 
-struct mapping_work_data {
-	unsigned long start;
-	unsigned long end;
-	unsigned long pfn_mapped;
-};
-
-static int __init_refok
-mapping_work_fn(unsigned long start_pfn, unsigned long end_pfn, void *datax)
-{
-	struct mapping_work_data *data = datax;
-	unsigned long pfn_mapped;
-	unsigned long final_start, final_end;
-
-	final_start = max_t(unsigned long, start_pfn<<PAGE_SHIFT, data->start);
-	final_end = min_t(unsigned long, end_pfn<<PAGE_SHIFT, data->end);
-
-	if (final_end <= final_start)
-		return 0;
-
-	pfn_mapped = init_memory_mapping(final_start, final_end);
-
-	if (pfn_mapped > data->pfn_mapped)
-		data->pfn_mapped = pfn_mapped;
-
-	return 0;
-}
-
-static unsigned long __init_refok
-init_memory_mapping_active_regions(unsigned long start, unsigned long end)
-{
-	struct mapping_work_data data;
-
-	data.start = start;
-	data.end = end;
-	data.pfn_mapped = 0;
-
-	work_with_active_regions(MAX_NUMNODES, mapping_work_fn, &data);
-
-	return data.pfn_mapped;
-}
-
-void __init_refok init_memory_mapping_high(void)
-{
-	if (max_pfn > max_low_pfn) {
-		max_pfn_mapped = init_memory_mapping_active_regions(1UL<<32,
-							 max_pfn<<PAGE_SHIFT);
-		/* can we preserve max_low_pfn ? */
-		max_low_pfn = max_pfn;
-
-		memblock.current_limit = get_max_mapped();
-	}
-}
-
 void __init paging_init(void)
 {
 	unsigned long max_zone_pfns[MAX_NR_ZONES];
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 74064e8..86491ba 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -543,8 +543,6 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
 	if (!numa_meminfo_cover_memory(mi))
 		return -EINVAL;
 
-	init_memory_mapping_high();
-
 	/* Finally register nodes. */
 	for_each_node_mask(nid, node_possible_map) {
 		u64 start = (u64)max_pfn << PAGE_SHIFT;
-- 
cgit v0.10.2


From a2f5c9ab79f78e8b91ac993e0543d65b661dd19b Mon Sep 17 00:00:00 2001
From: Darren Hart <dvhart@linux.intel.com>
Date: Tue, 22 Feb 2011 13:04:33 -0800
Subject: sched: Allow SCHED_BATCH to preempt SCHED_IDLE tasks

Perform the test for SCHED_IDLE before testing for SCHED_BATCH (and
ensure idle tasks don't preempt idle tasks) so the non-interactive,
but still important, SCHED_BATCH tasks will run in favor of the very
low priority SCHED_IDLE tasks.

Signed-off-by: Darren Hart <dvhart@linux.intel.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Mike Galbraith <efault@gmx.de>
Cc: Richard Purdie <richard.purdie@linuxfoundation.org>
LKML-Reference: <1298408674-3130-2-git-send-email-dvhart@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 3a88dee..1438e13 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1870,16 +1870,18 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
 	if (test_tsk_need_resched(curr))
 		return;
 
+	/* Idle tasks are by definition preempted by non-idle tasks. */
+	if (unlikely(curr->policy == SCHED_IDLE) &&
+	    likely(p->policy != SCHED_IDLE))
+		goto preempt;
+
 	/*
-	 * Batch and idle tasks do not preempt (their preemption is driven by
-	 * the tick):
+	 * Batch and idle tasks do not preempt non-idle tasks (their preemption
+	 * is driven by the tick):
 	 */
 	if (unlikely(p->policy != SCHED_NORMAL))
 		return;
 
-	/* Idle tasks are by definition preempted by everybody. */
-	if (unlikely(curr->policy == SCHED_IDLE))
-		goto preempt;
 
 	if (!sched_feat(WAKEUP_PREEMPT))
 		return;
-- 
cgit v0.10.2


From c02aa73b1d18e43cfd79c2f193b225e84ca497c8 Mon Sep 17 00:00:00 2001
From: Darren Hart <dvhart@linux.intel.com>
Date: Thu, 17 Feb 2011 15:37:07 -0800
Subject: sched: Allow users with sufficient RLIMIT_NICE to change from
 SCHED_IDLE policy

The current scheduler implementation returns -EPERM when trying to
change from SCHED_IDLE to SCHED_OTHER or SCHED_BATCH. Since SCHED_IDLE
is considered to be a nice 20 on steroids, changing to another policy
should be allowed provided the RLIMIT_NICE is accounted for.

This patch allows the following test-case to pass with RLIMIT_NICE=40,
but still fail with RLIMIT_NICE=10 when the calling process is run
from a typical shell (nice 0, or 20 in rlimit terms).

int main()
{
	int ret;
	struct sched_param sp;
	sp.sched_priority = 0;

	/* switch to SCHED_IDLE */
	ret = sched_setscheduler(0, SCHED_IDLE, &sp);
	printf("setscheduler IDLE: %d\n", ret);
	if (ret) return ret;

	/* switch back to SCHED_OTHER */
	ret = sched_setscheduler(0, SCHED_OTHER, &sp);
	printf("setscheduler OTHER: %d\n", ret);

	return ret;
}

 $ ulimit -e
 40
 $ ./test
 setscheduler IDLE: 0
 setscheduler OTHER: 0

 $ ulimit -e 10
 $ ulimit -e
 10
 $ ./test
 setscheduler IDLE: 0
 setscheduler OTHER: -1

Signed-off-by: Darren Hart <dvhart@linux.intel.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Richard Purdie <richard.purdie@linuxfoundation.org>
LKML-Reference: <4D657BEE.4040608@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched.c b/kernel/sched.c
index 0c87126..f303070 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4981,12 +4981,15 @@ recheck:
 			    param->sched_priority > rlim_rtprio)
 				return -EPERM;
 		}
+
 		/*
-		 * Like positive nice levels, dont allow tasks to
-		 * move out of SCHED_IDLE either:
+		 * Treat SCHED_IDLE as nice 20. Only allow a switch to
+		 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
 		 */
-		if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)
-			return -EPERM;
+		if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
+			if (!can_nice(p, TASK_NICE(p)))
+				return -EPERM;
+		}
 
 		/* can't change other user's priorities */
 		if (!check_same_owner(p))
-- 
cgit v0.10.2


From 6d1cafd8b56ea726c10a5a104de57cc3ed8fa953 Mon Sep 17 00:00:00 2001
From: Venkatesh Pallipadi <venki@google.com>
Date: Tue, 1 Mar 2011 16:28:21 -0800
Subject: sched: Resched proper CPU on yield_to()

yield_to_task_fair() has code to resched the CPU of yielding task when the
intention is to resched the CPU of the task that is being yielded to.

Change here fixes the problem and also makes the resched conditional on
rq != p_rq.

Signed-off-by: Venkatesh Pallipadi <venki@google.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1299025701-22168-1-git-send-email-venki@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched.c b/kernel/sched.c
index f303070..61452e8 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5522,8 +5522,15 @@ again:
 		goto out;
 
 	yielded = curr->sched_class->yield_to_task(rq, p, preempt);
-	if (yielded)
+	if (yielded) {
 		schedstat_inc(rq, yld_count);
+		/*
+		 * Make p's CPU reschedule; pick_next_entity takes care of
+		 * fairness.
+		 */
+		if (preempt && rq != p_rq)
+			resched_task(p_rq->curr);
+	}
 
 out:
 	double_rq_unlock(rq, p_rq);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 1438e13..3f7ec9e 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1987,10 +1987,6 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
 	/* Tell the scheduler that we'd really like pse to run next. */
 	set_next_buddy(se);
 
-	/* Make p's CPU reschedule; pick_next_entity takes care of fairness. */
-	if (preempt)
-		resched_task(rq->curr);
-
 	yield_task_fair(rq);
 
 	return true;
-- 
cgit v0.10.2


From 940c5b2971de443df22eed0441bc74fb0116e9f5 Mon Sep 17 00:00:00 2001
From: Lin Ming <ming.m.lin@intel.com>
Date: Sun, 27 Feb 2011 21:13:31 +0800
Subject: perf: Fix the missing event initialization when pmu is found in idr

Currently, the event is not initialized if pmu is found in idr. This
never causes bug just because now no pmu is associated with the idr
id.

Signed-off-by: Lin Ming <ming.m.lin@intel.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1298812411.2699.9.camel@localhost>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 64a018e..821ce82 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -6098,17 +6098,22 @@ struct pmu *perf_init_event(struct perf_event *event)
 {
 	struct pmu *pmu = NULL;
 	int idx;
+	int ret;
 
 	idx = srcu_read_lock(&pmus_srcu);
 
 	rcu_read_lock();
 	pmu = idr_find(&pmu_idr, event->attr.type);
 	rcu_read_unlock();
-	if (pmu)
+	if (pmu) {
+		ret = pmu->event_init(event);
+		if (ret)
+			pmu = ERR_PTR(ret);
 		goto unlock;
+	}
 
 	list_for_each_entry_rcu(pmu, &pmus, entry) {
-		int ret = pmu->event_init(event);
+		ret = pmu->event_init(event);
 		if (!ret)
 			goto unlock;
 
-- 
cgit v0.10.2


From 3db272c0494900fcb905a201180a78cae3addd6e Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 3 Mar 2011 14:25:37 +0800
Subject: perf cgroup: Fix leak of file reference count

In perf_cgroup_connect(), fput_light() is missing in a failure path.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <4D6F3461.6060406@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 821ce82..7c999e8 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -404,8 +404,10 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
 		return -EBADF;
 
 	css = cgroup_css_from_dir(file, perf_subsys_id);
-	if (IS_ERR(css))
-		return PTR_ERR(css);
+	if (IS_ERR(css)) {
+		ret = PTR_ERR(css);
+		goto out;
+	}
 
 	cgrp = container_of(css, struct perf_cgroup, css);
 	event->cgrp = cgrp;
@@ -422,6 +424,7 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
 		/* must be done before we fput() the file */
 		perf_get_cgroup(event);
 	}
+out:
 	fput_light(file, fput_needed);
 	return ret;
 }
-- 
cgit v0.10.2


From f75e18cb9627b1d3d752b83a0b5563da0042c50a Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 3 Mar 2011 14:25:50 +0800
Subject: perf cgroup: Fix unmatched call to perf_detach_cgroup()

In the failure path, we call perf_detach_cgroup(), but we didn't
call perf_get_cgroup() prio to it.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <4D6F346E.9070606@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 7c999e8..b002095 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -412,6 +412,9 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
 	cgrp = container_of(css, struct perf_cgroup, css);
 	event->cgrp = cgrp;
 
+	/* must be done before we fput() the file */
+	perf_get_cgroup(event);
+
 	/*
 	 * all events in a group must monitor
 	 * the same cgroup because a task belongs
@@ -420,9 +423,6 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
 	if (group_leader && group_leader->cgrp != cgrp) {
 		perf_detach_cgroup(event);
 		ret = -EINVAL;
-	} else {
-		/* must be done before we fput() the file */
-		perf_get_cgroup(event);
 	}
 out:
 	fput_light(file, fput_needed);
-- 
cgit v0.10.2


From 1b15d0558e82df9b3659804ceb44187b98eda354 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 3 Mar 2011 14:26:06 +0800
Subject: perf cgroup: Clean up perf_cgroup_create()

- Use kzalloc() to replace kmalloc() + memset().

- Remove redundant initialization, since alloc_percpu() returns
  zero-filled percpu memory.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <4D6F347E.2010806@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index b002095..193b190 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -7346,26 +7346,17 @@ static struct cgroup_subsys_state *perf_cgroup_create(
 	struct cgroup_subsys *ss, struct cgroup *cont)
 {
 	struct perf_cgroup *jc;
-	struct perf_cgroup_info *t;
-	int c;
 
-	jc = kmalloc(sizeof(*jc), GFP_KERNEL);
+	jc = kzalloc(sizeof(*jc), GFP_KERNEL);
 	if (!jc)
 		return ERR_PTR(-ENOMEM);
 
-	memset(jc, 0, sizeof(*jc));
-
 	jc->info = alloc_percpu(struct perf_cgroup_info);
 	if (!jc->info) {
 		kfree(jc);
 		return ERR_PTR(-ENOMEM);
 	}
 
-	for_each_possible_cpu(c) {
-		t = per_cpu_ptr(jc->info, c);
-		t->time = 0;
-		t->timestamp = 0;
-	}
 	return &jc->css;
 }
 
-- 
cgit v0.10.2


From 2d0f25201ee210a0666ec9c41538ba05a07f8bc6 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 3 Mar 2011 14:26:20 +0800
Subject: perf cgroup: Fix a typo in kernel config

s/specificied/specified

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <4D6F348C.2050804@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/init/Kconfig b/init/Kconfig
index 20d6bd9..4c4edf2 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -688,7 +688,7 @@ config CGROUP_PERF
 	depends on PERF_EVENTS && CGROUPS
 	help
 	  This option extends the per-cpu mode to restrict monitoring to
-	  threads which belong to the cgroup specificied and run on the
+	  threads which belong to the cgroup specified and run on the
 	  designated cpu.
 
 	  Say N if unsure.
-- 
cgit v0.10.2


From 08309379b7083a9ceec0f9bb96a629058fb623c4 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 3 Mar 2011 11:31:20 +0100
Subject: perf: Fix cgroup vs jump_label problem

Li Zefan reported that the jump label code sleeps and we're calling it
under a spinlock, *fail* ;-)

Reported-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 193b190..ed253aa 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -820,16 +820,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
 		list_add_tail(&event->group_entry, list);
 	}
 
-	if (is_cgroup_event(event)) {
+	if (is_cgroup_event(event))
 		ctx->nr_cgroups++;
-		/*
-		 * one more event:
-		 * - that has cgroup constraint on event->cpu
-		 * - that may need work on context switch
-		 */
-		atomic_inc(&per_cpu(perf_cgroup_events, event->cpu));
-		jump_label_inc(&perf_sched_events);
-	}
 
 	list_add_rcu(&event->event_entry, &ctx->event_list);
 	if (!ctx->nr_events)
@@ -957,11 +949,8 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
 
 	event->attach_state &= ~PERF_ATTACH_CONTEXT;
 
-	if (is_cgroup_event(event)) {
+	if (is_cgroup_event(event))
 		ctx->nr_cgroups--;
-		atomic_dec(&per_cpu(perf_cgroup_events, event->cpu));
-		jump_label_dec(&perf_sched_events);
-	}
 
 	ctx->nr_events--;
 	if (event->attr.inherit_stat)
@@ -2903,6 +2892,10 @@ static void free_event(struct perf_event *event)
 			atomic_dec(&nr_task_events);
 		if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
 			put_callchain_buffers();
+		if (is_cgroup_event(event)) {
+			atomic_dec(&per_cpu(perf_cgroup_events, event->cpu));
+			jump_label_dec(&perf_sched_events);
+		}
 	}
 
 	if (event->buffer) {
@@ -6478,6 +6471,13 @@ SYSCALL_DEFINE5(perf_event_open,
 		err = perf_cgroup_connect(pid, event, &attr, group_leader);
 		if (err)
 			goto err_alloc;
+		/*
+		 * one more event:
+		 * - that has cgroup constraint on event->cpu
+		 * - that may need work on context switch
+		 */
+		atomic_inc(&per_cpu(perf_cgroup_events, event->cpu));
+		jump_label_inc(&perf_sched_events);
 	}
 
 	/*
-- 
cgit v0.10.2


From 17e3162972cbb9796035fff1e2fd30669b0eef65 Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Wed, 2 Mar 2011 17:05:01 +0200
Subject: perf_events: Update PEBS event constraints

This patch updates PEBS event constraints for Intel Atom, Nehalem, Westmere.

This patch also reorganizes the PEBS format/constraint detection code. It is
now based on processor model and not PEBS format. Two processors may use the
same PEBS format without have the same list of PEBS events.

In this second version, we simplified the initialization of the PEBS
constraints by leveraging the existing switch() statement in perf_event_intel.c.
We also renamed the constraint tables to be more consistent with regular
constraints.

In this 3rd version, we drop BR_INST_RETIRED.MISPRED from Intel Atom as it does
not seem to work. Use MISPREDICTED_BRANCH_RETIRED instead. Also add FP_ASSIST.*
o both Intel Nehalem and Westmere. I misssed those in the earlier patches.
Events were tested using libpfm4 perf_examples.

Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <4d6e6b02.815bdf0a.637b.07a7@mx.google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index ba8aad1..c3ce053 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1137,6 +1137,7 @@ static __init int intel_pmu_init(void)
 		intel_pmu_lbr_init_core();
 
 		x86_pmu.event_constraints = intel_core2_event_constraints;
+		x86_pmu.pebs_constraints = intel_core2_pebs_event_constraints;
 		pr_cont("Core2 events, ");
 		break;
 
@@ -1149,6 +1150,7 @@ static __init int intel_pmu_init(void)
 		intel_pmu_lbr_init_nhm();
 
 		x86_pmu.event_constraints = intel_nehalem_event_constraints;
+		x86_pmu.pebs_constraints = intel_nehalem_pebs_event_constraints;
 		x86_pmu.enable_all = intel_pmu_nhm_enable_all;
 		pr_cont("Nehalem events, ");
 		break;
@@ -1160,6 +1162,7 @@ static __init int intel_pmu_init(void)
 		intel_pmu_lbr_init_atom();
 
 		x86_pmu.event_constraints = intel_gen_event_constraints;
+		x86_pmu.pebs_constraints = intel_atom_pebs_event_constraints;
 		pr_cont("Atom events, ");
 		break;
 
@@ -1172,6 +1175,7 @@ static __init int intel_pmu_init(void)
 
 		x86_pmu.event_constraints = intel_westmere_event_constraints;
 		x86_pmu.enable_all = intel_pmu_nhm_enable_all;
+		x86_pmu.pebs_constraints = intel_westmere_pebs_event_constraints;
 		pr_cont("Westmere events, ");
 		break;
 
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 8251998..b95c66a 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -361,30 +361,50 @@ static int intel_pmu_drain_bts_buffer(void)
 /*
  * PEBS
  */
-
-static struct event_constraint intel_core_pebs_events[] = {
-	PEBS_EVENT_CONSTRAINT(0x00c0, 0x1), /* INSTR_RETIRED.ANY */
+static struct event_constraint intel_core2_pebs_event_constraints[] = {
+	PEBS_EVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */
 	PEBS_EVENT_CONSTRAINT(0xfec1, 0x1), /* X87_OPS_RETIRED.ANY */
 	PEBS_EVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */
 	PEBS_EVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */
-	PEBS_EVENT_CONSTRAINT(0x01cb, 0x1), /* MEM_LOAD_RETIRED.L1D_MISS */
-	PEBS_EVENT_CONSTRAINT(0x02cb, 0x1), /* MEM_LOAD_RETIRED.L1D_LINE_MISS */
-	PEBS_EVENT_CONSTRAINT(0x04cb, 0x1), /* MEM_LOAD_RETIRED.L2_MISS */
-	PEBS_EVENT_CONSTRAINT(0x08cb, 0x1), /* MEM_LOAD_RETIRED.L2_LINE_MISS */
-	PEBS_EVENT_CONSTRAINT(0x10cb, 0x1), /* MEM_LOAD_RETIRED.DTLB_MISS */
+	INTEL_EVENT_CONSTRAINT(0xcb, 0x1),  /* MEM_LOAD_RETIRED.* */
 	EVENT_CONSTRAINT_END
 };
 
-static struct event_constraint intel_nehalem_pebs_events[] = {
-	PEBS_EVENT_CONSTRAINT(0x00c0, 0xf), /* INSTR_RETIRED.ANY */
-	PEBS_EVENT_CONSTRAINT(0xfec1, 0xf), /* X87_OPS_RETIRED.ANY */
-	PEBS_EVENT_CONSTRAINT(0x00c5, 0xf), /* BR_INST_RETIRED.MISPRED */
-	PEBS_EVENT_CONSTRAINT(0x1fc7, 0xf), /* SIMD_INST_RETURED.ANY */
-	PEBS_EVENT_CONSTRAINT(0x01cb, 0xf), /* MEM_LOAD_RETIRED.L1D_MISS */
-	PEBS_EVENT_CONSTRAINT(0x02cb, 0xf), /* MEM_LOAD_RETIRED.L1D_LINE_MISS */
-	PEBS_EVENT_CONSTRAINT(0x04cb, 0xf), /* MEM_LOAD_RETIRED.L2_MISS */
-	PEBS_EVENT_CONSTRAINT(0x08cb, 0xf), /* MEM_LOAD_RETIRED.L2_LINE_MISS */
-	PEBS_EVENT_CONSTRAINT(0x10cb, 0xf), /* MEM_LOAD_RETIRED.DTLB_MISS */
+static struct event_constraint intel_atom_pebs_event_constraints[] = {
+	PEBS_EVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */
+	PEBS_EVENT_CONSTRAINT(0x00c5, 0x1), /* MISPREDICTED_BRANCH_RETIRED */
+	INTEL_EVENT_CONSTRAINT(0xcb, 0x1),  /* MEM_LOAD_RETIRED.* */
+	EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint intel_nehalem_pebs_event_constraints[] = {
+	INTEL_EVENT_CONSTRAINT(0x0b, 0xf),  /* MEM_INST_RETIRED.* */
+	INTEL_EVENT_CONSTRAINT(0x0f, 0xf),  /* MEM_UNCORE_RETIRED.* */
+	PEBS_EVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */
+	INTEL_EVENT_CONSTRAINT(0xc0, 0xf),  /* INST_RETIRED.ANY */
+	INTEL_EVENT_CONSTRAINT(0xc2, 0xf),  /* UOPS_RETIRED.* */
+	INTEL_EVENT_CONSTRAINT(0xc4, 0xf),  /* BR_INST_RETIRED.* */
+	PEBS_EVENT_CONSTRAINT(0x02c5, 0xf), /* BR_MISP_RETIRED.NEAR_CALL */
+	INTEL_EVENT_CONSTRAINT(0xc7, 0xf),  /* SSEX_UOPS_RETIRED.* */
+	PEBS_EVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */
+	INTEL_EVENT_CONSTRAINT(0xcb, 0xf),  /* MEM_LOAD_RETIRED.* */
+	INTEL_EVENT_CONSTRAINT(0xf7, 0xf),  /* FP_ASSIST.* */
+	EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint intel_westmere_pebs_event_constraints[] = {
+	INTEL_EVENT_CONSTRAINT(0x0b, 0xf),  /* MEM_INST_RETIRED.* */
+	INTEL_EVENT_CONSTRAINT(0x0f, 0xf),  /* MEM_UNCORE_RETIRED.* */
+	PEBS_EVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */
+	INTEL_EVENT_CONSTRAINT(0xc0, 0xf),  /* INSTR_RETIRED.* */
+	INTEL_EVENT_CONSTRAINT(0xc2, 0xf),  /* UOPS_RETIRED.* */
+
+	INTEL_EVENT_CONSTRAINT(0xc4, 0xf),  /* BR_INST_RETIRED.* */
+	INTEL_EVENT_CONSTRAINT(0xc5, 0xf),  /* BR_MISP_RETIRED.* */
+	INTEL_EVENT_CONSTRAINT(0xc7, 0xf),  /* SSEX_UOPS_RETIRED.* */
+	PEBS_EVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */
+	INTEL_EVENT_CONSTRAINT(0xcb, 0xf),  /* MEM_LOAD_RETIRED.* */
+	INTEL_EVENT_CONSTRAINT(0xf7, 0xf),  /* FP_ASSIST.* */
 	EVENT_CONSTRAINT_END
 };
 
@@ -733,20 +753,17 @@ static void intel_ds_init(void)
 			printk(KERN_CONT "PEBS fmt0%c, ", pebs_type);
 			x86_pmu.pebs_record_size = sizeof(struct pebs_record_core);
 			x86_pmu.drain_pebs = intel_pmu_drain_pebs_core;
-			x86_pmu.pebs_constraints = intel_core_pebs_events;
 			break;
 
 		case 1:
 			printk(KERN_CONT "PEBS fmt1%c, ", pebs_type);
 			x86_pmu.pebs_record_size = sizeof(struct pebs_record_nhm);
 			x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
-			x86_pmu.pebs_constraints = intel_nehalem_pebs_events;
 			break;
 
 		default:
 			printk(KERN_CONT "no PEBS fmt%d%c, ", format, pebs_type);
 			x86_pmu.pebs = 0;
-			break;
 		}
 	}
 }
-- 
cgit v0.10.2


From a7e3ed1e470116c9d12c2f778431a481a6be8ab6 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Thu, 3 Mar 2011 10:34:47 +0800
Subject: perf: Add support for supplementary event registers

Change logs against Andi's original version:

- Extends perf_event_attr:config to config{,1,2} (Peter Zijlstra)
- Fixed a major event scheduling issue. There cannot be a ref++ on an
  event that has already done ref++ once and without calling
  put_constraint() in between. (Stephane Eranian)
- Use thread_cpumask for percore allocation. (Lin Ming)
- Use MSR names in the extra reg lists. (Lin Ming)
- Remove redundant "c = NULL" in intel_percore_constraints
- Fix comment of perf_event_attr::config1

Intel Nehalem/Westmere have a special OFFCORE_RESPONSE event
that can be used to monitor any offcore accesses from a core.
This is a very useful event for various tunings, and it's
also needed to implement the generic LLC-* events correctly.

Unfortunately this event requires programming a mask in a separate
register. And worse this separate register is per core, not per
CPU thread.

This patch:

- Teaches perf_events that OFFCORE_RESPONSE needs extra parameters.
  The extra parameters are passed by user space in the
  perf_event_attr::config1 field.

- Adds support to the Intel perf_event core to schedule per
  core resources. This adds fairly generic infrastructure that
  can be also used for other per core resources.
  The basic code has is patterned after the similar AMD northbridge
  constraints code.

Thanks to Stephane Eranian who pointed out some problems
in the original version and suggested improvements.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Lin Ming <ming.m.lin@intel.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1299119690-13991-2-git-send-email-ming.m.lin@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 4d0dfa0..d25e74c 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -47,6 +47,9 @@
 #define MSR_IA32_MCG_STATUS		0x0000017a
 #define MSR_IA32_MCG_CTL		0x0000017b
 
+#define MSR_OFFCORE_RSP_0		0x000001a6
+#define MSR_OFFCORE_RSP_1		0x000001a7
+
 #define MSR_IA32_PEBS_ENABLE		0x000003f1
 #define MSR_IA32_DS_AREA		0x00000600
 #define MSR_IA32_PERF_CAPABILITIES	0x00000345
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index ea03c72..ec6a6db073 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -93,6 +93,8 @@ struct amd_nb {
 	struct event_constraint event_constraints[X86_PMC_IDX_MAX];
 };
 
+struct intel_percore;
+
 #define MAX_LBR_ENTRIES		16
 
 struct cpu_hw_events {
@@ -128,6 +130,13 @@ struct cpu_hw_events {
 	struct perf_branch_entry	lbr_entries[MAX_LBR_ENTRIES];
 
 	/*
+	 * Intel percore register state.
+	 * Coordinate shared resources between HT threads.
+	 */
+	int				percore_used; /* Used by this CPU? */
+	struct intel_percore		*per_core;
+
+	/*
 	 * AMD specific bits
 	 */
 	struct amd_nb		*amd_nb;
@@ -177,6 +186,28 @@ struct cpu_hw_events {
 #define for_each_event_constraint(e, c)	\
 	for ((e) = (c); (e)->weight; (e)++)
 
+/*
+ * Extra registers for specific events.
+ * Some events need large masks and require external MSRs.
+ * Define a mapping to these extra registers.
+ */
+struct extra_reg {
+	unsigned int		event;
+	unsigned int		msr;
+	u64			config_mask;
+	u64			valid_mask;
+};
+
+#define EVENT_EXTRA_REG(e, ms, m, vm) {	\
+	.event = (e),		\
+	.msr = (ms),		\
+	.config_mask = (m),	\
+	.valid_mask = (vm),	\
+	}
+#define INTEL_EVENT_EXTRA_REG(event, msr, vm)	\
+	EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm)
+#define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0)
+
 union perf_capabilities {
 	struct {
 		u64	lbr_format    : 6;
@@ -221,6 +252,7 @@ struct x86_pmu {
 	void		(*put_event_constraints)(struct cpu_hw_events *cpuc,
 						 struct perf_event *event);
 	struct event_constraint *event_constraints;
+	struct event_constraint *percore_constraints;
 	void		(*quirks)(void);
 	int		perfctr_second_write;
 
@@ -249,6 +281,11 @@ struct x86_pmu {
 	 */
 	unsigned long	lbr_tos, lbr_from, lbr_to; /* MSR base regs       */
 	int		lbr_nr;			   /* hardware stack size */
+
+	/*
+	 * Extra registers for events
+	 */
+	struct extra_reg *extra_regs;
 };
 
 static struct x86_pmu x86_pmu __read_mostly;
@@ -341,6 +378,31 @@ static inline unsigned int x86_pmu_event_addr(int index)
 	return x86_pmu.perfctr + x86_pmu_addr_offset(index);
 }
 
+/*
+ * Find and validate any extra registers to set up.
+ */
+static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
+{
+	struct extra_reg *er;
+
+	event->hw.extra_reg = 0;
+	event->hw.extra_config = 0;
+
+	if (!x86_pmu.extra_regs)
+		return 0;
+
+	for (er = x86_pmu.extra_regs; er->msr; er++) {
+		if (er->event != (config & er->config_mask))
+			continue;
+		if (event->attr.config1 & ~er->valid_mask)
+			return -EINVAL;
+		event->hw.extra_reg = er->msr;
+		event->hw.extra_config = event->attr.config1;
+		break;
+	}
+	return 0;
+}
+
 static atomic_t active_events;
 static DEFINE_MUTEX(pmc_reserve_mutex);
 
@@ -665,6 +727,8 @@ static void x86_pmu_disable(struct pmu *pmu)
 static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
 					  u64 enable_mask)
 {
+	if (hwc->extra_reg)
+		wrmsrl(hwc->extra_reg, hwc->extra_config);
 	wrmsrl(hwc->config_base, hwc->config | enable_mask);
 }
 
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index c3ce053..13cb6cf 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1,5 +1,27 @@
 #ifdef CONFIG_CPU_SUP_INTEL
 
+#define MAX_EXTRA_REGS 2
+
+/*
+ * Per register state.
+ */
+struct er_account {
+	int			ref;		/* reference count */
+	unsigned int		extra_reg;	/* extra MSR number */
+	u64			extra_config;	/* extra MSR config */
+};
+
+/*
+ * Per core state
+ * This used to coordinate shared registers for HT threads.
+ */
+struct intel_percore {
+	raw_spinlock_t		lock;		/* protect structure */
+	struct er_account	regs[MAX_EXTRA_REGS];
+	int			refcnt;		/* number of threads */
+	unsigned		core_id;
+};
+
 /*
  * Intel PerfMon, used on Core and later.
  */
@@ -64,6 +86,18 @@ static struct event_constraint intel_nehalem_event_constraints[] =
 	EVENT_CONSTRAINT_END
 };
 
+static struct extra_reg intel_nehalem_extra_regs[] =
+{
+	INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff),
+	EVENT_EXTRA_END
+};
+
+static struct event_constraint intel_nehalem_percore_constraints[] =
+{
+	INTEL_EVENT_CONSTRAINT(0xb7, 0),
+	EVENT_CONSTRAINT_END
+};
+
 static struct event_constraint intel_westmere_event_constraints[] =
 {
 	FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
@@ -89,6 +123,20 @@ static struct event_constraint intel_snb_event_constraints[] =
 	EVENT_CONSTRAINT_END
 };
 
+static struct extra_reg intel_westmere_extra_regs[] =
+{
+	INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff),
+	INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0xffff),
+	EVENT_EXTRA_END
+};
+
+static struct event_constraint intel_westmere_percore_constraints[] =
+{
+	INTEL_EVENT_CONSTRAINT(0xb7, 0),
+	INTEL_EVENT_CONSTRAINT(0xbb, 0),
+	EVENT_CONSTRAINT_END
+};
+
 static struct event_constraint intel_gen_event_constraints[] =
 {
 	FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
@@ -907,6 +955,67 @@ intel_bts_constraints(struct perf_event *event)
 }
 
 static struct event_constraint *
+intel_percore_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	unsigned int e = hwc->config & ARCH_PERFMON_EVENTSEL_EVENT;
+	struct event_constraint *c;
+	struct intel_percore *pc;
+	struct er_account *era;
+	int i;
+	int free_slot;
+	int found;
+
+	if (!x86_pmu.percore_constraints || hwc->extra_alloc)
+		return NULL;
+
+	for (c = x86_pmu.percore_constraints; c->cmask; c++) {
+		if (e != c->code)
+			continue;
+
+		/*
+		 * Allocate resource per core.
+		 */
+		pc = cpuc->per_core;
+		if (!pc)
+			break;
+		c = &emptyconstraint;
+		raw_spin_lock(&pc->lock);
+		free_slot = -1;
+		found = 0;
+		for (i = 0; i < MAX_EXTRA_REGS; i++) {
+			era = &pc->regs[i];
+			if (era->ref > 0 && hwc->extra_reg == era->extra_reg) {
+				/* Allow sharing same config */
+				if (hwc->extra_config == era->extra_config) {
+					era->ref++;
+					cpuc->percore_used = 1;
+					hwc->extra_alloc = 1;
+					c = NULL;
+				}
+				/* else conflict */
+				found = 1;
+				break;
+			} else if (era->ref == 0 && free_slot == -1)
+				free_slot = i;
+		}
+		if (!found && free_slot != -1) {
+			era = &pc->regs[free_slot];
+			era->ref = 1;
+			era->extra_reg = hwc->extra_reg;
+			era->extra_config = hwc->extra_config;
+			cpuc->percore_used = 1;
+			hwc->extra_alloc = 1;
+			c = NULL;
+		}
+		raw_spin_unlock(&pc->lock);
+		return c;
+	}
+
+	return NULL;
+}
+
+static struct event_constraint *
 intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
 {
 	struct event_constraint *c;
@@ -919,9 +1028,51 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event
 	if (c)
 		return c;
 
+	c = intel_percore_constraints(cpuc, event);
+	if (c)
+		return c;
+
 	return x86_get_event_constraints(cpuc, event);
 }
 
+static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
+					struct perf_event *event)
+{
+	struct extra_reg *er;
+	struct intel_percore *pc;
+	struct er_account *era;
+	struct hw_perf_event *hwc = &event->hw;
+	int i, allref;
+
+	if (!cpuc->percore_used)
+		return;
+
+	for (er = x86_pmu.extra_regs; er->msr; er++) {
+		if (er->event != (hwc->config & er->config_mask))
+			continue;
+
+		pc = cpuc->per_core;
+		raw_spin_lock(&pc->lock);
+		for (i = 0; i < MAX_EXTRA_REGS; i++) {
+			era = &pc->regs[i];
+			if (era->ref > 0 &&
+			    era->extra_config == hwc->extra_config &&
+			    era->extra_reg == er->msr) {
+				era->ref--;
+				hwc->extra_alloc = 0;
+				break;
+			}
+		}
+		allref = 0;
+		for (i = 0; i < MAX_EXTRA_REGS; i++)
+			allref += pc->regs[i].ref;
+		if (allref == 0)
+			cpuc->percore_used = 0;
+		raw_spin_unlock(&pc->lock);
+		break;
+	}
+}
+
 static int intel_pmu_hw_config(struct perf_event *event)
 {
 	int ret = x86_pmu_hw_config(event);
@@ -993,11 +1144,43 @@ static __initconst const struct x86_pmu core_pmu = {
 	 */
 	.max_period		= (1ULL << 31) - 1,
 	.get_event_constraints	= intel_get_event_constraints,
+	.put_event_constraints	= intel_put_event_constraints,
 	.event_constraints	= intel_core_event_constraints,
 };
 
+static int intel_pmu_cpu_prepare(int cpu)
+{
+	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+
+	cpuc->per_core = kzalloc_node(sizeof(struct intel_percore),
+				      GFP_KERNEL, cpu_to_node(cpu));
+	if (!cpuc->per_core)
+		return NOTIFY_BAD;
+
+	raw_spin_lock_init(&cpuc->per_core->lock);
+	cpuc->per_core->core_id = -1;
+	return NOTIFY_OK;
+}
+
 static void intel_pmu_cpu_starting(int cpu)
 {
+	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+	int core_id = topology_core_id(cpu);
+	int i;
+
+	for_each_cpu(i, topology_thread_cpumask(cpu)) {
+		struct intel_percore *pc = per_cpu(cpu_hw_events, i).per_core;
+
+		if (pc && pc->core_id == core_id) {
+			kfree(cpuc->per_core);
+			cpuc->per_core = pc;
+			break;
+		}
+	}
+
+	cpuc->per_core->core_id = core_id;
+	cpuc->per_core->refcnt++;
+
 	init_debug_store_on_cpu(cpu);
 	/*
 	 * Deal with CPUs that don't clear their LBRs on power-up.
@@ -1007,6 +1190,15 @@ static void intel_pmu_cpu_starting(int cpu)
 
 static void intel_pmu_cpu_dying(int cpu)
 {
+	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+	struct intel_percore *pc = cpuc->per_core;
+
+	if (pc) {
+		if (pc->core_id == -1 || --pc->refcnt == 0)
+			kfree(pc);
+		cpuc->per_core = NULL;
+	}
+
 	fini_debug_store_on_cpu(cpu);
 }
 
@@ -1031,7 +1223,9 @@ static __initconst const struct x86_pmu intel_pmu = {
 	 */
 	.max_period		= (1ULL << 31) - 1,
 	.get_event_constraints	= intel_get_event_constraints,
+	.put_event_constraints	= intel_put_event_constraints,
 
+	.cpu_prepare		= intel_pmu_cpu_prepare,
 	.cpu_starting		= intel_pmu_cpu_starting,
 	.cpu_dying		= intel_pmu_cpu_dying,
 };
@@ -1151,7 +1345,9 @@ static __init int intel_pmu_init(void)
 
 		x86_pmu.event_constraints = intel_nehalem_event_constraints;
 		x86_pmu.pebs_constraints = intel_nehalem_pebs_event_constraints;
+		x86_pmu.percore_constraints = intel_nehalem_percore_constraints;
 		x86_pmu.enable_all = intel_pmu_nhm_enable_all;
+		x86_pmu.extra_regs = intel_nehalem_extra_regs;
 		pr_cont("Nehalem events, ");
 		break;
 
@@ -1174,8 +1370,10 @@ static __init int intel_pmu_init(void)
 		intel_pmu_lbr_init_nhm();
 
 		x86_pmu.event_constraints = intel_westmere_event_constraints;
+		x86_pmu.percore_constraints = intel_westmere_percore_constraints;
 		x86_pmu.enable_all = intel_pmu_nhm_enable_all;
 		x86_pmu.pebs_constraints = intel_westmere_pebs_event_constraints;
+		x86_pmu.extra_regs = intel_westmere_extra_regs;
 		pr_cont("Westmere events, ");
 		break;
 
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 8ceb5a6..614615b 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -225,8 +225,14 @@ struct perf_event_attr {
 	};
 
 	__u32			bp_type;
-	__u64			bp_addr;
-	__u64			bp_len;
+	union {
+		__u64		bp_addr;
+		__u64		config1; /* extension of config */
+	};
+	union {
+		__u64		bp_len;
+		__u64		config2; /* extension of config1 */
+	};
 };
 
 /*
@@ -541,6 +547,9 @@ struct hw_perf_event {
 			unsigned long	event_base;
 			int		idx;
 			int		last_cpu;
+			unsigned int	extra_reg;
+			u64		extra_config;
+			int		extra_alloc;
 		};
 		struct { /* software */
 			struct hrtimer	hrtimer;
-- 
cgit v0.10.2


From e994d7d23a0bae34cd28834e85522ed4e782faf7 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Thu, 3 Mar 2011 10:34:48 +0800
Subject: perf: Fix LLC-* events on Intel Nehalem/Westmere

On Intel Nehalem and Westmere CPUs the generic perf LLC-* events count the
L2 caches, not the real L3 LLC - this was inconsistent with behavior on
other CPUs.

Fixing this requires the use of the special OFFCORE_RESPONSE
events which need a separate mask register.

This has been implemented by the previous patch, now use this infrastructure
to set correct events for the LLC-* on Nehalem and Westmere.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Lin Ming <ming.m.lin@intel.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1299119690-13991-3-git-send-email-ming.m.lin@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index ec6a6db073..4d6ce5d 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -310,6 +310,10 @@ static u64 __read_mostly hw_cache_event_ids
 				[PERF_COUNT_HW_CACHE_MAX]
 				[PERF_COUNT_HW_CACHE_OP_MAX]
 				[PERF_COUNT_HW_CACHE_RESULT_MAX];
+static u64 __read_mostly hw_cache_extra_regs
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX];
 
 /*
  * Propagate event elapsed time into the generic event.
@@ -524,8 +528,9 @@ static inline int x86_pmu_initialized(void)
 }
 
 static inline int
-set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr)
+set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event)
 {
+	struct perf_event_attr *attr = &event->attr;
 	unsigned int cache_type, cache_op, cache_result;
 	u64 config, val;
 
@@ -552,8 +557,8 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr)
 		return -EINVAL;
 
 	hwc->config |= val;
-
-	return 0;
+	attr->config1 = hw_cache_extra_regs[cache_type][cache_op][cache_result];
+	return x86_pmu_extra_regs(val, event);
 }
 
 static int x86_setup_perfctr(struct perf_event *event)
@@ -578,10 +583,10 @@ static int x86_setup_perfctr(struct perf_event *event)
 	}
 
 	if (attr->type == PERF_TYPE_RAW)
-		return 0;
+		return x86_pmu_extra_regs(event->attr.config, event);
 
 	if (attr->type == PERF_TYPE_HW_CACHE)
-		return set_ext_hw_attr(hwc, attr);
+		return set_ext_hw_attr(hwc, event);
 
 	if (attr->config >= x86_pmu.max_events)
 		return -EINVAL;
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 13cb6cf..6e9b676 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -285,16 +285,26 @@ static __initconst const u64 westmere_hw_cache_event_ids
  },
  [ C(LL  ) ] = {
 	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS               */
-		[ C(RESULT_MISS)   ] = 0x0224, /* L2_RQSTS.LD_MISS             */
+		/* OFFCORE_RESPONSE_0.ANY_DATA.LOCAL_CACHE */
+		[ C(RESULT_ACCESS) ] = 0x01b7,
+		/* OFFCORE_RESPONSE_1.ANY_DATA.ANY_LLC_MISS */
+		[ C(RESULT_MISS)   ] = 0x01bb,
 	},
+	/*
+	 * Use RFO, not WRITEBACK, because a write miss would typically occur
+	 * on RFO.
+	 */
 	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS                */
-		[ C(RESULT_MISS)   ] = 0x0824, /* L2_RQSTS.RFO_MISS            */
+		/* OFFCORE_RESPONSE_1.ANY_RFO.LOCAL_CACHE */
+		[ C(RESULT_ACCESS) ] = 0x01bb,
+		/* OFFCORE_RESPONSE_0.ANY_RFO.ANY_LLC_MISS */
+		[ C(RESULT_MISS)   ] = 0x01b7,
 	},
 	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference                */
-		[ C(RESULT_MISS)   ] = 0x412e, /* LLC Misses                   */
+		/* OFFCORE_RESPONSE_0.PREFETCH.LOCAL_CACHE */
+		[ C(RESULT_ACCESS) ] = 0x01b7,
+		/* OFFCORE_RESPONSE_1.PREFETCH.ANY_LLC_MISS */
+		[ C(RESULT_MISS)   ] = 0x01bb,
 	},
  },
  [ C(DTLB) ] = {
@@ -341,6 +351,39 @@ static __initconst const u64 westmere_hw_cache_event_ids
  },
 };
 
+/*
+ * OFFCORE_RESPONSE MSR bits (subset), See IA32 SDM Vol 3 30.6.1.3
+ */
+
+#define DMND_DATA_RD     (1 << 0)
+#define DMND_RFO         (1 << 1)
+#define DMND_WB          (1 << 3)
+#define PF_DATA_RD       (1 << 4)
+#define PF_DATA_RFO      (1 << 5)
+#define RESP_UNCORE_HIT  (1 << 8)
+#define RESP_MISS        (0xf600) /* non uncore hit */
+
+static __initconst const u64 nehalem_hw_cache_extra_regs
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(LL  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = DMND_DATA_RD|RESP_UNCORE_HIT,
+		[ C(RESULT_MISS)   ] = DMND_DATA_RD|RESP_MISS,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = DMND_RFO|DMND_WB|RESP_UNCORE_HIT,
+		[ C(RESULT_MISS)   ] = DMND_RFO|DMND_WB|RESP_MISS,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = PF_DATA_RD|PF_DATA_RFO|RESP_UNCORE_HIT,
+		[ C(RESULT_MISS)   ] = PF_DATA_RD|PF_DATA_RFO|RESP_MISS,
+	},
+ }
+};
+
 static __initconst const u64 nehalem_hw_cache_event_ids
 				[PERF_COUNT_HW_CACHE_MAX]
 				[PERF_COUNT_HW_CACHE_OP_MAX]
@@ -376,16 +419,26 @@ static __initconst const u64 nehalem_hw_cache_event_ids
  },
  [ C(LL  ) ] = {
 	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS               */
-		[ C(RESULT_MISS)   ] = 0x0224, /* L2_RQSTS.LD_MISS             */
+		/* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
+		[ C(RESULT_ACCESS) ] = 0x01b7,
+		/* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */
+		[ C(RESULT_MISS)   ] = 0x01b7,
 	},
+	/*
+	 * Use RFO, not WRITEBACK, because a write miss would typically occur
+	 * on RFO.
+	 */
 	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS                */
-		[ C(RESULT_MISS)   ] = 0x0824, /* L2_RQSTS.RFO_MISS            */
+		/* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
+		[ C(RESULT_ACCESS) ] = 0x01b7,
+		/* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
+		[ C(RESULT_MISS)   ] = 0x01b7,
 	},
 	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference                */
-		[ C(RESULT_MISS)   ] = 0x412e, /* LLC Misses                   */
+		/* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
+		[ C(RESULT_ACCESS) ] = 0x01b7,
+		/* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
+		[ C(RESULT_MISS)   ] = 0x01b7,
 	},
  },
  [ C(DTLB) ] = {
@@ -1340,6 +1393,8 @@ static __init int intel_pmu_init(void)
 	case 46: /* 45 nm nehalem-ex, "Beckton" */
 		memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
 		       sizeof(hw_cache_event_ids));
+		memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs,
+		       sizeof(hw_cache_extra_regs));
 
 		intel_pmu_lbr_init_nhm();
 
@@ -1366,6 +1421,8 @@ static __init int intel_pmu_init(void)
 	case 44: /* 32 nm nehalem, "Gulftown" */
 		memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids,
 		       sizeof(hw_cache_event_ids));
+		memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs,
+		       sizeof(hw_cache_extra_regs));
 
 		intel_pmu_lbr_init_nhm();
 
-- 
cgit v0.10.2


From 51b361b4009f4e19ae68d2bcbb35e254e91b6054 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Fri, 4 Mar 2011 14:49:28 +0100
Subject: x86-64, NUMA: Fix numa_emulation code with node0 without RAM

On one system that does not have RAM on node0.

When numa_emulation is compiled in, and
1. boot system without numa=fake...
2. or boot system with numa=fake=128 to make emulation fail

will get:

[    0.092026] ------------[ cut here ]------------
[    0.096005] kernel BUG at arch/x86/mm/numa_emulation.c:439!
[    0.096005] invalid opcode: 0000 [#1] SMP
[    0.096005] last sysfs file:
[    0.096005] CPU 0
[    0.096005] Modules linked in:
[    0.096005]
[    0.096005] Pid: 0, comm: swapper Not tainted 2.6.38-rc6-tip-yh-03869-gcb0491d-dirty #684 Sun Microsystems     Sun Fire X4240/Sun Fire X4240
[    0.096005] RIP: 0010:[<ffffffff81cdc65b>]  [<ffffffff81cdc65b>] numa_add_cpu+0x56/0xcf
[    0.096005] RSP: 0000:ffffffff82437ed8  EFLAGS: 00010246
...
[    0.096005] Call Trace:
[    0.096005]  [<ffffffff81cd7931>] identify_cpu+0x2d7/0x2df
[    0.096005]  [<ffffffff827e54fa>] identify_boot_cpu+0x10/0x30
[    0.096005]  [<ffffffff827e5704>] check_bugs+0x9/0x2d
[    0.096005]  [<ffffffff827dceda>] start_kernel+0x3d7/0x3f1
[    0.096005]  [<ffffffff827dc2cc>] x86_64_start_reservations+0x9c/0xa0
[    0.096005]  [<ffffffff827dc4ad>] x86_64_start_kernel+0x1dd/0x1e8
[    0.096005] Code: 74 06 48 8d 04 90 eb 0f 48 c7 c0 30 d9 00 00 48 03 04 d5 90 0f 60 82 8b 00 83 f8 ff 74 0d 0f a3 05 8b 7e 92 00 19 d2 85 d2 75 02 <0f> 0b 48 98 be 00 01 00 00 48 c7 c7 e0 44 60 82 44 8b 2c 85 e0
[    0.096005] RIP  [<ffffffff81cdc65b>] numa_add_cpu+0x56/0xcf
[    0.096005]  RSP <ffffffff82437ed8>
[    0.096026] ---[ end trace a7919e7f17c0a725 ]---

We need to use early_cpu_to_node() directly, because numa_cpu_node()
will return node0 that is not onlined.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c
index aeecea9..75b31dc 100644
--- a/arch/x86/mm/numa_emulation.c
+++ b/arch/x86/mm/numa_emulation.c
@@ -417,9 +417,7 @@ void __cpuinit numa_add_cpu(int cpu)
 {
 	int physnid, nid;
 
-	nid = numa_cpu_node(cpu);
-	if (nid == NUMA_NO_NODE)
-		nid = early_cpu_to_node(cpu);
+	nid = early_cpu_to_node(cpu);
 	BUG_ON(nid == NUMA_NO_NODE || !node_online(nid));
 
 	physnid = emu_nid_to_phys[nid];
-- 
cgit v0.10.2


From c09cedf4f75f1e47ea17f55e18e9cfb81bec8575 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Fri, 4 Mar 2011 15:17:21 +0100
Subject: x86-64, NUMA: Clean up initmem_init()

This patch cleans initmem_init() so that it is more readable and doesn't
use an unnecessary array of function pointers to convolute the flow of
the code.  It also makes it obvious that dummy_numa_init() will always
succeed (and documents that requirement) so that the existing BUG() is
never actually reached.

No functional change.

-tj: Updated comment for dummy_numa_init() slightly.

Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 86491ba..9ec0f20 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -562,6 +562,15 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
 	return 0;
 }
 
+/**
+ * dummy_numma_init - Fallback dummy NUMA init
+ *
+ * Used if there's no underlying NUMA architecture, NUMA initialization
+ * fails, or NUMA is disabled on the command line.
+ *
+ * Must online at least one node and add memory blocks that cover all
+ * allowed memory.  This function must not fail.
+ */
 static int __init dummy_numa_init(void)
 {
 	printk(KERN_INFO "%s\n",
@@ -575,57 +584,64 @@ static int __init dummy_numa_init(void)
 	return 0;
 }
 
-void __init initmem_init(void)
+static int __init numa_init(int (*init_func)(void))
 {
-	int (*numa_init[])(void) = { [2] = dummy_numa_init };
-	int i, j;
-
-	if (!numa_off) {
-#ifdef CONFIG_ACPI_NUMA
-		numa_init[0] = x86_acpi_numa_init;
-#endif
-#ifdef CONFIG_AMD_NUMA
-		numa_init[1] = amd_numa_init;
-#endif
-	}
+	int i;
+	int ret;
 
-	for (i = 0; i < ARRAY_SIZE(numa_init); i++) {
-		if (!numa_init[i])
-			continue;
+	for (i = 0; i < MAX_LOCAL_APIC; i++)
+		set_apicid_to_node(i, NUMA_NO_NODE);
 
-		for (j = 0; j < MAX_LOCAL_APIC; j++)
-			set_apicid_to_node(j, NUMA_NO_NODE);
+	nodes_clear(numa_nodes_parsed);
+	nodes_clear(node_possible_map);
+	nodes_clear(node_online_map);
+	memset(&numa_meminfo, 0, sizeof(numa_meminfo));
+	remove_all_active_ranges();
+	numa_reset_distance();
 
-		nodes_clear(numa_nodes_parsed);
-		nodes_clear(node_possible_map);
-		nodes_clear(node_online_map);
-		memset(&numa_meminfo, 0, sizeof(numa_meminfo));
-		remove_all_active_ranges();
-		numa_reset_distance();
+	ret = init_func();
+	if (ret < 0)
+		return ret;
+	ret = numa_cleanup_meminfo(&numa_meminfo);
+	if (ret < 0)
+		return ret;
 
-		if (numa_init[i]() < 0)
-			continue;
+	numa_emulation(&numa_meminfo, numa_distance_cnt);
 
-		if (numa_cleanup_meminfo(&numa_meminfo) < 0)
-			continue;
+	ret = numa_register_memblks(&numa_meminfo);
+	if (ret < 0)
+		return ret;
 
-		numa_emulation(&numa_meminfo, numa_distance_cnt);
+	for (i = 0; i < nr_cpu_ids; i++) {
+		int nid = early_cpu_to_node(i);
 
-		if (numa_register_memblks(&numa_meminfo) < 0)
+		if (nid == NUMA_NO_NODE)
 			continue;
+		if (!node_online(nid))
+			numa_clear_node(i);
+	}
+	numa_init_array();
+	return 0;
+}
 
-		for (j = 0; j < nr_cpu_ids; j++) {
-			int nid = early_cpu_to_node(j);
+void __init initmem_init(void)
+{
+	int ret;
 
-			if (nid == NUMA_NO_NODE)
-				continue;
-			if (!node_online(nid))
-				numa_clear_node(j);
-		}
-		numa_init_array();
-		return;
+	if (!numa_off) {
+#ifdef CONFIG_ACPI_NUMA
+		ret = numa_init(x86_acpi_numa_init);
+		if (!ret)
+			return;
+#endif
+#ifdef CONFIG_AMD_NUMA
+		ret = numa_init(amd_numa_init);
+		if (!ret)
+			return;
+#endif
 	}
-	BUG();
+
+	numa_init(dummy_numa_init);
 }
 
 unsigned long __init numa_free_all_bootmem(void)
-- 
cgit v0.10.2


From 078a198906c796981f93ff100c210506e91aade5 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 4 Mar 2011 16:32:02 +0100
Subject: x86-64, NUMA: Don't assume phys node 0 is always online in
 numa_emulation()

Undetermined entries in emu_nid_to_phys[] are filled with zero
assuming that physical node 0 is always online; however, this might
not be true depending on hardware configuration.  Find a physical node
which is actually online and use it instead.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: David Rientjes <rientjes@google.com>
LKML-Reference: <alpine.DEB.2.00.1103020628210.31626@chino.kir.corp.google.com>

diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c
index 75b31dc..3696be0 100644
--- a/arch/x86/mm/numa_emulation.c
+++ b/arch/x86/mm/numa_emulation.c
@@ -301,6 +301,7 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
 	const u64 max_addr = max_pfn << PAGE_SHIFT;
 	u8 *phys_dist = NULL;
 	size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]);
+	int dfl_phys_nid;
 	int i, j, ret;
 
 	if (!emu_cmdline)
@@ -357,6 +358,19 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
 					node_distance(i, j);
 	}
 
+	/* determine the default phys nid to use for unmapped nodes */
+	dfl_phys_nid = NUMA_NO_NODE;
+	for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) {
+		if (emu_nid_to_phys[i] != NUMA_NO_NODE) {
+			dfl_phys_nid = emu_nid_to_phys[i];
+			break;
+		}
+	}
+	if (dfl_phys_nid == NUMA_NO_NODE) {
+		pr_warning("NUMA: Warning: can't determine default physical node, disabling emulation\n");
+		goto no_emu;
+	}
+
 	/* commit */
 	*numa_meminfo = ei;
 
@@ -377,7 +391,7 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
 	/* make sure all emulated nodes are mapped to a physical node */
 	for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
 		if (emu_nid_to_phys[i] == NUMA_NO_NODE)
-			emu_nid_to_phys[i] = 0;
+			emu_nid_to_phys[i] = dfl_phys_nid;
 
 	/*
 	 * Transform distance table.  numa_set_distance() ignores all
-- 
cgit v0.10.2


From ba74f4d7e5125d04d453b4af69c53c533e6feb80 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Sun, 9 Jan 2011 18:09:51 -0800
Subject: rcu: call __rcu_read_unlock() in exit_rcu for tiny RCU

Using __rcu_read_lock() in place of rcu_read_lock() leaves any debug
state as it really should be, namely with the lock still held.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 015abae..3cb8e36 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -852,7 +852,7 @@ void exit_rcu(void)
 	if (t->rcu_read_lock_nesting == 0)
 		return;
 	t->rcu_read_lock_nesting = 1;
-	rcu_read_unlock();
+	__rcu_read_unlock();
 }
 
 #else /* #ifdef CONFIG_TINY_PREEMPT_RCU */
-- 
cgit v0.10.2


From 37743de384951ef97c040e69039820c987849501 Mon Sep 17 00:00:00 2001
From: Jesper Juhl <jj@chaosbits.net>
Date: Mon, 10 Jan 2011 21:43:13 +0100
Subject: rcutorture: Get rid of duplicate sched.h include

linux/sched.h is included twice in kernel/rcutorture.c - once is enough.

Signed-off-by: Jesper Juhl <jj@chaosbits.net>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 89613f9..c224da4 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -47,7 +47,6 @@
 #include <linux/srcu.h>
 #include <linux/slab.h>
 #include <asm/byteorder.h>
-#include <linux/sched.h>
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and "
-- 
cgit v0.10.2


From fea651267e52a88e7b81e01b6717d968254b6ddb Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sun, 23 Jan 2011 22:35:45 -0800
Subject: rcu: add documentation saying which RCU flavor to choose

Reported-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt
index cfaac34..6ef6926 100644
--- a/Documentation/RCU/whatisRCU.txt
+++ b/Documentation/RCU/whatisRCU.txt
@@ -849,6 +849,37 @@ All:  lockdep-checked RCU-protected pointer access
 See the comment headers in the source code (or the docbook generated
 from them) for more information.
 
+However, given that there are no fewer than four families of RCU APIs
+in the Linux kernel, how do you choose which one to use?  The following
+list can be helpful:
+
+a.	Will readers need to block?  If so, you need SRCU.
+
+b.	What about the -rt patchset?  If readers would need to block
+	in an non-rt kernel, you need SRCU.  If readers would block
+	in a -rt kernel, but not in a non-rt kernel, SRCU is not
+	necessary.
+
+c.	Do you need to treat NMI handlers, hardirq handlers,
+	and code segments with preemption disabled (whether
+	via preempt_disable(), local_irq_save(), local_bh_disable(),
+	or some other mechanism) as if they were explicit RCU readers?
+	If so, you need RCU-sched.
+
+d.	Do you need RCU grace periods to complete even in the face
+	of softirq monopolization of one or more of the CPUs?  For
+	example, is your code subject to network-based denial-of-service
+	attacks?  If so, you need RCU-bh.
+
+e.	Is your workload too update-intensive for normal use of
+	RCU, but inappropriate for other synchronization mechanisms?
+	If so, consider SLAB_DESTROY_BY_RCU.  But please be careful!
+
+f.	Otherwise, use RCU.
+
+Of course, this all assumes that you have determined that RCU is in fact
+the right tool for your job.
+
 
 8.  ANSWERS TO QUICK QUIZZES
 
-- 
cgit v0.10.2


From fe8e64071a4ff5925ced4f33cb32ba090a04649c Mon Sep 17 00:00:00 2001
From: Amerigo Wang <amwang@redhat.com>
Date: Sun, 30 Jan 2011 18:23:08 +0800
Subject: rcupdate: remove dead code

DEBUG_OBJECTS_RCU_HEAD depends on PREEMPT, so #ifndef CONFIG_PREEMPT
is totally useless in kernel/rcupdate.c.

Signed-off-by: WANG Cong <amwang@redhat.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index a23a57a..afd21d1 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -215,10 +215,6 @@ static int rcuhead_fixup_free(void *addr, enum debug_obj_state state)
 		 * If we detect that we are nested in a RCU read-side critical
 		 * section, we should simply fail, otherwise we would deadlock.
 		 */
-#ifndef CONFIG_PREEMPT
-		WARN_ON(1);
-		return 0;
-#else
 		if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
 		    irqs_disabled()) {
 			WARN_ON(1);
@@ -229,7 +225,6 @@ static int rcuhead_fixup_free(void *addr, enum debug_obj_state state)
 		rcu_barrier_bh();
 		debug_object_free(head, &rcuhead_debug_descr);
 		return 1;
-#endif
 	default:
 		return 0;
 	}
-- 
cgit v0.10.2


From e611eecd6f9f91d74beda8cfbb3b5e02abdeb5a1 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sun, 30 Jan 2011 23:56:46 -0800
Subject: rcu: add comment saying why DEBUG_OBJECTS_RCU_HEAD depends on
 PREEMPT.

The build will break if you change the Kconfig to allow
DEBUG_OBJECTS_RCU_HEAD and !PREEMPT, so document the reasoning
near where the breakage would occur.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index afd21d1..f3240e9 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -214,6 +214,11 @@ static int rcuhead_fixup_free(void *addr, enum debug_obj_state state)
 		 * Ensure that queued callbacks are all executed.
 		 * If we detect that we are nested in a RCU read-side critical
 		 * section, we should simply fail, otherwise we would deadlock.
+		 * Note that the machinery to reliably determine whether
+		 * or not we are in an RCU read-side critical section
+		 * exists only in the preemptible RCU implementations
+		 * (TINY_PREEMPT_RCU and TREE_PREEMPT_RCU), which is why
+		 * DEBUG_OBJECTS_RCU_HEAD is disallowed if !PREEMPT.
 		 */
 		if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
 		    irqs_disabled()) {
-- 
cgit v0.10.2


From 241e6663b5151733294d1a230a3fd8a4d32e187f Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 10 Feb 2011 16:54:50 -0800
Subject: smp: Document transitivity for memory barriers.

Transitivity is guaranteed only for full memory barriers (smp_mb()).

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt
index 631ad2f..f0d3a80 100644
--- a/Documentation/memory-barriers.txt
+++ b/Documentation/memory-barriers.txt
@@ -21,6 +21,7 @@ Contents:
      - SMP barrier pairing.
      - Examples of memory barrier sequences.
      - Read memory barriers vs load speculation.
+     - Transitivity
 
  (*) Explicit kernel barriers.
 
@@ -959,6 +960,63 @@ the speculation will be cancelled and the value reloaded:
 	retrieved                               :       :       +-------+
 
 
+TRANSITIVITY
+------------
+
+Transitivity is a deeply intuitive notion about ordering that is not
+always provided by real computer systems.  The following example
+demonstrates transitivity (also called "cumulativity"):
+
+	CPU 1			CPU 2			CPU 3
+	=======================	=======================	=======================
+		{ X = 0, Y = 0 }
+	STORE X=1		LOAD X			STORE Y=1
+				<general barrier>	<general barrier>
+				LOAD Y			LOAD X
+
+Suppose that CPU 2's load from X returns 1 and its load from Y returns 0.
+This indicates that CPU 2's load from X in some sense follows CPU 1's
+store to X and that CPU 2's load from Y in some sense preceded CPU 3's
+store to Y.  The question is then "Can CPU 3's load from X return 0?"
+
+Because CPU 2's load from X in some sense came after CPU 1's store, it
+is natural to expect that CPU 3's load from X must therefore return 1.
+This expectation is an example of transitivity: if a load executing on
+CPU A follows a load from the same variable executing on CPU B, then
+CPU A's load must either return the same value that CPU B's load did,
+or must return some later value.
+
+In the Linux kernel, use of general memory barriers guarantees
+transitivity.  Therefore, in the above example, if CPU 2's load from X
+returns 1 and its load from Y returns 0, then CPU 3's load from X must
+also return 1.
+
+However, transitivity is -not- guaranteed for read or write barriers.
+For example, suppose that CPU 2's general barrier in the above example
+is changed to a read barrier as shown below:
+
+	CPU 1			CPU 2			CPU 3
+	=======================	=======================	=======================
+		{ X = 0, Y = 0 }
+	STORE X=1		LOAD X			STORE Y=1
+				<read barrier>		<general barrier>
+				LOAD Y			LOAD X
+
+This substitution destroys transitivity: in this example, it is perfectly
+legal for CPU 2's load from X to return 1, its load from Y to return 0,
+and CPU 3's load from X to return 0.
+
+The key point is that although CPU 2's read barrier orders its pair
+of loads, it does not guarantee to order CPU 1's store.  Therefore, if
+this example runs on a system where CPUs 1 and 2 share a store buffer
+or a level of cache, CPU 2 might have early access to CPU 1's writes.
+General barriers are therefore required to ensure that all CPUs agree
+on the combined order of CPU 1's and CPU 2's accesses.
+
+To reiterate, if your code requires transitivity, use general barriers
+throughout.
+
+
 ========================
 EXPLICIT KERNEL BARRIERS
 ========================
-- 
cgit v0.10.2


From 6909262429b70a162e9e7053672cfd8024c9275d Mon Sep 17 00:00:00 2001
From: Lin Ming <ming.m.lin@intel.com>
Date: Thu, 3 Mar 2011 10:34:50 +0800
Subject: perf: Avoid the percore allocations if the CPU is not HT capable

Signed-off-by: Lin Ming <ming.m.lin@intel.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1299119690-13991-5-git-send-email-ming.m.lin@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 1f46951..c1bbfa8 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -17,10 +17,20 @@
 #endif
 #include <asm/thread_info.h>
 #include <asm/cpumask.h>
+#include <asm/cpufeature.h>
 
 extern int smp_num_siblings;
 extern unsigned int num_processors;
 
+static inline bool cpu_has_ht_siblings(void)
+{
+	bool has_siblings = false;
+#ifdef CONFIG_SMP
+	has_siblings = cpu_has_ht && smp_num_siblings > 1;
+#endif
+	return has_siblings;
+}
+
 DECLARE_PER_CPU(cpumask_var_t, cpu_sibling_map);
 DECLARE_PER_CPU(cpumask_var_t, cpu_core_map);
 DECLARE_PER_CPU(u16, cpu_llc_id);
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 4d6ce5d..2660418 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -30,6 +30,7 @@
 #include <asm/stacktrace.h>
 #include <asm/nmi.h>
 #include <asm/compat.h>
+#include <asm/smp.h>
 
 #if 0
 #undef wrmsrl
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 6e9b676..8fc2b2c 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1205,6 +1205,9 @@ static int intel_pmu_cpu_prepare(int cpu)
 {
 	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
 
+	if (!cpu_has_ht_siblings())
+		return NOTIFY_OK;
+
 	cpuc->per_core = kzalloc_node(sizeof(struct intel_percore),
 				      GFP_KERNEL, cpu_to_node(cpu));
 	if (!cpuc->per_core)
@@ -1221,6 +1224,15 @@ static void intel_pmu_cpu_starting(int cpu)
 	int core_id = topology_core_id(cpu);
 	int i;
 
+	init_debug_store_on_cpu(cpu);
+	/*
+	 * Deal with CPUs that don't clear their LBRs on power-up.
+	 */
+	intel_pmu_lbr_reset();
+
+	if (!cpu_has_ht_siblings())
+		return;
+
 	for_each_cpu(i, topology_thread_cpumask(cpu)) {
 		struct intel_percore *pc = per_cpu(cpu_hw_events, i).per_core;
 
@@ -1233,12 +1245,6 @@ static void intel_pmu_cpu_starting(int cpu)
 
 	cpuc->per_core->core_id = core_id;
 	cpuc->per_core->refcnt++;
-
-	init_debug_store_on_cpu(cpu);
-	/*
-	 * Deal with CPUs that don't clear their LBRs on power-up.
-	 */
-	intel_pmu_lbr_reset();
 }
 
 static void intel_pmu_cpu_dying(int cpu)
-- 
cgit v0.10.2


From ac23f25355ef53f3d14352fcff3c6817527a9749 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@novell.com>
Date: Fri, 4 Mar 2011 15:52:35 +0000
Subject: x86: Really print supported CPUs if PROCESSOR_SELECT=y

I'm sure it was a mere oversight that the CONFIG_ prefixes are
missing.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Cc: Dave Jones <davej@redhat.com>
LKML-Reference: <4D7118D30200007800034F79@vpn.id2.novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 1d59834..5d98c46 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -675,7 +675,7 @@ void __init early_cpu_init(void)
 	const struct cpu_dev *const *cdev;
 	int count = 0;
 
-#ifdef PROCESSOR_SELECT
+#ifdef CONFIG_PROCESSOR_SELECT
 	printk(KERN_INFO "KERNEL supported cpus:\n");
 #endif
 
@@ -687,7 +687,7 @@ void __init early_cpu_init(void)
 		cpu_devs[count] = cpudev;
 		count++;
 
-#ifdef PROCESSOR_SELECT
+#ifdef CONFIG_PROCESSOR_SELECT
 		{
 			unsigned int j;
 
-- 
cgit v0.10.2


From a03f35ceeb3d279da35c5a914ac01a4b1effb0a1 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 3 Mar 2011 16:43:03 -0300
Subject: perf report tui: Fix multi event switching

TAB/UNTAB were not hotkeys, so didn't exit hists__browse back to
hists__tui_browse_tree, allowing just the first event to be browsed.

Reported-by: William Cohen <wcohen@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
Cc: William Cohen <wcohen@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/ui/browsers/hists.c b/tools/perf/util/ui/browsers/hists.c
index 497b3c4..c27ab35 100644
--- a/tools/perf/util/ui/browsers/hists.c
+++ b/tools/perf/util/ui/browsers/hists.c
@@ -292,7 +292,8 @@ static int hist_browser__run(struct hist_browser *self, const char *title)
 {
 	int key;
 	int exit_keys[] = { 'a', '?', 'h', 'C', 'd', 'D', 'E', 't',
-			    NEWT_KEY_ENTER, NEWT_KEY_RIGHT, NEWT_KEY_LEFT, 0, };
+			    NEWT_KEY_ENTER, NEWT_KEY_RIGHT, NEWT_KEY_LEFT,
+			    NEWT_KEY_TAB, NEWT_KEY_UNTAB, 0, };
 
 	self->b.entries = &self->hists->entries;
 	self->b.nr_entries = self->hists->nr_entries;
@@ -859,6 +860,7 @@ int hists__browse(struct hists *self, const char *helpline,
 					"E         Expand all callchains\n"
 					"d         Zoom into current DSO\n"
 					"t         Zoom into current Thread\n"
+					"TAB/UNTAB Switch events\n"
 					"q/CTRL+C  Exit browser");
 			continue;
 		case NEWT_KEY_ENTER:
@@ -997,6 +999,7 @@ int hists__tui_browse_tree(struct rb_root *self, const char *help, int evidx)
 			if (nd == first)
 				continue;
 			nd = rb_prev(nd);
+			break;
 		default:
 			return key;
 		}
-- 
cgit v0.10.2


From d7603d5122d9700fb8f36fa08b04f4e900fef059 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 4 Mar 2011 14:51:33 -0300
Subject: perf hists: Remove needless global col lenght calcs

To support multiple events we need to do these calcs per 'struct hists'
instance, and it turns out we already do that at:

	__hists__add_entry
		hists__inc_nr_entries
			hists__calc_col_len

for all the unfiltered hist_entry instances we stash in the rb tree, so
trow away the dead code.

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
index fbf5754..2b15c36 100644
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -424,33 +424,6 @@ int perf_event__synthesize_kernel_mmap(perf_event__handler_t process,
 	return err;
 }
 
-static void thread__comm_adjust(struct thread *self, struct hists *hists)
-{
-	char *comm = self->comm;
-
-	if (!symbol_conf.col_width_list_str && !symbol_conf.field_sep &&
-	    (!symbol_conf.comm_list ||
-	     strlist__has_entry(symbol_conf.comm_list, comm))) {
-		u16 slen = strlen(comm);
-
-		if (hists__new_col_len(hists, HISTC_COMM, slen))
-			hists__set_col_len(hists, HISTC_THREAD, slen + 6);
-	}
-}
-
-static int thread__set_comm_adjust(struct thread *self, const char *comm,
-				   struct hists *hists)
-{
-	int ret = thread__set_comm(self, comm);
-
-	if (ret)
-		return ret;
-
-	thread__comm_adjust(self, hists);
-
-	return 0;
-}
-
 int perf_event__process_comm(union perf_event *event,
 			     struct perf_sample *sample __used,
 			     struct perf_session *session)
@@ -459,8 +432,7 @@ int perf_event__process_comm(union perf_event *event,
 
 	dump_printf(": %s:%d\n", event->comm.comm, event->comm.tid);
 
-	if (thread == NULL || thread__set_comm_adjust(thread, event->comm.comm,
-						      &session->hists)) {
+	if (thread == NULL || thread__set_comm(thread, event->comm.comm)) {
 		dump_printf("problem processing PERF_RECORD_COMM, skipping event.\n");
 		return -1;
 	}
@@ -760,18 +732,6 @@ void thread__find_addr_location(struct thread *self,
 		al->sym = NULL;
 }
 
-static void dso__calc_col_width(struct dso *self, struct hists *hists)
-{
-	if (!symbol_conf.col_width_list_str && !symbol_conf.field_sep &&
-	    (!symbol_conf.dso_list ||
-	     strlist__has_entry(symbol_conf.dso_list, self->name))) {
-		u16 slen = dso__name_len(self);
-		hists__new_col_len(hists, HISTC_DSO, slen);
-	}
-
-	self->slen_calculated = 1;
-}
-
 int perf_event__preprocess_sample(const union perf_event *event,
 				  struct perf_session *session,
 				  struct addr_location *al,
@@ -817,23 +777,8 @@ int perf_event__preprocess_sample(const union perf_event *event,
 			strlist__has_entry(symbol_conf.dso_list,
 					   al->map->dso->long_name)))))
 			goto out_filtered;
-		/*
-		 * We have to do this here as we may have a dso with no symbol
-		 * hit that has a name longer than the ones with symbols
-		 * sampled.
-		 */
-		if (!sort_dso.elide && !al->map->dso->slen_calculated)
-			dso__calc_col_width(al->map->dso, &session->hists);
 
 		al->sym = map__find_symbol(al->map, al->addr, filter);
-	} else {
-		const unsigned int unresolved_col_width = BITS_PER_LONG / 4;
-
-		if (hists__col_len(&session->hists, HISTC_DSO) < unresolved_col_width &&
-		    !symbol_conf.col_width_list_str && !symbol_conf.field_sep &&
-		    !symbol_conf.dso_list)
-			hists__set_col_len(&session->hists, HISTC_DSO,
-					   unresolved_col_width);
 	}
 
 	if (symbol_conf.sym_list && al->sym &&
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index da2899e8..f7ad6bd 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -50,6 +50,15 @@ static void hists__calc_col_len(struct hists *self, struct hist_entry *h)
 
 	if (h->ms.sym)
 		hists__new_col_len(self, HISTC_SYMBOL, h->ms.sym->namelen);
+	else {
+		const unsigned int unresolved_col_width = BITS_PER_LONG / 4;
+
+		if (hists__col_len(self, HISTC_DSO) < unresolved_col_width &&
+		    !symbol_conf.col_width_list_str && !symbol_conf.field_sep &&
+		    !symbol_conf.dso_list)
+			hists__set_col_len(self, HISTC_DSO,
+					   unresolved_col_width);
+	}
 
 	len = thread__comm_len(h->thread);
 	if (hists__new_col_len(self, HISTC_COMM, len))
diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index ba6d489..00014e3 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -207,7 +207,6 @@ struct dso *dso__new(const char *name)
 		dso__set_short_name(self, self->name);
 		for (i = 0; i < MAP__NR_TYPES; ++i)
 			self->symbols[i] = self->symbol_names[i] = RB_ROOT;
-		self->slen_calculated = 0;
 		self->origin = DSO__ORIG_NOT_FOUND;
 		self->loaded = 0;
 		self->sorted_by_name = 0;
diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h
index 670cd1c..4d7ed09 100644
--- a/tools/perf/util/symbol.h
+++ b/tools/perf/util/symbol.h
@@ -132,7 +132,6 @@ struct dso {
 	struct rb_root	 symbol_names[MAP__NR_TYPES];
 	enum dso_kernel_type	kernel;
 	u8		 adjust_symbols:1;
-	u8		 slen_calculated:1;
 	u8		 has_build_id:1;
 	u8		 hit:1;
 	u8		 annotate_warned:1;
-- 
cgit v0.10.2


From 60098917c06d154d06ce030c125266eab9e60768 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 4 Mar 2011 21:19:21 -0300
Subject: perf hists browser: Handle browsing empty hists tree

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/ui/browsers/hists.c b/tools/perf/util/ui/browsers/hists.c
index c27ab35..c98e6f8 100644
--- a/tools/perf/util/ui/browsers/hists.c
+++ b/tools/perf/util/ui/browsers/hists.c
@@ -639,6 +639,9 @@ static void ui_browser__hists_seek(struct ui_browser *self,
 	struct rb_node *nd;
 	bool first = true;
 
+	if (self->nr_entries == 0)
+		return;
+
 	switch (whence) {
 	case SEEK_SET:
 		nd = hists__filter_entries(rb_first(self->entries));
@@ -820,8 +823,8 @@ int hists__browse(struct hists *self, const char *helpline,
 	hists__browser_title(self, msg, sizeof(msg), ev_name,
 			     dso_filter, thread_filter);
 	while (1) {
-		const struct thread *thread;
-		const struct dso *dso;
+		const struct thread *thread = NULL;
+		const struct dso *dso = NULL;
 		char *options[16];
 		int nr_options = 0, choice = 0, i,
 		    annotate = -2, zoom_dso = -2, zoom_thread = -2,
@@ -829,8 +832,10 @@ int hists__browse(struct hists *self, const char *helpline,
 
 		key = hist_browser__run(browser, msg);
 
-		thread = hist_browser__selected_thread(browser);
-		dso = browser->selection->map ? browser->selection->map->dso : NULL;
+		if (browser->he_selection != NULL) {
+			thread = hist_browser__selected_thread(browser);
+			dso = browser->selection->map ? browser->selection->map->dso : NULL;
+		}
 
 		switch (key) {
 		case NEWT_KEY_TAB:
@@ -841,7 +846,8 @@ int hists__browse(struct hists *self, const char *helpline,
 			 */
 			goto out_free_stack;
 		case 'a':
-			if (browser->selection->map == NULL &&
+			if (browser->selection == NULL ||
+			    browser->selection->map == NULL ||
 			    browser->selection->map->dso->annotate_warned)
 				continue;
 			goto do_annotate;
@@ -887,7 +893,8 @@ int hists__browse(struct hists *self, const char *helpline,
 			goto out_free_stack;
 		}
 
-		if (browser->selection->sym != NULL &&
+		if (browser->selection != NULL &&
+		    browser->selection->sym != NULL &&
 		    !browser->selection->map->dso->annotate_warned &&
 		    asprintf(&options[nr_options], "Annotate %s",
 			     browser->selection->sym->name) > 0)
@@ -906,7 +913,8 @@ int hists__browse(struct hists *self, const char *helpline,
 			     (dso->kernel ? "the Kernel" : dso->short_name)) > 0)
 			zoom_dso = nr_options++;
 
-		if (browser->selection->map != NULL &&
+		if (browser->selection != NULL &&
+		    browser->selection->map != NULL &&
 		    asprintf(&options[nr_options], "Browse map details") > 0)
 			browse_map = nr_options++;
 
-- 
cgit v0.10.2


From 3d3b5e95997208067c963923db90ed1517565d14 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 4 Mar 2011 22:29:39 -0300
Subject: perf evlist: Split perf_evlist__id_hash

The previous situation was to receive an fd from where to read the event
ID.

Spin off a routine for when we have the ID handy, not having to read it
from some fd.

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index 030ae7f..190c64c 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -106,12 +106,24 @@ void perf_evlist__add_pollfd(struct perf_evlist *evlist, int fd)
 	evlist->nr_fds++;
 }
 
-static int perf_evlist__id_hash(struct perf_evlist *evlist, struct perf_evsel *evsel,
-			       int cpu, int thread, int fd)
+void perf_evlist__id_hash(struct perf_evlist *evlist, struct perf_evsel *evsel,
+			  int cpu, int thread, u64 id)
+{
+	int hash;
+	struct perf_sample_id *sid = SID(evsel, cpu, thread);
+
+	sid->id = id;
+	sid->evsel = evsel;
+	hash = hash_64(sid->id, PERF_EVLIST__HLIST_BITS);
+	hlist_add_head(&sid->node, &evlist->heads[hash]);
+}
+
+static int perf_evlist__id_hash_fd(struct perf_evlist *evlist,
+				   struct perf_evsel *evsel,
+				   int cpu, int thread, int fd)
 {
-	struct perf_sample_id *sid;
 	u64 read_data[4] = { 0, };
-	int hash, id_idx = 1; /* The first entry is the counter value */
+	int id_idx = 1; /* The first entry is the counter value */
 
 	if (!(evsel->attr.read_format & PERF_FORMAT_ID) ||
 	    read(fd, &read_data, sizeof(read_data)) == -1)
@@ -122,11 +134,7 @@ static int perf_evlist__id_hash(struct perf_evlist *evlist, struct perf_evsel *e
 	if (evsel->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
 		++id_idx;
 
-	sid = SID(evsel, cpu, thread);
-	sid->id = read_data[id_idx];
-	sid->evsel = evsel;
-	hash = hash_64(sid->id, PERF_EVLIST__HLIST_BITS);
-	hlist_add_head(&sid->node, &evlist->heads[hash]);
+	perf_evlist__id_hash(evlist, evsel, cpu, thread, read_data[id_idx]);
 	return 0;
 }
 
@@ -300,7 +308,7 @@ int perf_evlist__mmap(struct perf_evlist *evlist, int pages, bool overwrite)
 					goto out_unmap;
 
 				if ((evsel->attr.read_format & PERF_FORMAT_ID) &&
-				    perf_evlist__id_hash(evlist, evsel, cpu, thread, fd) < 0)
+				    perf_evlist__id_hash_fd(evlist, evsel, cpu, thread, fd) < 0)
 					goto out_unmap;
 			}
 		}
diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h
index b75805a..078d512 100644
--- a/tools/perf/util/evlist.h
+++ b/tools/perf/util/evlist.h
@@ -38,6 +38,9 @@ void perf_evlist__delete(struct perf_evlist *evlist);
 void perf_evlist__add(struct perf_evlist *evlist, struct perf_evsel *entry);
 int perf_evlist__add_default(struct perf_evlist *evlist);
 
+void perf_evlist__id_hash(struct perf_evlist *evlist, struct perf_evsel *evsel,
+			  int cpu, int thread, u64 id);
+
 int perf_evlist__alloc_pollfd(struct perf_evlist *evlist);
 void perf_evlist__add_pollfd(struct perf_evlist *evlist, int fd);
 
-- 
cgit v0.10.2


From e248de331a452f8771eda6ed4bb30d92c82df28b Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Sat, 5 Mar 2011 21:40:06 -0300
Subject: perf tools: Improve support for sessions with multiple events

By creating an perf_evlist out of the attributes in the perf.data file
header, so that we can use evlists and evsels when reading recorded
sessions in addition to when we record sessions.

More work is needed to allow tools to allow the user to select which
events are wanted when browsing sessions, be it just one or a subset of
them, aggregated or showed at the same time but with different
indications on the UI to allow seeing workloads thru different views at
the same time.

But the overall goal/trend is to more uniformly use evsels and evlists.

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
index 4271829..695de4b 100644
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -19,6 +19,8 @@
 #include "perf.h"
 #include "util/debug.h"
 
+#include "util/evlist.h"
+#include "util/evsel.h"
 #include "util/annotate.h"
 #include "util/event.h"
 #include "util/parse-options.h"
@@ -38,9 +40,13 @@ static bool		print_line;
 
 static const char *sym_hist_filter;
 
-static int hists__add_entry(struct hists *self, struct addr_location *al)
+static int perf_evlist__add_sample(struct perf_evlist *evlist,
+				   struct perf_sample *sample,
+				   struct addr_location *al)
 {
+	struct perf_evsel *evsel;
 	struct hist_entry *he;
+	int ret;
 
 	if (sym_hist_filter != NULL &&
 	    (al->sym == NULL || strcmp(sym_hist_filter, al->sym->name) != 0)) {
@@ -53,23 +59,35 @@ static int hists__add_entry(struct hists *self, struct addr_location *al)
 		return 0;
 	}
 
-	he = __hists__add_entry(self, al, NULL, 1);
+	evsel = perf_evlist__id2evsel(evlist, sample->id);
+	if (evsel == NULL) {
+		/*
+		 * FIXME: Propagate this back, but at least we're in a builtin,
+		 * where exit() is allowed. ;-)
+		 */
+		ui__warning("Invalid %s file, contains samples with id not in "
+			    "its header!\n", input_name);
+		exit_browser(0);
+		exit(1);
+	}
+
+	he = __hists__add_entry(&evsel->hists, al, NULL, 1);
 	if (he == NULL)
 		return -ENOMEM;
 
+	ret = 0;
 	if (he->ms.sym != NULL) {
-		/*
-		 * All aggregated on the first sym_hist.
-		 */
 		struct annotation *notes = symbol__annotation(he->ms.sym);
 		if (notes->src == NULL &&
-		    symbol__alloc_hist(he->ms.sym, 1) < 0)
+		    symbol__alloc_hist(he->ms.sym, evlist->nr_entries) < 0)
 			return -ENOMEM;
 
-		return hist_entry__inc_addr_samples(he, 0, al->addr);
+		ret = hist_entry__inc_addr_samples(he, evsel->idx, al->addr);
 	}
 
-	return 0;
+	evsel->hists.stats.total_period += sample->period;
+	hists__inc_nr_events(&evsel->hists, PERF_RECORD_SAMPLE);
+	return ret;
 }
 
 static int process_sample_event(union perf_event *event,
@@ -85,7 +103,7 @@ static int process_sample_event(union perf_event *event,
 		return -1;
 	}
 
-	if (!al.filtered && hists__add_entry(&session->hists, &al)) {
+	if (!al.filtered && perf_evlist__add_sample(session->evlist, sample, &al)) {
 		pr_warning("problem incrementing symbol count, "
 			   "skipping event\n");
 		return -1;
@@ -100,7 +118,7 @@ static int hist_entry__tty_annotate(struct hist_entry *he, int evidx)
 				    print_line, full_paths, 0, 0);
 }
 
-static void hists__find_annotations(struct hists *self)
+static void hists__find_annotations(struct hists *self, int evidx)
 {
 	struct rb_node *nd = rb_first(&self->entries), *next;
 	int key = KEY_RIGHT;
@@ -123,8 +141,7 @@ find_next:
 		}
 
 		if (use_browser > 0) {
-			/* For now all is aggregated on the first */
-			key = hist_entry__tui_annotate(he, 0);
+			key = hist_entry__tui_annotate(he, evidx);
 			switch (key) {
 			case KEY_RIGHT:
 				next = rb_next(nd);
@@ -139,8 +156,7 @@ find_next:
 			if (next != NULL)
 				nd = next;
 		} else {
-			/* For now all is aggregated on the first */
-			hist_entry__tty_annotate(he, 0);
+			hist_entry__tty_annotate(he, evidx);
 			nd = rb_next(nd);
 			/*
 			 * Since we have a hist_entry per IP for the same
@@ -166,6 +182,8 @@ static int __cmd_annotate(void)
 {
 	int ret;
 	struct perf_session *session;
+	struct perf_evsel *pos;
+	u64 total_nr_samples;
 
 	session = perf_session__new(input_name, O_RDONLY, force, false, &event_ops);
 	if (session == NULL)
@@ -186,12 +204,36 @@ static int __cmd_annotate(void)
 	if (verbose > 2)
 		perf_session__fprintf_dsos(session, stdout);
 
-	hists__collapse_resort(&session->hists);
-	hists__output_resort(&session->hists);
-	hists__find_annotations(&session->hists);
-out_delete:
-	perf_session__delete(session);
+	total_nr_samples = 0;
+	list_for_each_entry(pos, &session->evlist->entries, node) {
+		struct hists *hists = &pos->hists;
+		u32 nr_samples = hists->stats.nr_events[PERF_RECORD_SAMPLE];
+
+		if (nr_samples > 0) {
+			total_nr_samples += nr_samples;
+			hists__collapse_resort(hists);
+			hists__output_resort(hists);
+			hists__find_annotations(hists, pos->idx);
+		}
+	}
 
+	if (total_nr_samples == 0) {
+		ui__warning("The %s file has no samples!\n", input_name);
+		goto out_delete;
+	}
+out_delete:
+	/*
+	 * Speed up the exit process, for large files this can
+	 * take quite a while.
+	 *
+	 * XXX Enable this when using valgrind or if we ever
+	 * librarize this command.
+	 *
+	 * Also experiment with obstacks to see how much speed
+	 * up we'll get here.
+	 *
+	 * perf_session__delete(session);
+	 */
 	return ret;
 }
 
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index dddcc7e..1c399ea 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -21,6 +21,8 @@
 
 #include "perf.h"
 #include "util/debug.h"
+#include "util/evlist.h"
+#include "util/evsel.h"
 #include "util/header.h"
 #include "util/session.h"
 
@@ -46,39 +48,6 @@ static const char	*pretty_printing_style = default_pretty_printing_style;
 static char		callchain_default_opt[] = "fractal,0.5";
 static symbol_filter_t	annotate_init;
 
-static struct hists *perf_session__hists_findnew(struct perf_session *self,
-						 u64 event_stream, u32 type,
-						 u64 config)
-{
-	struct rb_node **p = &self->hists_tree.rb_node;
-	struct rb_node *parent = NULL;
-	struct hists *iter, *new;
-
-	while (*p != NULL) {
-		parent = *p;
-		iter = rb_entry(parent, struct hists, rb_node);
-		if (iter->config == config)
-			return iter;
-
-
-		if (config > iter->config)
-			p = &(*p)->rb_right;
-		else
-			p = &(*p)->rb_left;
-	}
-
-	new = malloc(sizeof(struct hists));
-	if (new == NULL)
-		return NULL;
-	memset(new, 0, sizeof(struct hists));
-	new->event_stream = event_stream;
-	new->config = config;
-	new->type = type;
-	rb_link_node(&new->rb_node, parent, p);
-	rb_insert_color(&new->rb_node, &self->hists_tree);
-	return new;
-}
-
 static int perf_session__add_hist_entry(struct perf_session *session,
 					struct addr_location *al,
 					struct perf_sample *sample)
@@ -86,8 +55,7 @@ static int perf_session__add_hist_entry(struct perf_session *session,
 	struct symbol *parent = NULL;
 	int err = 0;
 	struct hist_entry *he;
-	struct hists *hists;
-	struct perf_event_attr *attr;
+	struct perf_evsel *evsel;
 
 	if ((sort__has_parent || symbol_conf.use_callchain) && sample->callchain) {
 		err = perf_session__resolve_callchain(session, al->thread,
@@ -96,15 +64,19 @@ static int perf_session__add_hist_entry(struct perf_session *session,
 			return err;
 	}
 
-	attr = perf_header__find_attr(sample->id, &session->header);
-	if (attr)
-		hists = perf_session__hists_findnew(session, sample->id, attr->type, attr->config);
-	else
-		hists = perf_session__hists_findnew(session, sample->id, 0, 0);
-	if (hists == NULL)
-		return -ENOMEM;
+	evsel = perf_evlist__id2evsel(session->evlist, sample->id);
+	if (evsel == NULL) {
+		/*
+		 * FIXME: Propagate this back, but at least we're in a builtin,
+		 * where exit() is allowed. ;-)
+		 */
+		ui__warning("Invalid %s file, contains samples with id not in "
+			    "its header!\n", input_name);
+		exit_browser(0);
+		exit(1);
+	}
 
-	he = __hists__add_entry(hists, al, parent, sample->period);
+	he = __hists__add_entry(&evsel->hists, al, parent, sample->period);
 	if (he == NULL)
 		return -ENOMEM;
 
@@ -120,52 +92,30 @@ static int perf_session__add_hist_entry(struct perf_session *session,
 	 * code will not use it.
 	 */
 	if (al->sym != NULL && use_browser > 0) {
-		/*
-		 * All aggregated on the first sym_hist.
-		 */
 		struct annotation *notes = symbol__annotation(he->ms.sym);
+
+		assert(evsel != NULL);
+
+		err = -ENOMEM;
 		if (notes->src == NULL &&
-		    symbol__alloc_hist(he->ms.sym, 1) < 0)
-			err = -ENOMEM;
-		else
-			err = hist_entry__inc_addr_samples(he, 0, al->addr);
+		    symbol__alloc_hist(he->ms.sym, session->evlist->nr_entries) < 0)
+			goto out;
+
+		err = hist_entry__inc_addr_samples(he, evsel->idx, al->addr);
 	}
 
+	evsel->hists.stats.total_period += sample->period;
+	hists__inc_nr_events(&evsel->hists, PERF_RECORD_SAMPLE);
+out:
 	return err;
 }
 
-static int add_event_total(struct perf_session *session,
-			   struct perf_sample *sample,
-			   struct perf_event_attr *attr)
-{
-	struct hists *hists;
-
-	if (attr)
-		hists = perf_session__hists_findnew(session, sample->id,
-						    attr->type, attr->config);
-	else
-		hists = perf_session__hists_findnew(session, sample->id, 0, 0);
-
-	if (!hists)
-		return -ENOMEM;
-
-	hists->stats.total_period += sample->period;
-	/*
-	 * FIXME: add_event_total should be moved from here to
-	 * perf_session__process_event so that the proper hist is passed to
-	 * the event_op methods.
-	 */
-	hists__inc_nr_events(hists, PERF_RECORD_SAMPLE);
-	session->hists.stats.total_period += sample->period;
-	return 0;
-}
 
 static int process_sample_event(union perf_event *event,
 				struct perf_sample *sample,
 				struct perf_session *session)
 {
 	struct addr_location al;
-	struct perf_event_attr *attr;
 
 	if (perf_event__preprocess_sample(event, session, &al, sample,
 					  annotate_init) < 0) {
@@ -182,27 +132,17 @@ static int process_sample_event(union perf_event *event,
 		return -1;
 	}
 
-	attr = perf_header__find_attr(sample->id, &session->header);
-
-	if (add_event_total(session, sample, attr)) {
-		pr_debug("problem adding event period\n");
-		return -1;
-	}
-
 	return 0;
 }
 
 static int process_read_event(union perf_event *event,
 			      struct perf_sample *sample __used,
-			      struct perf_session *session __used)
+			      struct perf_session *session)
 {
-	struct perf_event_attr *attr;
-
-	attr = perf_header__find_attr(event->read.id, &session->header);
-
+	struct perf_evsel *evsel = perf_evlist__id2evsel(session->evlist,
+							 event->read.id);
 	if (show_threads) {
-		const char *name = attr ? __event_name(attr->type, attr->config)
-				   : "unknown";
+		const char *name = evsel ? event_name(evsel) : "unknown";
 		perf_read_values_add_value(&show_threads_values,
 					   event->read.pid, event->read.tid,
 					   event->read.id,
@@ -211,7 +151,7 @@ static int process_read_event(union perf_event *event,
 	}
 
 	dump_printf(": %d %d %s %" PRIu64 "\n", event->read.pid, event->read.tid,
-		    attr ? __event_name(attr->type, attr->config) : "FAIL",
+		    evsel ? event_name(evsel) : "FAIL",
 		    event->read.value);
 
 	return 0;
@@ -282,21 +222,20 @@ static size_t hists__fprintf_nr_sample_events(struct hists *self,
 	return ret + fprintf(fp, "\n#\n");
 }
 
-static int hists__tty_browse_tree(struct rb_root *tree, const char *help)
+static int hists__tty_browse_tree(struct perf_evlist *evlist, const char *help)
 {
-	struct rb_node *next = rb_first(tree);
+	struct perf_evsel *pos;
 
-	while (next) {
-		struct hists *hists = rb_entry(next, struct hists, rb_node);
+	list_for_each_entry(pos, &evlist->entries, node) {
+		struct hists *hists = &pos->hists;
 		const char *evname = NULL;
 
 		if (rb_first(&hists->entries) != rb_last(&hists->entries))
-			evname = __event_name(hists->type, hists->config);
+			evname = event_name(pos);
 
 		hists__fprintf_nr_sample_events(hists, evname, stdout);
 		hists__fprintf(hists, NULL, false, stdout);
 		fprintf(stdout, "\n\n");
-		next = rb_next(&hists->rb_node);
 	}
 
 	if (sort_order == default_sort_order &&
@@ -317,8 +256,9 @@ static int hists__tty_browse_tree(struct rb_root *tree, const char *help)
 static int __cmd_report(void)
 {
 	int ret = -EINVAL;
+	u64 nr_samples;
 	struct perf_session *session;
-	struct rb_node *next;
+	struct perf_evsel *pos;
 	const char *help = "For a higher level overview, try: perf report --sort comm,dso";
 
 	signal(SIGINT, sig_handler);
@@ -349,26 +289,24 @@ static int __cmd_report(void)
 	if (verbose > 2)
 		perf_session__fprintf_dsos(session, stdout);
 
-	next = rb_first(&session->hists_tree);
-
-	if (next == NULL) {
-		ui__warning("The %s file has no samples!\n", input_name);
-		goto out_delete;
-	}
-
-	while (next) {
-		struct hists *hists;
+	nr_samples = 0;
+	list_for_each_entry(pos, &session->evlist->entries, node) {
+		struct hists *hists = &pos->hists;
 
-		hists = rb_entry(next, struct hists, rb_node);
 		hists__collapse_resort(hists);
 		hists__output_resort(hists);
-		next = rb_next(&hists->rb_node);
+		nr_samples += hists->stats.nr_events[PERF_RECORD_SAMPLE];
+	}
+
+	if (nr_samples == 0) {
+		ui__warning("The %s file has no samples!\n", input_name);
+		goto out_delete;
 	}
 
 	if (use_browser > 0)
-		hists__tui_browse_tree(&session->hists_tree, help, 0);
+		hists__tui_browse_tree(session->evlist, help);
 	else
-		hists__tty_browse_tree(&session->hists_tree, help);
+		hists__tty_browse_tree(session->evlist, help);
 
 out_delete:
 	/*
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index f6fc8f6..281b60e 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -7,6 +7,7 @@
 #include "types.h"
 #include "xyarray.h"
 #include "cgroup.h"
+#include "hist.h"
  
 struct perf_counts_values {
 	union {
@@ -51,6 +52,7 @@ struct perf_evsel {
 	struct xyarray		*id;
 	struct perf_counts	*counts;
 	int			idx;
+	struct hists		hists;
 	char			*name;
 	void			*priv;
 	struct cgroup_sel	*cgrp;
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 72c124d..108b0db 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -969,37 +969,6 @@ bool perf_header__sample_id_all(const struct perf_header *header)
 	return value;
 }
 
-struct perf_event_attr *
-perf_header__find_attr(u64 id, struct perf_header *header)
-{
-	int i;
-
-	/*
-	 * We set id to -1 if the data file doesn't contain sample
-	 * ids. This can happen when the data file contains one type
-	 * of event and in that case, the header can still store the
-	 * event attribute information. Check for this and avoid
-	 * walking through the entire list of ids which may be large.
-	 */
-	if (id == -1ULL) {
-		if (header->attrs > 0)
-			return &header->attr[0]->attr;
-		return NULL;
-	}
-
-	for (i = 0; i < header->attrs; i++) {
-		struct perf_header_attr *attr = header->attr[i];
-		int j;
-
-		for (j = 0; j < attr->ids; j++) {
-			if (attr->id[j] == id)
-				return &attr->attr;
-		}
-	}
-
-	return NULL;
-}
-
 int perf_event__synthesize_attr(struct perf_event_attr *attr, u16 ids, u64 *id,
 				perf_event__handler_t process,
 				struct perf_session *session)
diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h
index f042ceb..2fab133 100644
--- a/tools/perf/util/header.h
+++ b/tools/perf/util/header.h
@@ -85,8 +85,6 @@ int perf_header_attr__add_id(struct perf_header_attr *self, u64 id);
 
 u64 perf_header__sample_type(struct perf_header *header);
 bool perf_header__sample_id_all(const struct perf_header *header);
-struct perf_event_attr *
-perf_header__find_attr(u64 id, struct perf_header *header);
 void perf_header__set_feat(struct perf_header *self, int feat);
 void perf_header__clear_feat(struct perf_header *self, int feat);
 bool perf_header__has_feat(const struct perf_header *self, int feat);
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index f7ad6bd..627a02e 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -984,8 +984,12 @@ size_t hists__fprintf_nr_events(struct hists *self, FILE *fp)
 	size_t ret = 0;
 
 	for (i = 0; i < PERF_RECORD_HEADER_MAX; ++i) {
-		const char *name = perf_event__name(i);
+		const char *name;
 
+		if (self->stats.nr_events[i] == 0)
+			continue;
+
+		name = perf_event__name(i);
 		if (!strcmp(name, "UNKNOWN"))
 			continue;
 
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index 37c7908..0d38b43 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -42,13 +42,10 @@ enum hist_column {
 };
 
 struct hists {
-	struct rb_node		rb_node;
 	struct rb_root		entries;
 	u64			nr_entries;
 	struct events_stats	stats;
-	u64			config;
 	u64			event_stream;
-	u32			type;
 	u16			col_len[HISTC_NR_COLS];
 	/* Best would be to reuse the session callchain cursor */
 	struct callchain_cursor	callchain_cursor;
@@ -87,6 +84,8 @@ u16 hists__col_len(struct hists *self, enum hist_column col);
 void hists__set_col_len(struct hists *self, enum hist_column col, u16 len);
 bool hists__new_col_len(struct hists *self, enum hist_column col, u16 len);
 
+struct perf_evlist;
+
 #ifdef NO_NEWT_SUPPORT
 static inline int hists__browse(struct hists *self __used,
 				const char *helpline __used,
@@ -95,9 +94,8 @@ static inline int hists__browse(struct hists *self __used,
 	return 0;
 }
 
-static inline int hists__tui_browse_tree(struct rb_root *self __used,
-					 const char *help __used,
-					 int evidx __used)
+static inline int hists__tui_browse_tree(struct perf_evlist *evlist __used,
+					 const char *help __used)
 {
 	return 0;
 }
@@ -118,7 +116,7 @@ int hist_entry__tui_annotate(struct hist_entry *self, int evidx);
 #define KEY_LEFT NEWT_KEY_LEFT
 #define KEY_RIGHT NEWT_KEY_RIGHT
 
-int hists__tui_browse_tree(struct rb_root *self, const char *help, int evidx);
+int hists__tui_browse_tree(struct perf_evlist *evlist, const char *help);
 #endif
 
 unsigned int hists__sort_list_width(struct hists *self);
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index a3a871f..0d41419 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -7,10 +7,52 @@
 #include <sys/types.h>
 #include <sys/mman.h>
 
+#include "evlist.h"
+#include "evsel.h"
 #include "session.h"
 #include "sort.h"
 #include "util.h"
 
+static int perf_session__read_evlist(struct perf_session *session)
+{
+	int i, j;
+
+	session->evlist = perf_evlist__new(NULL, NULL);
+	if (session->evlist == NULL)
+		return -ENOMEM;
+
+	for (i = 0; i < session->header.attrs; ++i) {
+		struct perf_header_attr *hattr = session->header.attr[i];
+		struct perf_evsel *evsel = perf_evsel__new(&hattr->attr, i);
+
+		if (evsel == NULL)
+			goto out_delete_evlist;
+		/*
+		 * Do it before so that if perf_evsel__alloc_id fails, this
+		 * entry gets purged too at perf_evlist__delete().
+		 */
+		perf_evlist__add(session->evlist, evsel);
+		/*
+		 * We don't have the cpu and thread maps on the header, so
+		 * for allocating the perf_sample_id table we fake 1 cpu and
+		 * hattr->ids threads.
+		 */
+		if (perf_evsel__alloc_id(evsel, 1, hattr->ids))
+			goto out_delete_evlist;
+
+		for (j = 0; j < hattr->ids; ++j)
+			perf_evlist__id_hash(session->evlist, evsel, 0, j,
+					     hattr->id[j]);
+	}
+
+	return 0;
+
+out_delete_evlist:
+	perf_evlist__delete(session->evlist);
+	session->evlist = NULL;
+	return -ENOMEM;
+}
+
 static int perf_session__open(struct perf_session *self, bool force)
 {
 	struct stat input_stat;
@@ -56,6 +98,11 @@ static int perf_session__open(struct perf_session *self, bool force)
 		goto out_close;
 	}
 
+	if (perf_session__read_evlist(self) < 0) {
+		pr_err("Not enough memory to read the event selector list\n");
+		goto out_close;
+	}
+
 	self->size = input_stat.st_size;
 	return 0;
 
@@ -141,7 +188,6 @@ struct perf_session *perf_session__new(const char *filename, int mode,
 	memcpy(self->filename, filename, len);
 	self->threads = RB_ROOT;
 	INIT_LIST_HEAD(&self->dead_threads);
-	self->hists_tree = RB_ROOT;
 	self->last_match = NULL;
 	/*
 	 * On 64bit we can mmap the data file in one go. No need for tiny mmap
@@ -1137,3 +1183,18 @@ size_t perf_session__fprintf_dsos_buildid(struct perf_session *self, FILE *fp,
 	size_t ret = machine__fprintf_dsos_buildid(&self->host_machine, fp, with_hits);
 	return ret + machines__fprintf_dsos_buildid(&self->machines, fp, with_hits);
 }
+
+size_t perf_session__fprintf_nr_events(struct perf_session *session, FILE *fp)
+{
+	struct perf_evsel *pos;
+	size_t ret = fprintf(fp, "Aggregated stats:\n");
+
+	ret += hists__fprintf_nr_events(&session->hists, fp);
+
+	list_for_each_entry(pos, &session->evlist->entries, node) {
+		ret += fprintf(fp, "%s stats:\n", event_name(pos));
+		ret += hists__fprintf_nr_events(&pos->hists, fp);
+	}
+
+	return ret;
+}
diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h
index 977b3a1..05dd7bc 100644
--- a/tools/perf/util/session.h
+++ b/tools/perf/util/session.h
@@ -34,12 +34,12 @@ struct perf_session {
 	struct thread		*last_match;
 	struct machine		host_machine;
 	struct rb_root		machines;
-	struct rb_root		hists_tree;
+	struct perf_evlist	*evlist;
 	/*
-	 * FIXME: should point to the first entry in hists_tree and
-	 *        be a hists instance. Right now its only 'report'
-	 *        that is using ->hists_tree while all the rest use
-	 *        ->hists.
+	 * FIXME: Need to split this up further, we need global
+	 *	  stats + per event stats. 'perf diff' also needs
+	 *	  to properly support multiple events in a single
+	 *	  perf.data file.
 	 */
 	struct hists		hists;
 	u64			sample_type;
@@ -151,11 +151,7 @@ size_t perf_session__fprintf_dsos(struct perf_session *self, FILE *fp);
 size_t perf_session__fprintf_dsos_buildid(struct perf_session *self,
 					  FILE *fp, bool with_hits);
 
-static inline
-size_t perf_session__fprintf_nr_events(struct perf_session *self, FILE *fp)
-{
-	return hists__fprintf_nr_events(&self->hists, fp);
-}
+size_t perf_session__fprintf_nr_events(struct perf_session *session, FILE *fp);
 
 static inline int perf_session__parse_sample(struct perf_session *session,
 					     const union perf_event *event,
diff --git a/tools/perf/util/ui/browsers/hists.c b/tools/perf/util/ui/browsers/hists.c
index c98e6f8..f3af4fe 100644
--- a/tools/perf/util/ui/browsers/hists.c
+++ b/tools/perf/util/ui/browsers/hists.c
@@ -7,6 +7,8 @@
 #include <newt.h>
 #include <linux/rbtree.h>
 
+#include "../../evsel.h"
+#include "../../evlist.h"
 #include "../../hist.h"
 #include "../../pstack.h"
 #include "../../sort.h"
@@ -987,31 +989,33 @@ out:
 	return key;
 }
 
-int hists__tui_browse_tree(struct rb_root *self, const char *help, int evidx)
+int hists__tui_browse_tree(struct perf_evlist *evlist, const char *help)
 {
-	struct rb_node *first = rb_first(self), *nd = first, *next;
-	int key = 0;
+	struct perf_evsel *pos;
 
-	while (nd) {
-		struct hists *hists = rb_entry(nd, struct hists, rb_node);
-		const char *ev_name = __event_name(hists->type, hists->config);
+	pos = list_entry(evlist->entries.next, struct perf_evsel, node);
+	while (pos) {
+		struct hists *hists = &pos->hists;
+		const char *ev_name = event_name(pos);
+		int key = hists__browse(hists, help, ev_name, pos->idx);
 
-		key = hists__browse(hists, help, ev_name, evidx);
 		switch (key) {
 		case NEWT_KEY_TAB:
-			next = rb_next(nd);
-			if (next)
-				nd = next;
+			if (pos->node.next == &evlist->entries)
+				pos = list_entry(evlist->entries.next, struct perf_evsel, node);
+			else
+				pos = list_entry(pos->node.next, struct perf_evsel, node);
 			break;
 		case NEWT_KEY_UNTAB:
-			if (nd == first)
-				continue;
-			nd = rb_prev(nd);
+			if (pos->node.prev == &evlist->entries)
+				pos = list_entry(evlist->entries.prev, struct perf_evsel, node);
+			else
+				pos = list_entry(pos->node.prev, struct perf_evsel, node);
 			break;
 		default:
 			return key;
 		}
 	}
 
-	return key;
+	return 0;
 }
-- 
cgit v0.10.2


From 7f0030b211579939461468f25b80c73e293c46e0 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Sun, 6 Mar 2011 13:07:30 -0300
Subject: perf report tui: Improve multi event session support

When multiple events were used in 'perf record', allow the user to
choose which one is wanted before showing the per event histograms.

Annotations will be performed on the chosen event.

Allow going back and forth from event to event quickly using just the
arrow keys and enter.

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
Cc: William Cohen <wcohen@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 1c399ea..e9b5d51 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -222,7 +222,8 @@ static size_t hists__fprintf_nr_sample_events(struct hists *self,
 	return ret + fprintf(fp, "\n#\n");
 }
 
-static int hists__tty_browse_tree(struct perf_evlist *evlist, const char *help)
+static int perf_evlist__tty_browse_hists(struct perf_evlist *evlist,
+					 const char *help)
 {
 	struct perf_evsel *pos;
 
@@ -304,9 +305,9 @@ static int __cmd_report(void)
 	}
 
 	if (use_browser > 0)
-		hists__tui_browse_tree(session->evlist, help);
+		perf_evlist__tui_browse_hists(session->evlist, help);
 	else
-		hists__tty_browse_tree(session->evlist, help);
+		perf_evlist__tty_browse_hists(session->evlist, help);
 
 out_delete:
 	/*
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index 0d38b43..cb6858a 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -87,15 +87,9 @@ bool hists__new_col_len(struct hists *self, enum hist_column col, u16 len);
 struct perf_evlist;
 
 #ifdef NO_NEWT_SUPPORT
-static inline int hists__browse(struct hists *self __used,
-				const char *helpline __used,
-				const char *ev_name __used, int evidx __used)
-{
-	return 0;
-}
-
-static inline int hists__tui_browse_tree(struct perf_evlist *evlist __used,
-					 const char *help __used)
+static inline
+int perf_evlist__tui_browse_hists(struct perf_evlist *evlist __used,
+				  const char *help __used)
 {
 	return 0;
 }
@@ -109,14 +103,12 @@ static inline int hist_entry__tui_annotate(struct hist_entry *self __used,
 #define KEY_RIGHT -2
 #else
 #include <newt.h>
-int hists__browse(struct hists *self, const char *helpline,
-		  const char *ev_name, int evidx);
 int hist_entry__tui_annotate(struct hist_entry *self, int evidx);
 
 #define KEY_LEFT NEWT_KEY_LEFT
 #define KEY_RIGHT NEWT_KEY_RIGHT
 
-int hists__tui_browse_tree(struct perf_evlist *evlist, const char *help);
+int perf_evlist__tui_browse_hists(struct perf_evlist *evlist, const char *help);
 #endif
 
 unsigned int hists__sort_list_width(struct hists *self);
diff --git a/tools/perf/util/ui/browsers/hists.c b/tools/perf/util/ui/browsers/hists.c
index f3af4fe..798efdc 100644
--- a/tools/perf/util/ui/browsers/hists.c
+++ b/tools/perf/util/ui/browsers/hists.c
@@ -803,9 +803,11 @@ static int hists__browser_title(struct hists *self, char *bf, size_t size,
 	return printed;
 }
 
-int hists__browse(struct hists *self, const char *helpline,
-		  const char *ev_name, int evidx)
+static int perf_evsel__hists_browse(struct perf_evsel *evsel,
+				    const char *helpline, const char *ev_name,
+				    bool left_exits)
 {
+	struct hists *self = &evsel->hists;
 	struct hist_browser *browser = hist_browser__new(self);
 	struct pstack *fstack;
 	const struct thread *thread_filter = NULL;
@@ -878,8 +880,14 @@ int hists__browse(struct hists *self, const char *helpline,
 		case NEWT_KEY_LEFT: {
 			const void *top;
 
-			if (pstack__empty(fstack))
+			if (pstack__empty(fstack)) {
+				/*
+				 * Go back to the perf_evsel_menu__run or other user
+				 */
+				if (left_exits)
+					goto out_free_stack;
 				continue;
+			}
 			top = pstack__pop(fstack);
 			if (top == &dso_filter)
 				goto zoom_out_dso;
@@ -888,7 +896,8 @@ int hists__browse(struct hists *self, const char *helpline,
 			continue;
 		}
 		case NEWT_KEY_ESCAPE:
-			if (!ui__dialog_yesno("Do you really want to exit?"))
+			if (!left_exits &&
+			    !ui__dialog_yesno("Do you really want to exit?"))
 				continue;
 			/* Fall thru */
 		default:
@@ -940,7 +949,7 @@ do_annotate:
 			if (he == NULL)
 				continue;
 
-			hist_entry__tui_annotate(he, evidx);
+			hist_entry__tui_annotate(he, evsel->idx);
 		} else if (choice == browse_map)
 			map__browse(browser->selection->map);
 		else if (choice == zoom_dso) {
@@ -989,15 +998,71 @@ out:
 	return key;
 }
 
-int hists__tui_browse_tree(struct perf_evlist *evlist, const char *help)
+struct perf_evsel_menu {
+	struct ui_browser b;
+	struct perf_evsel *selection;
+};
+
+static void perf_evsel_menu__write(struct ui_browser *browser,
+				   void *entry, int row)
+{
+	struct perf_evsel_menu *menu = container_of(browser,
+						    struct perf_evsel_menu, b);
+	struct perf_evsel *evsel = list_entry(entry, struct perf_evsel, node);
+	bool current_entry = ui_browser__is_current_entry(browser, row);
+	unsigned long nr_events = evsel->hists.stats.nr_events[PERF_RECORD_SAMPLE];
+	const char *ev_name = event_name(evsel);
+	char bf[256], unit;
+
+	ui_browser__set_color(browser, current_entry ? HE_COLORSET_SELECTED :
+						       HE_COLORSET_NORMAL);
+
+	nr_events = convert_unit(nr_events, &unit);
+	snprintf(bf, sizeof(bf), "%lu%c%s%s", nr_events,
+		 unit, unit == ' ' ? "" : " ", ev_name);
+	slsmg_write_nstring(bf, browser->width);
+
+	if (current_entry)
+		menu->selection = evsel;
+}
+
+static int perf_evsel_menu__run(struct perf_evsel_menu *menu, const char *help)
 {
+	int exit_keys[] = { NEWT_KEY_ENTER, NEWT_KEY_RIGHT, 0, };
+	struct perf_evlist *evlist = menu->b.priv;
 	struct perf_evsel *pos;
+	const char *ev_name, *title = "Available samples";
+	int key;
 
-	pos = list_entry(evlist->entries.next, struct perf_evsel, node);
-	while (pos) {
-		struct hists *hists = &pos->hists;
-		const char *ev_name = event_name(pos);
-		int key = hists__browse(hists, help, ev_name, pos->idx);
+	if (ui_browser__show(&menu->b, title,
+			     "ESC: exit, ENTER|->: Browse histograms") < 0)
+		return -1;
+
+	ui_browser__add_exit_keys(&menu->b, exit_keys);
+
+	while (1) {
+		key = ui_browser__run(&menu->b);
+
+		switch (key) {
+		case NEWT_KEY_RIGHT:
+		case NEWT_KEY_ENTER:
+			if (!menu->selection)
+				continue;
+			pos = menu->selection;
+browse_hists:
+			ev_name = event_name(pos);
+			key = perf_evsel__hists_browse(pos, help, ev_name, true);
+			ui_browser__show_title(&menu->b, title);
+			break;
+		case NEWT_KEY_LEFT:
+			continue;
+		case NEWT_KEY_ESCAPE:
+			if (!ui__dialog_yesno("Do you really want to exit?"))
+				continue;
+			/* Fall thru */
+		default:
+			goto out;
+		}
 
 		switch (key) {
 		case NEWT_KEY_TAB:
@@ -1005,17 +1070,69 @@ int hists__tui_browse_tree(struct perf_evlist *evlist, const char *help)
 				pos = list_entry(evlist->entries.next, struct perf_evsel, node);
 			else
 				pos = list_entry(pos->node.next, struct perf_evsel, node);
-			break;
+			goto browse_hists;
 		case NEWT_KEY_UNTAB:
 			if (pos->node.prev == &evlist->entries)
 				pos = list_entry(evlist->entries.prev, struct perf_evsel, node);
 			else
 				pos = list_entry(pos->node.prev, struct perf_evsel, node);
-			break;
+			goto browse_hists;
+		case 'q':
+		case CTRL('c'):
+			goto out;
 		default:
-			return key;
+			break;
 		}
 	}
 
-	return 0;
+out:
+	ui_browser__hide(&menu->b);
+	return key;
+}
+
+static int __perf_evlist__tui_browse_hists(struct perf_evlist *evlist,
+					   const char *help)
+{
+	struct perf_evsel *pos;
+	struct perf_evsel_menu menu = {
+		.b = {
+			.entries    = &evlist->entries,
+			.refresh    = ui_browser__list_head_refresh,
+			.seek	    = ui_browser__list_head_seek,
+			.write	    = perf_evsel_menu__write,
+			.nr_entries = evlist->nr_entries,
+			.priv	    = evlist,
+		},
+	};
+
+	ui_helpline__push("Press ESC to exit");
+
+	list_for_each_entry(pos, &evlist->entries, node) {
+		const char *ev_name = event_name(pos);
+		size_t line_len = strlen(ev_name) + 7;
+
+		if (menu.b.width < line_len)
+			menu.b.width = line_len;
+		/*
+		 * Cache the evsel name, tracepoints have a _high_ cost per
+		 * event_name() call.
+		 */
+		if (pos->name == NULL)
+			pos->name = strdup(ev_name);
+	}
+
+	return perf_evsel_menu__run(&menu, help);
+}
+
+int perf_evlist__tui_browse_hists(struct perf_evlist *evlist, const char *help)
+{
+
+	if (evlist->nr_entries == 1) {
+		struct perf_evsel *first = list_entry(evlist->entries.next,
+						      struct perf_evsel, node);
+		const char *ev_name = event_name(first);
+		return perf_evsel__hists_browse(first, help, ev_name, false);
+	}
+
+	return __perf_evlist__tui_browse_hists(evlist, help);
 }
-- 
cgit v0.10.2


From 997772884036e6e121de39322179989154437d9f Mon Sep 17 00:00:00 2001
From: Stanislaw Gruszka <sgruszka@redhat.com>
Date: Mon, 7 Mar 2011 09:58:33 +0100
Subject: debugobjects: Add hint for better object identification

In complex subsystems like mac80211 structures can contain several
timers and work structs, so identifying a specific instance from the
call trace and object type output of debugobjects can be hard.

Allow the subsystems which support debugobjects to provide a hint
function. This function returns a pointer to a kernel address
(preferrably the objects callback function) which is printed along
with the debugobjects type.

Add hint methods for timer_list, work_struct and hrtimer.

[ tglx: Massaged changelog, made it compile ]

Signed-off-by: Stanislaw Gruszka <sgruszka@redhat.com>
LKML-Reference: <20110307085809.GA9334@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/debugobjects.h b/include/linux/debugobjects.h
index 597692f..65970b8 100644
--- a/include/linux/debugobjects.h
+++ b/include/linux/debugobjects.h
@@ -34,7 +34,10 @@ struct debug_obj {
 
 /**
  * struct debug_obj_descr - object type specific debug description structure
+ *
  * @name:		name of the object typee
+ * @debug_hint:		function returning address, which have associated
+ *			kernel symbol, to allow identify the object
  * @fixup_init:		fixup function, which is called when the init check
  *			fails
  * @fixup_activate:	fixup function, which is called when the activate check
@@ -46,7 +49,7 @@ struct debug_obj {
  */
 struct debug_obj_descr {
 	const char		*name;
-
+	void *(*debug_hint)	(void *addr);
 	int (*fixup_init)	(void *addr, enum debug_obj_state state);
 	int (*fixup_activate)	(void *addr, enum debug_obj_state state);
 	int (*fixup_destroy)	(void *addr, enum debug_obj_state state);
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 0c8d7c0..e38f5a0 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -334,6 +334,11 @@ EXPORT_SYMBOL_GPL(ktime_add_safe);
 
 static struct debug_obj_descr hrtimer_debug_descr;
 
+static void *hrtimer_debug_hint(void *addr)
+{
+	return ((struct hrtimer *) addr)->function;
+}
+
 /*
  * fixup_init is called when:
  * - an active object is initialized
@@ -393,6 +398,7 @@ static int hrtimer_fixup_free(void *addr, enum debug_obj_state state)
 
 static struct debug_obj_descr hrtimer_debug_descr = {
 	.name		= "hrtimer",
+	.debug_hint	= hrtimer_debug_hint,
 	.fixup_init	= hrtimer_fixup_init,
 	.fixup_activate	= hrtimer_fixup_activate,
 	.fixup_free	= hrtimer_fixup_free,
diff --git a/kernel/timer.c b/kernel/timer.c
index d645992..33a6792 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -404,6 +404,11 @@ static void timer_stats_account_timer(struct timer_list *timer) {}
 
 static struct debug_obj_descr timer_debug_descr;
 
+static void *timer_debug_hint(void *addr)
+{
+	return ((struct timer_list *) addr)->function;
+}
+
 /*
  * fixup_init is called when:
  * - an active object is initialized
@@ -477,6 +482,7 @@ static int timer_fixup_free(void *addr, enum debug_obj_state state)
 
 static struct debug_obj_descr timer_debug_descr = {
 	.name		= "timer_list",
+	.debug_hint	= timer_debug_hint,
 	.fixup_init	= timer_fixup_init,
 	.fixup_activate	= timer_fixup_activate,
 	.fixup_free	= timer_fixup_free,
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index ee6578b..b5fe4c0 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -316,6 +316,11 @@ static inline int __next_wq_cpu(int cpu, const struct cpumask *mask,
 
 static struct debug_obj_descr work_debug_descr;
 
+static void *work_debug_hint(void *addr)
+{
+	return ((struct work_struct *) addr)->func;
+}
+
 /*
  * fixup_init is called when:
  * - an active object is initialized
@@ -387,6 +392,7 @@ static int work_fixup_free(void *addr, enum debug_obj_state state)
 
 static struct debug_obj_descr work_debug_descr = {
 	.name		= "work_struct",
+	.debug_hint	= work_debug_hint,
 	.fixup_init	= work_fixup_init,
 	.fixup_activate	= work_fixup_activate,
 	.fixup_free	= work_fixup_free,
diff --git a/lib/debugobjects.c b/lib/debugobjects.c
index deebcc5..9d86e45 100644
--- a/lib/debugobjects.c
+++ b/lib/debugobjects.c
@@ -249,14 +249,17 @@ static struct debug_bucket *get_bucket(unsigned long addr)
 
 static void debug_print_object(struct debug_obj *obj, char *msg)
 {
+	struct debug_obj_descr *descr = obj->descr;
 	static int limit;
 
-	if (limit < 5 && obj->descr != descr_test) {
+	if (limit < 5 && descr != descr_test) {
+		void *hint = descr->debug_hint ?
+			descr->debug_hint(obj->object) : NULL;
 		limit++;
 		WARN(1, KERN_ERR "ODEBUG: %s %s (active state %u) "
-				 "object type: %s\n",
+				 "object type: %s hint: %pS\n",
 			msg, obj_states[obj->state], obj->astate,
-			obj->descr->name);
+			descr->name, hint);
 	}
 	debug_objects_warnings++;
 }
-- 
cgit v0.10.2


From ea7145477a461e09d8d194cac4b996dc4f449107 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Mon, 7 Mar 2011 19:10:39 +0100
Subject: x86: Separate out entry text section

Put x86 entry code into a separate link section: .entry.text.

Separating the entry text section seems to have performance
benefits - caused by more efficient instruction cache usage.

Running hackbench with perf stat --repeat showed that the change
compresses the icache footprint. The icache load miss rate went
down by about 15%:

 before patch:
         19417627  L1-icache-load-misses      ( +-   0.147% )

 after patch:
         16490788  L1-icache-load-misses      ( +-   0.180% )

The motivation of the patch was to fix a particular kprobes
bug that relates to the entry text section, the performance
advantage was discovered accidentally.

Whole perf output follows:

 - results for current tip tree:

  Performance counter stats for './hackbench/hackbench 10' (500 runs):

         19417627  L1-icache-load-misses      ( +-   0.147% )
       2676914223  instructions             #      0.497 IPC     ( +- 0.079% )
       5389516026  cycles                     ( +-   0.144% )

      0.206267711  seconds time elapsed   ( +-   0.138% )

 - results for current tip tree with the patch applied:

  Performance counter stats for './hackbench/hackbench 10' (500 runs):

         16490788  L1-icache-load-misses      ( +-   0.180% )
       2717734941  instructions             #      0.502 IPC     ( +- 0.079% )
       5414756975  cycles                     ( +-   0.148% )

      0.206747566  seconds time elapsed   ( +-   0.137% )

Signed-off-by: Jiri Olsa <jolsa@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Nick Piggin <npiggin@kernel.dk>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: masami.hiramatsu.pt@hitachi.com
Cc: ananth@in.ibm.com
Cc: davem@davemloft.net
Cc: 2nddept-manager@sdl.hitachi.co.jp
LKML-Reference: <20110307181039.GB15197@jolsa.redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 518bb99..f729b2e 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -25,6 +25,8 @@
 #define sysretl_audit ia32_ret_from_sys_call
 #endif
 
+	.section .entry.text, "ax"
+
 #define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8)
 
 	.macro IA32_ARG_FIXUP noebp=0
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index c8b4efa..f5accf8 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -65,6 +65,8 @@
 #define sysexit_audit	syscall_exit_work
 #endif
 
+	.section .entry.text, "ax"
+
 /*
  * We use macros for low-level operations which need to be overridden
  * for paravirtualization.  The following will never clobber any registers:
@@ -788,7 +790,7 @@ ENDPROC(ptregs_clone)
  */
 .section .init.rodata,"a"
 ENTRY(interrupt)
-.text
+.section .entry.text, "ax"
 	.p2align 5
 	.p2align CONFIG_X86_L1_CACHE_SHIFT
 ENTRY(irq_entries_start)
@@ -807,7 +809,7 @@ vector=FIRST_EXTERNAL_VECTOR
       .endif
       .previous
 	.long 1b
-      .text
+      .section .entry.text, "ax"
 vector=vector+1
     .endif
   .endr
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index aed1ffb..0a0ed79 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -61,6 +61,8 @@
 #define __AUDIT_ARCH_LE	   0x40000000
 
 	.code64
+	.section .entry.text, "ax"
+
 #ifdef CONFIG_FUNCTION_TRACER
 #ifdef CONFIG_DYNAMIC_FTRACE
 ENTRY(mcount)
@@ -744,7 +746,7 @@ END(stub_rt_sigreturn)
  */
 	.section .init.rodata,"a"
 ENTRY(interrupt)
-	.text
+	.section .entry.text
 	.p2align 5
 	.p2align CONFIG_X86_L1_CACHE_SHIFT
 ENTRY(irq_entries_start)
@@ -763,7 +765,7 @@ vector=FIRST_EXTERNAL_VECTOR
       .endif
       .previous
 	.quad 1b
-      .text
+      .section .entry.text
 vector=vector+1
     .endif
   .endr
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index bf47007..6d4341d 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -105,6 +105,7 @@ SECTIONS
 		SCHED_TEXT
 		LOCK_TEXT
 		KPROBES_TEXT
+		ENTRY_TEXT
 		IRQENTRY_TEXT
 		*(.fixup)
 		*(.gnu.warning)
diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h
index b3bfabc..c1a1216 100644
--- a/include/asm-generic/sections.h
+++ b/include/asm-generic/sections.h
@@ -11,6 +11,7 @@ extern char _sinittext[], _einittext[];
 extern char _end[];
 extern char __per_cpu_load[], __per_cpu_start[], __per_cpu_end[];
 extern char __kprobes_text_start[], __kprobes_text_end[];
+extern char __entry_text_start[], __entry_text_end[];
 extern char __initdata_begin[], __initdata_end[];
 extern char __start_rodata[], __end_rodata[];
 
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index fe77e33..906c3ce 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -424,6 +424,12 @@
 		*(.kprobes.text)					\
 		VMLINUX_SYMBOL(__kprobes_text_end) = .;
 
+#define ENTRY_TEXT							\
+		ALIGN_FUNCTION();					\
+		VMLINUX_SYMBOL(__entry_text_start) = .;			\
+		*(.entry.text)						\
+		VMLINUX_SYMBOL(__entry_text_end) = .;
+
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 #define IRQENTRY_TEXT							\
 		ALIGN_FUNCTION();					\
-- 
cgit v0.10.2


From 2a8247a2600c3e087a568fc68a6ec4eedac27ef1 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Mon, 21 Feb 2011 15:25:13 +0100
Subject: kprobes: Disabling optimized kprobes for entry text section

You can crash the kernel (with root/admin privileges) using kprobe tracer by running:

 echo "p system_call_after_swapgs" > ./kprobe_events
 echo 1 > ./events/kprobes/enable

The reason is that at the system_call_after_swapgs label, the
kernel stack is not set up. If optimized kprobes are enabled,
the user space stack is being used in this case (see optimized
kprobe template) and this might result in a crash.

There are several places like this over the entry code
(entry_$BIT). As it seems there's no any reasonable/maintainable
way to disable only those places where the stack is not ready, I
switched off the whole entry code from kprobe optimizing.

Signed-off-by: Jiri Olsa <jolsa@redhat.com>
Acked-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: acme@redhat.com
Cc: fweisbec@gmail.com
Cc: ananth@in.ibm.com
Cc: davem@davemloft.net
Cc: a.p.zijlstra@chello.nl
Cc: eric.dumazet@gmail.com
Cc: 2nddept-manager@sdl.hitachi.co.jp
LKML-Reference: <1298298313-5980-3-git-send-email-jolsa@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index d91c477..c969fd9 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -1276,6 +1276,14 @@ static int __kprobes can_optimize(unsigned long paddr)
 	if (!kallsyms_lookup_size_offset(paddr, &size, &offset))
 		return 0;
 
+	/*
+	 * Do not optimize in the entry code due to the unstable
+	 * stack handling.
+	 */
+	if ((paddr >= (unsigned long )__entry_text_start) &&
+	    (paddr <  (unsigned long )__entry_text_end))
+		return 0;
+
 	/* Check there is enough space for a relative jump. */
 	if (size - offset < RELATIVEJUMP_SIZE)
 		return 0;
-- 
cgit v0.10.2


From c68fd4f3ca90de7d18c567e70b2c164078aefadf Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 8 Mar 2011 19:52:55 +0100
Subject: genirq: Add comments to Kconfig switches

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Sam Ravnborg <sam@ravnborg.org>

diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 144db9d..09bef82 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -1,3 +1,4 @@
+# Select this to activate the generic irq options below
 config HAVE_GENERIC_HARDIRQS
 	bool
 
@@ -17,27 +18,36 @@ config GENERIC_HARDIRQS_NO_COMPAT
        bool
 
 # Options selectable by the architecture code
+
+# Make sparse irq Kconfig switch below available
 config HAVE_SPARSE_IRQ
        bool
 
+# Enable the generic irq autoprobe mechanism
 config GENERIC_IRQ_PROBE
 	bool
 
+# Use the generic /proc/interrupts implementation
 config GENERIC_IRQ_SHOW
        bool
 
+# Support for delayed migration from interrupt context
 config GENERIC_PENDING_IRQ
 	bool
 
+# Alpha specific irq affinity mechanism
 config AUTO_IRQ_AFFINITY
        bool
 
+# Tasklet based software resend for pending interrupts on enable_irq()
 config HARDIRQS_SW_RESEND
        bool
 
+# Preflow handler support for fasteoi (sparc64)
 config IRQ_PREFLOW_FASTEOI
        bool
 
+# Support forced irq threading
 config IRQ_FORCED_THREADING
        bool
 
-- 
cgit v0.10.2


From 51de69523ffe1c17994dc2f260369f29dfdce71c Mon Sep 17 00:00:00 2001
From: Owen Smith <owen.smith@citrix.com>
Date: Wed, 22 Dec 2010 15:05:00 +0000
Subject: xen: Union the blkif_request request specific fields

Prepare for extending the block device ring to allow request
specific fields, by moving the request specific fields for
reads, writes and barrier requests to a union member.

Acked-by: Jens Axboe <jaxboe@fusionio.com>
Signed-off-by: Owen Smith <owen.smith@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index d7aa39e..cc4514c 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -281,7 +281,7 @@ static int blkif_queue_request(struct request *req)
 	info->shadow[id].request = req;
 
 	ring_req->id = id;
-	ring_req->sector_number = (blkif_sector_t)blk_rq_pos(req);
+	ring_req->u.rw.sector_number = (blkif_sector_t)blk_rq_pos(req);
 	ring_req->handle = info->handle;
 
 	ring_req->operation = rq_data_dir(req) ?
@@ -317,7 +317,7 @@ static int blkif_queue_request(struct request *req)
 				rq_data_dir(req) );
 
 		info->shadow[id].frame[i] = mfn_to_pfn(buffer_mfn);
-		ring_req->seg[i] =
+		ring_req->u.rw.seg[i] =
 				(struct blkif_request_segment) {
 					.gref       = ref,
 					.first_sect = fsect,
@@ -615,7 +615,7 @@ static void blkif_completion(struct blk_shadow *s)
 {
 	int i;
 	for (i = 0; i < s->req.nr_segments; i++)
-		gnttab_end_foreign_access(s->req.seg[i].gref, 0, 0UL);
+		gnttab_end_foreign_access(s->req.u.rw.seg[i].gref, 0, 0UL);
 }
 
 static irqreturn_t blkif_interrupt(int irq, void *dev_id)
@@ -932,7 +932,7 @@ static int blkif_recover(struct blkfront_info *info)
 		/* Rewrite any grant references invalidated by susp/resume. */
 		for (j = 0; j < req->nr_segments; j++)
 			gnttab_grant_foreign_access_ref(
-				req->seg[j].gref,
+				req->u.rw.seg[j].gref,
 				info->xbdev->otherend_id,
 				pfn_to_mfn(info->shadow[req->id].frame[j]),
 				rq_data_dir(info->shadow[req->id].request));
diff --git a/include/xen/interface/io/blkif.h b/include/xen/interface/io/blkif.h
index c2d1fa4..e4f743c 100644
--- a/include/xen/interface/io/blkif.h
+++ b/include/xen/interface/io/blkif.h
@@ -51,11 +51,7 @@ typedef uint64_t blkif_sector_t;
  */
 #define BLKIF_MAX_SEGMENTS_PER_REQUEST 11
 
-struct blkif_request {
-	uint8_t        operation;    /* BLKIF_OP_???                         */
-	uint8_t        nr_segments;  /* number of segments                   */
-	blkif_vdev_t   handle;       /* only for read/write requests         */
-	uint64_t       id;           /* private guest value, echoed in resp  */
+struct blkif_request_rw {
 	blkif_sector_t sector_number;/* start sector idx on disk (r/w only)  */
 	struct blkif_request_segment {
 		grant_ref_t gref;        /* reference to I/O buffer frame        */
@@ -65,6 +61,16 @@ struct blkif_request {
 	} seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 };
 
+struct blkif_request {
+	uint8_t        operation;    /* BLKIF_OP_???                         */
+	uint8_t        nr_segments;  /* number of segments                   */
+	blkif_vdev_t   handle;       /* only for read/write requests         */
+	uint64_t       id;           /* private guest value, echoed in resp  */
+	union {
+		struct blkif_request_rw rw;
+	} u;
+};
+
 struct blkif_response {
 	uint64_t        id;              /* copied from request */
 	uint8_t         operation;       /* copied from request */
-- 
cgit v0.10.2


From c49aa5bd1376939b40759a6da5ba6cf701702721 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@novell.com>
Date: Tue, 8 Mar 2011 09:24:26 +0000
Subject: x86: Remove dead config option X86_CPU

This isn't being referenced anywhere, and the selects done from
it can be easily done together with all the other X86 ones.

 v2: Also adjust UML's Kconfig.x86.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
LKML-Reference: <4D7603DA02000078000351C1@vpn.id2.novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/um/Kconfig.x86 b/arch/um/Kconfig.x86
index 62f2115..02fb017 100644
--- a/arch/um/Kconfig.x86
+++ b/arch/um/Kconfig.x86
@@ -10,6 +10,8 @@ endmenu
 
 config UML_X86
 	def_bool y
+	select GENERIC_FIND_FIRST_BIT
+	select GENERIC_FIND_NEXT_BIT
 
 config 64BIT
 	bool
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index d5ed94d..43214e3 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -64,6 +64,8 @@ config X86
 	select HAVE_TEXT_POKE_SMP
 	select HAVE_GENERIC_HARDIRQS
 	select HAVE_SPARSE_IRQ
+	select GENERIC_FIND_FIRST_BIT
+	select GENERIC_FIND_NEXT_BIT
 	select GENERIC_IRQ_PROBE
 	select GENERIC_PENDING_IRQ if SMP
 	select USE_GENERIC_SMP_HELPERS if SMP
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 283c5a6..ed47e6e 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -294,11 +294,6 @@ config X86_GENERIC
 
 endif
 
-config X86_CPU
-	def_bool y
-	select GENERIC_FIND_FIRST_BIT
-	select GENERIC_FIND_NEXT_BIT
-
 #
 # Define implied options from the CPU selection here
 config X86_INTERNODE_CACHE_SHIFT
-- 
cgit v0.10.2


From 750912fa366312e9c5bc83eab352898a26750401 Mon Sep 17 00:00:00 2001
From: David Sharp <dhsharp@google.com>
Date: Wed, 8 Dec 2010 13:46:47 -0800
Subject: tracing: Add an 'overwrite' trace_option.

Add an "overwrite" trace_option for ftrace to control whether the buffer should
be overwritten on overflow or not. The default remains to overwrite old events
when the buffer is full. This patch adds the option to instead discard newest
events when the buffer is full. This is useful to get a snapshot of traces just
after enabling traces. Dropping the current event is also a simpler code path.

Signed-off-by: David Sharp <dhsharp@google.com>
LKML-Reference: <1291844807-15481-1-git-send-email-dhsharp@google.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt
index 67f1cc4..1ebc24c 100644
--- a/Documentation/trace/ftrace.txt
+++ b/Documentation/trace/ftrace.txt
@@ -454,6 +454,11 @@ x494] <- /root/a.out[+0x4a8] <- /lib/libc-2.7.so[+0x1e1a6]
                    latencies, as described in "Latency
                    trace format".
 
+  overwrite - This controls what happens when the trace buffer is
+              full. If "1" (default), the oldest events are
+              discarded and overwritten. If "0", then the newest
+              events are discarded.
+
 ftrace_enabled
 --------------
 
diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 8d3a248..ab38ac8 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -100,6 +100,8 @@ void ring_buffer_free(struct ring_buffer *buffer);
 
 int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size);
 
+void ring_buffer_change_overwrite(struct ring_buffer *buffer, int val);
+
 struct ring_buffer_event *ring_buffer_lock_reserve(struct ring_buffer *buffer,
 						   unsigned long length);
 int ring_buffer_unlock_commit(struct ring_buffer *buffer,
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index bd1c35a..269db80 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1429,6 +1429,17 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
 }
 EXPORT_SYMBOL_GPL(ring_buffer_resize);
 
+void ring_buffer_change_overwrite(struct ring_buffer *buffer, int val)
+{
+	mutex_lock(&buffer->mutex);
+	if (val)
+		buffer->flags |= RB_FL_OVERWRITE;
+	else
+		buffer->flags &= ~RB_FL_OVERWRITE;
+	mutex_unlock(&buffer->mutex);
+}
+EXPORT_SYMBOL_GPL(ring_buffer_change_overwrite);
+
 static inline void *
 __rb_data_page_index(struct buffer_data_page *bpage, unsigned index)
 {
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8dc8da6..85e3ee1 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -41,8 +41,6 @@
 #include "trace.h"
 #include "trace_output.h"
 
-#define TRACE_BUFFER_FLAGS	(RB_FL_OVERWRITE)
-
 /*
  * On boot up, the ring buffer is set to the minimum size, so that
  * we do not waste memory on systems that are not using tracing.
@@ -340,7 +338,7 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
 /* trace_flags holds trace_options default values */
 unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
 	TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME |
-	TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD;
+	TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE;
 
 static int trace_stop_count;
 static DEFINE_SPINLOCK(tracing_start_lock);
@@ -425,6 +423,7 @@ static const char *trace_options[] = {
 	"sleep-time",
 	"graph-time",
 	"record-cmd",
+	"overwrite",
 	NULL
 };
 
@@ -2529,6 +2528,9 @@ static void set_tracer_flags(unsigned int mask, int enabled)
 
 	if (mask == TRACE_ITER_RECORD_CMD)
 		trace_event_enable_cmd_record(enabled);
+
+	if (mask == TRACE_ITER_OVERWRITE)
+		ring_buffer_change_overwrite(global_trace.buffer, enabled);
 }
 
 static ssize_t
@@ -4555,9 +4557,11 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
 __init static int tracer_alloc_buffers(void)
 {
 	int ring_buf_size;
+	enum ring_buffer_flags rb_flags;
 	int i;
 	int ret = -ENOMEM;
 
+
 	if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL))
 		goto out;
 
@@ -4570,12 +4574,13 @@ __init static int tracer_alloc_buffers(void)
 	else
 		ring_buf_size = 1;
 
+	rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0;
+
 	cpumask_copy(tracing_buffer_mask, cpu_possible_mask);
 	cpumask_copy(tracing_cpumask, cpu_all_mask);
 
 	/* TODO: make the number of buffers hot pluggable with CPUS */
-	global_trace.buffer = ring_buffer_alloc(ring_buf_size,
-						   TRACE_BUFFER_FLAGS);
+	global_trace.buffer = ring_buffer_alloc(ring_buf_size, rb_flags);
 	if (!global_trace.buffer) {
 		printk(KERN_ERR "tracer: failed to allocate ring buffer!\n");
 		WARN_ON(1);
@@ -4585,7 +4590,7 @@ __init static int tracer_alloc_buffers(void)
 
 
 #ifdef CONFIG_TRACER_MAX_TRACE
-	max_tr.buffer = ring_buffer_alloc(1, TRACE_BUFFER_FLAGS);
+	max_tr.buffer = ring_buffer_alloc(1, rb_flags);
 	if (!max_tr.buffer) {
 		printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n");
 		WARN_ON(1);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 856e73c..951d0b7 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -606,6 +606,7 @@ enum trace_iterator_flags {
 	TRACE_ITER_SLEEP_TIME		= 0x40000,
 	TRACE_ITER_GRAPH_TIME		= 0x80000,
 	TRACE_ITER_RECORD_CMD		= 0x100000,
+	TRACE_ITER_OVERWRITE		= 0x200000,
 };
 
 /*
-- 
cgit v0.10.2


From de29be5e712dc8b7eef2bef9417af3bb6a88e47a Mon Sep 17 00:00:00 2001
From: David Sharp <dhsharp@google.com>
Date: Fri, 3 Dec 2010 16:13:16 -0800
Subject: ring-buffer: Remove unused #include <linux/trace_irq.h>

Signed-off-by: David Sharp <dhsharp@google.com>
LKML-Reference: <1291421609-14665-3-git-send-email-dhsharp@google.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 269db80..3237d96 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -5,7 +5,6 @@
  */
 #include <linux/ring_buffer.h>
 #include <linux/trace_clock.h>
-#include <linux/ftrace_irq.h>
 #include <linux/spinlock.h>
 #include <linux/debugfs.h>
 #include <linux/uaccess.h>
-- 
cgit v0.10.2


From f44f7f96a20af16f6f12e1c995576d6becf5f57b Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Mon, 21 Feb 2011 22:58:51 -0800
Subject: RTC: Initialize kernel state from RTC

Mark Brown pointed out a corner case: that RTC alarms should
be allowed to be persistent across reboots if the hardware
supported it.

The rework of the generic layer to virtualize the RTC alarm
virtualized much of the alarm handling, and removed the
code used to read the alarm time from the hardware.

Mark noted if we want the alarm to be persistent across
reboots, we need to re-read the alarm value into the
virtualized generic layer at boot up, so that the generic
layer properly exposes that value.

This patch restores much of the earlier removed
rtc_read_alarm code and wires it in so that we
set the kernel's alarm value to what we find in the
hardware at boot time.

NOTE: Not all hardware supports persistent RTC alarm state across
system reset. rtc-cmos for example will keep the alarm time, but
disables the AIE mode irq. Applications should not expect the RTC
alarm to be valid after a system reset. We will preserve what
we can, to represent the hardware state at boot, but its not
guarenteed.

Further, in the future, with multiplexed RTC alarms, the
soonest alarm to fire may not be the one set via the /dev/rt
ioctls. So an application may set the alarm with RTC_ALM_SET,
but after a reset find that RTC_ALM_READ returns an earlier
time. Again, we preserve what we can, but applications should
not expect the RTC alarm state to persist across a system reset.

Big thanks to Mark for pointing out the issue!
Thanks also to Marcelo for helping think through the solution.

CC: Mark Brown <broonie@opensource.wolfsonmicro.com>
CC: Marcelo Roberto Jimenez <mroberto@cpti.cetuc.puc-rio.br>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Alessandro Zummo <a.zummo@towertech.it>
CC: rtc-linux@googlegroups.com
Reported-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>

diff --git a/drivers/rtc/class.c b/drivers/rtc/class.c
index c404b61..09b4437 100644
--- a/drivers/rtc/class.c
+++ b/drivers/rtc/class.c
@@ -117,6 +117,7 @@ struct rtc_device *rtc_device_register(const char *name, struct device *dev,
 					struct module *owner)
 {
 	struct rtc_device *rtc;
+	struct rtc_wkalrm alrm;
 	int id, err;
 
 	if (idr_pre_get(&rtc_idr, GFP_KERNEL) == 0) {
@@ -166,6 +167,12 @@ struct rtc_device *rtc_device_register(const char *name, struct device *dev,
 	rtc->pie_timer.function = rtc_pie_update_irq;
 	rtc->pie_enabled = 0;
 
+	/* Check to see if there is an ALARM already set in hw */
+	err = __rtc_read_alarm(rtc, &alrm);
+
+	if (!err && !rtc_valid_tm(&alrm.time))
+		rtc_set_alarm(rtc, &alrm);
+
 	strlcpy(rtc->name, name, RTC_DEVICE_NAME_SIZE);
 	dev_set_name(&rtc->dev, "rtc%d", id);
 
diff --git a/drivers/rtc/interface.c b/drivers/rtc/interface.c
index cb2f072..8ec6b06 100644
--- a/drivers/rtc/interface.c
+++ b/drivers/rtc/interface.c
@@ -116,6 +116,186 @@ int rtc_set_mmss(struct rtc_device *rtc, unsigned long secs)
 }
 EXPORT_SYMBOL_GPL(rtc_set_mmss);
 
+static int rtc_read_alarm_internal(struct rtc_device *rtc, struct rtc_wkalrm *alarm)
+{
+	int err;
+
+	err = mutex_lock_interruptible(&rtc->ops_lock);
+	if (err)
+		return err;
+
+	if (rtc->ops == NULL)
+		err = -ENODEV;
+	else if (!rtc->ops->read_alarm)
+		err = -EINVAL;
+	else {
+		memset(alarm, 0, sizeof(struct rtc_wkalrm));
+		err = rtc->ops->read_alarm(rtc->dev.parent, alarm);
+	}
+
+	mutex_unlock(&rtc->ops_lock);
+	return err;
+}
+
+int __rtc_read_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm)
+{
+	int err;
+	struct rtc_time before, now;
+	int first_time = 1;
+	unsigned long t_now, t_alm;
+	enum { none, day, month, year } missing = none;
+	unsigned days;
+
+	/* The lower level RTC driver may return -1 in some fields,
+	 * creating invalid alarm->time values, for reasons like:
+	 *
+	 *   - The hardware may not be capable of filling them in;
+	 *     many alarms match only on time-of-day fields, not
+	 *     day/month/year calendar data.
+	 *
+	 *   - Some hardware uses illegal values as "wildcard" match
+	 *     values, which non-Linux firmware (like a BIOS) may try
+	 *     to set up as e.g. "alarm 15 minutes after each hour".
+	 *     Linux uses only oneshot alarms.
+	 *
+	 * When we see that here, we deal with it by using values from
+	 * a current RTC timestamp for any missing (-1) values.  The
+	 * RTC driver prevents "periodic alarm" modes.
+	 *
+	 * But this can be racey, because some fields of the RTC timestamp
+	 * may have wrapped in the interval since we read the RTC alarm,
+	 * which would lead to us inserting inconsistent values in place
+	 * of the -1 fields.
+	 *
+	 * Reading the alarm and timestamp in the reverse sequence
+	 * would have the same race condition, and not solve the issue.
+	 *
+	 * So, we must first read the RTC timestamp,
+	 * then read the RTC alarm value,
+	 * and then read a second RTC timestamp.
+	 *
+	 * If any fields of the second timestamp have changed
+	 * when compared with the first timestamp, then we know
+	 * our timestamp may be inconsistent with that used by
+	 * the low-level rtc_read_alarm_internal() function.
+	 *
+	 * So, when the two timestamps disagree, we just loop and do
+	 * the process again to get a fully consistent set of values.
+	 *
+	 * This could all instead be done in the lower level driver,
+	 * but since more than one lower level RTC implementation needs it,
+	 * then it's probably best best to do it here instead of there..
+	 */
+
+	/* Get the "before" timestamp */
+	err = rtc_read_time(rtc, &before);
+	if (err < 0)
+		return err;
+	do {
+		if (!first_time)
+			memcpy(&before, &now, sizeof(struct rtc_time));
+		first_time = 0;
+
+		/* get the RTC alarm values, which may be incomplete */
+		err = rtc_read_alarm_internal(rtc, alarm);
+		if (err)
+			return err;
+
+		/* full-function RTCs won't have such missing fields */
+		if (rtc_valid_tm(&alarm->time) == 0)
+			return 0;
+
+		/* get the "after" timestamp, to detect wrapped fields */
+		err = rtc_read_time(rtc, &now);
+		if (err < 0)
+			return err;
+
+		/* note that tm_sec is a "don't care" value here: */
+	} while (   before.tm_min   != now.tm_min
+		 || before.tm_hour  != now.tm_hour
+		 || before.tm_mon   != now.tm_mon
+		 || before.tm_year  != now.tm_year);
+
+	/* Fill in the missing alarm fields using the timestamp; we
+	 * know there's at least one since alarm->time is invalid.
+	 */
+	if (alarm->time.tm_sec == -1)
+		alarm->time.tm_sec = now.tm_sec;
+	if (alarm->time.tm_min == -1)
+		alarm->time.tm_min = now.tm_min;
+	if (alarm->time.tm_hour == -1)
+		alarm->time.tm_hour = now.tm_hour;
+
+	/* For simplicity, only support date rollover for now */
+	if (alarm->time.tm_mday == -1) {
+		alarm->time.tm_mday = now.tm_mday;
+		missing = day;
+	}
+	if (alarm->time.tm_mon == -1) {
+		alarm->time.tm_mon = now.tm_mon;
+		if (missing == none)
+			missing = month;
+	}
+	if (alarm->time.tm_year == -1) {
+		alarm->time.tm_year = now.tm_year;
+		if (missing == none)
+			missing = year;
+	}
+
+	/* with luck, no rollover is needed */
+	rtc_tm_to_time(&now, &t_now);
+	rtc_tm_to_time(&alarm->time, &t_alm);
+	if (t_now < t_alm)
+		goto done;
+
+	switch (missing) {
+
+	/* 24 hour rollover ... if it's now 10am Monday, an alarm that
+	 * that will trigger at 5am will do so at 5am Tuesday, which
+	 * could also be in the next month or year.  This is a common
+	 * case, especially for PCs.
+	 */
+	case day:
+		dev_dbg(&rtc->dev, "alarm rollover: %s\n", "day");
+		t_alm += 24 * 60 * 60;
+		rtc_time_to_tm(t_alm, &alarm->time);
+		break;
+
+	/* Month rollover ... if it's the 31th, an alarm on the 3rd will
+	 * be next month.  An alarm matching on the 30th, 29th, or 28th
+	 * may end up in the month after that!  Many newer PCs support
+	 * this type of alarm.
+	 */
+	case month:
+		dev_dbg(&rtc->dev, "alarm rollover: %s\n", "month");
+		do {
+			if (alarm->time.tm_mon < 11)
+				alarm->time.tm_mon++;
+			else {
+				alarm->time.tm_mon = 0;
+				alarm->time.tm_year++;
+			}
+			days = rtc_month_days(alarm->time.tm_mon,
+					alarm->time.tm_year);
+		} while (days < alarm->time.tm_mday);
+		break;
+
+	/* Year rollover ... easy except for leap years! */
+	case year:
+		dev_dbg(&rtc->dev, "alarm rollover: %s\n", "year");
+		do {
+			alarm->time.tm_year++;
+		} while (rtc_valid_tm(&alarm->time) != 0);
+		break;
+
+	default:
+		dev_warn(&rtc->dev, "alarm rollover not handled\n");
+	}
+
+done:
+	return 0;
+}
+
 int rtc_read_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm)
 {
 	int err;
diff --git a/include/linux/rtc.h b/include/linux/rtc.h
index 89c3e51..db3832d 100644
--- a/include/linux/rtc.h
+++ b/include/linux/rtc.h
@@ -227,6 +227,7 @@ extern void rtc_device_unregister(struct rtc_device *rtc);
 extern int rtc_read_time(struct rtc_device *rtc, struct rtc_time *tm);
 extern int rtc_set_time(struct rtc_device *rtc, struct rtc_time *tm);
 extern int rtc_set_mmss(struct rtc_device *rtc, unsigned long secs);
+int __rtc_read_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm);
 extern int rtc_read_alarm(struct rtc_device *rtc,
 			struct rtc_wkalrm *alrm);
 extern int rtc_set_alarm(struct rtc_device *rtc,
-- 
cgit v0.10.2


From 80d4bb515b78f38738f3378fd1be6039063ab040 Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Thu, 3 Feb 2011 11:34:50 -0800
Subject: RTC: Cleanup rtc_class_ops->irq_set_state

With PIE mode interrupts now emulated in generic code via an hrtimer,
no one calls rtc_class_ops->irq_set_state(), so this patch removes it
along with driver implementations.

CC: Thomas Gleixner <tglx@linutronix.de>
CC: Alessandro Zummo <a.zummo@towertech.it>
CC: Marcelo Roberto Jimenez <mroberto@cpti.cetuc.puc-rio.br>
CC: rtc-linux@googlegroups.com
Signed-off-by: John Stultz <john.stultz@linaro.org>

diff --git a/drivers/rtc/rtc-cmos.c b/drivers/rtc/rtc-cmos.c
index c7ff8df..de632e7 100644
--- a/drivers/rtc/rtc-cmos.c
+++ b/drivers/rtc/rtc-cmos.c
@@ -400,25 +400,6 @@ static int cmos_irq_set_freq(struct device *dev, int freq)
 	return 0;
 }
 
-static int cmos_irq_set_state(struct device *dev, int enabled)
-{
-	struct cmos_rtc	*cmos = dev_get_drvdata(dev);
-	unsigned long	flags;
-
-	if (!is_valid_irq(cmos->irq))
-		return -ENXIO;
-
-	spin_lock_irqsave(&rtc_lock, flags);
-
-	if (enabled)
-		cmos_irq_enable(cmos, RTC_PIE);
-	else
-		cmos_irq_disable(cmos, RTC_PIE);
-
-	spin_unlock_irqrestore(&rtc_lock, flags);
-	return 0;
-}
-
 static int cmos_alarm_irq_enable(struct device *dev, unsigned int enabled)
 {
 	struct cmos_rtc	*cmos = dev_get_drvdata(dev);
@@ -502,7 +483,6 @@ static const struct rtc_class_ops cmos_rtc_ops = {
 	.set_alarm		= cmos_set_alarm,
 	.proc			= cmos_procfs,
 	.irq_set_freq		= cmos_irq_set_freq,
-	.irq_set_state		= cmos_irq_set_state,
 	.alarm_irq_enable	= cmos_alarm_irq_enable,
 	.update_irq_enable	= cmos_update_irq_enable,
 };
diff --git a/drivers/rtc/rtc-davinci.c b/drivers/rtc/rtc-davinci.c
index 34647fc..92da73d 100644
--- a/drivers/rtc/rtc-davinci.c
+++ b/drivers/rtc/rtc-davinci.c
@@ -473,39 +473,6 @@ static int davinci_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alm)
 	return 0;
 }
 
-static int davinci_rtc_irq_set_state(struct device *dev, int enabled)
-{
-	struct davinci_rtc *davinci_rtc = dev_get_drvdata(dev);
-	unsigned long flags;
-	u8 rtc_ctrl;
-
-	spin_lock_irqsave(&davinci_rtc_lock, flags);
-
-	rtc_ctrl = rtcss_read(davinci_rtc, PRTCSS_RTC_CTRL);
-
-	if (enabled) {
-		while (rtcss_read(davinci_rtc, PRTCSS_RTC_CTRL)
-		       & PRTCSS_RTC_CTRL_WDTBUS)
-			cpu_relax();
-
-		rtc_ctrl |= PRTCSS_RTC_CTRL_TE;
-		rtcss_write(davinci_rtc, rtc_ctrl, PRTCSS_RTC_CTRL);
-
-		rtcss_write(davinci_rtc, 0x0, PRTCSS_RTC_CLKC_CNT);
-
-		rtc_ctrl |= PRTCSS_RTC_CTRL_TIEN |
-			    PRTCSS_RTC_CTRL_TMMD |
-			    PRTCSS_RTC_CTRL_TMRFLG;
-	} else
-		rtc_ctrl &= ~PRTCSS_RTC_CTRL_TIEN;
-
-	rtcss_write(davinci_rtc, rtc_ctrl, PRTCSS_RTC_CTRL);
-
-	spin_unlock_irqrestore(&davinci_rtc_lock, flags);
-
-	return 0;
-}
-
 static int davinci_rtc_irq_set_freq(struct device *dev, int freq)
 {
 	struct davinci_rtc *davinci_rtc = dev_get_drvdata(dev);
@@ -529,7 +496,6 @@ static struct rtc_class_ops davinci_rtc_ops = {
 	.alarm_irq_enable	= davinci_rtc_alarm_irq_enable,
 	.read_alarm		= davinci_rtc_read_alarm,
 	.set_alarm		= davinci_rtc_set_alarm,
-	.irq_set_state		= davinci_rtc_irq_set_state,
 	.irq_set_freq		= davinci_rtc_irq_set_freq,
 };
 
diff --git a/drivers/rtc/rtc-mrst.c b/drivers/rtc/rtc-mrst.c
index 1db62db..4db96fa 100644
--- a/drivers/rtc/rtc-mrst.c
+++ b/drivers/rtc/rtc-mrst.c
@@ -236,25 +236,6 @@ static int mrst_set_alarm(struct device *dev, struct rtc_wkalrm *t)
 	return 0;
 }
 
-static int mrst_irq_set_state(struct device *dev, int enabled)
-{
-	struct mrst_rtc	*mrst = dev_get_drvdata(dev);
-	unsigned long	flags;
-
-	if (!mrst->irq)
-		return -ENXIO;
-
-	spin_lock_irqsave(&rtc_lock, flags);
-
-	if (enabled)
-		mrst_irq_enable(mrst, RTC_PIE);
-	else
-		mrst_irq_disable(mrst, RTC_PIE);
-
-	spin_unlock_irqrestore(&rtc_lock, flags);
-	return 0;
-}
-
 /* Currently, the vRTC doesn't support UIE ON/OFF */
 static int mrst_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled)
 {
@@ -301,7 +282,6 @@ static const struct rtc_class_ops mrst_rtc_ops = {
 	.read_alarm	= mrst_read_alarm,
 	.set_alarm	= mrst_set_alarm,
 	.proc		= mrst_procfs,
-	.irq_set_state	= mrst_irq_set_state,
 	.alarm_irq_enable = mrst_rtc_alarm_irq_enable,
 };
 
diff --git a/drivers/rtc/rtc-pl031.c b/drivers/rtc/rtc-pl031.c
index b7a6690..0e7c15b 100644
--- a/drivers/rtc/rtc-pl031.c
+++ b/drivers/rtc/rtc-pl031.c
@@ -293,38 +293,6 @@ static int pl031_set_alarm(struct device *dev, struct rtc_wkalrm *alarm)
 	return ret;
 }
 
-/* Periodic interrupt is only available in ST variants. */
-static int pl031_irq_set_state(struct device *dev, int enabled)
-{
-	struct pl031_local *ldata = dev_get_drvdata(dev);
-
-	if (enabled == 1) {
-		/* Clear any pending timer interrupt. */
-		writel(RTC_BIT_PI, ldata->base + RTC_ICR);
-
-		writel(readl(ldata->base + RTC_IMSC) | RTC_BIT_PI,
-			ldata->base + RTC_IMSC);
-
-		/* Now start the timer */
-		writel(readl(ldata->base + RTC_TCR) | RTC_TCR_EN,
-			ldata->base + RTC_TCR);
-
-	} else {
-		writel(readl(ldata->base + RTC_IMSC) & (~RTC_BIT_PI),
-			ldata->base + RTC_IMSC);
-
-		/* Also stop the timer */
-		writel(readl(ldata->base + RTC_TCR) & (~RTC_TCR_EN),
-			ldata->base + RTC_TCR);
-	}
-	/* Wait at least 1 RTC32 clock cycle to ensure next access
-	 * to RTC_TCR will succeed.
-	 */
-	udelay(40);
-
-	return 0;
-}
-
 static int pl031_irq_set_freq(struct device *dev, int freq)
 {
 	struct pl031_local *ldata = dev_get_drvdata(dev);
@@ -440,7 +408,6 @@ static struct rtc_class_ops stv1_pl031_ops = {
 	.read_alarm = pl031_read_alarm,
 	.set_alarm = pl031_set_alarm,
 	.alarm_irq_enable = pl031_alarm_irq_enable,
-	.irq_set_state = pl031_irq_set_state,
 	.irq_set_freq = pl031_irq_set_freq,
 };
 
@@ -451,7 +418,6 @@ static struct rtc_class_ops stv2_pl031_ops = {
 	.read_alarm = pl031_stv2_read_alarm,
 	.set_alarm = pl031_stv2_set_alarm,
 	.alarm_irq_enable = pl031_alarm_irq_enable,
-	.irq_set_state = pl031_irq_set_state,
 	.irq_set_freq = pl031_irq_set_freq,
 };
 
diff --git a/drivers/rtc/rtc-pxa.c b/drivers/rtc/rtc-pxa.c
index 29e867a..b216ae5 100644
--- a/drivers/rtc/rtc-pxa.c
+++ b/drivers/rtc/rtc-pxa.c
@@ -223,18 +223,6 @@ static int pxa_periodic_irq_set_freq(struct device *dev, int freq)
 	return 0;
 }
 
-static int pxa_periodic_irq_set_state(struct device *dev, int enabled)
-{
-	struct pxa_rtc *pxa_rtc = dev_get_drvdata(dev);
-
-	if (enabled)
-		rtsr_set_bits(pxa_rtc, RTSR_PIALE | RTSR_PICE);
-	else
-		rtsr_clear_bits(pxa_rtc, RTSR_PIALE | RTSR_PICE);
-
-	return 0;
-}
-
 static int pxa_alarm_irq_enable(struct device *dev, unsigned int enabled)
 {
 	struct pxa_rtc *pxa_rtc = dev_get_drvdata(dev);
@@ -348,7 +336,6 @@ static const struct rtc_class_ops pxa_rtc_ops = {
 	.alarm_irq_enable = pxa_alarm_irq_enable,
 	.update_irq_enable = pxa_update_irq_enable,
 	.proc = pxa_rtc_proc,
-	.irq_set_state = pxa_periodic_irq_set_state,
 	.irq_set_freq = pxa_periodic_irq_set_freq,
 };
 
diff --git a/drivers/rtc/rtc-rx8025.c b/drivers/rtc/rtc-rx8025.c
index af32a62..fde172f 100644
--- a/drivers/rtc/rtc-rx8025.c
+++ b/drivers/rtc/rtc-rx8025.c
@@ -424,37 +424,12 @@ static int rx8025_alarm_irq_enable(struct device *dev, unsigned int enabled)
 	return 0;
 }
 
-static int rx8025_irq_set_state(struct device *dev, int enabled)
-{
-	struct i2c_client *client = to_i2c_client(dev);
-	struct rx8025_data *rx8025 = i2c_get_clientdata(client);
-	int ctrl1;
-	int err;
-
-	if (client->irq <= 0)
-		return -ENXIO;
-
-	ctrl1 = rx8025->ctrl1 & ~RX8025_BIT_CTRL1_CT;
-	if (enabled)
-		ctrl1 |= RX8025_BIT_CTRL1_CT_1HZ;
-	if (ctrl1 != rx8025->ctrl1) {
-		rx8025->ctrl1 = ctrl1;
-		err = rx8025_write_reg(rx8025->client, RX8025_REG_CTRL1,
-				       rx8025->ctrl1);
-		if (err)
-			return err;
-	}
-
-	return 0;
-}
-
 static struct rtc_class_ops rx8025_rtc_ops = {
 	.read_time = rx8025_get_time,
 	.set_time = rx8025_set_time,
 	.read_alarm = rx8025_read_alarm,
 	.set_alarm = rx8025_set_alarm,
 	.alarm_irq_enable = rx8025_alarm_irq_enable,
-	.irq_set_state  = rx8025_irq_set_state,
 };
 
 /*
diff --git a/drivers/rtc/rtc-s3c.c b/drivers/rtc/rtc-s3c.c
index b80fa28..80fb7e7 100644
--- a/drivers/rtc/rtc-s3c.c
+++ b/drivers/rtc/rtc-s3c.c
@@ -93,37 +93,6 @@ static int s3c_rtc_setaie(struct device *dev, unsigned int enabled)
 	return 0;
 }
 
-static int s3c_rtc_setpie(struct device *dev, int enabled)
-{
-	unsigned int tmp;
-
-	pr_debug("%s: pie=%d\n", __func__, enabled);
-
-	spin_lock_irq(&s3c_rtc_pie_lock);
-
-	if (s3c_rtc_cpu_type == TYPE_S3C64XX) {
-		tmp = readw(s3c_rtc_base + S3C2410_RTCCON);
-		tmp &= ~S3C64XX_RTCCON_TICEN;
-
-		if (enabled)
-			tmp |= S3C64XX_RTCCON_TICEN;
-
-		writew(tmp, s3c_rtc_base + S3C2410_RTCCON);
-	} else {
-		tmp = readb(s3c_rtc_base + S3C2410_TICNT);
-		tmp &= ~S3C2410_TICNT_ENABLE;
-
-		if (enabled)
-			tmp |= S3C2410_TICNT_ENABLE;
-
-		writeb(tmp, s3c_rtc_base + S3C2410_TICNT);
-	}
-
-	spin_unlock_irq(&s3c_rtc_pie_lock);
-
-	return 0;
-}
-
 static int s3c_rtc_setfreq(struct device *dev, int freq)
 {
 	struct platform_device *pdev = to_platform_device(dev);
@@ -380,7 +349,6 @@ static const struct rtc_class_ops s3c_rtcops = {
 	.read_alarm	= s3c_rtc_getalarm,
 	.set_alarm	= s3c_rtc_setalarm,
 	.irq_set_freq	= s3c_rtc_setfreq,
-	.irq_set_state	= s3c_rtc_setpie,
 	.proc		= s3c_rtc_proc,
 	.alarm_irq_enable = s3c_rtc_setaie,
 };
diff --git a/drivers/rtc/rtc-sa1100.c b/drivers/rtc/rtc-sa1100.c
index 5dfe5ff..d47b3fc 100644
--- a/drivers/rtc/rtc-sa1100.c
+++ b/drivers/rtc/rtc-sa1100.c
@@ -171,23 +171,6 @@ static int sa1100_irq_set_freq(struct device *dev, int freq)
 
 static int rtc_timer1_count;
 
-static int sa1100_irq_set_state(struct device *dev, int enabled)
-{
-	spin_lock_irq(&sa1100_rtc_lock);
-	if (enabled) {
-		struct rtc_device *rtc = (struct rtc_device *)dev;
-
-		OSMR1 = timer_freq / rtc->irq_freq + OSCR;
-		OIER |= OIER_E1;
-		rtc_timer1_count = 1;
-	} else {
-		OIER &= ~OIER_E1;
-	}
-	spin_unlock_irq(&sa1100_rtc_lock);
-
-	return 0;
-}
-
 static inline int sa1100_timer1_retrigger(struct rtc_device *rtc)
 {
 	unsigned long diff;
@@ -410,7 +393,6 @@ static const struct rtc_class_ops sa1100_rtc_ops = {
 	.set_alarm = sa1100_rtc_set_alarm,
 	.proc = sa1100_rtc_proc,
 	.irq_set_freq = sa1100_irq_set_freq,
-	.irq_set_state = sa1100_irq_set_state,
 	.alarm_irq_enable = sa1100_rtc_alarm_irq_enable,
 };
 
diff --git a/drivers/rtc/rtc-sh.c b/drivers/rtc/rtc-sh.c
index 93314a9..ff50a8b 100644
--- a/drivers/rtc/rtc-sh.c
+++ b/drivers/rtc/rtc-sh.c
@@ -603,7 +603,6 @@ static struct rtc_class_ops sh_rtc_ops = {
 	.set_time	= sh_rtc_set_time,
 	.read_alarm	= sh_rtc_read_alarm,
 	.set_alarm	= sh_rtc_set_alarm,
-	.irq_set_state	= sh_rtc_irq_set_state,
 	.irq_set_freq	= sh_rtc_irq_set_freq,
 	.proc		= sh_rtc_proc,
 	.alarm_irq_enable = sh_rtc_alarm_irq_enable,
diff --git a/drivers/rtc/rtc-vr41xx.c b/drivers/rtc/rtc-vr41xx.c
index 769190a..86f1490 100644
--- a/drivers/rtc/rtc-vr41xx.c
+++ b/drivers/rtc/rtc-vr41xx.c
@@ -227,16 +227,6 @@ static int vr41xx_rtc_irq_set_freq(struct device *dev, int freq)
 	return 0;
 }
 
-static int vr41xx_rtc_irq_set_state(struct device *dev, int enabled)
-{
-	if (enabled)
-		enable_irq(pie_irq);
-	else
-		disable_irq(pie_irq);
-
-	return 0;
-}
-
 static int vr41xx_rtc_ioctl(struct device *dev, unsigned int cmd, unsigned long arg)
 {
 	switch (cmd) {
@@ -309,7 +299,6 @@ static const struct rtc_class_ops vr41xx_rtc_ops = {
 	.read_alarm	= vr41xx_rtc_read_alarm,
 	.set_alarm	= vr41xx_rtc_set_alarm,
 	.irq_set_freq	= vr41xx_rtc_irq_set_freq,
-	.irq_set_state	= vr41xx_rtc_irq_set_state,
 };
 
 static int __devinit rtc_probe(struct platform_device *pdev)
diff --git a/include/linux/rtc.h b/include/linux/rtc.h
index db3832d..0e2063a 100644
--- a/include/linux/rtc.h
+++ b/include/linux/rtc.h
@@ -148,7 +148,6 @@ struct rtc_class_ops {
 	int (*set_alarm)(struct device *, struct rtc_wkalrm *);
 	int (*proc)(struct device *, struct seq_file *);
 	int (*set_mmss)(struct device *, unsigned long secs);
-	int (*irq_set_state)(struct device *, int enabled);
 	int (*irq_set_freq)(struct device *, int freq);
 	int (*read_callback)(struct device *, int data);
 	int (*alarm_irq_enable)(struct device *, unsigned int enabled);
-- 
cgit v0.10.2


From 696160fec162601d06940862b5b3aa4460344c1b Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Thu, 3 Feb 2011 12:02:07 -0800
Subject: RTC: Cleanup rtc_class_ops->irq_set_freq()

With the generic rtc code now emulating PIE mode irqs via an
hrtimer, no one calls the rtc_class_ops->irq_set_freq call.

This patch removes the hook and deletes the driver functions
if no one else calls them.

CC: Thomas Gleixner <tglx@linutronix.de>
CC: Alessandro Zummo <a.zummo@towertech.it>
CC: Marcelo Roberto Jimenez <mroberto@cpti.cetuc.puc-rio.br>
CC: rtc-linux@googlegroups.com
Signed-off-by: John Stultz <john.stultz@linaro.org>

diff --git a/drivers/rtc/rtc-cmos.c b/drivers/rtc/rtc-cmos.c
index de632e7..bdb1f8e 100644
--- a/drivers/rtc/rtc-cmos.c
+++ b/drivers/rtc/rtc-cmos.c
@@ -375,31 +375,6 @@ static int cmos_set_alarm(struct device *dev, struct rtc_wkalrm *t)
 	return 0;
 }
 
-static int cmos_irq_set_freq(struct device *dev, int freq)
-{
-	struct cmos_rtc	*cmos = dev_get_drvdata(dev);
-	int		f;
-	unsigned long	flags;
-
-	if (!is_valid_irq(cmos->irq))
-		return -ENXIO;
-
-	if (!is_power_of_2(freq))
-		return -EINVAL;
-	/* 0 = no irqs; 1 = 2^15 Hz ... 15 = 2^0 Hz */
-	f = ffs(freq);
-	if (f-- > 16)
-		return -EINVAL;
-	f = 16 - f;
-
-	spin_lock_irqsave(&rtc_lock, flags);
-	hpet_set_periodic_freq(freq);
-	CMOS_WRITE(RTC_REF_CLCK_32KHZ | f, RTC_FREQ_SELECT);
-	spin_unlock_irqrestore(&rtc_lock, flags);
-
-	return 0;
-}
-
 static int cmos_alarm_irq_enable(struct device *dev, unsigned int enabled)
 {
 	struct cmos_rtc	*cmos = dev_get_drvdata(dev);
@@ -482,7 +457,6 @@ static const struct rtc_class_ops cmos_rtc_ops = {
 	.read_alarm		= cmos_read_alarm,
 	.set_alarm		= cmos_set_alarm,
 	.proc			= cmos_procfs,
-	.irq_set_freq		= cmos_irq_set_freq,
 	.alarm_irq_enable	= cmos_alarm_irq_enable,
 	.update_irq_enable	= cmos_update_irq_enable,
 };
diff --git a/drivers/rtc/rtc-davinci.c b/drivers/rtc/rtc-davinci.c
index 92da73d..dfd98a2 100644
--- a/drivers/rtc/rtc-davinci.c
+++ b/drivers/rtc/rtc-davinci.c
@@ -473,22 +473,6 @@ static int davinci_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alm)
 	return 0;
 }
 
-static int davinci_rtc_irq_set_freq(struct device *dev, int freq)
-{
-	struct davinci_rtc *davinci_rtc = dev_get_drvdata(dev);
-	unsigned long flags;
-	u16 tmr_counter = (0x8000 >> (ffs(freq) - 1));
-
-	spin_lock_irqsave(&davinci_rtc_lock, flags);
-
-	rtcss_write(davinci_rtc, tmr_counter & 0xFF, PRTCSS_RTC_TMR0);
-	rtcss_write(davinci_rtc, (tmr_counter & 0xFF00) >> 8, PRTCSS_RTC_TMR1);
-
-	spin_unlock_irqrestore(&davinci_rtc_lock, flags);
-
-	return 0;
-}
-
 static struct rtc_class_ops davinci_rtc_ops = {
 	.ioctl			= davinci_rtc_ioctl,
 	.read_time		= davinci_rtc_read_time,
@@ -496,7 +480,6 @@ static struct rtc_class_ops davinci_rtc_ops = {
 	.alarm_irq_enable	= davinci_rtc_alarm_irq_enable,
 	.read_alarm		= davinci_rtc_read_alarm,
 	.set_alarm		= davinci_rtc_set_alarm,
-	.irq_set_freq		= davinci_rtc_irq_set_freq,
 };
 
 static int __init davinci_rtc_probe(struct platform_device *pdev)
diff --git a/drivers/rtc/rtc-pl031.c b/drivers/rtc/rtc-pl031.c
index 0e7c15b..d829ea6 100644
--- a/drivers/rtc/rtc-pl031.c
+++ b/drivers/rtc/rtc-pl031.c
@@ -293,25 +293,6 @@ static int pl031_set_alarm(struct device *dev, struct rtc_wkalrm *alarm)
 	return ret;
 }
 
-static int pl031_irq_set_freq(struct device *dev, int freq)
-{
-	struct pl031_local *ldata = dev_get_drvdata(dev);
-
-	/* Cant set timer if it is already enabled */
-	if (readl(ldata->base + RTC_TCR) & RTC_TCR_EN) {
-		dev_err(dev, "can't change frequency while timer enabled\n");
-		return -EINVAL;
-	}
-
-	/* If self start bit in RTC_TCR is set timer will start here,
-	 * but we never set that bit. Instead we start the timer when
-	 * set_state is called with enabled == 1.
-	 */
-	writel(RTC_TIMER_FREQ / freq, ldata->base + RTC_TLR);
-
-	return 0;
-}
-
 static int pl031_remove(struct amba_device *adev)
 {
 	struct pl031_local *ldata = dev_get_drvdata(&adev->dev);
@@ -408,7 +389,6 @@ static struct rtc_class_ops stv1_pl031_ops = {
 	.read_alarm = pl031_read_alarm,
 	.set_alarm = pl031_set_alarm,
 	.alarm_irq_enable = pl031_alarm_irq_enable,
-	.irq_set_freq = pl031_irq_set_freq,
 };
 
 /* And the second ST derivative */
@@ -418,7 +398,6 @@ static struct rtc_class_ops stv2_pl031_ops = {
 	.read_alarm = pl031_stv2_read_alarm,
 	.set_alarm = pl031_stv2_set_alarm,
 	.alarm_irq_enable = pl031_alarm_irq_enable,
-	.irq_set_freq = pl031_irq_set_freq,
 };
 
 static struct amba_id pl031_ids[] = {
diff --git a/drivers/rtc/rtc-pxa.c b/drivers/rtc/rtc-pxa.c
index b216ae5..a1fdc80 100644
--- a/drivers/rtc/rtc-pxa.c
+++ b/drivers/rtc/rtc-pxa.c
@@ -209,20 +209,6 @@ static void pxa_rtc_release(struct device *dev)
 	free_irq(pxa_rtc->irq_1Hz, dev);
 }
 
-static int pxa_periodic_irq_set_freq(struct device *dev, int freq)
-{
-	struct pxa_rtc *pxa_rtc = dev_get_drvdata(dev);
-	int period_ms;
-
-	if (freq < 1 || freq > MAXFREQ_PERIODIC)
-		return -EINVAL;
-
-	period_ms = 1000 / freq;
-	rtc_writel(pxa_rtc, PIAR, period_ms);
-
-	return 0;
-}
-
 static int pxa_alarm_irq_enable(struct device *dev, unsigned int enabled)
 {
 	struct pxa_rtc *pxa_rtc = dev_get_drvdata(dev);
@@ -336,7 +322,6 @@ static const struct rtc_class_ops pxa_rtc_ops = {
 	.alarm_irq_enable = pxa_alarm_irq_enable,
 	.update_irq_enable = pxa_update_irq_enable,
 	.proc = pxa_rtc_proc,
-	.irq_set_freq = pxa_periodic_irq_set_freq,
 };
 
 static int __init pxa_rtc_probe(struct platform_device *pdev)
diff --git a/drivers/rtc/rtc-s3c.c b/drivers/rtc/rtc-s3c.c
index 80fb7e7..7149649 100644
--- a/drivers/rtc/rtc-s3c.c
+++ b/drivers/rtc/rtc-s3c.c
@@ -348,7 +348,6 @@ static const struct rtc_class_ops s3c_rtcops = {
 	.set_time	= s3c_rtc_settime,
 	.read_alarm	= s3c_rtc_getalarm,
 	.set_alarm	= s3c_rtc_setalarm,
-	.irq_set_freq	= s3c_rtc_setfreq,
 	.proc		= s3c_rtc_proc,
 	.alarm_irq_enable = s3c_rtc_setaie,
 };
diff --git a/drivers/rtc/rtc-sa1100.c b/drivers/rtc/rtc-sa1100.c
index d47b3fc..d1a2b0b 100644
--- a/drivers/rtc/rtc-sa1100.c
+++ b/drivers/rtc/rtc-sa1100.c
@@ -392,7 +392,6 @@ static const struct rtc_class_ops sa1100_rtc_ops = {
 	.read_alarm = sa1100_rtc_read_alarm,
 	.set_alarm = sa1100_rtc_set_alarm,
 	.proc = sa1100_rtc_proc,
-	.irq_set_freq = sa1100_irq_set_freq,
 	.alarm_irq_enable = sa1100_rtc_alarm_irq_enable,
 };
 
diff --git a/drivers/rtc/rtc-sh.c b/drivers/rtc/rtc-sh.c
index ff50a8b..1485449 100644
--- a/drivers/rtc/rtc-sh.c
+++ b/drivers/rtc/rtc-sh.c
@@ -603,7 +603,6 @@ static struct rtc_class_ops sh_rtc_ops = {
 	.set_time	= sh_rtc_set_time,
 	.read_alarm	= sh_rtc_read_alarm,
 	.set_alarm	= sh_rtc_set_alarm,
-	.irq_set_freq	= sh_rtc_irq_set_freq,
 	.proc		= sh_rtc_proc,
 	.alarm_irq_enable = sh_rtc_alarm_irq_enable,
 };
diff --git a/drivers/rtc/rtc-vr41xx.c b/drivers/rtc/rtc-vr41xx.c
index 86f1490..c5698cd 100644
--- a/drivers/rtc/rtc-vr41xx.c
+++ b/drivers/rtc/rtc-vr41xx.c
@@ -207,26 +207,6 @@ static int vr41xx_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *wkalrm)
 	return 0;
 }
 
-static int vr41xx_rtc_irq_set_freq(struct device *dev, int freq)
-{
-	u64 count;
-
-	if (!is_power_of_2(freq))
-		return -EINVAL;
-	count = RTC_FREQUENCY;
-	do_div(count, freq);
-
-	spin_lock_irq(&rtc_lock);
-
-	periodic_count = count;
-	rtc1_write(RTCL1LREG, periodic_count);
-	rtc1_write(RTCL1HREG, periodic_count >> 16);
-
-	spin_unlock_irq(&rtc_lock);
-
-	return 0;
-}
-
 static int vr41xx_rtc_ioctl(struct device *dev, unsigned int cmd, unsigned long arg)
 {
 	switch (cmd) {
@@ -298,7 +278,6 @@ static const struct rtc_class_ops vr41xx_rtc_ops = {
 	.set_time	= vr41xx_rtc_set_time,
 	.read_alarm	= vr41xx_rtc_read_alarm,
 	.set_alarm	= vr41xx_rtc_set_alarm,
-	.irq_set_freq	= vr41xx_rtc_irq_set_freq,
 };
 
 static int __devinit rtc_probe(struct platform_device *pdev)
diff --git a/include/linux/rtc.h b/include/linux/rtc.h
index 0e2063a..741a51c 100644
--- a/include/linux/rtc.h
+++ b/include/linux/rtc.h
@@ -133,7 +133,6 @@ extern struct class *rtc_class;
  * The (current) exceptions are mostly filesystem hooks:
  *   - the proc() hook for procfs
  *   - non-ioctl() chardev hooks:  open(), release(), read_callback()
- *   - periodic irq calls:  irq_set_state(), irq_set_freq()
  *
  * REVISIT those periodic irq calls *do* have ops_lock when they're
  * issued through ioctl() ...
@@ -148,7 +147,6 @@ struct rtc_class_ops {
 	int (*set_alarm)(struct device *, struct rtc_wkalrm *);
 	int (*proc)(struct device *, struct seq_file *);
 	int (*set_mmss)(struct device *, unsigned long secs);
-	int (*irq_set_freq)(struct device *, int freq);
 	int (*read_callback)(struct device *, int data);
 	int (*alarm_irq_enable)(struct device *, unsigned int enabled);
 	int (*update_irq_enable)(struct device *, unsigned int enabled);
-- 
cgit v0.10.2


From 51ba60c5bb3b0f71bee26404ddc22d8e4109e88a Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Thu, 3 Feb 2011 12:13:50 -0800
Subject: RTC: Cleanup rtc_class_ops->update_irq_enable()

Now that the generic code handles UIE mode irqs via periodic
alarm interrupts, no one calls the
rtc_class_ops->update_irq_enable() method anymore.

This patch removes the driver hooks and implementations of
update_irq_enable if no one else is calling it.

CC: Thomas Gleixner <tglx@linutronix.de>
CC: Alessandro Zummo <a.zummo@towertech.it>
CC: Marcelo Roberto Jimenez <mroberto@cpti.cetuc.puc-rio.br>
CC: rtc-linux@googlegroups.com
Signed-off-by: John Stultz <john.stultz@linaro.org>

diff --git a/drivers/rtc/rtc-cmos.c b/drivers/rtc/rtc-cmos.c
index bdb1f8e..dc2a0ba 100644
--- a/drivers/rtc/rtc-cmos.c
+++ b/drivers/rtc/rtc-cmos.c
@@ -394,25 +394,6 @@ static int cmos_alarm_irq_enable(struct device *dev, unsigned int enabled)
 	return 0;
 }
 
-static int cmos_update_irq_enable(struct device *dev, unsigned int enabled)
-{
-	struct cmos_rtc	*cmos = dev_get_drvdata(dev);
-	unsigned long	flags;
-
-	if (!is_valid_irq(cmos->irq))
-		return -EINVAL;
-
-	spin_lock_irqsave(&rtc_lock, flags);
-
-	if (enabled)
-		cmos_irq_enable(cmos, RTC_UIE);
-	else
-		cmos_irq_disable(cmos, RTC_UIE);
-
-	spin_unlock_irqrestore(&rtc_lock, flags);
-	return 0;
-}
-
 #if defined(CONFIG_RTC_INTF_PROC) || defined(CONFIG_RTC_INTF_PROC_MODULE)
 
 static int cmos_procfs(struct device *dev, struct seq_file *seq)
@@ -458,7 +439,6 @@ static const struct rtc_class_ops cmos_rtc_ops = {
 	.set_alarm		= cmos_set_alarm,
 	.proc			= cmos_procfs,
 	.alarm_irq_enable	= cmos_alarm_irq_enable,
-	.update_irq_enable	= cmos_update_irq_enable,
 };
 
 /*----------------------------------------------------------------*/
diff --git a/drivers/rtc/rtc-ds1511.c b/drivers/rtc/rtc-ds1511.c
index 37268e9..3fffd70 100644
--- a/drivers/rtc/rtc-ds1511.c
+++ b/drivers/rtc/rtc-ds1511.c
@@ -397,29 +397,12 @@ static int ds1511_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled)
 	return 0;
 }
 
-static int ds1511_rtc_update_irq_enable(struct device *dev,
-	unsigned int enabled)
-{
-	struct platform_device *pdev = to_platform_device(dev);
-	struct rtc_plat_data *pdata = platform_get_drvdata(pdev);
-
-	if (pdata->irq <= 0)
-		return -EINVAL;
-	if (enabled)
-		pdata->irqen |= RTC_UF;
-	else
-		pdata->irqen &= ~RTC_UF;
-	ds1511_rtc_update_alarm(pdata);
-	return 0;
-}
-
 static const struct rtc_class_ops ds1511_rtc_ops = {
 	.read_time		= ds1511_rtc_read_time,
 	.set_time		= ds1511_rtc_set_time,
 	.read_alarm		= ds1511_rtc_read_alarm,
 	.set_alarm		= ds1511_rtc_set_alarm,
 	.alarm_irq_enable	= ds1511_rtc_alarm_irq_enable,
-	.update_irq_enable	= ds1511_rtc_update_irq_enable,
 };
 
  static ssize_t
diff --git a/drivers/rtc/rtc-ds1553.c b/drivers/rtc/rtc-ds1553.c
index ff432e2..fee41b9 100644
--- a/drivers/rtc/rtc-ds1553.c
+++ b/drivers/rtc/rtc-ds1553.c
@@ -227,29 +227,12 @@ static int ds1553_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled)
 	return 0;
 }
 
-static int ds1553_rtc_update_irq_enable(struct device *dev,
-	unsigned int enabled)
-{
-	struct platform_device *pdev = to_platform_device(dev);
-	struct rtc_plat_data *pdata = platform_get_drvdata(pdev);
-
-	if (pdata->irq <= 0)
-		return -EINVAL;
-	if (enabled)
-		pdata->irqen |= RTC_UF;
-	else
-		pdata->irqen &= ~RTC_UF;
-	ds1553_rtc_update_alarm(pdata);
-	return 0;
-}
-
 static const struct rtc_class_ops ds1553_rtc_ops = {
 	.read_time		= ds1553_rtc_read_time,
 	.set_time		= ds1553_rtc_set_time,
 	.read_alarm		= ds1553_rtc_read_alarm,
 	.set_alarm		= ds1553_rtc_set_alarm,
 	.alarm_irq_enable	= ds1553_rtc_alarm_irq_enable,
-	.update_irq_enable	= ds1553_rtc_update_irq_enable,
 };
 
 static ssize_t ds1553_nvram_read(struct file *filp, struct kobject *kobj,
diff --git a/drivers/rtc/rtc-ds3232.c b/drivers/rtc/rtc-ds3232.c
index 9507354..27b7bf6 100644
--- a/drivers/rtc/rtc-ds3232.c
+++ b/drivers/rtc/rtc-ds3232.c
@@ -339,23 +339,6 @@ static int ds3232_alarm_irq_enable(struct device *dev, unsigned int enabled)
 	return 0;
 }
 
-static int ds3232_update_irq_enable(struct device *dev, unsigned int enabled)
-{
-	struct i2c_client *client = to_i2c_client(dev);
-	struct ds3232 *ds3232 = i2c_get_clientdata(client);
-
-	if (client->irq <= 0)
-		return -EINVAL;
-
-	if (enabled)
-		ds3232->rtc->irq_data |= RTC_UF;
-	else
-		ds3232->rtc->irq_data &= ~RTC_UF;
-
-	ds3232_update_alarm(client);
-	return 0;
-}
-
 static irqreturn_t ds3232_irq(int irq, void *dev_id)
 {
 	struct i2c_client *client = dev_id;
@@ -406,7 +389,6 @@ static const struct rtc_class_ops ds3232_rtc_ops = {
 	.read_alarm = ds3232_read_alarm,
 	.set_alarm = ds3232_set_alarm,
 	.alarm_irq_enable = ds3232_alarm_irq_enable,
-	.update_irq_enable = ds3232_update_irq_enable,
 };
 
 static int __devinit ds3232_probe(struct i2c_client *client,
diff --git a/drivers/rtc/rtc-jz4740.c b/drivers/rtc/rtc-jz4740.c
index 2e16f72..b647363 100644
--- a/drivers/rtc/rtc-jz4740.c
+++ b/drivers/rtc/rtc-jz4740.c
@@ -168,12 +168,6 @@ static int jz4740_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm)
 	return ret;
 }
 
-static int jz4740_rtc_update_irq_enable(struct device *dev, unsigned int enable)
-{
-	struct jz4740_rtc *rtc = dev_get_drvdata(dev);
-	return jz4740_rtc_ctrl_set_bits(rtc, JZ_RTC_CTRL_1HZ_IRQ, enable);
-}
-
 static int jz4740_rtc_alarm_irq_enable(struct device *dev, unsigned int enable)
 {
 	struct jz4740_rtc *rtc = dev_get_drvdata(dev);
@@ -185,7 +179,6 @@ static struct rtc_class_ops jz4740_rtc_ops = {
 	.set_mmss	= jz4740_rtc_set_mmss,
 	.read_alarm	= jz4740_rtc_read_alarm,
 	.set_alarm	= jz4740_rtc_set_alarm,
-	.update_irq_enable = jz4740_rtc_update_irq_enable,
 	.alarm_irq_enable = jz4740_rtc_alarm_irq_enable,
 };
 
diff --git a/drivers/rtc/rtc-mc13xxx.c b/drivers/rtc/rtc-mc13xxx.c
index 5314b15..c420064 100644
--- a/drivers/rtc/rtc-mc13xxx.c
+++ b/drivers/rtc/rtc-mc13xxx.c
@@ -282,12 +282,6 @@ static irqreturn_t mc13xxx_rtc_update_handler(int irq, void *dev)
 	return IRQ_HANDLED;
 }
 
-static int mc13xxx_rtc_update_irq_enable(struct device *dev,
-		unsigned int enabled)
-{
-	return mc13xxx_rtc_irq_enable(dev, enabled, MC13XXX_IRQ_1HZ);
-}
-
 static int mc13xxx_rtc_alarm_irq_enable(struct device *dev,
 		unsigned int enabled)
 {
@@ -300,7 +294,6 @@ static const struct rtc_class_ops mc13xxx_rtc_ops = {
 	.read_alarm = mc13xxx_rtc_read_alarm,
 	.set_alarm = mc13xxx_rtc_set_alarm,
 	.alarm_irq_enable = mc13xxx_rtc_alarm_irq_enable,
-	.update_irq_enable = mc13xxx_rtc_update_irq_enable,
 };
 
 static irqreturn_t mc13xxx_rtc_reset_handler(int irq, void *dev)
diff --git a/drivers/rtc/rtc-mpc5121.c b/drivers/rtc/rtc-mpc5121.c
index dfcdf09..b40c1ff 100644
--- a/drivers/rtc/rtc-mpc5121.c
+++ b/drivers/rtc/rtc-mpc5121.c
@@ -240,32 +240,12 @@ static int mpc5121_rtc_alarm_irq_enable(struct device *dev,
 	return 0;
 }
 
-static int mpc5121_rtc_update_irq_enable(struct device *dev,
-					 unsigned int enabled)
-{
-	struct mpc5121_rtc_data *rtc = dev_get_drvdata(dev);
-	struct mpc5121_rtc_regs __iomem *regs = rtc->regs;
-	int val;
-
-	val = in_8(&regs->int_enable);
-
-	if (enabled)
-		val = (val & ~0x8) | 0x1;
-	else
-		val &= ~0x1;
-
-	out_8(&regs->int_enable, val);
-
-	return 0;
-}
-
 static const struct rtc_class_ops mpc5121_rtc_ops = {
 	.read_time = mpc5121_rtc_read_time,
 	.set_time = mpc5121_rtc_set_time,
 	.read_alarm = mpc5121_rtc_read_alarm,
 	.set_alarm = mpc5121_rtc_set_alarm,
 	.alarm_irq_enable = mpc5121_rtc_alarm_irq_enable,
-	.update_irq_enable = mpc5121_rtc_update_irq_enable,
 };
 
 static int __devinit mpc5121_rtc_probe(struct platform_device *op,
diff --git a/drivers/rtc/rtc-mxc.c b/drivers/rtc/rtc-mxc.c
index 0b06c1e..826ab64 100644
--- a/drivers/rtc/rtc-mxc.c
+++ b/drivers/rtc/rtc-mxc.c
@@ -274,12 +274,6 @@ static int mxc_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled)
 	return 0;
 }
 
-static int mxc_rtc_update_irq_enable(struct device *dev, unsigned int enabled)
-{
-	mxc_rtc_irq_enable(dev, RTC_1HZ_BIT, enabled);
-	return 0;
-}
-
 /*
  * This function reads the current RTC time into tm in Gregorian date.
  */
@@ -368,7 +362,6 @@ static struct rtc_class_ops mxc_rtc_ops = {
 	.read_alarm		= mxc_rtc_read_alarm,
 	.set_alarm		= mxc_rtc_set_alarm,
 	.alarm_irq_enable	= mxc_rtc_alarm_irq_enable,
-	.update_irq_enable	= mxc_rtc_update_irq_enable,
 };
 
 static int __init mxc_rtc_probe(struct platform_device *pdev)
diff --git a/drivers/rtc/rtc-nuc900.c b/drivers/rtc/rtc-nuc900.c
index ddb0857..781068d 100644
--- a/drivers/rtc/rtc-nuc900.c
+++ b/drivers/rtc/rtc-nuc900.c
@@ -134,20 +134,6 @@ static void nuc900_rtc_bin2bcd(struct device *dev, struct rtc_time *settm,
 	gettm->bcd_hour = bin2bcd(settm->tm_hour) << 16;
 }
 
-static int nuc900_update_irq_enable(struct device *dev, unsigned int enabled)
-{
-	struct nuc900_rtc *rtc = dev_get_drvdata(dev);
-
-	if (enabled)
-		__raw_writel(__raw_readl(rtc->rtc_reg + REG_RTC_RIER)|
-				(TICKINTENB), rtc->rtc_reg + REG_RTC_RIER);
-	else
-		__raw_writel(__raw_readl(rtc->rtc_reg + REG_RTC_RIER)&
-				(~TICKINTENB), rtc->rtc_reg + REG_RTC_RIER);
-
-	return 0;
-}
-
 static int nuc900_alarm_irq_enable(struct device *dev, unsigned int enabled)
 {
 	struct nuc900_rtc *rtc = dev_get_drvdata(dev);
@@ -234,7 +220,6 @@ static struct rtc_class_ops nuc900_rtc_ops = {
 	.read_alarm = nuc900_rtc_read_alarm,
 	.set_alarm = nuc900_rtc_set_alarm,
 	.alarm_irq_enable = nuc900_alarm_irq_enable,
-	.update_irq_enable = nuc900_update_irq_enable,
 };
 
 static int __devinit nuc900_rtc_probe(struct platform_device *pdev)
diff --git a/drivers/rtc/rtc-pcap.c b/drivers/rtc/rtc-pcap.c
index 25c0b3f..a633abc 100644
--- a/drivers/rtc/rtc-pcap.c
+++ b/drivers/rtc/rtc-pcap.c
@@ -131,18 +131,12 @@ static int pcap_rtc_alarm_irq_enable(struct device *dev, unsigned int en)
 	return pcap_rtc_irq_enable(dev, PCAP_IRQ_TODA, en);
 }
 
-static int pcap_rtc_update_irq_enable(struct device *dev, unsigned int en)
-{
-	return pcap_rtc_irq_enable(dev, PCAP_IRQ_1HZ, en);
-}
-
 static const struct rtc_class_ops pcap_rtc_ops = {
 	.read_time = pcap_rtc_read_time,
 	.read_alarm = pcap_rtc_read_alarm,
 	.set_alarm = pcap_rtc_set_alarm,
 	.set_mmss = pcap_rtc_set_mmss,
 	.alarm_irq_enable = pcap_rtc_alarm_irq_enable,
-	.update_irq_enable = pcap_rtc_update_irq_enable,
 };
 
 static int __devinit pcap_rtc_probe(struct platform_device *pdev)
diff --git a/drivers/rtc/rtc-pcf50633.c b/drivers/rtc/rtc-pcf50633.c
index 16edf94..f90c574 100644
--- a/drivers/rtc/rtc-pcf50633.c
+++ b/drivers/rtc/rtc-pcf50633.c
@@ -106,25 +106,6 @@ pcf50633_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled)
 	return 0;
 }
 
-static int
-pcf50633_rtc_update_irq_enable(struct device *dev, unsigned int enabled)
-{
-	struct pcf50633_rtc *rtc = dev_get_drvdata(dev);
-	int err;
-
-	if (enabled)
-		err = pcf50633_irq_unmask(rtc->pcf, PCF50633_IRQ_SECOND);
-	else
-		err = pcf50633_irq_mask(rtc->pcf, PCF50633_IRQ_SECOND);
-
-	if (err < 0)
-		return err;
-
-	rtc->second_enabled = enabled;
-
-	return 0;
-}
-
 static int pcf50633_rtc_read_time(struct device *dev, struct rtc_time *tm)
 {
 	struct pcf50633_rtc *rtc;
@@ -262,8 +243,7 @@ static struct rtc_class_ops pcf50633_rtc_ops = {
 	.set_time		= pcf50633_rtc_set_time,
 	.read_alarm		= pcf50633_rtc_read_alarm,
 	.set_alarm		= pcf50633_rtc_set_alarm,
-	.alarm_irq_enable 	= pcf50633_rtc_alarm_irq_enable,
-	.update_irq_enable 	= pcf50633_rtc_update_irq_enable,
+	.alarm_irq_enable	= pcf50633_rtc_alarm_irq_enable,
 };
 
 static void pcf50633_rtc_irq(int irq, void *data)
diff --git a/drivers/rtc/rtc-pxa.c b/drivers/rtc/rtc-pxa.c
index a1fdc80..fc9f499 100644
--- a/drivers/rtc/rtc-pxa.c
+++ b/drivers/rtc/rtc-pxa.c
@@ -224,21 +224,6 @@ static int pxa_alarm_irq_enable(struct device *dev, unsigned int enabled)
 	return 0;
 }
 
-static int pxa_update_irq_enable(struct device *dev, unsigned int enabled)
-{
-	struct pxa_rtc *pxa_rtc = dev_get_drvdata(dev);
-
-	spin_lock_irq(&pxa_rtc->lock);
-
-	if (enabled)
-		rtsr_set_bits(pxa_rtc, RTSR_HZE);
-	else
-		rtsr_clear_bits(pxa_rtc, RTSR_HZE);
-
-	spin_unlock_irq(&pxa_rtc->lock);
-	return 0;
-}
-
 static int pxa_rtc_read_time(struct device *dev, struct rtc_time *tm)
 {
 	struct pxa_rtc *pxa_rtc = dev_get_drvdata(dev);
@@ -320,7 +305,6 @@ static const struct rtc_class_ops pxa_rtc_ops = {
 	.read_alarm = pxa_rtc_read_alarm,
 	.set_alarm = pxa_rtc_set_alarm,
 	.alarm_irq_enable = pxa_alarm_irq_enable,
-	.update_irq_enable = pxa_update_irq_enable,
 	.proc = pxa_rtc_proc,
 };
 
diff --git a/drivers/rtc/rtc-stmp3xxx.c b/drivers/rtc/rtc-stmp3xxx.c
index 7e7d0c8..572e953 100644
--- a/drivers/rtc/rtc-stmp3xxx.c
+++ b/drivers/rtc/rtc-stmp3xxx.c
@@ -115,19 +115,6 @@ static int stmp3xxx_alarm_irq_enable(struct device *dev, unsigned int enabled)
 	return 0;
 }
 
-static int stmp3xxx_update_irq_enable(struct device *dev, unsigned int enabled)
-{
-	struct stmp3xxx_rtc_data *rtc_data = dev_get_drvdata(dev);
-
-	if (enabled)
-		stmp3xxx_setl(BM_RTC_CTRL_ONEMSEC_IRQ_EN,
-				rtc_data->io + HW_RTC_CTRL);
-	else
-		stmp3xxx_clearl(BM_RTC_CTRL_ONEMSEC_IRQ_EN,
-				rtc_data->io + HW_RTC_CTRL);
-	return 0;
-}
-
 static int stmp3xxx_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alm)
 {
 	struct stmp3xxx_rtc_data *rtc_data = dev_get_drvdata(dev);
@@ -149,8 +136,6 @@ static int stmp3xxx_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alm)
 static struct rtc_class_ops stmp3xxx_rtc_ops = {
 	.alarm_irq_enable =
 			  stmp3xxx_alarm_irq_enable,
-	.update_irq_enable =
-			  stmp3xxx_update_irq_enable,
 	.read_time	= stmp3xxx_rtc_gettime,
 	.set_mmss	= stmp3xxx_rtc_set_mmss,
 	.read_alarm	= stmp3xxx_rtc_read_alarm,
diff --git a/drivers/rtc/rtc-twl.c b/drivers/rtc/rtc-twl.c
index ed1b868..f9a2799 100644
--- a/drivers/rtc/rtc-twl.c
+++ b/drivers/rtc/rtc-twl.c
@@ -213,18 +213,6 @@ static int twl_rtc_alarm_irq_enable(struct device *dev, unsigned enabled)
 	return ret;
 }
 
-static int twl_rtc_update_irq_enable(struct device *dev, unsigned enabled)
-{
-	int ret;
-
-	if (enabled)
-		ret = set_rtc_irq_bit(BIT_RTC_INTERRUPTS_REG_IT_TIMER_M);
-	else
-		ret = mask_rtc_irq_bit(BIT_RTC_INTERRUPTS_REG_IT_TIMER_M);
-
-	return ret;
-}
-
 /*
  * Gets current TWL RTC time and date parameters.
  *
@@ -433,7 +421,6 @@ static struct rtc_class_ops twl_rtc_ops = {
 	.read_alarm	= twl_rtc_read_alarm,
 	.set_alarm	= twl_rtc_set_alarm,
 	.alarm_irq_enable = twl_rtc_alarm_irq_enable,
-	.update_irq_enable = twl_rtc_update_irq_enable,
 };
 
 /*----------------------------------------------------------------------*/
diff --git a/drivers/rtc/rtc-wm831x.c b/drivers/rtc/rtc-wm831x.c
index 82931dc..bdc909b 100644
--- a/drivers/rtc/rtc-wm831x.c
+++ b/drivers/rtc/rtc-wm831x.c
@@ -315,21 +315,6 @@ static int wm831x_rtc_alarm_irq_enable(struct device *dev,
 		return wm831x_rtc_stop_alarm(wm831x_rtc);
 }
 
-static int wm831x_rtc_update_irq_enable(struct device *dev,
-					unsigned int enabled)
-{
-	struct wm831x_rtc *wm831x_rtc = dev_get_drvdata(dev);
-	int val;
-
-	if (enabled)
-		val = 1 << WM831X_RTC_PINT_FREQ_SHIFT;
-	else
-		val = 0;
-
-	return wm831x_set_bits(wm831x_rtc->wm831x, WM831X_RTC_CONTROL,
-			       WM831X_RTC_PINT_FREQ_MASK, val);
-}
-
 static irqreturn_t wm831x_alm_irq(int irq, void *data)
 {
 	struct wm831x_rtc *wm831x_rtc = data;
@@ -354,7 +339,6 @@ static const struct rtc_class_ops wm831x_rtc_ops = {
 	.read_alarm = wm831x_rtc_readalarm,
 	.set_alarm = wm831x_rtc_setalarm,
 	.alarm_irq_enable = wm831x_rtc_alarm_irq_enable,
-	.update_irq_enable = wm831x_rtc_update_irq_enable,
 };
 
 #ifdef CONFIG_PM
diff --git a/drivers/rtc/rtc-wm8350.c b/drivers/rtc/rtc-wm8350.c
index 3d0dc76..6642142 100644
--- a/drivers/rtc/rtc-wm8350.c
+++ b/drivers/rtc/rtc-wm8350.c
@@ -302,26 +302,6 @@ static int wm8350_rtc_setalarm(struct device *dev, struct rtc_wkalrm *alrm)
 	return ret;
 }
 
-static int wm8350_rtc_update_irq_enable(struct device *dev,
-					unsigned int enabled)
-{
-	struct wm8350 *wm8350 = dev_get_drvdata(dev);
-
-	/* Suppress duplicate changes since genirq nests enable and
-	 * disable calls. */
-	if (enabled == wm8350->rtc.update_enabled)
-		return 0;
-
-	if (enabled)
-		wm8350_unmask_irq(wm8350, WM8350_IRQ_RTC_SEC);
-	else
-		wm8350_mask_irq(wm8350, WM8350_IRQ_RTC_SEC);
-
-	wm8350->rtc.update_enabled = enabled;
-
-	return 0;
-}
-
 static irqreturn_t wm8350_rtc_alarm_handler(int irq, void *data)
 {
 	struct wm8350 *wm8350 = data;
@@ -357,7 +337,6 @@ static const struct rtc_class_ops wm8350_rtc_ops = {
 	.read_alarm = wm8350_rtc_readalarm,
 	.set_alarm = wm8350_rtc_setalarm,
 	.alarm_irq_enable = wm8350_rtc_alarm_irq_enable,
-	.update_irq_enable = wm8350_rtc_update_irq_enable,
 };
 
 #ifdef CONFIG_PM
diff --git a/include/linux/rtc.h b/include/linux/rtc.h
index 741a51c..2ca7e8a 100644
--- a/include/linux/rtc.h
+++ b/include/linux/rtc.h
@@ -149,7 +149,6 @@ struct rtc_class_ops {
 	int (*set_mmss)(struct device *, unsigned long secs);
 	int (*read_callback)(struct device *, int data);
 	int (*alarm_irq_enable)(struct device *, unsigned int enabled);
-	int (*update_irq_enable)(struct device *, unsigned int enabled);
 };
 
 #define RTC_DEVICE_NAME_SIZE 20
-- 
cgit v0.10.2


From e428c6a2772bcf6b022baf7c8267cca3634c0c3e Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Fri, 4 Feb 2011 16:16:12 -0800
Subject: RTC: Clean out UIE icotl implementations

With the generic RTC rework, the UIE mode irqs are handled
in the generic layer, and only hardware specific ioctls
get passed down to the rtc driver layer.

So this patch removes the UIE mode ioctl handling in the rtc
driver layer, which never get used.

CC: Thomas Gleixner <tglx@linutronix.de>
CC: Alessandro Zummo <a.zummo@towertech.it>
CC: Marcelo Roberto Jimenez <mroberto@cpti.cetuc.puc-rio.br>
CC: rtc-linux@googlegroups.com
Signed-off-by: John Stultz <john.stultz@linaro.org>

diff --git a/drivers/rtc/rtc-at91rm9200.c b/drivers/rtc/rtc-at91rm9200.c
index 26d1cf5..518a76e 100644
--- a/drivers/rtc/rtc-at91rm9200.c
+++ b/drivers/rtc/rtc-at91rm9200.c
@@ -183,33 +183,6 @@ static int at91_rtc_setalarm(struct device *dev, struct rtc_wkalrm *alrm)
 	return 0;
 }
 
-/*
- * Handle commands from user-space
- */
-static int at91_rtc_ioctl(struct device *dev, unsigned int cmd,
-			unsigned long arg)
-{
-	int ret = 0;
-
-	pr_debug("%s(): cmd=%08x, arg=%08lx.\n", __func__, cmd, arg);
-
-	/* important:  scrub old status before enabling IRQs */
-	switch (cmd) {
-	case RTC_UIE_OFF:	/* update off */
-		at91_sys_write(AT91_RTC_IDR, AT91_RTC_SECEV);
-		break;
-	case RTC_UIE_ON:	/* update on */
-		at91_sys_write(AT91_RTC_SCCR, AT91_RTC_SECEV);
-		at91_sys_write(AT91_RTC_IER, AT91_RTC_SECEV);
-		break;
-	default:
-		ret = -ENOIOCTLCMD;
-		break;
-	}
-
-	return ret;
-}
-
 static int at91_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled)
 {
 	pr_debug("%s(): cmd=%08x\n", __func__, enabled);
@@ -269,7 +242,6 @@ static irqreturn_t at91_rtc_interrupt(int irq, void *dev_id)
 }
 
 static const struct rtc_class_ops at91_rtc_ops = {
-	.ioctl		= at91_rtc_ioctl,
 	.read_time	= at91_rtc_readtime,
 	.set_time	= at91_rtc_settime,
 	.read_alarm	= at91_rtc_readalarm,
diff --git a/drivers/rtc/rtc-at91sam9.c b/drivers/rtc/rtc-at91sam9.c
index 5469c52..a3ad957 100644
--- a/drivers/rtc/rtc-at91sam9.c
+++ b/drivers/rtc/rtc-at91sam9.c
@@ -216,33 +216,6 @@ static int at91_rtc_setalarm(struct device *dev, struct rtc_wkalrm *alrm)
 	return 0;
 }
 
-/*
- * Handle commands from user-space
- */
-static int at91_rtc_ioctl(struct device *dev, unsigned int cmd,
-			unsigned long arg)
-{
-	struct sam9_rtc *rtc = dev_get_drvdata(dev);
-	int ret = 0;
-	u32 mr = rtt_readl(rtc, MR);
-
-	dev_dbg(dev, "ioctl: cmd=%08x, arg=%08lx, mr %08x\n", cmd, arg, mr);
-
-	switch (cmd) {
-	case RTC_UIE_OFF:		/* update off */
-		rtt_writel(rtc, MR, mr & ~AT91_RTT_RTTINCIEN);
-		break;
-	case RTC_UIE_ON:		/* update on */
-		rtt_writel(rtc, MR, mr | AT91_RTT_RTTINCIEN);
-		break;
-	default:
-		ret = -ENOIOCTLCMD;
-		break;
-	}
-
-	return ret;
-}
-
 static int at91_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled)
 {
 	struct sam9_rtc *rtc = dev_get_drvdata(dev);
@@ -303,7 +276,6 @@ static irqreturn_t at91_rtc_interrupt(int irq, void *_rtc)
 }
 
 static const struct rtc_class_ops at91_rtc_ops = {
-	.ioctl		= at91_rtc_ioctl,
 	.read_time	= at91_rtc_readtime,
 	.set_time	= at91_rtc_settime,
 	.read_alarm	= at91_rtc_readalarm,
diff --git a/drivers/rtc/rtc-bfin.c b/drivers/rtc/rtc-bfin.c
index 17971d9..ca9cff8 100644
--- a/drivers/rtc/rtc-bfin.c
+++ b/drivers/rtc/rtc-bfin.c
@@ -240,32 +240,6 @@ static void bfin_rtc_int_set_alarm(struct bfin_rtc *rtc)
 	 */
 	bfin_rtc_int_set(rtc->rtc_alarm.tm_yday == -1 ? RTC_ISTAT_ALARM : RTC_ISTAT_ALARM_DAY);
 }
-static int bfin_rtc_ioctl(struct device *dev, unsigned int cmd, unsigned long arg)
-{
-	struct bfin_rtc *rtc = dev_get_drvdata(dev);
-	int ret = 0;
-
-	dev_dbg_stamp(dev);
-
-	bfin_rtc_sync_pending(dev);
-
-	switch (cmd) {
-	case RTC_UIE_ON:
-		dev_dbg_stamp(dev);
-		bfin_rtc_int_set(RTC_ISTAT_SEC);
-		break;
-	case RTC_UIE_OFF:
-		dev_dbg_stamp(dev);
-		bfin_rtc_int_clear(~RTC_ISTAT_SEC);
-		break;
-
-	default:
-		dev_dbg_stamp(dev);
-		ret = -ENOIOCTLCMD;
-	}
-
-	return ret;
-}
 
 static int bfin_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled)
 {
@@ -358,7 +332,6 @@ static int bfin_rtc_proc(struct device *dev, struct seq_file *seq)
 }
 
 static struct rtc_class_ops bfin_rtc_ops = {
-	.ioctl         = bfin_rtc_ioctl,
 	.read_time     = bfin_rtc_read_time,
 	.set_time      = bfin_rtc_set_time,
 	.read_alarm    = bfin_rtc_read_alarm,
diff --git a/drivers/rtc/rtc-davinci.c b/drivers/rtc/rtc-davinci.c
index dfd98a2..8d46838 100644
--- a/drivers/rtc/rtc-davinci.c
+++ b/drivers/rtc/rtc-davinci.c
@@ -231,10 +231,6 @@ davinci_rtc_ioctl(struct device *dev, unsigned int cmd, unsigned long arg)
 	case RTC_WIE_OFF:
 		rtc_ctrl &= ~PRTCSS_RTC_CTRL_WEN;
 		break;
-	case RTC_UIE_OFF:
-	case RTC_UIE_ON:
-		ret = -ENOTTY;
-		break;
 	default:
 		ret = -ENOIOCTLCMD;
 	}
diff --git a/drivers/rtc/rtc-omap.c b/drivers/rtc/rtc-omap.c
index b4dbf3a3..de0dd7b 100644
--- a/drivers/rtc/rtc-omap.c
+++ b/drivers/rtc/rtc-omap.c
@@ -135,44 +135,6 @@ static irqreturn_t rtc_irq(int irq, void *rtc)
 	return IRQ_HANDLED;
 }
 
-#ifdef	CONFIG_RTC_INTF_DEV
-
-static int
-omap_rtc_ioctl(struct device *dev, unsigned int cmd, unsigned long arg)
-{
-	u8 reg;
-
-	switch (cmd) {
-	case RTC_UIE_OFF:
-	case RTC_UIE_ON:
-		break;
-	default:
-		return -ENOIOCTLCMD;
-	}
-
-	local_irq_disable();
-	rtc_wait_not_busy();
-	reg = rtc_read(OMAP_RTC_INTERRUPTS_REG);
-	switch (cmd) {
-	/* UIE = Update Interrupt Enable (1/second) */
-	case RTC_UIE_OFF:
-		reg &= ~OMAP_RTC_INTERRUPTS_IT_TIMER;
-		break;
-	case RTC_UIE_ON:
-		reg |= OMAP_RTC_INTERRUPTS_IT_TIMER;
-		break;
-	}
-	rtc_wait_not_busy();
-	rtc_write(reg, OMAP_RTC_INTERRUPTS_REG);
-	local_irq_enable();
-
-	return 0;
-}
-
-#else
-#define	omap_rtc_ioctl	NULL
-#endif
-
 static int omap_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled)
 {
 	u8 reg;
@@ -313,7 +275,6 @@ static int omap_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alm)
 }
 
 static struct rtc_class_ops omap_rtc_ops = {
-	.ioctl		= omap_rtc_ioctl,
 	.read_time	= omap_rtc_read_time,
 	.set_time	= omap_rtc_set_time,
 	.read_alarm	= omap_rtc_read_alarm,
diff --git a/drivers/rtc/rtc-pl030.c b/drivers/rtc/rtc-pl030.c
index bbdb2f0..d554368 100644
--- a/drivers/rtc/rtc-pl030.c
+++ b/drivers/rtc/rtc-pl030.c
@@ -35,11 +35,6 @@ static irqreturn_t pl030_interrupt(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
-static int pl030_ioctl(struct device *dev, unsigned int cmd, unsigned long arg)
-{
-	return -ENOIOCTLCMD;
-}
-
 static int pl030_read_alarm(struct device *dev, struct rtc_wkalrm *alrm)
 {
 	struct pl030_rtc *rtc = dev_get_drvdata(dev);
@@ -96,7 +91,6 @@ static int pl030_set_time(struct device *dev, struct rtc_time *tm)
 }
 
 static const struct rtc_class_ops pl030_ops = {
-	.ioctl		= pl030_ioctl,
 	.read_time	= pl030_read_time,
 	.set_time	= pl030_set_time,
 	.read_alarm	= pl030_read_alarm,
diff --git a/drivers/rtc/rtc-rs5c372.c b/drivers/rtc/rtc-rs5c372.c
index 6aaa155..85c1b84 100644
--- a/drivers/rtc/rtc-rs5c372.c
+++ b/drivers/rtc/rtc-rs5c372.c
@@ -281,57 +281,6 @@ static int rs5c372_rtc_set_time(struct device *dev, struct rtc_time *tm)
 	return rs5c372_set_datetime(to_i2c_client(dev), tm);
 }
 
-#if defined(CONFIG_RTC_INTF_DEV) || defined(CONFIG_RTC_INTF_DEV_MODULE)
-
-static int
-rs5c_rtc_ioctl(struct device *dev, unsigned int cmd, unsigned long arg)
-{
-	struct i2c_client	*client = to_i2c_client(dev);
-	struct rs5c372		*rs5c = i2c_get_clientdata(client);
-	unsigned char		buf;
-	int			status, addr;
-
-	buf = rs5c->regs[RS5C_REG_CTRL1];
-	switch (cmd) {
-	case RTC_UIE_OFF:
-	case RTC_UIE_ON:
-		/* some 327a modes use a different IRQ pin for 1Hz irqs */
-		if (rs5c->type == rtc_rs5c372a
-				&& (buf & RS5C372A_CTRL1_SL1))
-			return -ENOIOCTLCMD;
-	default:
-		return -ENOIOCTLCMD;
-	}
-
-	status = rs5c_get_regs(rs5c);
-	if (status < 0)
-		return status;
-
-	addr = RS5C_ADDR(RS5C_REG_CTRL1);
-	switch (cmd) {
-	case RTC_UIE_OFF:	/* update off */
-		buf &= ~RS5C_CTRL1_CT_MASK;
-		break;
-	case RTC_UIE_ON:	/* update on */
-		buf &= ~RS5C_CTRL1_CT_MASK;
-		buf |= RS5C_CTRL1_CT4;
-		break;
-	}
-
-	if (i2c_smbus_write_byte_data(client, addr, buf) < 0) {
-		printk(KERN_WARNING "%s: can't update alarm\n",
-			rs5c->rtc->name);
-		status = -EIO;
-	} else
-		rs5c->regs[RS5C_REG_CTRL1] = buf;
-
-	return status;
-}
-
-#else
-#define	rs5c_rtc_ioctl	NULL
-#endif
-
 
 static int rs5c_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled)
 {
@@ -480,7 +429,6 @@ static int rs5c372_rtc_proc(struct device *dev, struct seq_file *seq)
 
 static const struct rtc_class_ops rs5c372_rtc_ops = {
 	.proc		= rs5c372_rtc_proc,
-	.ioctl		= rs5c_rtc_ioctl,
 	.read_time	= rs5c372_rtc_read_time,
 	.set_time	= rs5c372_rtc_set_time,
 	.read_alarm	= rs5c_read_alarm,
diff --git a/drivers/rtc/rtc-sa1100.c b/drivers/rtc/rtc-sa1100.c
index d1a2b0b..a918933 100644
--- a/drivers/rtc/rtc-sa1100.c
+++ b/drivers/rtc/rtc-sa1100.c
@@ -293,24 +293,6 @@ static void sa1100_rtc_release(struct device *dev)
 }
 
 
-static int sa1100_rtc_ioctl(struct device *dev, unsigned int cmd,
-		unsigned long arg)
-{
-	switch (cmd) {
-	case RTC_UIE_OFF:
-		spin_lock_irq(&sa1100_rtc_lock);
-		RTSR &= ~RTSR_HZE;
-		spin_unlock_irq(&sa1100_rtc_lock);
-		return 0;
-	case RTC_UIE_ON:
-		spin_lock_irq(&sa1100_rtc_lock);
-		RTSR |= RTSR_HZE;
-		spin_unlock_irq(&sa1100_rtc_lock);
-		return 0;
-	}
-	return -ENOIOCTLCMD;
-}
-
 static int sa1100_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled)
 {
 	spin_lock_irq(&sa1100_rtc_lock);
@@ -386,7 +368,6 @@ static const struct rtc_class_ops sa1100_rtc_ops = {
 	.open = sa1100_rtc_open,
 	.read_callback = sa1100_rtc_read_callback,
 	.release = sa1100_rtc_release,
-	.ioctl = sa1100_rtc_ioctl,
 	.read_time = sa1100_rtc_read_time,
 	.set_time = sa1100_rtc_set_time,
 	.read_alarm = sa1100_rtc_read_alarm,
diff --git a/drivers/rtc/rtc-sh.c b/drivers/rtc/rtc-sh.c
index 1485449..e55dc1a 100644
--- a/drivers/rtc/rtc-sh.c
+++ b/drivers/rtc/rtc-sh.c
@@ -344,27 +344,6 @@ static inline void sh_rtc_setcie(struct device *dev, unsigned int enable)
 	spin_unlock_irq(&rtc->lock);
 }
 
-static int sh_rtc_ioctl(struct device *dev, unsigned int cmd, unsigned long arg)
-{
-	struct sh_rtc *rtc = dev_get_drvdata(dev);
-	unsigned int ret = 0;
-
-	switch (cmd) {
-	case RTC_UIE_OFF:
-		rtc->periodic_freq &= ~PF_OXS;
-		sh_rtc_setcie(dev, 0);
-		break;
-	case RTC_UIE_ON:
-		rtc->periodic_freq |= PF_OXS;
-		sh_rtc_setcie(dev, 1);
-		break;
-	default:
-		ret = -ENOIOCTLCMD;
-	}
-
-	return ret;
-}
-
 static int sh_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled)
 {
 	sh_rtc_setaie(dev, enabled);
@@ -598,7 +577,6 @@ static int sh_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *wkalrm)
 }
 
 static struct rtc_class_ops sh_rtc_ops = {
-	.ioctl		= sh_rtc_ioctl,
 	.read_time	= sh_rtc_read_time,
 	.set_time	= sh_rtc_set_time,
 	.read_alarm	= sh_rtc_read_alarm,
-- 
cgit v0.10.2


From bca8521c551afcd926bdc8f814ebaefcb8215c57 Mon Sep 17 00:00:00 2001
From: Marcelo Roberto Jimenez <mroberto@cpti.cetuc.puc-rio.br>
Date: Fri, 11 Feb 2011 11:50:24 -0200
Subject: RTC: Include information about UIE and PIE in RTC driver proc.

Generic RTC code is always able to provide the necessary information
about update and periodic interrupts. This patch add such information to
the proc interface.

CC: Thomas Gleixner <tglx@linutronix.de>
CC: Alessandro Zummo <a.zummo@towertech.it>
CC: Marcelo Roberto Jimenez <mroberto@cpti.cetuc.puc-rio.br>
CC: rtc-linux@googlegroups.com
Signed-off-by: Marcelo Roberto Jimenez <mroberto@cpti.cetuc.puc-rio.br>
Signed-off-by: John Stultz <john.stultz@linaro.org>

diff --git a/drivers/rtc/rtc-proc.c b/drivers/rtc/rtc-proc.c
index 242bbf8..0a59fda 100644
--- a/drivers/rtc/rtc-proc.c
+++ b/drivers/rtc/rtc-proc.c
@@ -69,6 +69,14 @@ static int rtc_proc_show(struct seq_file *seq, void *offset)
 				alrm.enabled ? "yes" : "no");
 		seq_printf(seq, "alrm_pending\t: %s\n",
 				alrm.pending ? "yes" : "no");
+		seq_printf(seq, "update IRQ enabled\t: %s\n",
+			(rtc->uie_rtctimer.enabled) ? "yes" : "no");
+		seq_printf(seq, "periodic IRQ enabled\t: %s\n",
+			(rtc->pie_enabled) ? "yes" : "no");
+		seq_printf(seq, "periodic IRQ frequency\t: %d\n",
+			rtc->irq_freq);
+		seq_printf(seq, "max user IRQ frequency\t: %d\n",
+			rtc->max_user_freq);
 	}
 
 	seq_printf(seq, "24hr\t\t: yes\n");
-- 
cgit v0.10.2


From 4cebe7aadc9ee8e7b44857b7aba3a878870cef65 Mon Sep 17 00:00:00 2001
From: Marcelo Roberto Jimenez <mroberto@cpti.cetuc.puc-rio.br>
Date: Mon, 7 Feb 2011 19:16:06 -0200
Subject: RTC: Remove UIE and PIE information from the sa1100 driver proc.

This patch removes the UIE and PIE information that is now being
supplied directly in the generic RTC code.

CC: Thomas Gleixner <tglx@linutronix.de>
CC: Alessandro Zummo <a.zummo@towertech.it>
CC: Marcelo Roberto Jimenez <mroberto@cpti.cetuc.puc-rio.br>
CC: rtc-linux@googlegroups.com
Signed-off-by: Marcelo Roberto Jimenez <mroberto@cpti.cetuc.puc-rio.br>
Signed-off-by: John Stultz <john.stultz@linaro.org>

diff --git a/drivers/rtc/rtc-sa1100.c b/drivers/rtc/rtc-sa1100.c
index a918933..41f62ca 100644
--- a/drivers/rtc/rtc-sa1100.c
+++ b/drivers/rtc/rtc-sa1100.c
@@ -351,15 +351,8 @@ static int sa1100_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm)
 
 static int sa1100_rtc_proc(struct device *dev, struct seq_file *seq)
 {
-	struct rtc_device *rtc = (struct rtc_device *)dev;
-
-	seq_printf(seq, "trim/divider\t: 0x%08x\n", (u32) RTTR);
-	seq_printf(seq, "update_IRQ\t: %s\n",
-			(RTSR & RTSR_HZE) ? "yes" : "no");
-	seq_printf(seq, "periodic_IRQ\t: %s\n",
-			(OIER & OIER_E1) ? "yes" : "no");
-	seq_printf(seq, "periodic_freq\t: %d\n", rtc->irq_freq);
-	seq_printf(seq, "RTSR\t\t: 0x%08x\n", (u32)RTSR);
+	seq_printf(seq, "trim/divider\t\t: 0x%08x\n", (u32) RTTR);
+	seq_printf(seq, "RTSR\t\t\t: 0x%08x\n", (u32)RTSR);
 
 	return 0;
 }
-- 
cgit v0.10.2


From a417493ef916b8b6d1782a589766a713c553842e Mon Sep 17 00:00:00 2001
From: Marcelo Roberto Jimenez <mroberto@cpti.cetuc.puc-rio.br>
Date: Mon, 7 Feb 2011 19:16:07 -0200
Subject: RTC: Fix the cross interrupt issue on rtc-test.

The rtc-test driver is meant to provide a test/debug code for the RTC
subsystem.

The rtc-test driver simulates specific interrupts by echoing to the
sys interface. Those were the update, alarm and periodic interrupts.

As a side effect of the new implementation, any interrupt generated in
the rtc-test driver would trigger the same code path in the generic
code, and thus the distinction among interrupts gets lost.

This patch preserves the previous behaviour of the rtc-test driver,
where e.g. an update interrupt would not trigger an alarm or periodic
interrupt, and vice-versa. In real world RTC drivers, this is not an
issue, but in the rtc-test driver it may be interesting to distinguish
these interrupts for testing purposes.

CC: Thomas Gleixner <tglx@linutronix.de>
CC: Alessandro Zummo <a.zummo@towertech.it>
CC: Marcelo Roberto Jimenez <mroberto@cpti.cetuc.puc-rio.br>
CC: rtc-linux@googlegroups.com
Signed-off-by: Marcelo Roberto Jimenez <mroberto@cpti.cetuc.puc-rio.br>
Signed-off-by: John Stultz <john.stultz@linaro.org>

diff --git a/drivers/rtc/rtc-test.c b/drivers/rtc/rtc-test.c
index a82d6fe..7e96254b 100644
--- a/drivers/rtc/rtc-test.c
+++ b/drivers/rtc/rtc-test.c
@@ -78,11 +78,16 @@ static ssize_t test_irq_store(struct device *dev,
 	struct rtc_device *rtc = platform_get_drvdata(plat_dev);
 
 	retval = count;
-	if (strncmp(buf, "tick", 4) == 0)
+	if (strncmp(buf, "tick", 4) == 0 && rtc->pie_enabled)
 		rtc_update_irq(rtc, 1, RTC_PF | RTC_IRQF);
-	else if (strncmp(buf, "alarm", 5) == 0)
-		rtc_update_irq(rtc, 1, RTC_AF | RTC_IRQF);
-	else if (strncmp(buf, "update", 6) == 0)
+	else if (strncmp(buf, "alarm", 5) == 0) {
+		struct rtc_wkalrm alrm;
+		int err = rtc_read_alarm(rtc, &alrm);
+
+		if (!err && alrm.enabled)
+			rtc_update_irq(rtc, 1, RTC_AF | RTC_IRQF);
+
+	} else if (strncmp(buf, "update", 6) == 0 && rtc->uie_rtctimer.enabled)
 		rtc_update_irq(rtc, 1, RTC_UF | RTC_IRQF);
 	else
 		retval = -EINVAL;
-- 
cgit v0.10.2


From 416f0e8056f757c119dc3d4fa434a62b65c8272b Mon Sep 17 00:00:00 2001
From: Marcelo Roberto Jimenez <mroberto@cpti.cetuc.puc-rio.br>
Date: Mon, 7 Feb 2011 19:16:08 -0200
Subject: RTC: sa1100: Update the sa1100 RTC driver.

Since PIE interrupts are now emulated, this patch removes the previous
code that used the hardware counters.

The removal of read_callback() also fixes a wrong user space behaviour
of this driver, which was not returning the right value to read().

[john.stultz: Merge fixups]

CC: Thomas Gleixner <tglx@linutronix.de>
CC: Alessandro Zummo <a.zummo@towertech.it>
CC: Marcelo Roberto Jimenez <mroberto@cpti.cetuc.puc-rio.br>
CC: rtc-linux@googlegroups.com
Signed-off-by: Marcelo Roberto Jimenez <mroberto@cpti.cetuc.puc-rio.br>
Signed-off-by: John Stultz <john.stultz@linaro.org>

diff --git a/drivers/rtc/rtc-sa1100.c b/drivers/rtc/rtc-sa1100.c
index 41f62ca..0b40bb8 100644
--- a/drivers/rtc/rtc-sa1100.c
+++ b/drivers/rtc/rtc-sa1100.c
@@ -43,7 +43,6 @@
 #define RTC_DEF_TRIM		0
 
 static const unsigned long RTC_FREQ = 1024;
-static unsigned long timer_freq;
 static struct rtc_time rtc_alarm;
 static DEFINE_SPINLOCK(sa1100_rtc_lock);
 
@@ -156,97 +155,11 @@ static irqreturn_t sa1100_rtc_interrupt(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
-static int sa1100_irq_set_freq(struct device *dev, int freq)
-{
-	if (freq < 1 || freq > timer_freq) {
-		return -EINVAL;
-	} else {
-		struct rtc_device *rtc = (struct rtc_device *)dev;
-
-		rtc->irq_freq = freq;
-
-		return 0;
-	}
-}
-
-static int rtc_timer1_count;
-
-static inline int sa1100_timer1_retrigger(struct rtc_device *rtc)
-{
-	unsigned long diff;
-	unsigned long period = timer_freq / rtc->irq_freq;
-
-	spin_lock_irq(&sa1100_rtc_lock);
-
-	do {
-		OSMR1 += period;
-		diff = OSMR1 - OSCR;
-		/* If OSCR > OSMR1, diff is a very large number (unsigned
-		 * math). This means we have a lost interrupt. */
-	} while (diff > period);
-	OIER |= OIER_E1;
-
-	spin_unlock_irq(&sa1100_rtc_lock);
-
-	return 0;
-}
-
-static irqreturn_t timer1_interrupt(int irq, void *dev_id)
-{
-	struct platform_device *pdev = to_platform_device(dev_id);
-	struct rtc_device *rtc = platform_get_drvdata(pdev);
-
-	/*
-	 * If we match for the first time, rtc_timer1_count will be 1.
-	 * Otherwise, we wrapped around (very unlikely but
-	 * still possible) so compute the amount of missed periods.
-	 * The match reg is updated only when the data is actually retrieved
-	 * to avoid unnecessary interrupts.
-	 */
-	OSSR = OSSR_M1;	/* clear match on timer1 */
-
-	rtc_update_irq(rtc, rtc_timer1_count, RTC_PF | RTC_IRQF);
-
-	if (rtc_timer1_count == 1)
-		rtc_timer1_count =
-			(rtc->irq_freq * ((1 << 30) / (timer_freq >> 2)));
-
-	/* retrigger. */
-	sa1100_timer1_retrigger(rtc);
-
-	return IRQ_HANDLED;
-}
-
-static int sa1100_rtc_read_callback(struct device *dev, int data)
-{
-	if (data & RTC_PF) {
-		struct rtc_device *rtc = (struct rtc_device *)dev;
-
-		/* interpolate missed periods and set match for the next */
-		unsigned long period = timer_freq / rtc->irq_freq;
-		unsigned long oscr = OSCR;
-		unsigned long osmr1 = OSMR1;
-		unsigned long missed = (oscr - osmr1)/period;
-		data += missed << 8;
-		OSSR = OSSR_M1;	/* clear match on timer 1 */
-		OSMR1 = osmr1 + (missed + 1)*period;
-		/* Ensure we didn't miss another match in the mean time.
-		 * Here we compare (match - OSCR) 8 instead of 0 --
-		 * see comment in pxa_timer_interrupt() for explanation.
-		 */
-		while ((signed long)((osmr1 = OSMR1) - OSCR) <= 8) {
-			data += 0x100;
-			OSSR = OSSR_M1;	/* clear match on timer 1 */
-			OSMR1 = osmr1 + period;
-		}
-	}
-	return data;
-}
-
 static int sa1100_rtc_open(struct device *dev)
 {
 	int ret;
-	struct rtc_device *rtc = (struct rtc_device *)dev;
+	struct platform_device *plat_dev = to_platform_device(dev);
+	struct rtc_device *rtc = platform_get_drvdata(plat_dev);
 
 	ret = request_irq(IRQ_RTC1Hz, sa1100_rtc_interrupt, IRQF_DISABLED,
 		"rtc 1Hz", dev);
@@ -260,19 +173,11 @@ static int sa1100_rtc_open(struct device *dev)
 		dev_err(dev, "IRQ %d already in use.\n", IRQ_RTCAlrm);
 		goto fail_ai;
 	}
-	ret = request_irq(IRQ_OST1, timer1_interrupt, IRQF_DISABLED,
-		"rtc timer", dev);
-	if (ret) {
-		dev_err(dev, "IRQ %d already in use.\n", IRQ_OST1);
-		goto fail_pi;
-	}
 	rtc->max_user_freq = RTC_FREQ;
-	sa1100_irq_set_freq(dev, RTC_FREQ);
+	rtc_irq_set_freq(rtc, NULL, RTC_FREQ);
 
 	return 0;
 
- fail_pi:
-	free_irq(IRQ_RTCAlrm, dev);
  fail_ai:
 	free_irq(IRQ_RTC1Hz, dev);
  fail_ui:
@@ -287,12 +192,10 @@ static void sa1100_rtc_release(struct device *dev)
 	OSSR = OSSR_M1;
 	spin_unlock_irq(&sa1100_rtc_lock);
 
-	free_irq(IRQ_OST1, dev);
 	free_irq(IRQ_RTCAlrm, dev);
 	free_irq(IRQ_RTC1Hz, dev);
 }
 
-
 static int sa1100_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled)
 {
 	spin_lock_irq(&sa1100_rtc_lock);
@@ -359,7 +262,6 @@ static int sa1100_rtc_proc(struct device *dev, struct seq_file *seq)
 
 static const struct rtc_class_ops sa1100_rtc_ops = {
 	.open = sa1100_rtc_open,
-	.read_callback = sa1100_rtc_read_callback,
 	.release = sa1100_rtc_release,
 	.read_time = sa1100_rtc_read_time,
 	.set_time = sa1100_rtc_set_time,
@@ -373,8 +275,6 @@ static int sa1100_rtc_probe(struct platform_device *pdev)
 {
 	struct rtc_device *rtc;
 
-	timer_freq = get_clock_tick_rate();
-
 	/*
 	 * According to the manual we should be able to let RTTR be zero
 	 * and then a default diviser for a 32.768KHz clock is used.
@@ -400,11 +300,6 @@ static int sa1100_rtc_probe(struct platform_device *pdev)
 
 	platform_set_drvdata(pdev, rtc);
 
-	/* Set the irq_freq */
-	/*TODO: Find out who is messing with this value after we initialize
-	 * it here.*/
-	rtc->irq_freq = RTC_FREQ;
-
 	/* Fix for a nasty initialization problem the in SA11xx RTSR register.
 	 * See also the comments in sa1100_rtc_interrupt().
 	 *
-- 
cgit v0.10.2


From ea04683f592e6200b52e191b7e2842aedcfd88b6 Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Thu, 10 Feb 2011 15:32:59 -0800
Subject: RTC: Fix up rtc.txt documentation to reflect changes to generic rtc
 layer

Now that the genric RTC layer handles much of the RTC functionality,
the rtc.txt documentation needs to be updated to remove outdated information.

CC: Thomas Gleixner <tglx@linutronix.de>
CC: Alessandro Zummo <a.zummo@towertech.it>
CC: Marcelo Roberto Jimenez <mroberto@cpti.cetuc.puc-rio.br>
CC: rtc-linux@googlegroups.com
Signed-off-by: John Stultz <john.stultz@linaro.org>

diff --git a/Documentation/rtc.txt b/Documentation/rtc.txt
index 9104c10..2501604 100644
--- a/Documentation/rtc.txt
+++ b/Documentation/rtc.txt
@@ -178,38 +178,29 @@ RTC class framework, but can't be supported by the older driver.
 	setting the longer alarm time and enabling its IRQ using a single
 	request (using the same model as EFI firmware).
 
-    *	RTC_UIE_ON, RTC_UIE_OFF ... if the RTC offers IRQs, it probably
-	also offers update IRQs whenever the "seconds" counter changes.
-	If needed, the RTC framework can emulate this mechanism.
+    *	RTC_UIE_ON, RTC_UIE_OFF ... if the RTC offers IRQs, the RTC framework
+	will emulate this mechanism.
 
-    *	RTC_PIE_ON, RTC_PIE_OFF, RTC_IRQP_SET, RTC_IRQP_READ ... another
-	feature often accessible with an IRQ line is a periodic IRQ, issued
-	at settable frequencies (usually 2^N Hz).
+    *	RTC_PIE_ON, RTC_PIE_OFF, RTC_IRQP_SET, RTC_IRQP_READ ... these icotls
+	are emulated via a kernel hrtimer.
 
 In many cases, the RTC alarm can be a system wake event, used to force
 Linux out of a low power sleep state (or hibernation) back to a fully
 operational state.  For example, a system could enter a deep power saving
 state until it's time to execute some scheduled tasks.
 
-Note that many of these ioctls need not actually be implemented by your
-driver.  The common rtc-dev interface handles many of these nicely if your
-driver returns ENOIOCTLCMD.  Some common examples:
+Note that many of these ioctls are handled by the common rtc-dev interface.
+Some common examples:
 
     *	RTC_RD_TIME, RTC_SET_TIME: the read_time/set_time functions will be
 	called with appropriate values.
 
-    *	RTC_ALM_SET, RTC_ALM_READ, RTC_WKALM_SET, RTC_WKALM_RD: the
-	set_alarm/read_alarm functions will be called.
+    *	RTC_ALM_SET, RTC_ALM_READ, RTC_WKALM_SET, RTC_WKALM_RD: gets or sets
+	the alarm rtc_timer. May call the set_alarm driver function.
 
-    *	RTC_IRQP_SET, RTC_IRQP_READ: the irq_set_freq function will be called
-	to set the frequency while the framework will handle the read for you
-	since the frequency is stored in the irq_freq member of the rtc_device
-	structure.  Your driver needs to initialize the irq_freq member during
-	init.  Make sure you check the requested frequency is in range of your
-	hardware in the irq_set_freq function.  If it isn't, return -EINVAL.  If
-	you cannot actually change the frequency, do not define irq_set_freq.
+    *	RTC_IRQP_SET, RTC_IRQP_READ: These are emulated by the generic code.
 
-    *	RTC_PIE_ON, RTC_PIE_OFF: the irq_set_state function will be called.
+    *	RTC_PIE_ON, RTC_PIE_OFF: These are also emulated by the generic code.
 
 If all else fails, check out the rtc-test.c driver!
 
-- 
cgit v0.10.2


From b9a46bba88001504235459c8410f17e6a7e38008 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Mon, 7 Mar 2011 21:13:40 +0100
Subject: perf top: Fix events overflow in top command

The snprintf function returns number of printed characters even if it
cross the size parameter. So passing enough events via '-e' parameter
will cause segmentation fault.

It's reproduced by following command:

perf top -e `perf list | grep Tracepoint | awk -F'[' '\
{gsub(/[[:space:]]+/,"",$1);array[FNR]=$1}END{outputs=array[1];\
for (i=2;i<=FNR;i++){ outputs=outputs "," array[i];};print outputs}'`

Attached patch is adding SNPRINTF macro that provides the overflow check
and returns actuall number of printed characters.

Reported-by: Han Pingtian <phan@redhat.com>
Cc: Han Pingtian <phan@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1299528821-17521-2-git-send-email-jolsa@redhat.com>
Signed-off-by: Jiri Olsa <jolsa@redhat.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/top.c b/tools/perf/util/top.c
index 70a9c13..4f869da 100644
--- a/tools/perf/util/top.c
+++ b/tools/perf/util/top.c
@@ -61,6 +61,12 @@ static void rb_insert_active_sym(struct rb_root *tree, struct sym_entry *se)
 	rb_insert_color(&se->rb_node, tree);
 }
 
+#define SNPRINTF(buf, size, fmt, args...) \
+({ \
+	size_t r = snprintf(buf, size, fmt, ## args); \
+	r > size ?  size : r; \
+})
+
 size_t perf_top__header_snprintf(struct perf_top *top, char *bf, size_t size)
 {
 	struct perf_evsel *counter;
@@ -70,7 +76,7 @@ size_t perf_top__header_snprintf(struct perf_top *top, char *bf, size_t size)
 	size_t ret = 0;
 
 	if (!perf_guest) {
-		ret = snprintf(bf, size,
+		ret = SNPRINTF(bf, size,
 			       "   PerfTop:%8.0f irqs/sec  kernel:%4.1f%%"
 			       "  exact: %4.1f%% [", samples_per_sec,
 			       100.0 - (100.0 * ((samples_per_sec - ksamples_per_sec) /
@@ -81,7 +87,7 @@ size_t perf_top__header_snprintf(struct perf_top *top, char *bf, size_t size)
 		float guest_kernel_samples_per_sec = top->guest_kernel_samples / top->delay_secs;
 		float guest_us_samples_per_sec = top->guest_us_samples / top->delay_secs;
 
-		ret = snprintf(bf, size,
+		ret = SNPRINTF(bf, size,
 			       "   PerfTop:%8.0f irqs/sec  kernel:%4.1f%% us:%4.1f%%"
 			       " guest kernel:%4.1f%% guest us:%4.1f%%"
 			       " exact: %4.1f%% [", samples_per_sec,
@@ -101,38 +107,38 @@ size_t perf_top__header_snprintf(struct perf_top *top, char *bf, size_t size)
 	if (top->evlist->nr_entries == 1 || !top->display_weighted) {
 		struct perf_evsel *first;
 		first = list_entry(top->evlist->entries.next, struct perf_evsel, node);
-		ret += snprintf(bf + ret, size - ret, "%" PRIu64 "%s ",
+		ret += SNPRINTF(bf + ret, size - ret, "%" PRIu64 "%s ",
 				(uint64_t)first->attr.sample_period,
 				top->freq ? "Hz" : "");
 	}
 
 	if (!top->display_weighted) {
-		ret += snprintf(bf + ret, size - ret, "%s",
+		ret += SNPRINTF(bf + ret, size - ret, "%s",
 				event_name(top->sym_evsel));
 	} else list_for_each_entry(counter, &top->evlist->entries, node) {
-		ret += snprintf(bf + ret, size - ret, "%s%s",
+		ret += SNPRINTF(bf + ret, size - ret, "%s%s",
 				counter->idx ? "/" : "", event_name(counter));
 	}
 
-	ret += snprintf(bf + ret, size - ret, "], ");
+	ret += SNPRINTF(bf + ret, size - ret, "], ");
 
 	if (top->target_pid != -1)
-		ret += snprintf(bf + ret, size - ret, " (target_pid: %d",
+		ret += SNPRINTF(bf + ret, size - ret, " (target_pid: %d",
 				top->target_pid);
 	else if (top->target_tid != -1)
-		ret += snprintf(bf + ret, size - ret, " (target_tid: %d",
+		ret += SNPRINTF(bf + ret, size - ret, " (target_tid: %d",
 				top->target_tid);
 	else
-		ret += snprintf(bf + ret, size - ret, " (all");
+		ret += SNPRINTF(bf + ret, size - ret, " (all");
 
 	if (top->cpu_list)
-		ret += snprintf(bf + ret, size - ret, ", CPU%s: %s)",
+		ret += SNPRINTF(bf + ret, size - ret, ", CPU%s: %s)",
 				top->evlist->cpus->nr > 1 ? "s" : "", top->cpu_list);
 	else {
 		if (top->target_tid != -1)
-			ret += snprintf(bf + ret, size - ret, ")");
+			ret += SNPRINTF(bf + ret, size - ret, ")");
 		else
-			ret += snprintf(bf + ret, size - ret, ", %d CPU%s)",
+			ret += SNPRINTF(bf + ret, size - ret, ", %d CPU%s)",
 					top->evlist->cpus->nr,
 					top->evlist->cpus->nr > 1 ? "s" : "");
 	}
-- 
cgit v0.10.2


From 6547250381eb315acff3d52b4872ad775359407c Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Mon, 7 Mar 2011 21:13:41 +0100
Subject: perf top: Don't let events to eat up whole header line

Passing multiple events might force out information about pid/tid/cpu.
Attached patch leaves 30 characters for this info at the expense of the
events' names.

Cc: Ingo Molnar <mingo@elte.hu>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Han Pingtian <phan@redhat.com>
LKML-Reference: <1299528821-17521-3-git-send-email-jolsa@redhat.com>
Signed-off-by: Jiri Olsa <jolsa@redhat.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/top.c b/tools/perf/util/top.c
index 4f869da..75cfe4d 100644
--- a/tools/perf/util/top.c
+++ b/tools/perf/util/top.c
@@ -115,9 +115,23 @@ size_t perf_top__header_snprintf(struct perf_top *top, char *bf, size_t size)
 	if (!top->display_weighted) {
 		ret += SNPRINTF(bf + ret, size - ret, "%s",
 				event_name(top->sym_evsel));
-	} else list_for_each_entry(counter, &top->evlist->entries, node) {
-		ret += SNPRINTF(bf + ret, size - ret, "%s%s",
-				counter->idx ? "/" : "", event_name(counter));
+	} else {
+		/*
+		 * Don't let events eat all the space. Leaving 30 bytes
+		 * for the rest should be enough.
+		 */
+		size_t last_pos = size - 30;
+
+		list_for_each_entry(counter, &top->evlist->entries, node) {
+			ret += SNPRINTF(bf + ret, size - ret, "%s%s",
+					counter->idx ? "/" : "",
+					event_name(counter));
+			if (ret > last_pos) {
+				sprintf(bf + last_pos - 3, "..");
+				ret = last_pos - 1;
+				break;
+			}
+		}
 	}
 
 	ret += SNPRINTF(bf + ret, size - ret, "], ");
-- 
cgit v0.10.2


From a91e5431d54f5359fccb5ec2512f252eb217707e Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 10 Mar 2011 11:15:54 -0300
Subject: perf session: Use evlist/evsel for managing perf.data attributes

So that we can reuse things like the id to attr lookup routine
(perf_evlist__id2evsel) that uses a hash table instead of the linear
lookup done in the older perf_header_attr routines, etc.

Also to make evsels/evlist more pervasive an API, simplyfing using the
emerging perf lib.

cc: Arun Sharma <arun@sharma-home.net>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index d40a81e..6febcc1 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -31,7 +31,6 @@
 #include <sys/mman.h>
 
 #define FD(e, x, y) (*(int *)xyarray__entry(e->fd, x, y))
-#define SID(e, x, y) xyarray__entry(e->id, x, y)
 
 enum write_mode_t {
 	WRITE_FORCE,
@@ -40,7 +39,6 @@ enum write_mode_t {
 
 static u64			user_interval			= ULLONG_MAX;
 static u64			default_interval		=      0;
-static u64			sample_type;
 
 static unsigned int		page_size;
 static unsigned int		mmap_pages			=    128;
@@ -160,54 +158,6 @@ static void sig_atexit(void)
 	kill(getpid(), signr);
 }
 
-static struct perf_header_attr *get_header_attr(struct perf_event_attr *a, int nr)
-{
-	struct perf_header_attr *h_attr;
-
-	if (nr < session->header.attrs) {
-		h_attr = session->header.attr[nr];
-	} else {
-		h_attr = perf_header_attr__new(a);
-		if (h_attr != NULL)
-			if (perf_header__add_attr(&session->header, h_attr) < 0) {
-				perf_header_attr__delete(h_attr);
-				h_attr = NULL;
-			}
-	}
-
-	return h_attr;
-}
-
-static void create_counter(struct perf_evsel *evsel, int cpu)
-{
-	struct perf_event_attr *attr = &evsel->attr;
-	struct perf_header_attr *h_attr;
-	struct perf_sample_id *sid;
-	int thread_index;
-
-	for (thread_index = 0; thread_index < evsel_list->threads->nr; thread_index++) {
-		h_attr = get_header_attr(attr, evsel->idx);
-		if (h_attr == NULL)
-			die("nomem\n");
-
-		if (!file_new) {
-			if (memcmp(&h_attr->attr, attr, sizeof(*attr))) {
-				fprintf(stderr, "incompatible append\n");
-				exit(-1);
-			}
-		}
-
-		sid = SID(evsel, cpu, thread_index);
-		if (perf_header_attr__add_id(h_attr, sid->id) < 0) {
-			pr_warning("Not enough memory to add id\n");
-			exit(-1);
-		}
-	}
-
-	if (!sample_type)
-		sample_type = attr->sample_type;
-}
-
 static void config_attr(struct perf_evsel *evsel, struct perf_evlist *evlist)
 {
 	struct perf_event_attr *attr = &evsel->attr;
@@ -278,10 +228,28 @@ static void config_attr(struct perf_evsel *evsel, struct perf_evlist *evlist)
 	}
 }
 
+static bool perf_evlist__equal(struct perf_evlist *evlist,
+			       struct perf_evlist *other)
+{
+	struct perf_evsel *pos, *pair;
+
+	if (evlist->nr_entries != other->nr_entries)
+		return false;
+
+	pair = list_entry(other->entries.next, struct perf_evsel, node);
+
+	list_for_each_entry(pos, &evlist->entries, node) {
+		if (memcmp(&pos->attr, &pair->attr, sizeof(pos->attr) != 0))
+			return false;
+		pair = list_entry(pair->node.next, struct perf_evsel, node);
+	}
+
+	return true;
+}
+
 static void open_counters(struct perf_evlist *evlist)
 {
 	struct perf_evsel *pos;
-	int cpu;
 
 	list_for_each_entry(pos, &evlist->entries, node) {
 		struct perf_event_attr *attr = &pos->attr;
@@ -364,10 +332,16 @@ try_again:
 	if (perf_evlist__mmap(evlist, mmap_pages, false) < 0)
 		die("failed to mmap with %d (%s)\n", errno, strerror(errno));
 
-	for (cpu = 0; cpu < evsel_list->cpus->nr; ++cpu) {
-		list_for_each_entry(pos, &evlist->entries, node)
-			create_counter(pos, cpu);
-	}
+	if (file_new)
+		session->evlist = evlist;
+	else {
+		if (!perf_evlist__equal(session->evlist, evlist)) {
+			fprintf(stderr, "incompatible append\n");
+			exit(-1);
+		}
+ 	}
+
+	perf_session__update_sample_type(session);
 }
 
 static int process_buildids(void)
@@ -390,7 +364,7 @@ static void atexit_header(void)
 
 		if (!no_buildid)
 			process_buildids();
-		perf_header__write(&session->header, evsel_list, output, true);
+		perf_session__write_header(session, evsel_list, output, true);
 		perf_session__delete(session);
 		perf_evlist__delete(evsel_list);
 		symbol__exit();
@@ -524,7 +498,7 @@ static int __cmd_record(int argc, const char **argv)
 		perf_header__set_feat(&session->header, HEADER_BUILD_ID);
 
 	if (!file_new) {
-		err = perf_header__read(session, output);
+		err = perf_session__read_header(session, output);
 		if (err < 0)
 			goto out_delete_session;
 	}
@@ -588,8 +562,6 @@ static int __cmd_record(int argc, const char **argv)
 
 	open_counters(evsel_list);
 
-	perf_session__set_sample_type(session, sample_type);
-
 	/*
 	 * perf_session__delete(session) will be called at atexit_header()
 	 */
@@ -600,20 +572,17 @@ static int __cmd_record(int argc, const char **argv)
 		if (err < 0)
 			return err;
 	} else if (file_new) {
-		err = perf_header__write(&session->header, evsel_list,
-					 output, false);
+		err = perf_session__write_header(session, evsel_list,
+						 output, false);
 		if (err < 0)
 			return err;
 	}
 
 	post_processing_offset = lseek(output, 0, SEEK_CUR);
 
-	perf_session__set_sample_id_all(session, sample_id_all_avail);
-
 	if (pipe_output) {
-		err = perf_event__synthesize_attrs(&session->header,
-						   process_synthesized_event,
-						   session);
+		err = perf_session__synthesize_attrs(session,
+						     process_synthesized_event);
 		if (err < 0) {
 			pr_err("Couldn't synthesize attrs.\n");
 			return err;
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index e9b5d51..b1b8200 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -70,8 +70,8 @@ static int perf_session__add_hist_entry(struct perf_session *session,
 		 * FIXME: Propagate this back, but at least we're in a builtin,
 		 * where exit() is allowed. ;-)
 		 */
-		ui__warning("Invalid %s file, contains samples with id not in "
-			    "its header!\n", input_name);
+		ui__warning("Invalid %s file, contains samples with id %" PRIu64 " not in "
+			    "its header!\n", input_name, sample->id);
 		exit_browser(0);
 		exit(1);
 	}
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 417f757..80c9e06 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -883,7 +883,6 @@ try_again:
 static int __cmd_top(void)
 {
 	pthread_t thread;
-	struct perf_evsel *first;
 	int ret __used;
 	/*
 	 * FIXME: perf_session__new should allow passing a O_MMAP, so that all this
@@ -900,8 +899,8 @@ static int __cmd_top(void)
 		perf_event__synthesize_threads(perf_event__process, session);
 
 	start_counters(top.evlist);
-	first = list_entry(top.evlist->entries.next, struct perf_evsel, node);
-	perf_session__set_sample_type(session, first->attr.sample_type);
+	session->evlist = top.evlist;
+	perf_session__update_sample_type(session);
 
 	/* Wait for a minimal set of events before starting the snapshot */
 	poll(top.evlist->pollfd, top.evlist->nr_fds, 100);
diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index 190c64c..d852cef 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -19,7 +19,7 @@
 #include <linux/hash.h>
 
 #define FD(e, x, y) (*(int *)xyarray__entry(e->fd, x, y))
-#define SID(e, x, y) xyarray__entry(e->id, x, y)
+#define SID(e, x, y) xyarray__entry(e->sample_id, x, y)
 
 void perf_evlist__init(struct perf_evlist *evlist, struct cpu_map *cpus,
 		       struct thread_map *threads)
@@ -106,8 +106,9 @@ void perf_evlist__add_pollfd(struct perf_evlist *evlist, int fd)
 	evlist->nr_fds++;
 }
 
-void perf_evlist__id_hash(struct perf_evlist *evlist, struct perf_evsel *evsel,
-			  int cpu, int thread, u64 id)
+static void perf_evlist__id_hash(struct perf_evlist *evlist,
+				 struct perf_evsel *evsel,
+				 int cpu, int thread, u64 id)
 {
 	int hash;
 	struct perf_sample_id *sid = SID(evsel, cpu, thread);
@@ -118,9 +119,16 @@ void perf_evlist__id_hash(struct perf_evlist *evlist, struct perf_evsel *evsel,
 	hlist_add_head(&sid->node, &evlist->heads[hash]);
 }
 
-static int perf_evlist__id_hash_fd(struct perf_evlist *evlist,
-				   struct perf_evsel *evsel,
-				   int cpu, int thread, int fd)
+void perf_evlist__id_add(struct perf_evlist *evlist, struct perf_evsel *evsel,
+			 int cpu, int thread, u64 id)
+{
+	perf_evlist__id_hash(evlist, evsel, cpu, thread, id);
+	evsel->id[evsel->ids++] = id;
+}
+
+static int perf_evlist__id_add_fd(struct perf_evlist *evlist,
+				  struct perf_evsel *evsel,
+				  int cpu, int thread, int fd)
 {
 	u64 read_data[4] = { 0, };
 	int id_idx = 1; /* The first entry is the counter value */
@@ -134,7 +142,7 @@ static int perf_evlist__id_hash_fd(struct perf_evlist *evlist,
 	if (evsel->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
 		++id_idx;
 
-	perf_evlist__id_hash(evlist, evsel, cpu, thread, read_data[id_idx]);
+	perf_evlist__id_add(evlist, evsel, cpu, thread, read_data[id_idx]);
 	return 0;
 }
 
@@ -292,7 +300,7 @@ int perf_evlist__mmap(struct perf_evlist *evlist, int pages, bool overwrite)
 
 	list_for_each_entry(evsel, &evlist->entries, node) {
 		if ((evsel->attr.read_format & PERF_FORMAT_ID) &&
-		    evsel->id == NULL &&
+		    evsel->sample_id == NULL &&
 		    perf_evsel__alloc_id(evsel, cpus->nr, threads->nr) < 0)
 			return -ENOMEM;
 
@@ -308,7 +316,7 @@ int perf_evlist__mmap(struct perf_evlist *evlist, int pages, bool overwrite)
 					goto out_unmap;
 
 				if ((evsel->attr.read_format & PERF_FORMAT_ID) &&
-				    perf_evlist__id_hash_fd(evlist, evsel, cpu, thread, fd) < 0)
+				    perf_evlist__id_add_fd(evlist, evsel, cpu, thread, fd) < 0)
 					goto out_unmap;
 			}
 		}
diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h
index 078d512..8b1cb7a 100644
--- a/tools/perf/util/evlist.h
+++ b/tools/perf/util/evlist.h
@@ -38,8 +38,8 @@ void perf_evlist__delete(struct perf_evlist *evlist);
 void perf_evlist__add(struct perf_evlist *evlist, struct perf_evsel *entry);
 int perf_evlist__add_default(struct perf_evlist *evlist);
 
-void perf_evlist__id_hash(struct perf_evlist *evlist, struct perf_evsel *evsel,
-			  int cpu, int thread, u64 id);
+void perf_evlist__id_add(struct perf_evlist *evlist, struct perf_evsel *evsel,
+			 int cpu, int thread, u64 id);
 
 int perf_evlist__alloc_pollfd(struct perf_evlist *evlist);
 void perf_evlist__add_pollfd(struct perf_evlist *evlist, int fd);
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 8083d51..662596a 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -41,8 +41,18 @@ int perf_evsel__alloc_fd(struct perf_evsel *evsel, int ncpus, int nthreads)
 
 int perf_evsel__alloc_id(struct perf_evsel *evsel, int ncpus, int nthreads)
 {
-	evsel->id = xyarray__new(ncpus, nthreads, sizeof(struct perf_sample_id));
-	return evsel->id != NULL ? 0 : -ENOMEM;
+	evsel->sample_id = xyarray__new(ncpus, nthreads, sizeof(struct perf_sample_id));
+	if (evsel->sample_id == NULL)
+		return -ENOMEM;
+
+	evsel->id = zalloc(ncpus * nthreads * sizeof(u64));
+	if (evsel->id == NULL) {
+		xyarray__delete(evsel->sample_id);
+		evsel->sample_id = NULL;
+		return -ENOMEM;
+	}
+
+	return 0;
 }
 
 int perf_evsel__alloc_counts(struct perf_evsel *evsel, int ncpus)
@@ -60,7 +70,9 @@ void perf_evsel__free_fd(struct perf_evsel *evsel)
 
 void perf_evsel__free_id(struct perf_evsel *evsel)
 {
-	xyarray__delete(evsel->id);
+	xyarray__delete(evsel->sample_id);
+	evsel->sample_id = NULL;
+	free(evsel->id);
 	evsel->id = NULL;
 }
 
@@ -79,7 +91,8 @@ void perf_evsel__exit(struct perf_evsel *evsel)
 {
 	assert(list_empty(&evsel->node));
 	xyarray__delete(evsel->fd);
-	xyarray__delete(evsel->id);
+	xyarray__delete(evsel->sample_id);
+	free(evsel->id);
 }
 
 void perf_evsel__delete(struct perf_evsel *evsel)
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index 281b60e..6710ab5 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -49,12 +49,17 @@ struct perf_evsel {
 	struct perf_event_attr	attr;
 	char			*filter;
 	struct xyarray		*fd;
-	struct xyarray		*id;
+	struct xyarray		*sample_id;
+	u64			*id;
 	struct perf_counts	*counts;
 	int			idx;
+	int			ids;
 	struct hists		hists;
 	char			*name;
-	void			*priv;
+	union {
+		void		*priv;
+		off_t		id_offset;
+	};
 	struct cgroup_sel	*cgrp;
 };
 
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 108b0db..40b10e4 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -9,6 +9,7 @@
 #include <linux/kernel.h>
 
 #include "evlist.h"
+#include "evsel.h"
 #include "util.h"
 #include "header.h"
 #include "../perf.h"
@@ -19,89 +20,6 @@
 
 static bool no_buildid_cache = false;
 
-/*
- * Create new perf.data header attribute:
- */
-struct perf_header_attr *perf_header_attr__new(struct perf_event_attr *attr)
-{
-	struct perf_header_attr *self = malloc(sizeof(*self));
-
-	if (self != NULL) {
-		self->attr = *attr;
-		self->ids  = 0;
-		self->size = 1;
-		self->id   = malloc(sizeof(u64));
-		if (self->id == NULL) {
-			free(self);
-			self = NULL;
-		}
-	}
-
-	return self;
-}
-
-void perf_header_attr__delete(struct perf_header_attr *self)
-{
-	free(self->id);
-	free(self);
-}
-
-int perf_header_attr__add_id(struct perf_header_attr *self, u64 id)
-{
-	int pos = self->ids;
-
-	self->ids++;
-	if (self->ids > self->size) {
-		int nsize = self->size * 2;
-		u64 *nid = realloc(self->id, nsize * sizeof(u64));
-
-		if (nid == NULL)
-			return -1;
-
-		self->size = nsize;
-		self->id = nid;
-	}
-	self->id[pos] = id;
-	return 0;
-}
-
-int perf_header__init(struct perf_header *self)
-{
-	self->size = 1;
-	self->attr = malloc(sizeof(void *));
-	return self->attr == NULL ? -ENOMEM : 0;
-}
-
-void perf_header__exit(struct perf_header *self)
-{
-	int i;
-	for (i = 0; i < self->attrs; ++i)
-                perf_header_attr__delete(self->attr[i]);
-	free(self->attr);
-}
-
-int perf_header__add_attr(struct perf_header *self,
-			  struct perf_header_attr *attr)
-{
-	if (self->frozen)
-		return -1;
-
-	if (self->attrs == self->size) {
-		int nsize = self->size * 2;
-		struct perf_header_attr **nattr;
-
-		nattr = realloc(self->attr, nsize * sizeof(void *));
-		if (nattr == NULL)
-			return -1;
-
-		self->size = nsize;
-		self->attr = nattr;
-	}
-
-	self->attr[self->attrs++] = attr;
-	return 0;
-}
-
 static int event_count;
 static struct perf_trace_event_type *events;
 
@@ -515,33 +433,41 @@ int perf_header__write_pipe(int fd)
 	return 0;
 }
 
-int perf_header__write(struct perf_header *self, struct perf_evlist *evlist,
-		       int fd, bool at_exit)
+int perf_session__write_header(struct perf_session *session,
+			       struct perf_evlist *evlist,
+			       int fd, bool at_exit)
 {
 	struct perf_file_header f_header;
 	struct perf_file_attr   f_attr;
-	struct perf_header_attr	*attr;
-	int i, err;
+	struct perf_header *self = &session->header;
+	struct perf_evsel *attr, *pair = NULL;
+	int err;
 
 	lseek(fd, sizeof(f_header), SEEK_SET);
 
-	for (i = 0; i < self->attrs; i++) {
-		attr = self->attr[i];
+	if (session->evlist != evlist)
+		pair = list_entry(session->evlist->entries.next, struct perf_evsel, node);
 
+	list_for_each_entry(attr, &evlist->entries, node) {
 		attr->id_offset = lseek(fd, 0, SEEK_CUR);
 		err = do_write(fd, attr->id, attr->ids * sizeof(u64));
 		if (err < 0) {
+out_err_write:
 			pr_debug("failed to write perf header\n");
 			return err;
 		}
+		if (session->evlist != evlist) {
+			err = do_write(fd, pair->id, pair->ids * sizeof(u64));
+			if (err < 0)
+				goto out_err_write;
+			attr->ids += pair->ids;
+			pair = list_entry(pair->node.next, struct perf_evsel, node);
+		}
 	}
 
-
 	self->attr_offset = lseek(fd, 0, SEEK_CUR);
 
-	for (i = 0; i < self->attrs; i++) {
-		attr = self->attr[i];
-
+	list_for_each_entry(attr, &evlist->entries, node) {
 		f_attr = (struct perf_file_attr){
 			.attr = attr->attr,
 			.ids  = {
@@ -580,7 +506,7 @@ int perf_header__write(struct perf_header *self, struct perf_evlist *evlist,
 		.attr_size = sizeof(f_attr),
 		.attrs = {
 			.offset = self->attr_offset,
-			.size   = self->attrs * sizeof(f_attr),
+			.size   = evlist->nr_entries * sizeof(f_attr),
 		},
 		.data = {
 			.offset = self->data_offset,
@@ -861,7 +787,7 @@ static int perf_header__read_pipe(struct perf_session *session, int fd)
 	return 0;
 }
 
-int perf_header__read(struct perf_session *session, int fd)
+int perf_session__read_header(struct perf_session *session, int fd)
 {
 	struct perf_header *self = &session->header;
 	struct perf_file_header	f_header;
@@ -869,6 +795,10 @@ int perf_header__read(struct perf_session *session, int fd)
 	u64			f_id;
 	int nr_attrs, nr_ids, i, j;
 
+	session->evlist = perf_evlist__new(NULL, NULL);
+	if (session->evlist == NULL)
+		return -ENOMEM;
+
 	if (session->fd_pipe)
 		return perf_header__read_pipe(session, fd);
 
@@ -881,33 +811,39 @@ int perf_header__read(struct perf_session *session, int fd)
 	lseek(fd, f_header.attrs.offset, SEEK_SET);
 
 	for (i = 0; i < nr_attrs; i++) {
-		struct perf_header_attr *attr;
+		struct perf_evsel *evsel;
 		off_t tmp;
 
 		if (perf_header__getbuffer64(self, fd, &f_attr, sizeof(f_attr)))
 			goto out_errno;
 
 		tmp = lseek(fd, 0, SEEK_CUR);
+		evsel = perf_evsel__new(&f_attr.attr, i);
 
-		attr = perf_header_attr__new(&f_attr.attr);
-		if (attr == NULL)
-			 return -ENOMEM;
+		if (evsel == NULL)
+			goto out_delete_evlist;
+		/*
+		 * Do it before so that if perf_evsel__alloc_id fails, this
+		 * entry gets purged too at perf_evlist__delete().
+		 */
+		perf_evlist__add(session->evlist, evsel);
 
 		nr_ids = f_attr.ids.size / sizeof(u64);
+		/*
+		 * We don't have the cpu and thread maps on the header, so
+		 * for allocating the perf_sample_id table we fake 1 cpu and
+		 * hattr->ids threads.
+		 */
+		if (perf_evsel__alloc_id(evsel, 1, nr_ids))
+			goto out_delete_evlist;
+
 		lseek(fd, f_attr.ids.offset, SEEK_SET);
 
 		for (j = 0; j < nr_ids; j++) {
 			if (perf_header__getbuffer64(self, fd, &f_id, sizeof(f_id)))
 				goto out_errno;
 
-			if (perf_header_attr__add_id(attr, f_id) < 0) {
-				perf_header_attr__delete(attr);
-				return -ENOMEM;
-			}
-		}
-		if (perf_header__add_attr(self, attr) < 0) {
-			perf_header_attr__delete(attr);
-			return -ENOMEM;
+			perf_evlist__id_add(session->evlist, evsel, 0, j, f_id);
 		}
 
 		lseek(fd, tmp, SEEK_SET);
@@ -932,37 +868,38 @@ int perf_header__read(struct perf_session *session, int fd)
 	return 0;
 out_errno:
 	return -errno;
+
+out_delete_evlist:
+	perf_evlist__delete(session->evlist);
+	session->evlist = NULL;
+	return -ENOMEM;
 }
 
-u64 perf_header__sample_type(struct perf_header *header)
+u64 perf_evlist__sample_type(struct perf_evlist *evlist)
 {
+	struct perf_evsel *pos;
 	u64 type = 0;
-	int i;
-
-	for (i = 0; i < header->attrs; i++) {
-		struct perf_header_attr *attr = header->attr[i];
 
+	list_for_each_entry(pos, &evlist->entries, node) {
 		if (!type)
-			type = attr->attr.sample_type;
-		else if (type != attr->attr.sample_type)
+			type = pos->attr.sample_type;
+		else if (type != pos->attr.sample_type)
 			die("non matching sample_type");
 	}
 
 	return type;
 }
 
-bool perf_header__sample_id_all(const struct perf_header *header)
+bool perf_evlist__sample_id_all(const struct perf_evlist *evlist)
 {
 	bool value = false, first = true;
-	int i;
-
-	for (i = 0; i < header->attrs; i++) {
-		struct perf_header_attr *attr = header->attr[i];
+	struct perf_evsel *pos;
 
+	list_for_each_entry(pos, &evlist->entries, node) {
 		if (first) {
-			value = attr->attr.sample_id_all;
+			value = pos->attr.sample_id_all;
 			first = false;
-		} else if (value != attr->attr.sample_id_all)
+		} else if (value != pos->attr.sample_id_all)
 			die("non matching sample_id_all");
 	}
 
@@ -1000,16 +937,13 @@ int perf_event__synthesize_attr(struct perf_event_attr *attr, u16 ids, u64 *id,
 	return err;
 }
 
-int perf_event__synthesize_attrs(struct perf_header *self,
-				 perf_event__handler_t process,
-				 struct perf_session *session)
+int perf_session__synthesize_attrs(struct perf_session *session,
+				   perf_event__handler_t process)
 {
-	struct perf_header_attr	*attr;
-	int i, err = 0;
-
-	for (i = 0; i < self->attrs; i++) {
-		attr = self->attr[i];
+	struct perf_evsel *attr;
+	int err = 0;
 
+	list_for_each_entry(attr, &session->evlist->entries, node) {
 		err = perf_event__synthesize_attr(&attr->attr, attr->ids,
 						  attr->id, process, session);
 		if (err) {
@@ -1024,27 +958,36 @@ int perf_event__synthesize_attrs(struct perf_header *self,
 int perf_event__process_attr(union perf_event *event,
 			     struct perf_session *session)
 {
-	struct perf_header_attr *attr;
 	unsigned int i, ids, n_ids;
+	struct perf_evsel *evsel;
 
-	attr = perf_header_attr__new(&event->attr.attr);
-	if (attr == NULL)
+	if (session->evlist == NULL) {
+		session->evlist = perf_evlist__new(NULL, NULL);
+		if (session->evlist == NULL)
+			return -ENOMEM;
+	}
+
+	evsel = perf_evsel__new(&event->attr.attr,
+				session->evlist->nr_entries);
+	if (evsel == NULL)
 		return -ENOMEM;
 
+	perf_evlist__add(session->evlist, evsel);
+
 	ids = event->header.size;
 	ids -= (void *)&event->attr.id - (void *)event;
 	n_ids = ids / sizeof(u64);
+	/*
+	 * We don't have the cpu and thread maps on the header, so
+	 * for allocating the perf_sample_id table we fake 1 cpu and
+	 * hattr->ids threads.
+	 */
+	if (perf_evsel__alloc_id(evsel, 1, n_ids))
+		return -ENOMEM;
 
 	for (i = 0; i < n_ids; i++) {
-		if (perf_header_attr__add_id(attr, event->attr.id[i]) < 0) {
-			perf_header_attr__delete(attr);
-			return -ENOMEM;
-		}
-	}
-
-	if (perf_header__add_attr(&session->header, attr) < 0) {
-		perf_header_attr__delete(attr);
-		return -ENOMEM;
+		perf_evlist__id_add(session->evlist, evsel, 0, i,
+				    event->attr.id[i]);
 	}
 
 	perf_session__update_sample_type(session);
diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h
index 2fab133..4cc2675 100644
--- a/tools/perf/util/header.h
+++ b/tools/perf/util/header.h
@@ -9,13 +9,6 @@
 
 #include <linux/bitmap.h>
 
-struct perf_header_attr {
-	struct perf_event_attr attr;
-	int ids, size;
-	u64 *id;
-	off_t id_offset;
-};
-
 enum {
 	HEADER_TRACE_INFO = 1,
 	HEADER_BUILD_ID,
@@ -51,9 +44,7 @@ int perf_file_header__read(struct perf_file_header *self,
 
 struct perf_header {
 	int			frozen;
-	int			attrs, size;
 	bool			needs_swap;
-	struct perf_header_attr **attr;
 	s64			attr_offset;
 	u64			data_offset;
 	u64			data_size;
@@ -62,29 +53,19 @@ struct perf_header {
 	DECLARE_BITMAP(adds_features, HEADER_FEAT_BITS);
 };
 
-int perf_header__init(struct perf_header *self);
-void perf_header__exit(struct perf_header *self);
-
 struct perf_evlist;
 
-int perf_header__read(struct perf_session *session, int fd);
-int perf_header__write(struct perf_header *self, struct perf_evlist *evlist,
-		       int fd, bool at_exit);
+int perf_session__read_header(struct perf_session *session, int fd);
+int perf_session__write_header(struct perf_session *session,
+			       struct perf_evlist *evlist,
+			       int fd, bool at_exit);
 int perf_header__write_pipe(int fd);
 
-int perf_header__add_attr(struct perf_header *self,
-			  struct perf_header_attr *attr);
-
 int perf_header__push_event(u64 id, const char *name);
 char *perf_header__find_event(u64 id);
 
-struct perf_header_attr *perf_header_attr__new(struct perf_event_attr *attr);
-void perf_header_attr__delete(struct perf_header_attr *self);
-
-int perf_header_attr__add_id(struct perf_header_attr *self, u64 id);
-
-u64 perf_header__sample_type(struct perf_header *header);
-bool perf_header__sample_id_all(const struct perf_header *header);
+u64 perf_evlist__sample_type(struct perf_evlist *evlist);
+bool perf_evlist__sample_id_all(const struct perf_evlist *evlist);
 void perf_header__set_feat(struct perf_header *self, int feat);
 void perf_header__clear_feat(struct perf_header *self, int feat);
 bool perf_header__has_feat(const struct perf_header *self, int feat);
@@ -101,9 +82,8 @@ int build_id_cache__remove_s(const char *sbuild_id, const char *debugdir);
 int perf_event__synthesize_attr(struct perf_event_attr *attr, u16 ids, u64 *id,
 				perf_event__handler_t process,
 				struct perf_session *session);
-int perf_event__synthesize_attrs(struct perf_header *self,
-				 perf_event__handler_t process,
-				 struct perf_session *session);
+int perf_session__synthesize_attrs(struct perf_session *session,
+				   perf_event__handler_t process);
 int perf_event__process_attr(union perf_event *event, struct perf_session *session);
 
 int perf_event__synthesize_event_type(u64 event_id, char *name,
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 0d41419..f26639f 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -13,46 +13,6 @@
 #include "sort.h"
 #include "util.h"
 
-static int perf_session__read_evlist(struct perf_session *session)
-{
-	int i, j;
-
-	session->evlist = perf_evlist__new(NULL, NULL);
-	if (session->evlist == NULL)
-		return -ENOMEM;
-
-	for (i = 0; i < session->header.attrs; ++i) {
-		struct perf_header_attr *hattr = session->header.attr[i];
-		struct perf_evsel *evsel = perf_evsel__new(&hattr->attr, i);
-
-		if (evsel == NULL)
-			goto out_delete_evlist;
-		/*
-		 * Do it before so that if perf_evsel__alloc_id fails, this
-		 * entry gets purged too at perf_evlist__delete().
-		 */
-		perf_evlist__add(session->evlist, evsel);
-		/*
-		 * We don't have the cpu and thread maps on the header, so
-		 * for allocating the perf_sample_id table we fake 1 cpu and
-		 * hattr->ids threads.
-		 */
-		if (perf_evsel__alloc_id(evsel, 1, hattr->ids))
-			goto out_delete_evlist;
-
-		for (j = 0; j < hattr->ids; ++j)
-			perf_evlist__id_hash(session->evlist, evsel, 0, j,
-					     hattr->id[j]);
-	}
-
-	return 0;
-
-out_delete_evlist:
-	perf_evlist__delete(session->evlist);
-	session->evlist = NULL;
-	return -ENOMEM;
-}
-
 static int perf_session__open(struct perf_session *self, bool force)
 {
 	struct stat input_stat;
@@ -61,7 +21,7 @@ static int perf_session__open(struct perf_session *self, bool force)
 		self->fd_pipe = true;
 		self->fd = STDIN_FILENO;
 
-		if (perf_header__read(self, self->fd) < 0)
+		if (perf_session__read_header(self, self->fd) < 0)
 			pr_err("incompatible file format");
 
 		return 0;
@@ -93,16 +53,11 @@ static int perf_session__open(struct perf_session *self, bool force)
 		goto out_close;
 	}
 
-	if (perf_header__read(self, self->fd) < 0) {
+	if (perf_session__read_header(self, self->fd) < 0) {
 		pr_err("incompatible file format");
 		goto out_close;
 	}
 
-	if (perf_session__read_evlist(self) < 0) {
-		pr_err("Not enough memory to read the event selector list\n");
-		goto out_close;
-	}
-
 	self->size = input_stat.st_size;
 	return 0;
 
@@ -139,21 +94,10 @@ out:
        session->id_hdr_size = size;
 }
 
-void perf_session__set_sample_id_all(struct perf_session *session, bool value)
-{
-	session->sample_id_all = value;
-	perf_session__id_header_size(session);
-}
-
-void perf_session__set_sample_type(struct perf_session *session, u64 type)
-{
-	session->sample_type = type;
-}
-
 void perf_session__update_sample_type(struct perf_session *self)
 {
-	self->sample_type = perf_header__sample_type(&self->header);
-	self->sample_id_all = perf_header__sample_id_all(&self->header);
+	self->sample_type = perf_evlist__sample_type(self->evlist);
+	self->sample_id_all = perf_evlist__sample_id_all(self->evlist);
 	perf_session__id_header_size(self);
 }
 
@@ -182,9 +126,6 @@ struct perf_session *perf_session__new(const char *filename, int mode,
 	if (self == NULL)
 		goto out;
 
-	if (perf_header__init(&self->header) < 0)
-		goto out_free;
-
 	memcpy(self->filename, filename, len);
 	self->threads = RB_ROOT;
 	INIT_LIST_HEAD(&self->dead_threads);
@@ -208,6 +149,7 @@ struct perf_session *perf_session__new(const char *filename, int mode,
 	if (mode == O_RDONLY) {
 		if (perf_session__open(self, force) < 0)
 			goto out_delete;
+		perf_session__update_sample_type(self);
 	} else if (mode == O_WRONLY) {
 		/*
 		 * In O_RDONLY mode this will be performed when reading the
@@ -217,8 +159,6 @@ struct perf_session *perf_session__new(const char *filename, int mode,
 			goto out_delete;
 	}
 
-	perf_session__update_sample_type(self);
-
 	if (ops && ops->ordering_requires_timestamps &&
 	    ops->ordered_samples && !self->sample_id_all) {
 		dump_printf("WARNING: No sample_id_all support, falling back to unordered processing\n");
@@ -227,9 +167,6 @@ struct perf_session *perf_session__new(const char *filename, int mode,
 
 out:
 	return self;
-out_free:
-	free(self);
-	return NULL;
 out_delete:
 	perf_session__delete(self);
 	return NULL;
@@ -260,7 +197,6 @@ static void perf_session__delete_threads(struct perf_session *self)
 
 void perf_session__delete(struct perf_session *self)
 {
-	perf_header__exit(&self->header);
 	perf_session__destroy_kernel_maps(self);
 	perf_session__delete_dead_threads(self);
 	perf_session__delete_threads(self);
diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h
index 05dd7bc..b5b148b 100644
--- a/tools/perf/util/session.h
+++ b/tools/perf/util/session.h
@@ -112,8 +112,6 @@ void mem_bswap_64(void *src, int byte_size);
 int perf_session__create_kernel_maps(struct perf_session *self);
 
 void perf_session__update_sample_type(struct perf_session *self);
-void perf_session__set_sample_id_all(struct perf_session *session, bool value);
-void perf_session__set_sample_type(struct perf_session *session, u64 type);
 void perf_session__remove_thread(struct perf_session *self, struct thread *th);
 
 static inline
-- 
cgit v0.10.2


From 1c0b04d10bbe35279c50e3b36cf5b8ec2a0050d8 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Wed, 9 Mar 2011 08:13:19 -0300
Subject: perf header: Stop using 'self'

Stop using this python/OOP convention, doesn't really helps. Will do
more from time to time till we get it cleaned up in all of tools/perf.

Suggested-by: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <new-submission>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tom Zanussi <tzanussi@gmail.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 40b10e4..5a72d42 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -66,19 +66,19 @@ struct perf_file_attr {
 	struct perf_file_section	ids;
 };
 
-void perf_header__set_feat(struct perf_header *self, int feat)
+void perf_header__set_feat(struct perf_header *header, int feat)
 {
-	set_bit(feat, self->adds_features);
+	set_bit(feat, header->adds_features);
 }
 
-void perf_header__clear_feat(struct perf_header *self, int feat)
+void perf_header__clear_feat(struct perf_header *header, int feat)
 {
-	clear_bit(feat, self->adds_features);
+	clear_bit(feat, header->adds_features);
 }
 
-bool perf_header__has_feat(const struct perf_header *self, int feat)
+bool perf_header__has_feat(const struct perf_header *header, int feat)
 {
-	return test_bit(feat, self->adds_features);
+	return test_bit(feat, header->adds_features);
 }
 
 static int do_write(int fd, const void *buf, size_t size)
@@ -147,22 +147,22 @@ static int __dsos__write_buildid_table(struct list_head *head, pid_t pid,
 	return 0;
 }
 
-static int machine__write_buildid_table(struct machine *self, int fd)
+static int machine__write_buildid_table(struct machine *machine, int fd)
 {
 	int err;
 	u16 kmisc = PERF_RECORD_MISC_KERNEL,
 	    umisc = PERF_RECORD_MISC_USER;
 
-	if (!machine__is_host(self)) {
+	if (!machine__is_host(machine)) {
 		kmisc = PERF_RECORD_MISC_GUEST_KERNEL;
 		umisc = PERF_RECORD_MISC_GUEST_USER;
 	}
 
-	err = __dsos__write_buildid_table(&self->kernel_dsos, self->pid,
+	err = __dsos__write_buildid_table(&machine->kernel_dsos, machine->pid,
 					  kmisc, fd);
 	if (err == 0)
-		err = __dsos__write_buildid_table(&self->user_dsos,
-						  self->pid, umisc, fd);
+		err = __dsos__write_buildid_table(&machine->user_dsos,
+						  machine->pid, umisc, fd);
 	return err;
 }
 
@@ -280,12 +280,12 @@ out_free:
 	return err;
 }
 
-static int dso__cache_build_id(struct dso *self, const char *debugdir)
+static int dso__cache_build_id(struct dso *dso, const char *debugdir)
 {
-	bool is_kallsyms = self->kernel && self->long_name[0] != '/';
+	bool is_kallsyms = dso->kernel && dso->long_name[0] != '/';
 
-	return build_id_cache__add_b(self->build_id, sizeof(self->build_id),
-				     self->long_name, debugdir, is_kallsyms);
+	return build_id_cache__add_b(dso->build_id, sizeof(dso->build_id),
+				     dso->long_name, debugdir, is_kallsyms);
 }
 
 static int __dsos__cache_build_ids(struct list_head *head, const char *debugdir)
@@ -300,14 +300,14 @@ static int __dsos__cache_build_ids(struct list_head *head, const char *debugdir)
 	return err;
 }
 
-static int machine__cache_build_ids(struct machine *self, const char *debugdir)
+static int machine__cache_build_ids(struct machine *machine, const char *debugdir)
 {
-	int ret = __dsos__cache_build_ids(&self->kernel_dsos, debugdir);
-	ret |= __dsos__cache_build_ids(&self->user_dsos, debugdir);
+	int ret = __dsos__cache_build_ids(&machine->kernel_dsos, debugdir);
+	ret |= __dsos__cache_build_ids(&machine->user_dsos, debugdir);
 	return ret;
 }
 
-static int perf_session__cache_build_ids(struct perf_session *self)
+static int perf_session__cache_build_ids(struct perf_session *session)
 {
 	struct rb_node *nd;
 	int ret;
@@ -318,28 +318,28 @@ static int perf_session__cache_build_ids(struct perf_session *self)
 	if (mkdir(debugdir, 0755) != 0 && errno != EEXIST)
 		return -1;
 
-	ret = machine__cache_build_ids(&self->host_machine, debugdir);
+	ret = machine__cache_build_ids(&session->host_machine, debugdir);
 
-	for (nd = rb_first(&self->machines); nd; nd = rb_next(nd)) {
+	for (nd = rb_first(&session->machines); nd; nd = rb_next(nd)) {
 		struct machine *pos = rb_entry(nd, struct machine, rb_node);
 		ret |= machine__cache_build_ids(pos, debugdir);
 	}
 	return ret ? -1 : 0;
 }
 
-static bool machine__read_build_ids(struct machine *self, bool with_hits)
+static bool machine__read_build_ids(struct machine *machine, bool with_hits)
 {
-	bool ret = __dsos__read_build_ids(&self->kernel_dsos, with_hits);
-	ret |= __dsos__read_build_ids(&self->user_dsos, with_hits);
+	bool ret = __dsos__read_build_ids(&machine->kernel_dsos, with_hits);
+	ret |= __dsos__read_build_ids(&machine->user_dsos, with_hits);
 	return ret;
 }
 
-static bool perf_session__read_build_ids(struct perf_session *self, bool with_hits)
+static bool perf_session__read_build_ids(struct perf_session *session, bool with_hits)
 {
 	struct rb_node *nd;
-	bool ret = machine__read_build_ids(&self->host_machine, with_hits);
+	bool ret = machine__read_build_ids(&session->host_machine, with_hits);
 
-	for (nd = rb_first(&self->machines); nd; nd = rb_next(nd)) {
+	for (nd = rb_first(&session->machines); nd; nd = rb_next(nd)) {
 		struct machine *pos = rb_entry(nd, struct machine, rb_node);
 		ret |= machine__read_build_ids(pos, with_hits);
 	}
@@ -347,7 +347,7 @@ static bool perf_session__read_build_ids(struct perf_session *self, bool with_hi
 	return ret;
 }
 
-static int perf_header__adds_write(struct perf_header *self,
+static int perf_header__adds_write(struct perf_header *header,
 				   struct perf_evlist *evlist, int fd)
 {
 	int nr_sections;
@@ -357,13 +357,13 @@ static int perf_header__adds_write(struct perf_header *self,
 	u64 sec_start;
 	int idx = 0, err;
 
-	session = container_of(self, struct perf_session, header);
+	session = container_of(header, struct perf_session, header);
 
-	if (perf_header__has_feat(self, HEADER_BUILD_ID &&
+	if (perf_header__has_feat(header, HEADER_BUILD_ID &&
 	    !perf_session__read_build_ids(session, true)))
-		perf_header__clear_feat(self, HEADER_BUILD_ID);
+		perf_header__clear_feat(header, HEADER_BUILD_ID);
 
-	nr_sections = bitmap_weight(self->adds_features, HEADER_FEAT_BITS);
+	nr_sections = bitmap_weight(header->adds_features, HEADER_FEAT_BITS);
 	if (!nr_sections)
 		return 0;
 
@@ -373,10 +373,10 @@ static int perf_header__adds_write(struct perf_header *self,
 
 	sec_size = sizeof(*feat_sec) * nr_sections;
 
-	sec_start = self->data_offset + self->data_size;
+	sec_start = header->data_offset + header->data_size;
 	lseek(fd, sec_start + sec_size, SEEK_SET);
 
-	if (perf_header__has_feat(self, HEADER_TRACE_INFO)) {
+	if (perf_header__has_feat(header, HEADER_TRACE_INFO)) {
 		struct perf_file_section *trace_sec;
 
 		trace_sec = &feat_sec[idx++];
@@ -387,14 +387,14 @@ static int perf_header__adds_write(struct perf_header *self,
 		trace_sec->size = lseek(fd, 0, SEEK_CUR) - trace_sec->offset;
 	}
 
-	if (perf_header__has_feat(self, HEADER_BUILD_ID)) {
+	if (perf_header__has_feat(header, HEADER_BUILD_ID)) {
 		struct perf_file_section *buildid_sec;
 
 		buildid_sec = &feat_sec[idx++];
 
 		/* Write build-ids */
 		buildid_sec->offset = lseek(fd, 0, SEEK_CUR);
-		err = dsos__write_buildid_table(self, fd);
+		err = dsos__write_buildid_table(header, fd);
 		if (err < 0) {
 			pr_debug("failed to write buildid table\n");
 			goto out_free;
@@ -439,7 +439,7 @@ int perf_session__write_header(struct perf_session *session,
 {
 	struct perf_file_header f_header;
 	struct perf_file_attr   f_attr;
-	struct perf_header *self = &session->header;
+	struct perf_header *header = &session->header;
 	struct perf_evsel *attr, *pair = NULL;
 	int err;
 
@@ -465,7 +465,7 @@ out_err_write:
 		}
 	}
 
-	self->attr_offset = lseek(fd, 0, SEEK_CUR);
+	header->attr_offset = lseek(fd, 0, SEEK_CUR);
 
 	list_for_each_entry(attr, &evlist->entries, node) {
 		f_attr = (struct perf_file_attr){
@@ -482,20 +482,20 @@ out_err_write:
 		}
 	}
 
-	self->event_offset = lseek(fd, 0, SEEK_CUR);
-	self->event_size = event_count * sizeof(struct perf_trace_event_type);
+	header->event_offset = lseek(fd, 0, SEEK_CUR);
+	header->event_size = event_count * sizeof(struct perf_trace_event_type);
 	if (events) {
-		err = do_write(fd, events, self->event_size);
+		err = do_write(fd, events, header->event_size);
 		if (err < 0) {
 			pr_debug("failed to write perf header events\n");
 			return err;
 		}
 	}
 
-	self->data_offset = lseek(fd, 0, SEEK_CUR);
+	header->data_offset = lseek(fd, 0, SEEK_CUR);
 
 	if (at_exit) {
-		err = perf_header__adds_write(self, evlist, fd);
+		err = perf_header__adds_write(header, evlist, fd);
 		if (err < 0)
 			return err;
 	}
@@ -505,20 +505,20 @@ out_err_write:
 		.size	   = sizeof(f_header),
 		.attr_size = sizeof(f_attr),
 		.attrs = {
-			.offset = self->attr_offset,
+			.offset = header->attr_offset,
 			.size   = evlist->nr_entries * sizeof(f_attr),
 		},
 		.data = {
-			.offset = self->data_offset,
-			.size	= self->data_size,
+			.offset = header->data_offset,
+			.size	= header->data_size,
 		},
 		.event_types = {
-			.offset = self->event_offset,
-			.size	= self->event_size,
+			.offset = header->event_offset,
+			.size	= header->event_size,
 		},
 	};
 
-	memcpy(&f_header.adds_features, &self->adds_features, sizeof(self->adds_features));
+	memcpy(&f_header.adds_features, &header->adds_features, sizeof(header->adds_features));
 
 	lseek(fd, 0, SEEK_SET);
 	err = do_write(fd, &f_header, sizeof(f_header));
@@ -526,26 +526,26 @@ out_err_write:
 		pr_debug("failed to write perf header\n");
 		return err;
 	}
-	lseek(fd, self->data_offset + self->data_size, SEEK_SET);
+	lseek(fd, header->data_offset + header->data_size, SEEK_SET);
 
-	self->frozen = 1;
+	header->frozen = 1;
 	return 0;
 }
 
-static int perf_header__getbuffer64(struct perf_header *self,
+static int perf_header__getbuffer64(struct perf_header *header,
 				    int fd, void *buf, size_t size)
 {
 	if (readn(fd, buf, size) <= 0)
 		return -1;
 
-	if (self->needs_swap)
+	if (header->needs_swap)
 		mem_bswap_64(buf, size);
 
 	return 0;
 }
 
-int perf_header__process_sections(struct perf_header *self, int fd,
-				  int (*process)(struct perf_file_section *self,
+int perf_header__process_sections(struct perf_header *header, int fd,
+				  int (*process)(struct perf_file_section *section,
 						 struct perf_header *ph,
 						 int feat, int fd))
 {
@@ -555,7 +555,7 @@ int perf_header__process_sections(struct perf_header *self, int fd,
 	int idx = 0;
 	int err = -1, feat = 1;
 
-	nr_sections = bitmap_weight(self->adds_features, HEADER_FEAT_BITS);
+	nr_sections = bitmap_weight(header->adds_features, HEADER_FEAT_BITS);
 	if (!nr_sections)
 		return 0;
 
@@ -565,17 +565,17 @@ int perf_header__process_sections(struct perf_header *self, int fd,
 
 	sec_size = sizeof(*feat_sec) * nr_sections;
 
-	lseek(fd, self->data_offset + self->data_size, SEEK_SET);
+	lseek(fd, header->data_offset + header->data_size, SEEK_SET);
 
-	if (perf_header__getbuffer64(self, fd, feat_sec, sec_size))
+	if (perf_header__getbuffer64(header, fd, feat_sec, sec_size))
 		goto out_free;
 
 	err = 0;
 	while (idx < nr_sections && feat < HEADER_LAST_FEATURE) {
-		if (perf_header__has_feat(self, feat)) {
+		if (perf_header__has_feat(header, feat)) {
 			struct perf_file_section *sec = &feat_sec[idx++];
 
-			err = process(sec, self, feat, fd);
+			err = process(sec, header, feat, fd);
 			if (err < 0)
 				break;
 		}
@@ -586,35 +586,35 @@ out_free:
 	return err;
 }
 
-int perf_file_header__read(struct perf_file_header *self,
+int perf_file_header__read(struct perf_file_header *header,
 			   struct perf_header *ph, int fd)
 {
 	lseek(fd, 0, SEEK_SET);
 
-	if (readn(fd, self, sizeof(*self)) <= 0 ||
-	    memcmp(&self->magic, __perf_magic, sizeof(self->magic)))
+	if (readn(fd, header, sizeof(*header)) <= 0 ||
+	    memcmp(&header->magic, __perf_magic, sizeof(header->magic)))
 		return -1;
 
-	if (self->attr_size != sizeof(struct perf_file_attr)) {
-		u64 attr_size = bswap_64(self->attr_size);
+	if (header->attr_size != sizeof(struct perf_file_attr)) {
+		u64 attr_size = bswap_64(header->attr_size);
 
 		if (attr_size != sizeof(struct perf_file_attr))
 			return -1;
 
-		mem_bswap_64(self, offsetof(struct perf_file_header,
+		mem_bswap_64(header, offsetof(struct perf_file_header,
 					    adds_features));
 		ph->needs_swap = true;
 	}
 
-	if (self->size != sizeof(*self)) {
+	if (header->size != sizeof(*header)) {
 		/* Support the previous format */
-		if (self->size == offsetof(typeof(*self), adds_features))
-			bitmap_zero(self->adds_features, HEADER_FEAT_BITS);
+		if (header->size == offsetof(typeof(*header), adds_features))
+			bitmap_zero(header->adds_features, HEADER_FEAT_BITS);
 		else
 			return -1;
 	}
 
-	memcpy(&ph->adds_features, &self->adds_features,
+	memcpy(&ph->adds_features, &header->adds_features,
 	       sizeof(ph->adds_features));
 	/*
 	 * FIXME: hack that assumes that if we need swap the perf.data file
@@ -628,10 +628,10 @@ int perf_file_header__read(struct perf_file_header *self,
 		perf_header__set_feat(ph, HEADER_BUILD_ID);
 	}
 
-	ph->event_offset = self->event_types.offset;
-	ph->event_size   = self->event_types.size;
-	ph->data_offset  = self->data.offset;
-	ph->data_size	 = self->data.size;
+	ph->event_offset = header->event_types.offset;
+	ph->event_size   = header->event_types.size;
+	ph->data_offset  = header->data.offset;
+	ph->data_size	 = header->data.size;
 	return 0;
 }
 
@@ -690,11 +690,10 @@ out:
 	return err;
 }
 
-static int perf_header__read_build_ids(struct perf_header *self,
-			int input, u64 offset, u64 size)
+static int perf_header__read_build_ids(struct perf_header *header,
+				       int input, u64 offset, u64 size)
 {
-	struct perf_session *session = container_of(self,
-			struct perf_session, header);
+	struct perf_session *session = container_of(header, struct perf_session, header);
 	struct build_id_event bev;
 	char filename[PATH_MAX];
 	u64 limit = offset + size;
@@ -706,7 +705,7 @@ static int perf_header__read_build_ids(struct perf_header *self,
 		if (read(input, &bev, sizeof(bev)) != sizeof(bev))
 			goto out;
 
-		if (self->needs_swap)
+		if (header->needs_swap)
 			perf_event_header__bswap(&bev.header);
 
 		len = bev.header.size - sizeof(bev);
@@ -722,13 +721,13 @@ out:
 	return err;
 }
 
-static int perf_file_section__process(struct perf_file_section *self,
+static int perf_file_section__process(struct perf_file_section *section,
 				      struct perf_header *ph,
 				      int feat, int fd)
 {
-	if (lseek(fd, self->offset, SEEK_SET) == (off_t)-1) {
+	if (lseek(fd, section->offset, SEEK_SET) == (off_t)-1) {
 		pr_debug("Failed to lseek to %" PRIu64 " offset for feature "
-			  "%d, continuing...\n", self->offset, feat);
+			  "%d, continuing...\n", section->offset, feat);
 		return 0;
 	}
 
@@ -738,7 +737,7 @@ static int perf_file_section__process(struct perf_file_section *self,
 		break;
 
 	case HEADER_BUILD_ID:
-		if (perf_header__read_build_ids(ph, fd, self->offset, self->size))
+		if (perf_header__read_build_ids(ph, fd, section->offset, section->size))
 			pr_debug("Failed to read buildids, continuing...\n");
 		break;
 	default:
@@ -748,21 +747,21 @@ static int perf_file_section__process(struct perf_file_section *self,
 	return 0;
 }
 
-static int perf_file_header__read_pipe(struct perf_pipe_file_header *self,
+static int perf_file_header__read_pipe(struct perf_pipe_file_header *header,
 				       struct perf_header *ph, int fd,
 				       bool repipe)
 {
-	if (readn(fd, self, sizeof(*self)) <= 0 ||
-	    memcmp(&self->magic, __perf_magic, sizeof(self->magic)))
+	if (readn(fd, header, sizeof(*header)) <= 0 ||
+	    memcmp(&header->magic, __perf_magic, sizeof(header->magic)))
 		return -1;
 
-	if (repipe && do_write(STDOUT_FILENO, self, sizeof(*self)) < 0)
+	if (repipe && do_write(STDOUT_FILENO, header, sizeof(*header)) < 0)
 		return -1;
 
-	if (self->size != sizeof(*self)) {
-		u64 size = bswap_64(self->size);
+	if (header->size != sizeof(*header)) {
+		u64 size = bswap_64(header->size);
 
-		if (size != sizeof(*self))
+		if (size != sizeof(*header))
 			return -1;
 
 		ph->needs_swap = true;
@@ -773,10 +772,10 @@ static int perf_file_header__read_pipe(struct perf_pipe_file_header *self,
 
 static int perf_header__read_pipe(struct perf_session *session, int fd)
 {
-	struct perf_header *self = &session->header;
+	struct perf_header *header = &session->header;
 	struct perf_pipe_file_header f_header;
 
-	if (perf_file_header__read_pipe(&f_header, self, fd,
+	if (perf_file_header__read_pipe(&f_header, header, fd,
 					session->repipe) < 0) {
 		pr_debug("incompatible file format\n");
 		return -EINVAL;
@@ -789,7 +788,7 @@ static int perf_header__read_pipe(struct perf_session *session, int fd)
 
 int perf_session__read_header(struct perf_session *session, int fd)
 {
-	struct perf_header *self = &session->header;
+	struct perf_header *header = &session->header;
 	struct perf_file_header	f_header;
 	struct perf_file_attr	f_attr;
 	u64			f_id;
@@ -802,7 +801,7 @@ int perf_session__read_header(struct perf_session *session, int fd)
 	if (session->fd_pipe)
 		return perf_header__read_pipe(session, fd);
 
-	if (perf_file_header__read(&f_header, self, fd) < 0) {
+	if (perf_file_header__read(&f_header, header, fd) < 0) {
 		pr_debug("incompatible file format\n");
 		return -EINVAL;
 	}
@@ -814,7 +813,7 @@ int perf_session__read_header(struct perf_session *session, int fd)
 		struct perf_evsel *evsel;
 		off_t tmp;
 
-		if (perf_header__getbuffer64(self, fd, &f_attr, sizeof(f_attr)))
+		if (perf_header__getbuffer64(header, fd, &f_attr, sizeof(f_attr)))
 			goto out_errno;
 
 		tmp = lseek(fd, 0, SEEK_CUR);
@@ -840,7 +839,7 @@ int perf_session__read_header(struct perf_session *session, int fd)
 		lseek(fd, f_attr.ids.offset, SEEK_SET);
 
 		for (j = 0; j < nr_ids; j++) {
-			if (perf_header__getbuffer64(self, fd, &f_id, sizeof(f_id)))
+			if (perf_header__getbuffer64(header, fd, &f_id, sizeof(f_id)))
 				goto out_errno;
 
 			perf_evlist__id_add(session->evlist, evsel, 0, j, f_id);
@@ -854,17 +853,17 @@ int perf_session__read_header(struct perf_session *session, int fd)
 		events = malloc(f_header.event_types.size);
 		if (events == NULL)
 			return -ENOMEM;
-		if (perf_header__getbuffer64(self, fd, events,
+		if (perf_header__getbuffer64(header, fd, events,
 					     f_header.event_types.size))
 			goto out_errno;
 		event_count =  f_header.event_types.size / sizeof(struct perf_trace_event_type);
 	}
 
-	perf_header__process_sections(self, fd, perf_file_section__process);
+	perf_header__process_sections(header, fd, perf_file_section__process);
 
-	lseek(fd, self->data_offset, SEEK_SET);
+	lseek(fd, header->data_offset, SEEK_SET);
 
-	self->frozen = 1;
+	header->frozen = 1;
 	return 0;
 out_errno:
 	return -errno;
diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h
index 4cc2675..456661d 100644
--- a/tools/perf/util/header.h
+++ b/tools/perf/util/header.h
@@ -39,7 +39,7 @@ struct perf_pipe_file_header {
 
 struct perf_header;
 
-int perf_file_header__read(struct perf_file_header *self,
+int perf_file_header__read(struct perf_file_header *header,
 			   struct perf_header *ph, int fd);
 
 struct perf_header {
@@ -66,12 +66,12 @@ char *perf_header__find_event(u64 id);
 
 u64 perf_evlist__sample_type(struct perf_evlist *evlist);
 bool perf_evlist__sample_id_all(const struct perf_evlist *evlist);
-void perf_header__set_feat(struct perf_header *self, int feat);
-void perf_header__clear_feat(struct perf_header *self, int feat);
-bool perf_header__has_feat(const struct perf_header *self, int feat);
+void perf_header__set_feat(struct perf_header *header, int feat);
+void perf_header__clear_feat(struct perf_header *header, int feat);
+bool perf_header__has_feat(const struct perf_header *header, int feat);
 
-int perf_header__process_sections(struct perf_header *self, int fd,
-				  int (*process)(struct perf_file_section *self,
+int perf_header__process_sections(struct perf_header *header, int fd,
+				  int (*process)(struct perf_file_section *section,
 						 struct perf_header *ph,
 						 int feat, int fd));
 
-- 
cgit v0.10.2


From e6e1e2593592a8f6f6380496655d8c6f67431266 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 9 Mar 2011 10:41:56 -0500
Subject: tracing: Remove lock_depth from event entry

The lock_depth field in the event headers was added as a temporary
data point for help in removing the BKL. Now that the BKL is pretty
much been removed, we can remove this field.

This in turn changes the header from 12 bytes to 8 bytes,
removing the 4 byte buffer that gcc would insert if the first field
in the data load was 8 bytes in size.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 1a99e79..22b32af 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -37,7 +37,6 @@ struct trace_entry {
 	unsigned char		flags;
 	unsigned char		preempt_count;
 	int			pid;
-	int			lock_depth;
 };
 
 #define FTRACE_MAX_EVENT						\
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 85e3ee1..fd6e1b9 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1101,7 +1101,6 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
 
 	entry->preempt_count		= pc & 0xff;
 	entry->pid			= (tsk) ? tsk->pid : 0;
-	entry->lock_depth		= (tsk) ? tsk->lock_depth : 0;
 	entry->flags =
 #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
 		(irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
@@ -1748,10 +1747,9 @@ static void print_lat_help_header(struct seq_file *m)
 	seq_puts(m, "#                | / _----=> need-resched    \n");
 	seq_puts(m, "#                || / _---=> hardirq/softirq \n");
 	seq_puts(m, "#                ||| / _--=> preempt-depth   \n");
-	seq_puts(m, "#                |||| /_--=> lock-depth       \n");
-	seq_puts(m, "#                |||||/     delay             \n");
-	seq_puts(m, "#  cmd     pid   |||||| time  |   caller      \n");
-	seq_puts(m, "#     \\   /      ||||||   \\   |   /           \n");
+	seq_puts(m, "#                |||| /     delay             \n");
+	seq_puts(m, "#  cmd     pid   ||||| time  |   caller      \n");
+	seq_puts(m, "#     \\   /      |||||  \\    |   /           \n");
 }
 
 static void print_func_help_header(struct seq_file *m)
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 5f499e04..e1d579b 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -116,7 +116,6 @@ static int trace_define_common_fields(void)
 	__common_field(unsigned char, flags);
 	__common_field(unsigned char, preempt_count);
 	__common_field(int, pid);
-	__common_field(int, lock_depth);
 
 	return ret;
 }
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 02272ba..151f32e 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -529,7 +529,7 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
  * @entry: The trace entry field from the ring buffer
  *
  * Prints the generic fields of irqs off, in hard or softirq, preempt
- * count and lock depth.
+ * count.
  */
 int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
 {
@@ -554,13 +554,7 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
 	else
 		ret = trace_seq_putc(s, '.');
 
-	if (!ret)
-		return 0;
-
-	if (entry->lock_depth < 0)
-		return trace_seq_putc(s, '.');
-
-	return trace_seq_printf(s, "%d", entry->lock_depth);
+	return ret;
 }
 
 static int
-- 
cgit v0.10.2


From 140e4f2d1cd816aed196705c036763313c0e4bd3 Mon Sep 17 00:00:00 2001
From: David Sharp <dhsharp@google.com>
Date: Fri, 3 Dec 2010 16:13:19 -0800
Subject: tracing: Fix event alignment: ftrace:context_switch and ftrace:wakeup

Signed-off-by: David Sharp <dhsharp@google.com>
LKML-Reference: <1291421609-14665-6-git-send-email-dhsharp@google.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index 6cf2237..1516cb3 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -109,12 +109,12 @@ FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry,
  */
 #define FTRACE_CTX_FIELDS					\
 	__field(	unsigned int,	prev_pid	)	\
+	__field(	unsigned int,	next_pid	)	\
+	__field(	unsigned int,	next_cpu	)       \
 	__field(	unsigned char,	prev_prio	)	\
 	__field(	unsigned char,	prev_state	)	\
-	__field(	unsigned int,	next_pid	)	\
 	__field(	unsigned char,	next_prio	)	\
-	__field(	unsigned char,	next_state	)	\
-	__field(	unsigned int,	next_cpu	)
+	__field(	unsigned char,	next_state	)
 
 FTRACE_ENTRY(context_switch, ctx_switch_entry,
 
-- 
cgit v0.10.2


From b5e3008e489f5a00c6d5db914a4c4338c9ef5e8b Mon Sep 17 00:00:00 2001
From: David Sharp <dhsharp@google.com>
Date: Fri, 3 Dec 2010 16:13:20 -0800
Subject: tracing: Fix event alignment: module:module_request

Acked-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: David Sharp <dhsharp@google.com>
LKML-Reference: <1291421609-14665-7-git-send-email-dhsharp@google.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/include/trace/events/module.h b/include/trace/events/module.h
index c6bae36..21a546d 100644
--- a/include/trace/events/module.h
+++ b/include/trace/events/module.h
@@ -108,14 +108,14 @@ TRACE_EVENT(module_request,
 	TP_ARGS(name, wait, ip),
 
 	TP_STRUCT__entry(
-		__field(	bool,		wait		)
 		__field(	unsigned long,	ip		)
+		__field(	bool,		wait		)
 		__string(	name,		name		)
 	),
 
 	TP_fast_assign(
-		__entry->wait	= wait;
 		__entry->ip	= ip;
+		__entry->wait	= wait;
 		__assign_str(name, name);
 	),
 
@@ -129,4 +129,3 @@ TRACE_EVENT(module_request,
 
 /* This part must be outside protection */
 #include <trace/define_trace.h>
-
-- 
cgit v0.10.2


From d5bf2ff07230a4a1b73ecb22363f77c02e1d85ab Mon Sep 17 00:00:00 2001
From: David Sharp <dhsharp@google.com>
Date: Fri, 3 Dec 2010 16:13:21 -0800
Subject: tracing: Fix event alignment: kvm:kvm_hv_hypercall

Acked-by: Avi Kivity <avi@redhat.com>
Signed-off-by: David Sharp <dhsharp@google.com>
LKML-Reference: <1291421609-14665-8-git-send-email-dhsharp@google.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 1357d7c..db93276 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -62,21 +62,21 @@ TRACE_EVENT(kvm_hv_hypercall,
 	TP_ARGS(code, fast, rep_cnt, rep_idx, ingpa, outgpa),
 
 	TP_STRUCT__entry(
-		__field(	__u16, 		code		)
-		__field(	bool,		fast		)
 		__field(	__u16,		rep_cnt		)
 		__field(	__u16,		rep_idx		)
 		__field(	__u64,		ingpa		)
 		__field(	__u64,		outgpa		)
+		__field(	__u16, 		code		)
+		__field(	bool,		fast		)
 	),
 
 	TP_fast_assign(
-		__entry->code		= code;
-		__entry->fast		= fast;
 		__entry->rep_cnt	= rep_cnt;
 		__entry->rep_idx	= rep_idx;
 		__entry->ingpa		= ingpa;
 		__entry->outgpa		= outgpa;
+		__entry->code		= code;
+		__entry->fast		= fast;
 	),
 
 	TP_printk("code 0x%x %s cnt 0x%x idx 0x%x in 0x%llx out 0x%llx",
-- 
cgit v0.10.2


From ad440ad66f1617194738bf674dfe2d38978ac54d Mon Sep 17 00:00:00 2001
From: David Sharp <dhsharp@google.com>
Date: Fri, 3 Dec 2010 16:13:22 -0800
Subject: tracing: Fix event alignment: mce:mce_record

Signed-off-by: David Sharp <dhsharp@google.com>
LKML-Reference: <1291421609-14665-9-git-send-email-dhsharp@google.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/include/trace/events/mce.h b/include/trace/events/mce.h
index 7eee778..4cbbcef 100644
--- a/include/trace/events/mce.h
+++ b/include/trace/events/mce.h
@@ -17,36 +17,36 @@ TRACE_EVENT(mce_record,
 	TP_STRUCT__entry(
 		__field(	u64,		mcgcap		)
 		__field(	u64,		mcgstatus	)
-		__field(	u8,		bank		)
 		__field(	u64,		status		)
 		__field(	u64,		addr		)
 		__field(	u64,		misc		)
 		__field(	u64,		ip		)
-		__field(	u8,		cs		)
 		__field(	u64,		tsc		)
 		__field(	u64,		walltime	)
 		__field(	u32,		cpu		)
 		__field(	u32,		cpuid		)
 		__field(	u32,		apicid		)
 		__field(	u32,		socketid	)
+		__field(	u8,		cs		)
+		__field(	u8,		bank		)
 		__field(	u8,		cpuvendor	)
 	),
 
 	TP_fast_assign(
 		__entry->mcgcap		= m->mcgcap;
 		__entry->mcgstatus	= m->mcgstatus;
-		__entry->bank		= m->bank;
 		__entry->status		= m->status;
 		__entry->addr		= m->addr;
 		__entry->misc		= m->misc;
 		__entry->ip		= m->ip;
-		__entry->cs		= m->cs;
 		__entry->tsc		= m->tsc;
 		__entry->walltime	= m->time;
 		__entry->cpu		= m->extcpu;
 		__entry->cpuid		= m->cpuid;
 		__entry->apicid		= m->apicid;
 		__entry->socketid	= m->socketid;
+		__entry->cs		= m->cs;
+		__entry->bank		= m->bank;
 		__entry->cpuvendor	= m->cpuvendor;
 	),
 
-- 
cgit v0.10.2


From ca9da2dd63b0b32de1b693953dff66cadeb6400b Mon Sep 17 00:00:00 2001
From: David Sharp <dhsharp@google.com>
Date: Fri, 3 Dec 2010 16:13:23 -0800
Subject: tracing: Fix event alignment: skb:kfree_skb

Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David Sharp <dhsharp@google.com>
LKML-Reference: <1291421609-14665-10-git-send-email-dhsharp@google.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/include/trace/events/skb.h b/include/trace/events/skb.h
index f10293c..0c68ae22 100644
--- a/include/trace/events/skb.h
+++ b/include/trace/events/skb.h
@@ -19,14 +19,14 @@ TRACE_EVENT(kfree_skb,
 
 	TP_STRUCT__entry(
 		__field(	void *,		skbaddr		)
-		__field(	unsigned short,	protocol	)
 		__field(	void *,		location	)
+		__field(	unsigned short,	protocol	)
 	),
 
 	TP_fast_assign(
 		__entry->skbaddr = skb;
-		__entry->protocol = ntohs(skb->protocol);
 		__entry->location = location;
+		__entry->protocol = ntohs(skb->protocol);
 	),
 
 	TP_printk("skbaddr=%p protocol=%u location=%p",
-- 
cgit v0.10.2


From 10da37a645b5e915d8572cc2b1f5eb11ada3ea4f Mon Sep 17 00:00:00 2001
From: David Sharp <dhsharp@google.com>
Date: Fri, 3 Dec 2010 16:13:26 -0800
Subject: tracing: Adjust conditional expression latency formatting.

Formatting change only to improve code readability. No code changes except to
introduce intermediate variables.

Signed-off-by: David Sharp <dhsharp@google.com>
LKML-Reference: <1291421609-14665-13-git-send-email-dhsharp@google.com>

[ Keep variable declarations and assignment separate ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 151f32e..456be90 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -533,20 +533,30 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
  */
 int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
 {
-	int hardirq, softirq;
+	char hardsoft_irq;
+	char need_resched;
+	char irqs_off;
+	int hardirq;
+	int softirq;
 	int ret;
 
 	hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
 	softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
 
+	irqs_off =
+		(entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
+		(entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' :
+		'.';
+	need_resched =
+		(entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.';
+	hardsoft_irq =
+		(hardirq && softirq) ? 'H' :
+		hardirq ? 'h' :
+		softirq ? 's' :
+		'.';
+
 	if (!trace_seq_printf(s, "%c%c%c",
-			      (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
-				(entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ?
-				  'X' : '.',
-			      (entry->flags & TRACE_FLAG_NEED_RESCHED) ?
-				'N' : '.',
-			      (hardirq && softirq) ? 'H' :
-				hardirq ? 'h' : softirq ? 's' : '.'))
+			      irqs_off, need_resched, hardsoft_irq))
 		return 0;
 
 	if (entry->preempt_count)
-- 
cgit v0.10.2


From 1274a9c2e91652e28efa45c3e5886ec82f08bfbe Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 11 Feb 2011 16:43:33 -0500
Subject: ftrace: Add .ref.text as one of the safe areas to trace

The section .ref.text will not go away unexpectedly and is
safe to trace. Add it to the safe list of sections to allow
tracing.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/scripts/recordmcount.c b/scripts/recordmcount.c
index 038b3d1..f9f6f52 100644
--- a/scripts/recordmcount.c
+++ b/scripts/recordmcount.c
@@ -206,7 +206,8 @@ static uint32_t (*w2)(uint16_t);
 static int
 is_mcounted_section_name(char const *const txtname)
 {
-	return 0 == strcmp(".text",          txtname) ||
+	return 0 == strcmp(".text",           txtname) ||
+		0 == strcmp(".ref.text",      txtname) ||
 		0 == strcmp(".sched.text",    txtname) ||
 		0 == strcmp(".spinlock.text", txtname) ||
 		0 == strcmp(".irqentry.text", txtname) ||
diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl
index 1d7963f..4be0dee 100755
--- a/scripts/recordmcount.pl
+++ b/scripts/recordmcount.pl
@@ -130,6 +130,7 @@ if ($inputfile =~ m,kernel/trace/ftrace\.o$,) {
 # Acceptable sections to record.
 my %text_sections = (
      ".text" => 1,
+     ".ref.text" => 1,
      ".sched.text" => 1,
      ".spinlock.text" => 1,
      ".irqentry.text" => 1,
-- 
cgit v0.10.2


From 722b3c74695377d11d18a52f3da08114d37f3f37 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 11 Feb 2011 20:36:02 -0500
Subject: ftrace/graph: Trace function entry before updating index

Currently the index to the ret_stack is updated and the real return address
is saved in the ret_stack. Then we call the trace function. The trace
function could decide that it doesn't want to trace this function
(ex. set_graph_function does not match) and it will return 0 which means
not to trace this call.

The normal function graph tracer has this code:

	if (!(trace->depth || ftrace_graph_addr(trace->func)) ||
	      ftrace_graph_ignore_irqs())
		return 0;

What this states is, if the trace depth (which is curr_ret_stack)
is zero (top of nested functions) then test if we want to trace this
function. If this function is not to be traced, then return  0 and
the rest of the function graph tracer logic will not trace this function.

The problem arises when an interrupt comes in after we updated the
curr_ret_stack. The next function that gets called will have a trace->depth
of 1. Which fools this trace code into thinking that we are in a nested
function, and that we should trace. This causes interrupts to be traced
when they should not be.

The solution is to trace the function first and then update the ret_stack.

Reported-by: zhiping zhong <xzhong86@163.com>
Reported-by: wu zhangjin <wuzhangjin@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 382eb29..a93742a 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -437,18 +437,19 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr,
 		return;
 	}
 
-	if (ftrace_push_return_trace(old, self_addr, &trace.depth,
-		    frame_pointer) == -EBUSY) {
-		*parent = old;
-		return;
-	}
-
 	trace.func = self_addr;
+	trace.depth = current->curr_ret_stack + 1;
 
 	/* Only trace if the calling function expects to */
 	if (!ftrace_graph_entry(&trace)) {
-		current->curr_ret_stack--;
 		*parent = old;
+		return;
+	}
+
+	if (ftrace_push_return_trace(old, self_addr, &trace.depth,
+		    frame_pointer) == -EBUSY) {
+		*parent = old;
+		return;
 	}
 }
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
-- 
cgit v0.10.2


From 31274d72f01604f4b02d933b4f3cac84d2c201fd Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Fri, 18 Feb 2011 15:52:19 +0100
Subject: tracing: Explain about unstable clock on resume with ring buffer
 warning

The "Delta way too big" warning might appear on a system with a
unstable shed clock right after the system is resumed and tracing
was enabled at time of suspend.

Since it's not realy a bug, and the unstable sched clock is working
fast and reliable otherwise, Steven suggested to keep using the
sched clock in any case and just to make note in the warning itself.

v2 changes:
- added #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK

Signed-off-by: Jiri Olsa <jolsa@redhat.com>
LKML-Reference: <20110218145219.GD2604@jolsa.brq.redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 3237d96..db7b439 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2172,11 +2172,19 @@ rb_reserve_next_event(struct ring_buffer *buffer,
 	if (likely(ts >= cpu_buffer->write_stamp)) {
 		delta = diff;
 		if (unlikely(test_time_stamp(delta))) {
+			int local_clock_stable = 1;
+#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+			local_clock_stable = sched_clock_stable;
+#endif
 			WARN_ONCE(delta > (1ULL << 59),
-				  KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n",
+				  KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s",
 				  (unsigned long long)delta,
 				  (unsigned long long)ts,
-				  (unsigned long long)cpu_buffer->write_stamp);
+				  (unsigned long long)cpu_buffer->write_stamp,
+				  local_clock_stable ? "" :
+				  "If you just came from a suspend/resume,\n"
+				  "please switch to the trace global clock:\n"
+				  "  echo global > /sys/kernel/debug/tracing/trace_clock\n");
 			add_timestamp = 1;
 		}
 	}
-- 
cgit v0.10.2


From 56355b83e2a24ce7e1870c8479205e2cdd332225 Mon Sep 17 00:00:00 2001
From: Yuanhan Liu <yuanhan.liu@linux.intel.com>
Date: Mon, 8 Nov 2010 14:05:12 +0800
Subject: tracing: Export trace_set_clr_event()

Trace events belonging to a module only exists when the module is
loaded. Well, we can use trace_set_clr_event funtion to enable some
trace event at the module init routine, so that we will not miss
something while loading then module.

So, Export the trace_set_clr_event function so that module can use it.

Signed-off-by: Yuanhan Liu <yuanhan.liu@linux.intel.com>
LKML-Reference: <1289196312-25323-1-git-send-email-yuanhan.liu@linux.intel.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index e1d579b..e88f74f 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -325,6 +325,7 @@ int trace_set_clr_event(const char *system, const char *event, int set)
 {
 	return __ftrace_set_clr_event(NULL, system, event, set);
 }
+EXPORT_SYMBOL_GPL(trace_set_clr_event);
 
 /* 128 should be much more than enough */
 #define EVENT_BUF_SIZE		127
-- 
cgit v0.10.2


From 9a24470b2826e4665b1484836c7ae6aba1ddea32 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 9 Mar 2011 14:53:38 -0500
Subject: tracing: Align 4 byte ints together in struct tracer

Move elements in struct tracer for better alignment.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 951d0b7..5e9dfc6 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -272,8 +272,8 @@ struct tracer {
 	/* If you handled the flag setting, return 0 */
 	int			(*set_flag)(u32 old_flags, u32 bit, int set);
 	struct tracer		*next;
-	int			print_max;
 	struct tracer_flags	*flags;
+	int			print_max;
 	int			use_max_tr;
 };
 
-- 
cgit v0.10.2


From 4a0b1665db09cf2da9ad7d0f12da386373c10bfa Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 9 Mar 2011 20:09:26 -0500
Subject: tracing: Fix irqoff selftest expanding max buffer

If the kernel command line declares a tracer "ftrace=sometracer" and
that tracer is either not defined or is enabled after irqsoff,
then the irqs off selftest will fail with the following error:

Testing tracer irqsoff:
------------[ cut here ]------------
WARNING: at /home/rostedt/work/autotest/nobackup/linux-test.git/kernel/trace/tra
ce.c:713 update_max_tr_single+0xfa/0x11b()
Hardware name:
Modules linked in:
Pid: 1, comm: swapper Not tainted 2.6.38-rc8-test #1
Call Trace:
 [<c0441d9d>] ? warn_slowpath_common+0x65/0x7a
 [<c049adb2>] ? update_max_tr_single+0xfa/0x11b
 [<c0441dc1>] ? warn_slowpath_null+0xf/0x13
 [<c049adb2>] ? update_max_tr_single+0xfa/0x11b
 [<c049e454>] ? stop_critical_timing+0x154/0x204
 [<c049b54b>] ? trace_selftest_startup_irqsoff+0x5b/0xc1
 [<c049b54b>] ? trace_selftest_startup_irqsoff+0x5b/0xc1
 [<c049b54b>] ? trace_selftest_startup_irqsoff+0x5b/0xc1
 [<c049e529>] ? time_hardirqs_on+0x25/0x28
 [<c0468bca>] ? trace_hardirqs_on_caller+0x18/0x12f
 [<c0468cec>] ? trace_hardirqs_on+0xb/0xd
 [<c049b54b>] ? trace_selftest_startup_irqsoff+0x5b/0xc1
 [<c049b6b8>] ? register_tracer+0xf8/0x1a3
 [<c14e93fe>] ? init_irqsoff_tracer+0xd/0x11
 [<c040115e>] ? do_one_initcall+0x71/0x121
 [<c14e93f1>] ? init_irqsoff_tracer+0x0/0x11
 [<c14ce3a9>] ? kernel_init+0x13a/0x1b6
 [<c14ce26f>] ? kernel_init+0x0/0x1b6
 [<c0403842>] ? kernel_thread_helper+0x6/0x10
---[ end trace e93713a9d40cd06c ]---
.. no entries found ..FAILED!

What happens is the "ftrace=..." will expand the ring buffer to its
default size (from its minimum size) but it will not expand the
max ring buffer (the ring buffer to store maximum latencies).
When the irqsoff test runs, it will call the ring buffer swap routine
that checks if the max ring buffer is the same size as the normal
ring buffer, and will fail if it is not. This causes the test to fail.

The solution is to expand the max ring buffer before running the self
test if the max ring buffer is used by that tracer and the normal ring
buffer is expanded. The max ring buffer should be shrunk again after
the test is done to save space.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index fd6e1b9..9541c27 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -779,6 +779,11 @@ __acquires(kernel_lock)
 		tracing_reset_online_cpus(tr);
 
 		current_trace = type;
+
+		/* If we expanded the buffers, make sure the max is expanded too */
+		if (ring_buffer_expanded && type->use_max_tr)
+			ring_buffer_resize(max_tr.buffer, trace_buf_size);
+
 		/* the test is responsible for initializing and enabling */
 		pr_info("Testing tracer %s: ", type->name);
 		ret = type->selftest(type, tr);
@@ -791,6 +796,10 @@ __acquires(kernel_lock)
 		/* Only reset on passing, to avoid touching corrupted buffers */
 		tracing_reset_online_cpus(tr);
 
+		/* Shrink the max buffer again */
+		if (ring_buffer_expanded && type->use_max_tr)
+			ring_buffer_resize(max_tr.buffer, 1);
+
 		printk(KERN_CONT "PASSED\n");
 	}
 #endif
-- 
cgit v0.10.2


From 53370d2e8c0382e3e2aa76def93365ed674e7fc7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 10 Mar 2011 18:26:33 +0100
Subject: hrtimer: Update hrtimer->state documentation

We changed some of the state bits and combinations thereof over time,
but never updated the documentation.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 6bc1804..62f500c 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -54,11 +54,13 @@ enum hrtimer_restart {
  * 0x00		inactive
  * 0x01		enqueued into rbtree
  * 0x02		callback function running
+ * 0x04		timer is migrated to another cpu
  *
  * Special cases:
  * 0x03		callback function running and enqueued
  *		(was requeued on another CPU)
- * 0x09		timer was migrated on CPU hotunplug
+ * 0x05		timer was migrated on CPU hotunplug
+ *
  * The "callback function running and enqueued" status is only possible on
  * SMP. It happens for example when a posix timer expired and the callback
  * queued a signal. Between dropping the lock which protects the posix timer
@@ -67,8 +69,11 @@ enum hrtimer_restart {
  * as otherwise the timer could be removed before the softirq code finishes the
  * the handling of the timer.
  *
- * The HRTIMER_STATE_ENQUEUED bit is always or'ed to the current state to
- * preserve the HRTIMER_STATE_CALLBACK bit in the above scenario.
+ * The HRTIMER_STATE_ENQUEUED bit is always or'ed to the current state
+ * to preserve the HRTIMER_STATE_CALLBACK in the above scenario. This
+ * also affects HRTIMER_STATE_MIGRATE where the preservation is not
+ * necessary. HRTIMER_STATE_MIGRATE is cleared after the timer is
+ * enqueued on the new cpu.
  *
  * All state transitions are protected by cpu_base->lock.
  */
@@ -376,8 +381,9 @@ extern int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp);
 extern ktime_t hrtimer_get_next_event(void);
 
 /*
- * A timer is active, when it is enqueued into the rbtree or the callback
- * function is running.
+ * A timer is active, when it is enqueued into the rbtree or the
+ * callback function is running or it's in the state of being migrated
+ * to another cpu.
  */
 static inline int hrtimer_active(const struct hrtimer *timer)
 {
-- 
cgit v0.10.2


From a9e7acfff0a279792918b7b0de74106e576e9988 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 10 Mar 2011 19:12:24 +0100
Subject: hrtimer: Remove empty hrtimer_init_hres_timer()

Leftover from earlier implementation. All empty, remove it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index f679a2d..63fb749 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -681,14 +681,6 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
 }
 
 /*
- * Initialize the high resolution related parts of a hrtimer
- */
-static inline void hrtimer_init_timer_hres(struct hrtimer *timer)
-{
-}
-
-
-/*
  * When High resolution timers are active, try to reprogram. Note, that in case
  * the state has HRTIMER_STATE_CALLBACK set, no reprogramming and no expiry
  * check happens. The timer gets enqueued into the rbtree. The reprogramming
@@ -759,7 +751,6 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
 	return 0;
 }
 static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
-static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { }
 
 #endif /* CONFIG_HIGH_RES_TIMERS */
 
@@ -1141,7 +1132,6 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
 
 	base = hrtimer_clockid_to_base(clock_id);
 	timer->base = &cpu_base->clock_base[base];
-	hrtimer_init_timer_hres(timer);
 	timerqueue_init(&timer->node);
 
 #ifdef CONFIG_TIMER_STATS
-- 
cgit v0.10.2


From 8fe8f545c6d753ead15e1f4919d39e8f9bb49629 Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Sun, 6 Mar 2011 18:07:50 -0800
Subject: futex: Update futex_wait_setup comments about locking

Reviving a cleanup I had done about a year ago as part of a larger
futex_set_wait proposal. Over the years, the locking of the hashed
futex queue got improved, so that some of the "rare but normal" race
conditions described in comments can't actually happen anymore.

Signed-off-by: Michel Lespinasse <walken@google.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Darren Hart <dvhltc@us.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <20110307020750.GA31188@google.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/futex.c b/kernel/futex.c
index b766d28..3184d3b 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1781,13 +1781,14 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
 	 *
 	 * The basic logical guarantee of a futex is that it blocks ONLY
 	 * if cond(var) is known to be true at the time of blocking, for
-	 * any cond.  If we queued after testing *uaddr, that would open
-	 * a race condition where we could block indefinitely with
+	 * any cond.  If we locked the hash-bucket after testing *uaddr, that
+	 * would open a race condition where we could block indefinitely with
 	 * cond(var) false, which would violate the guarantee.
 	 *
-	 * A consequence is that futex_wait() can return zero and absorb
-	 * a wakeup when *uaddr != val on entry to the syscall.  This is
-	 * rare, but normal.
+	 * On the other hand, we insert q and release the hash-bucket only
+	 * after testing *uaddr.  This guarantees that futex_wait() will NOT
+	 * absorb a wakeup if *uaddr does not match the desired values
+	 * while the syscall executes.
 	 */
 retry:
 	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key);
-- 
cgit v0.10.2


From 260a7d4cfd26d8bad8ac3a7fce11de47491d7e00 Mon Sep 17 00:00:00 2001
From: Ian Campbell <ian.campbell@citrix.com>
Date: Fri, 18 Feb 2011 16:43:26 +0000
Subject: xen: pci: only define xen_initdom_setup_msi_irqs if CONFIG_XEN_DOM0

Fixes:
 CC      arch/x86/pci/xen.o
arch/x86/pci/xen.c:183: warning: 'xen_initdom_setup_msi_irqs' defined but not used

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index 8634e1b..47c4688 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -203,6 +203,7 @@ static void xen_teardown_msi_irq(unsigned int irq)
 	xen_destroy_irq(irq);
 }
 
+#ifdef CONFIG_XEN_DOM0
 static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 {
 	int irq, ret;
@@ -224,6 +225,7 @@ error:
 	return ret;
 }
 #endif
+#endif
 
 static int xen_pcifront_enable_irq(struct pci_dev *dev)
 {
-- 
cgit v0.10.2


From ae1635b05fae30804061406010914d85d12431ac Mon Sep 17 00:00:00 2001
From: Ian Campbell <ian.campbell@citrix.com>
Date: Fri, 18 Feb 2011 16:43:27 +0000
Subject: xen: events: do not leak IRQ from xen_allocate_pirq_msi when no pirq
 available.

Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Cc: xen-devel@lists.xensource.com
Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index 89987a7..bce3035 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -676,8 +676,11 @@ void xen_allocate_pirq_msi(char *name, int *irq, int *pirq, int alloc)
 
 	if (alloc & XEN_ALLOC_PIRQ) {
 		*pirq = find_unbound_pirq(MAP_PIRQ_TYPE_MSI);
-		if (*pirq == -1)
+		if (*pirq == -1) {
+			xen_free_irq(*irq);
+			*irq = -1;
 			goto out;
+		}
 	}
 
 	set_irq_chip_and_handler_name(*irq, &xen_pirq_chip,
-- 
cgit v0.10.2


From bb5d079aefa828c292c267ed34ed2282947fa233 Mon Sep 17 00:00:00 2001
From: Ian Campbell <ian.campbell@citrix.com>
Date: Fri, 18 Feb 2011 16:43:28 +0000
Subject: xen: events: drop XEN_ALLOC_IRQ flag to xen_allocate_pirq_msi

All callers pass this flag so it is pointless.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index 47c4688..ca5fa09 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -101,7 +101,7 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 			((msg.address_lo >> MSI_ADDR_DEST_ID_SHIFT) & 0xff);
 		if (xen_irq_from_pirq(pirq) >= 0 && msg.data == XEN_PIRQ_MSI_DATA) {
 			xen_allocate_pirq_msi((type == PCI_CAP_ID_MSIX) ?
-					"msi-x" : "msi", &irq, &pirq, XEN_ALLOC_IRQ);
+					"msi-x" : "msi", &irq, &pirq, 0);
 			if (irq < 0)
 				goto error;
 			ret = set_irq_msi(irq, msidesc);
@@ -112,7 +112,7 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 			return 0;
 		}
 		xen_allocate_pirq_msi((type == PCI_CAP_ID_MSIX) ?
-				"msi-x" : "msi", &irq, &pirq, (XEN_ALLOC_IRQ | XEN_ALLOC_PIRQ));
+				"msi-x" : "msi", &irq, &pirq, 1);
 		if (irq < 0 || pirq < 0)
 			goto error;
 		printk(KERN_DEBUG "xen: msi --> irq=%d, pirq=%d\n", irq, pirq);
@@ -160,7 +160,7 @@ static int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 		xen_allocate_pirq_msi(
 			(type == PCI_CAP_ID_MSIX) ?
 			"pcifront-msi-x" : "pcifront-msi",
-			&irq, &v[i], XEN_ALLOC_IRQ);
+			&irq, &v[i], 0);
 		if (irq < 0) {
 			ret = -1;
 			goto free;
diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index bce3035..36e9adc 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -664,17 +664,15 @@ static int find_unbound_pirq(int type)
 	return -1;
 }
 
-void xen_allocate_pirq_msi(char *name, int *irq, int *pirq, int alloc)
+void xen_allocate_pirq_msi(char *name, int *irq, int *pirq, int alloc_pirq)
 {
 	spin_lock(&irq_mapping_update_lock);
 
-	if (alloc & XEN_ALLOC_IRQ) {
-		*irq = xen_allocate_irq_dynamic();
-		if (*irq == -1)
-			goto out;
-	}
+	*irq = xen_allocate_irq_dynamic();
+	if (*irq == -1)
+		goto out;
 
-	if (alloc & XEN_ALLOC_PIRQ) {
+	if (alloc_pirq) {
 		*pirq = find_unbound_pirq(MAP_PIRQ_TYPE_MSI);
 		if (*pirq == -1) {
 			xen_free_irq(*irq);
diff --git a/include/xen/events.h b/include/xen/events.h
index 00f53dd..8d98861 100644
--- a/include/xen/events.h
+++ b/include/xen/events.h
@@ -75,10 +75,7 @@ int xen_allocate_pirq(unsigned gsi, int shareable, char *name);
 int xen_map_pirq_gsi(unsigned pirq, unsigned gsi, int shareable, char *name);
 
 #ifdef CONFIG_PCI_MSI
-/* Allocate an irq and a pirq to be used with MSIs. */
-#define XEN_ALLOC_PIRQ (1 << 0)
-#define XEN_ALLOC_IRQ  (1 << 1)
-void xen_allocate_pirq_msi(char *name, int *irq, int *pirq, int alloc_mask);
+void xen_allocate_pirq_msi(char *name, int *irq, int *pirq, int alloc_pirq);
 int xen_create_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int type);
 #endif
 
-- 
cgit v0.10.2


From 4b41df7f6e0b5684378d9155773c42a4577e8582 Mon Sep 17 00:00:00 2001
From: Ian Campbell <ian.campbell@citrix.com>
Date: Fri, 18 Feb 2011 16:43:29 +0000
Subject: xen: events: return irq from xen_allocate_pirq_msi

consistent with other similar functions.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index ca5fa09..6fd695b 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -100,8 +100,8 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 		pirq = MSI_ADDR_EXT_DEST_ID(msg.address_hi) |
 			((msg.address_lo >> MSI_ADDR_DEST_ID_SHIFT) & 0xff);
 		if (xen_irq_from_pirq(pirq) >= 0 && msg.data == XEN_PIRQ_MSI_DATA) {
-			xen_allocate_pirq_msi((type == PCI_CAP_ID_MSIX) ?
-					"msi-x" : "msi", &irq, &pirq, 0);
+			irq = xen_allocate_pirq_msi((type == PCI_CAP_ID_MSIX) ?
+						    "msi-x" : "msi", &pirq, 0);
 			if (irq < 0)
 				goto error;
 			ret = set_irq_msi(irq, msidesc);
@@ -111,8 +111,8 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 					" pirq=%d\n", irq, pirq);
 			return 0;
 		}
-		xen_allocate_pirq_msi((type == PCI_CAP_ID_MSIX) ?
-				"msi-x" : "msi", &irq, &pirq, 1);
+		irq = xen_allocate_pirq_msi((type == PCI_CAP_ID_MSIX) ?
+					    "msi-x" : "msi", &pirq, 1);
 		if (irq < 0 || pirq < 0)
 			goto error;
 		printk(KERN_DEBUG "xen: msi --> irq=%d, pirq=%d\n", irq, pirq);
@@ -157,10 +157,10 @@ static int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 		goto error;
 	i = 0;
 	list_for_each_entry(msidesc, &dev->msi_list, list) {
-		xen_allocate_pirq_msi(
+		irq = xen_allocate_pirq_msi(
 			(type == PCI_CAP_ID_MSIX) ?
 			"pcifront-msi-x" : "pcifront-msi",
-			&irq, &v[i], 0);
+			&v[i], 0);
 		if (irq < 0) {
 			ret = -1;
 			goto free;
diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index 36e9adc..ed3420d 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -664,31 +664,34 @@ static int find_unbound_pirq(int type)
 	return -1;
 }
 
-void xen_allocate_pirq_msi(char *name, int *irq, int *pirq, int alloc_pirq)
+int xen_allocate_pirq_msi(char *name, int *pirq, int alloc_pirq)
 {
+	int irq;
+
 	spin_lock(&irq_mapping_update_lock);
 
-	*irq = xen_allocate_irq_dynamic();
-	if (*irq == -1)
+	irq = xen_allocate_irq_dynamic();
+	if (irq == -1)
 		goto out;
 
 	if (alloc_pirq) {
 		*pirq = find_unbound_pirq(MAP_PIRQ_TYPE_MSI);
 		if (*pirq == -1) {
-			xen_free_irq(*irq);
-			*irq = -1;
+			xen_free_irq(irq);
+			irq = -1;
 			goto out;
 		}
 	}
 
-	set_irq_chip_and_handler_name(*irq, &xen_pirq_chip,
+	set_irq_chip_and_handler_name(irq, &xen_pirq_chip,
 				      handle_level_irq, name);
 
-	irq_info[*irq] = mk_pirq_info(0, *pirq, 0, 0);
-	pirq_to_irq[*pirq] = *irq;
+	irq_info[irq] = mk_pirq_info(0, *pirq, 0, 0);
+	pirq_to_irq[*pirq] = irq;
 
 out:
 	spin_unlock(&irq_mapping_update_lock);
+	return irq;
 }
 
 int xen_create_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int type)
diff --git a/include/xen/events.h b/include/xen/events.h
index 8d98861..f70536a 100644
--- a/include/xen/events.h
+++ b/include/xen/events.h
@@ -75,7 +75,7 @@ int xen_allocate_pirq(unsigned gsi, int shareable, char *name);
 int xen_map_pirq_gsi(unsigned pirq, unsigned gsi, int shareable, char *name);
 
 #ifdef CONFIG_PCI_MSI
-void xen_allocate_pirq_msi(char *name, int *irq, int *pirq, int alloc_pirq);
+int xen_allocate_pirq_msi(char *name, int *pirq, int alloc_pirq);
 int xen_create_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int type);
 #endif
 
-- 
cgit v0.10.2


From 9a626612c2010699d9909a4c3141d3a38660f3b3 Mon Sep 17 00:00:00 2001
From: Ian Campbell <ian.campbell@citrix.com>
Date: Fri, 18 Feb 2011 16:43:30 +0000
Subject: xen: pci: collapse apic_register_gsi_xen_hvm and
 xen_hvm_register_pirq

apic_register_gsi_xen_hvm is a tiny wrapper around
xen_hvm_register_pirq.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index 6fd695b..0d5087ee 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -20,7 +20,8 @@
 #include <asm/xen/pci.h>
 
 #ifdef CONFIG_ACPI
-static int xen_hvm_register_pirq(u32 gsi, int triggering)
+static int acpi_register_gsi_xen_hvm(struct device *dev, u32 gsi,
+				 int trigger, int polarity)
 {
 	int rc, irq;
 	struct physdev_map_pirq map_irq;
@@ -41,7 +42,7 @@ static int xen_hvm_register_pirq(u32 gsi, int triggering)
 		return -1;
 	}
 
-	if (triggering == ACPI_EDGE_SENSITIVE) {
+	if (trigger == ACPI_EDGE_SENSITIVE) {
 		shareable = 0;
 		name = "ioapic-edge";
 	} else {
@@ -55,12 +56,6 @@ static int xen_hvm_register_pirq(u32 gsi, int triggering)
 
 	return irq;
 }
-
-static int acpi_register_gsi_xen_hvm(struct device *dev, u32 gsi,
-				 int trigger, int polarity)
-{
-	return xen_hvm_register_pirq(gsi, trigger);
-}
 #endif
 
 #if defined(CONFIG_PCI_MSI)
-- 
cgit v0.10.2


From 5cad61a6ba6f4956a218ffbb64cafcc1daefaca0 Mon Sep 17 00:00:00 2001
From: Ian Campbell <ian.campbell@citrix.com>
Date: Fri, 18 Feb 2011 16:43:31 +0000
Subject: xen: events: assume PHYSDEVOP_get_free_pirq exists

The find_unbound_pirq is called only from xen_allocate_pirq_msi and
only if alloc_pirq is true. The only caller which does this is
xen_hvm_setup_msi_irqs. The use of this function is gated, in
pci_xen_hvm_init, on XENFEAT_hvm_pirqs.

The PHYSDEVOP_get_free_pirq interfaces was added to the hypervisor in
22410:be96f6058c05 while XENFEAT_hvm_pirqs was added a couple of
minutes prior in 22409:6663214f06ac. Therefore we do not need to
concern ourselves with hypervisors which support XENFEAT_hvm_pirqs but
not PHYSDEVOP_get_free_pirq.

This eliminates the fallback path in find_unbound_pirq which walks to
pirq_to_irq array looking for a free pirq. Unlike the
PHYSDEVOP_get_free_pirq interface this fallback only looks up a free
pirq but does not reserve it. Removing this fallback will simplify
locking in the future.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index ed3420d..c21066f 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -649,19 +649,16 @@ out:
 
 static int find_unbound_pirq(int type)
 {
-	int rc, i;
+	int rc;
 	struct physdev_get_free_pirq op_get_free_pirq;
-	op_get_free_pirq.type = type;
 
+	op_get_free_pirq.type = type;
 	rc = HYPERVISOR_physdev_op(PHYSDEVOP_get_free_pirq, &op_get_free_pirq);
-	if (!rc)
-		return op_get_free_pirq.pirq;
 
-	for (i = 0; i < nr_irqs; i++) {
-		if (pirq_to_irq[i] < 0)
-			return i;
-	}
-	return -1;
+	WARN_ONCE(rc == -ENOSYS,
+		  "hypervisor does not support the PHYSDEVOP_get_free_pirq interface\n");
+
+	return rc ? -1 : op_get_free_pirq.pirq;
 }
 
 int xen_allocate_pirq_msi(char *name, int *pirq, int alloc_pirq)
-- 
cgit v0.10.2


From bf480d952bcf25e8ff7e95d2a23964107513ac51 Mon Sep 17 00:00:00 2001
From: Ian Campbell <ian.campbell@citrix.com>
Date: Fri, 18 Feb 2011 16:43:32 +0000
Subject: xen: events: separate MSI PIRQ allocation from PIRQ binding to IRQ

Split the binding aspect of xen_allocate_pirq_msi out into a new
xen_bind_pirq_to_irq function.

In xen_hvm_setup_msi_irq when allocating a pirq write the MSI message
to signal the PIRQ as soon as the pirq is obtained. There is no way to
free the pirq back so if the subsequent binding to an IRQ fails we
want to ensure that we will reuse the PIRQ next time rather than leak
it.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index 0d5087ee..93e4215 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -86,7 +86,7 @@ static void xen_msi_compose_msg(struct pci_dev *pdev, unsigned int pirq,
 
 static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 {
-	int irq, pirq, ret = 0;
+	int irq, pirq;
 	struct msi_desc *msidesc;
 	struct msi_msg msg;
 
@@ -94,39 +94,32 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 		__read_msi_msg(msidesc, &msg);
 		pirq = MSI_ADDR_EXT_DEST_ID(msg.address_hi) |
 			((msg.address_lo >> MSI_ADDR_DEST_ID_SHIFT) & 0xff);
-		if (xen_irq_from_pirq(pirq) >= 0 && msg.data == XEN_PIRQ_MSI_DATA) {
-			irq = xen_allocate_pirq_msi((type == PCI_CAP_ID_MSIX) ?
-						    "msi-x" : "msi", &pirq, 0);
-			if (irq < 0)
+		if (msg.data != XEN_PIRQ_MSI_DATA ||
+		    xen_irq_from_pirq(pirq) < 0) {
+			pirq = xen_allocate_pirq_msi(dev, msidesc);
+			if (pirq < 0)
 				goto error;
-			ret = set_irq_msi(irq, msidesc);
-			if (ret < 0)
-				goto error_while;
-			printk(KERN_DEBUG "xen: msi already setup: msi --> irq=%d"
-					" pirq=%d\n", irq, pirq);
-			return 0;
+			xen_msi_compose_msg(dev, pirq, &msg);
+			__write_msi_msg(msidesc, &msg);
+			dev_dbg(&dev->dev, "xen: msi bound to pirq=%d\n", pirq);
+		} else {
+			dev_dbg(&dev->dev,
+				"xen: msi already bound to pirq=%d\n", pirq);
 		}
-		irq = xen_allocate_pirq_msi((type == PCI_CAP_ID_MSIX) ?
-					    "msi-x" : "msi", &pirq, 1);
-		if (irq < 0 || pirq < 0)
+		irq = xen_bind_pirq_msi_to_irq(dev, msidesc, pirq,
+					       (type == PCI_CAP_ID_MSIX) ?
+					       "msi-x" : "msi");
+		if (irq < 0)
 			goto error;
-		printk(KERN_DEBUG "xen: msi --> irq=%d, pirq=%d\n", irq, pirq);
-		xen_msi_compose_msg(dev, pirq, &msg);
-		ret = set_irq_msi(irq, msidesc);
-		if (ret < 0)
-			goto error_while;
-		write_msi_msg(irq, &msg);
+		dev_dbg(&dev->dev,
+			"xen: msi --> pirq=%d --> irq=%d\n", pirq, irq);
 	}
 	return 0;
 
-error_while:
-	unbind_from_irqhandler(irq, NULL);
 error:
-	if (ret == -ENODEV)
-		dev_err(&dev->dev, "Xen PCI frontend has not registered" \
-				" MSI/MSI-X support!\n");
-
-	return ret;
+	dev_err(&dev->dev,
+		"Xen PCI frontend has not registered MSI/MSI-X support!\n");
+	return -ENODEV;
 }
 
 /*
@@ -152,28 +145,19 @@ static int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 		goto error;
 	i = 0;
 	list_for_each_entry(msidesc, &dev->msi_list, list) {
-		irq = xen_allocate_pirq_msi(
-			(type == PCI_CAP_ID_MSIX) ?
-			"pcifront-msi-x" : "pcifront-msi",
-			&v[i], 0);
-		if (irq < 0) {
-			ret = -1;
+		irq = xen_bind_pirq_msi_to_irq(dev, msidesc, v[i],
+					       (type == PCI_CAP_ID_MSIX) ?
+					       "pcifront-msi-x" :
+					       "pcifront-msi");
+		if (irq < 0)
 			goto free;
-		}
-		ret = set_irq_msi(irq, msidesc);
-		if (ret)
-			goto error_while;
 		i++;
 	}
 	kfree(v);
 	return 0;
 
-error_while:
-	unbind_from_irqhandler(irq, NULL);
 error:
-	if (ret == -ENODEV)
-		dev_err(&dev->dev, "Xen PCI frontend has not registered" \
-			" MSI/MSI-X support!\n");
+	dev_err(&dev->dev, "Xen PCI frontend has not registered MSI/MSI-X support!\n");
 free:
 	kfree(v);
 	return ret;
diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index c21066f..1033f62 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -647,12 +647,12 @@ out:
 #include <linux/msi.h>
 #include "../pci/msi.h"
 
-static int find_unbound_pirq(int type)
+int xen_allocate_pirq_msi(struct pci_dev *dev, struct msi_desc *msidesc)
 {
 	int rc;
 	struct physdev_get_free_pirq op_get_free_pirq;
 
-	op_get_free_pirq.type = type;
+	op_get_free_pirq.type = MAP_PIRQ_TYPE_MSI;
 	rc = HYPERVISOR_physdev_op(PHYSDEVOP_get_free_pirq, &op_get_free_pirq);
 
 	WARN_ONCE(rc == -ENOSYS,
@@ -661,9 +661,10 @@ static int find_unbound_pirq(int type)
 	return rc ? -1 : op_get_free_pirq.pirq;
 }
 
-int xen_allocate_pirq_msi(char *name, int *pirq, int alloc_pirq)
+int xen_bind_pirq_msi_to_irq(struct pci_dev *dev, struct msi_desc *msidesc,
+			     int pirq, const char *name)
 {
-	int irq;
+	int irq, ret;
 
 	spin_lock(&irq_mapping_update_lock);
 
@@ -671,24 +672,21 @@ int xen_allocate_pirq_msi(char *name, int *pirq, int alloc_pirq)
 	if (irq == -1)
 		goto out;
 
-	if (alloc_pirq) {
-		*pirq = find_unbound_pirq(MAP_PIRQ_TYPE_MSI);
-		if (*pirq == -1) {
-			xen_free_irq(irq);
-			irq = -1;
-			goto out;
-		}
-	}
-
 	set_irq_chip_and_handler_name(irq, &xen_pirq_chip,
 				      handle_level_irq, name);
 
-	irq_info[irq] = mk_pirq_info(0, *pirq, 0, 0);
-	pirq_to_irq[*pirq] = irq;
-
+	irq_info[irq] = mk_pirq_info(0, pirq, 0, 0);
+	pirq_to_irq[pirq] = irq;
+	ret = set_irq_msi(irq, msidesc);
+	if (ret < 0)
+		goto error_irq;
 out:
 	spin_unlock(&irq_mapping_update_lock);
 	return irq;
+error_irq:
+	spin_unlock(&irq_mapping_update_lock);
+	xen_free_irq(irq);
+	return -1;
 }
 
 int xen_create_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int type)
diff --git a/include/xen/events.h b/include/xen/events.h
index f70536a..18bf825 100644
--- a/include/xen/events.h
+++ b/include/xen/events.h
@@ -75,7 +75,9 @@ int xen_allocate_pirq(unsigned gsi, int shareable, char *name);
 int xen_map_pirq_gsi(unsigned pirq, unsigned gsi, int shareable, char *name);
 
 #ifdef CONFIG_PCI_MSI
-int xen_allocate_pirq_msi(char *name, int *pirq, int alloc_pirq);
+int xen_allocate_pirq_msi(struct pci_dev *dev, struct msi_desc *msidesc);
+int xen_bind_pirq_msi_to_irq(struct pci_dev *dev, struct msi_desc *msidesc,
+			     int pirq, const char *name);
 int xen_create_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int type);
 #endif
 
-- 
cgit v0.10.2


From 8135591e90c81462a6902f6ffa1f1ca021db077a Mon Sep 17 00:00:00 2001
From: Ian Campbell <ian.campbell@citrix.com>
Date: Fri, 18 Feb 2011 16:43:33 +0000
Subject: xen: events: refactor xen_create_msi_irq slightly

Calling PHYSDEVOP_map_pirq earlier simplifies error handling and
starts to make the tail end of this function look like
xen_bind_pirq_msi_to_irq.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index 1033f62..b54285e 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -716,6 +716,12 @@ int xen_create_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int type)
 		map_irq.entry_nr = msidesc->msi_attrib.entry_nr;
 	}
 
+	rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
+	if (rc) {
+		dev_warn(&dev->dev, "xen map irq failed %d\n", rc);
+		goto out;
+	}
+
 	spin_lock(&irq_mapping_update_lock);
 
 	irq = xen_allocate_irq_dynamic();
@@ -723,15 +729,6 @@ int xen_create_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int type)
 	if (irq == -1)
 		goto out;
 
-	rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
-	if (rc) {
-		printk(KERN_WARNING "xen map irq failed %d\n", rc);
-
-		xen_free_irq(irq);
-
-		irq = -1;
-		goto out;
-	}
 	irq_info[irq] = mk_pirq_info(0, map_irq.pirq, 0, map_irq.index);
 
 	set_irq_chip_and_handler_name(irq, &xen_pirq_chip,
-- 
cgit v0.10.2


From 2e55288f63343f0810f4f0a3004f78037cfb93d3 Mon Sep 17 00:00:00 2001
From: Ian Campbell <ian.campbell@citrix.com>
Date: Fri, 18 Feb 2011 16:43:34 +0000
Subject: xen: events: update pirq_to_irq in xen_create_msi_irq

I don't think this was a deliberate ommision.

Makes the tail end of this function look even more like
xen_bind_pirq_msi_to_irq.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index b54285e..721b393 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -730,6 +730,7 @@ int xen_create_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int type)
 		goto out;
 
 	irq_info[irq] = mk_pirq_info(0, map_irq.pirq, 0, map_irq.index);
+	pirq_to_irq[map_irq.pirq] = irq;
 
 	set_irq_chip_and_handler_name(irq, &xen_pirq_chip,
 			handle_level_irq,
-- 
cgit v0.10.2


From f420e010edd84eb2c237fc87b7451e69740fed46 Mon Sep 17 00:00:00 2001
From: Ian Campbell <ian.campbell@citrix.com>
Date: Fri, 18 Feb 2011 16:43:35 +0000
Subject: xen: events: push set_irq_msi down into xen_create_msi_irq

Makes the tail end of this function look even more like
xen_bind_pirq_msi_to_irq.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index 93e4215..15fd981 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -185,23 +185,15 @@ static void xen_teardown_msi_irq(unsigned int irq)
 #ifdef CONFIG_XEN_DOM0
 static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 {
-	int irq, ret;
+	int irq;
 	struct msi_desc *msidesc;
 
 	list_for_each_entry(msidesc, &dev->msi_list, list) {
 		irq = xen_create_msi_irq(dev, msidesc, type);
 		if (irq < 0)
 			return -1;
-
-		ret = set_irq_msi(irq, msidesc);
-		if (ret)
-			goto error;
 	}
 	return 0;
-
-error:
-	xen_destroy_irq(irq);
-	return ret;
 }
 #endif
 #endif
diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index 721b393..77ede77 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -691,7 +691,7 @@ error_irq:
 
 int xen_create_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int type)
 {
-	int irq = -1;
+	int ret, irq = -1;
 	struct physdev_map_pirq map_irq;
 	int rc;
 	int pos;
@@ -736,9 +736,17 @@ int xen_create_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int type)
 			handle_level_irq,
 			(type == PCI_CAP_ID_MSIX) ? "msi-x":"msi");
 
+	ret = set_irq_msi(irq, msidesc);
+	if (ret)
+		goto out_irq;
+
 out:
 	spin_unlock(&irq_mapping_update_lock);
 	return irq;
+out_irq:
+	spin_unlock(&irq_mapping_update_lock);
+	xen_free_irq(irq);
+	return -1;
 }
 #endif
 
-- 
cgit v0.10.2


From ca1d8fe9521fb67c95cfa736c08f4bbbc282b5bd Mon Sep 17 00:00:00 2001
From: Ian Campbell <ian.campbell@citrix.com>
Date: Fri, 18 Feb 2011 16:43:36 +0000
Subject: xen: events: use xen_bind_pirq_msi_to_irq from xen_create_msi_irq

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index 15fd981..ffd8c7a 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -106,7 +106,7 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 			dev_dbg(&dev->dev,
 				"xen: msi already bound to pirq=%d\n", pirq);
 		}
-		irq = xen_bind_pirq_msi_to_irq(dev, msidesc, pirq,
+		irq = xen_bind_pirq_msi_to_irq(dev, msidesc, pirq, 0,
 					       (type == PCI_CAP_ID_MSIX) ?
 					       "msi-x" : "msi");
 		if (irq < 0)
@@ -145,7 +145,7 @@ static int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 		goto error;
 	i = 0;
 	list_for_each_entry(msidesc, &dev->msi_list, list) {
-		irq = xen_bind_pirq_msi_to_irq(dev, msidesc, v[i],
+		irq = xen_bind_pirq_msi_to_irq(dev, msidesc, v[i], 0,
 					       (type == PCI_CAP_ID_MSIX) ?
 					       "pcifront-msi-x" :
 					       "pcifront-msi");
diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index 77ede77..3446948 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -662,7 +662,7 @@ int xen_allocate_pirq_msi(struct pci_dev *dev, struct msi_desc *msidesc)
 }
 
 int xen_bind_pirq_msi_to_irq(struct pci_dev *dev, struct msi_desc *msidesc,
-			     int pirq, const char *name)
+			     int pirq, int vector, const char *name)
 {
 	int irq, ret;
 
@@ -675,7 +675,7 @@ int xen_bind_pirq_msi_to_irq(struct pci_dev *dev, struct msi_desc *msidesc,
 	set_irq_chip_and_handler_name(irq, &xen_pirq_chip,
 				      handle_level_irq, name);
 
-	irq_info[irq] = mk_pirq_info(0, pirq, 0, 0);
+	irq_info[irq] = mk_pirq_info(0, pirq, 0, vector);
 	pirq_to_irq[pirq] = irq;
 	ret = set_irq_msi(irq, msidesc);
 	if (ret < 0)
@@ -691,7 +691,6 @@ error_irq:
 
 int xen_create_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int type)
 {
-	int ret, irq = -1;
 	struct physdev_map_pirq map_irq;
 	int rc;
 	int pos;
@@ -719,34 +718,13 @@ int xen_create_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int type)
 	rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
 	if (rc) {
 		dev_warn(&dev->dev, "xen map irq failed %d\n", rc);
-		goto out;
+		return -1;
 	}
 
-	spin_lock(&irq_mapping_update_lock);
-
-	irq = xen_allocate_irq_dynamic();
-
-	if (irq == -1)
-		goto out;
-
-	irq_info[irq] = mk_pirq_info(0, map_irq.pirq, 0, map_irq.index);
-	pirq_to_irq[map_irq.pirq] = irq;
-
-	set_irq_chip_and_handler_name(irq, &xen_pirq_chip,
-			handle_level_irq,
-			(type == PCI_CAP_ID_MSIX) ? "msi-x":"msi");
-
-	ret = set_irq_msi(irq, msidesc);
-	if (ret)
-		goto out_irq;
-
-out:
-	spin_unlock(&irq_mapping_update_lock);
-	return irq;
-out_irq:
-	spin_unlock(&irq_mapping_update_lock);
-	xen_free_irq(irq);
-	return -1;
+	return xen_bind_pirq_msi_to_irq(dev, msidesc,
+					map_irq.pirq, map_irq.index,
+					(type == PCI_CAP_ID_MSIX) ?
+					"msi-x" : "msi");
 }
 #endif
 
diff --git a/include/xen/events.h b/include/xen/events.h
index 18bf825..45c08a0 100644
--- a/include/xen/events.h
+++ b/include/xen/events.h
@@ -77,7 +77,7 @@ int xen_map_pirq_gsi(unsigned pirq, unsigned gsi, int shareable, char *name);
 #ifdef CONFIG_PCI_MSI
 int xen_allocate_pirq_msi(struct pci_dev *dev, struct msi_desc *msidesc);
 int xen_bind_pirq_msi_to_irq(struct pci_dev *dev, struct msi_desc *msidesc,
-			     int pirq, const char *name);
+			     int pirq, int vector, const char *name);
 int xen_create_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int type);
 #endif
 
-- 
cgit v0.10.2


From 71eef7d1e3d9df760897fdd2cad6949a8bcf1620 Mon Sep 17 00:00:00 2001
From: Ian Campbell <Ian.Campbell@citrix.com>
Date: Fri, 18 Feb 2011 17:06:55 +0000
Subject: xen: events: remove dom0 specific xen_create_msi_irq

The function name does not distinguish it from xen_allocate_pirq_msi
(which operates on domU and pvhvm domains rather than dom0).

Hoist domain 0 specific functionality up into the only caller leaving
functionality common to all guest types in xen_bind_pirq_msi_to_irq.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index ffd8c7a..8c4085a 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -185,15 +185,50 @@ static void xen_teardown_msi_irq(unsigned int irq)
 #ifdef CONFIG_XEN_DOM0
 static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 {
-	int irq;
+	int ret = 0;
 	struct msi_desc *msidesc;
 
 	list_for_each_entry(msidesc, &dev->msi_list, list) {
-		irq = xen_create_msi_irq(dev, msidesc, type);
-		if (irq < 0)
-			return -1;
+		struct physdev_map_pirq map_irq;
+
+		memset(&map_irq, 0, sizeof(map_irq));
+		map_irq.domid = DOMID_SELF;
+		map_irq.type = MAP_PIRQ_TYPE_MSI;
+		map_irq.index = -1;
+		map_irq.pirq = -1;
+		map_irq.bus = dev->bus->number;
+		map_irq.devfn = dev->devfn;
+
+		if (type == PCI_CAP_ID_MSIX) {
+			int pos;
+			u32 table_offset, bir;
+
+			pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
+
+			pci_read_config_dword(dev, pos + PCI_MSIX_TABLE,
+					      &table_offset);
+			bir = (u8)(table_offset & PCI_MSIX_FLAGS_BIRMASK);
+
+			map_irq.table_base = pci_resource_start(dev, bir);
+			map_irq.entry_nr = msidesc->msi_attrib.entry_nr;
+		}
+
+		ret = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
+		if (ret) {
+			dev_warn(&dev->dev, "xen map irq failed %d\n", ret);
+			goto out;
+		}
+
+		ret = xen_bind_pirq_msi_to_irq(dev, msidesc,
+					       map_irq.pirq, map_irq.index,
+					       (type == PCI_CAP_ID_MSIX) ?
+					       "msi-x" : "msi");
+		if (ret < 0)
+			goto out;
 	}
-	return 0;
+	ret = 0;
+out:
+	return ret;
 }
 #endif
 #endif
diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index 3446948..6befe62 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -644,9 +644,6 @@ out:
 }
 
 #ifdef CONFIG_PCI_MSI
-#include <linux/msi.h>
-#include "../pci/msi.h"
-
 int xen_allocate_pirq_msi(struct pci_dev *dev, struct msi_desc *msidesc)
 {
 	int rc;
@@ -688,44 +685,6 @@ error_irq:
 	xen_free_irq(irq);
 	return -1;
 }
-
-int xen_create_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int type)
-{
-	struct physdev_map_pirq map_irq;
-	int rc;
-	int pos;
-	u32 table_offset, bir;
-
-	memset(&map_irq, 0, sizeof(map_irq));
-	map_irq.domid = DOMID_SELF;
-	map_irq.type = MAP_PIRQ_TYPE_MSI;
-	map_irq.index = -1;
-	map_irq.pirq = -1;
-	map_irq.bus = dev->bus->number;
-	map_irq.devfn = dev->devfn;
-
-	if (type == PCI_CAP_ID_MSIX) {
-		pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
-
-		pci_read_config_dword(dev, msix_table_offset_reg(pos),
-					&table_offset);
-		bir = (u8)(table_offset & PCI_MSIX_FLAGS_BIRMASK);
-
-		map_irq.table_base = pci_resource_start(dev, bir);
-		map_irq.entry_nr = msidesc->msi_attrib.entry_nr;
-	}
-
-	rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
-	if (rc) {
-		dev_warn(&dev->dev, "xen map irq failed %d\n", rc);
-		return -1;
-	}
-
-	return xen_bind_pirq_msi_to_irq(dev, msidesc,
-					map_irq.pirq, map_irq.index,
-					(type == PCI_CAP_ID_MSIX) ?
-					"msi-x" : "msi");
-}
 #endif
 
 int xen_destroy_irq(int irq)
diff --git a/include/xen/events.h b/include/xen/events.h
index 45c08a0..962da2c 100644
--- a/include/xen/events.h
+++ b/include/xen/events.h
@@ -78,7 +78,6 @@ int xen_map_pirq_gsi(unsigned pirq, unsigned gsi, int shareable, char *name);
 int xen_allocate_pirq_msi(struct pci_dev *dev, struct msi_desc *msidesc);
 int xen_bind_pirq_msi_to_irq(struct pci_dev *dev, struct msi_desc *msidesc,
 			     int pirq, int vector, const char *name);
-int xen_create_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int type);
 #endif
 
 /* De-allocates the above mentioned physical interrupt. */
-- 
cgit v0.10.2


From ec8df88f6bd808db47ac7a06c96dcc90d7ed6ecc Mon Sep 17 00:00:00 2001
From: Henrik Kretzschmar <henne@nachtwindheim.de>
Date: Fri, 11 Mar 2011 08:02:35 +0100
Subject: x86: Remove superflous goal definition of tsc_sync

The extra tsc_sync.o goal definition is superflous.
CONFIG_X86_64_SMP depends on CONFIG_SMP
and tsc_sync.o is already in the definition of CONFIG_SMP.

Signed-off-by: Henrik Kretzschmar <henne@nachtwindheim.de>
Acked-by: Cyrill Gorcunov <gorcunov@openvz.org>
LKML-Reference: <1299826956-8607-1-git-send-email-henne@nachtwindheim.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 6ac5036..62445ba 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -66,9 +66,9 @@ obj-$(CONFIG_PCI)		+= early-quirks.o
 apm-y				:= apm_32.o
 obj-$(CONFIG_APM)		+= apm.o
 obj-$(CONFIG_SMP)		+= smp.o
-obj-$(CONFIG_SMP)		+= smpboot.o tsc_sync.o
+obj-$(CONFIG_SMP)		+= smpboot.o
+obj-$(CONFIG_SMP)		+= tsc_sync.o
 obj-$(CONFIG_SMP)		+= setup_percpu.o
-obj-$(CONFIG_X86_64_SMP)	+= tsc_sync.o
 obj-$(CONFIG_X86_TRAMPOLINE)	+= trampoline_$(BITS).o
 obj-$(CONFIG_X86_MPPARSE)	+= mpparse.o
 obj-y				+= apic/
-- 
cgit v0.10.2


From 25874a299ef8037df03ce4ada570bc4e42f9748f Mon Sep 17 00:00:00 2001
From: Henrik Kretzschmar <henne@nachtwindheim.de>
Date: Fri, 11 Mar 2011 08:02:36 +0100
Subject: x86: Clean up apic.c and apic.h

This patch moves some functions and variables into init
sections, makes a function static and removes some lines of
cruft.

Signed-off-by: Henrik Kretzschmar <henne@nachtwindheim.de>
Acked-by: Cyrill Gorcunov <gorcunov@openvz.org>
LKML-Reference: <1299826956-8607-2-git-send-email-henne@nachtwindheim.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 4afe512..5b7d513 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -220,7 +220,6 @@ extern void enable_IR_x2apic(void);
 
 extern int get_physical_broadcast(void);
 
-extern void apic_disable(void);
 extern int lapic_get_maxlvt(void);
 extern void clear_local_APIC(void);
 extern void connect_bsp_APIC(void);
@@ -228,7 +227,6 @@ extern void disconnect_bsp_APIC(int virt_wire_setup);
 extern void disable_local_APIC(void);
 extern void lapic_shutdown(void);
 extern int verify_local_APIC(void);
-extern void cache_APIC_registers(void);
 extern void sync_Arb_IDs(void);
 extern void init_bsp_APIC(void);
 extern void setup_local_APIC(void);
@@ -239,7 +237,6 @@ void register_lapic_address(unsigned long address);
 extern void setup_boot_APIC_clock(void);
 extern void setup_secondary_APIC_clock(void);
 extern int APIC_init_uniprocessor(void);
-extern void enable_NMI_through_LVT0(void);
 extern int apic_force_enable(unsigned long addr);
 
 /*
@@ -261,7 +258,6 @@ static inline void lapic_shutdown(void) { }
 #define local_apic_timer_c2_ok		1
 static inline void init_apic_mappings(void) { }
 static inline void disable_local_APIC(void) { }
-static inline void apic_disable(void) { }
 # define setup_boot_APIC_clock x86_init_noop
 # define setup_secondary_APIC_clock x86_init_noop
 #endif /* !CONFIG_X86_LOCAL_APIC */
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 4f43312..ffbf7c2 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -84,7 +84,7 @@ EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
  *
  * +1=force-enable
  */
-static int force_enable_local_apic;
+static int force_enable_local_apic __initdata;
 /*
  * APIC command line parameters
  */
@@ -154,7 +154,7 @@ early_param("nox2apic", setup_nox2apic);
 unsigned long mp_lapic_addr;
 int disable_apic;
 /* Disable local APIC timer from the kernel commandline or via dmi quirk */
-static int disable_apic_timer __cpuinitdata;
+static int disable_apic_timer __initdata;
 /* Local APIC timer works in C2 */
 int local_apic_timer_c2_ok;
 EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
@@ -178,29 +178,8 @@ static struct resource lapic_resource = {
 
 static unsigned int calibration_result;
 
-static int lapic_next_event(unsigned long delta,
-			    struct clock_event_device *evt);
-static void lapic_timer_setup(enum clock_event_mode mode,
-			      struct clock_event_device *evt);
-static void lapic_timer_broadcast(const struct cpumask *mask);
 static void apic_pm_activate(void);
 
-/*
- * The local apic timer can be used for any function which is CPU local.
- */
-static struct clock_event_device lapic_clockevent = {
-	.name		= "lapic",
-	.features	= CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT
-			| CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_DUMMY,
-	.shift		= 32,
-	.set_mode	= lapic_timer_setup,
-	.set_next_event	= lapic_next_event,
-	.broadcast	= lapic_timer_broadcast,
-	.rating		= 100,
-	.irq		= -1,
-};
-static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
-
 static unsigned long apic_phys;
 
 /*
@@ -239,7 +218,7 @@ static int modern_apic(void)
  * right after this call apic become NOOP driven
  * so apic->write/read doesn't do anything
  */
-void apic_disable(void)
+static void __init apic_disable(void)
 {
 	pr_info("APIC: switched to apic NOOP\n");
 	apic = &apic_noop;
@@ -283,23 +262,6 @@ u64 native_apic_icr_read(void)
 	return icr1 | ((u64)icr2 << 32);
 }
 
-/**
- * enable_NMI_through_LVT0 - enable NMI through local vector table 0
- */
-void __cpuinit enable_NMI_through_LVT0(void)
-{
-	unsigned int v;
-
-	/* unmask and set to NMI */
-	v = APIC_DM_NMI;
-
-	/* Level triggered for 82489DX (32bit mode) */
-	if (!lapic_is_integrated())
-		v |= APIC_LVT_LEVEL_TRIGGER;
-
-	apic_write(APIC_LVT0, v);
-}
-
 #ifdef CONFIG_X86_32
 /**
  * get_physical_broadcast - Get number of physical broadcast IDs
@@ -509,6 +471,23 @@ static void lapic_timer_broadcast(const struct cpumask *mask)
 #endif
 }
 
+
+/*
+ * The local apic timer can be used for any function which is CPU local.
+ */
+static struct clock_event_device lapic_clockevent = {
+	.name		= "lapic",
+	.features	= CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT
+			| CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_DUMMY,
+	.shift		= 32,
+	.set_mode	= lapic_timer_setup,
+	.set_next_event	= lapic_next_event,
+	.broadcast	= lapic_timer_broadcast,
+	.rating		= 100,
+	.irq		= -1,
+};
+static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
+
 /*
  * Setup the local APIC timer for this CPU. Copy the initialized values
  * of the boot CPU and register the clock event in the framework.
@@ -1538,7 +1517,7 @@ static int __init detect_init_APIC(void)
 }
 #else
 
-static int apic_verify(void)
+static int __init apic_verify(void)
 {
 	u32 features, h, l;
 
@@ -1563,7 +1542,7 @@ static int apic_verify(void)
 	return 0;
 }
 
-int apic_force_enable(unsigned long addr)
+int __init apic_force_enable(unsigned long addr)
 {
 	u32 h, l;
 
-- 
cgit v0.10.2


From c0c9ed15042ceac7c485813012a0a97316101b57 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 11 Mar 2011 11:51:22 +0100
Subject: futex: Avoid redudant evaluation of task_pid_vnr()

The result is not going to change under us, so no need to reevaluate
this over and over. Seems to be a leftover from the mechanical mass
conversion of task->pid to task_pid_vnr(tsk).

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/futex.c b/kernel/futex.c
index 3184d3b..7738154 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -674,7 +674,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
 				struct task_struct *task, int set_waiters)
 {
 	int lock_taken, ret, ownerdied = 0;
-	u32 uval, newval, curval;
+	u32 uval, newval, curval, vpid = task_pid_vnr(task);
 
 retry:
 	ret = lock_taken = 0;
@@ -684,7 +684,7 @@ retry:
 	 * (by doing a 0 -> TID atomic cmpxchg), while holding all
 	 * the locks. It will most likely not succeed.
 	 */
-	newval = task_pid_vnr(task);
+	newval = vpid;
 	if (set_waiters)
 		newval |= FUTEX_WAITERS;
 
@@ -696,7 +696,7 @@ retry:
 	/*
 	 * Detect deadlocks.
 	 */
-	if ((unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(task))))
+	if ((unlikely((curval & FUTEX_TID_MASK) == vpid)))
 		return -EDEADLK;
 
 	/*
@@ -723,7 +723,7 @@ retry:
 	 */
 	if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) {
 		/* Keep the OWNER_DIED bit */
-		newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(task);
+		newval = (curval & ~FUTEX_TID_MASK) | vpid;
 		ownerdied = 0;
 		lock_taken = 1;
 	}
@@ -2047,9 +2047,9 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
 {
 	struct futex_hash_bucket *hb;
 	struct futex_q *this, *next;
-	u32 uval;
 	struct plist_head *head;
 	union futex_key key = FUTEX_KEY_INIT;
+	u32 uval, vpid = task_pid_vnr(current);
 	int ret;
 
 retry:
@@ -2058,7 +2058,7 @@ retry:
 	/*
 	 * We release only a lock we actually own:
 	 */
-	if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current))
+	if ((uval & FUTEX_TID_MASK) != vpid)
 		return -EPERM;
 
 	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key);
@@ -2074,7 +2074,7 @@ retry:
 	 * anyone else up:
 	 */
 	if (!(uval & FUTEX_OWNER_DIED))
-		uval = cmpxchg_futex_value_locked(uaddr, task_pid_vnr(current), 0);
+		uval = cmpxchg_futex_value_locked(uaddr, vpid, 0);
 
 
 	if (unlikely(uval == -EFAULT))
@@ -2083,7 +2083,7 @@ retry:
 	 * Rare case: we managed to release the lock atomically,
 	 * no need to wake anyone else up:
 	 */
-	if (unlikely(uval == task_pid_vnr(current)))
+	if (unlikely(uval == vpid))
 		goto out_unlock;
 
 	/*
-- 
cgit v0.10.2


From 522d7decc0370070448a8c28982c8dfd8970489e Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Thu, 10 Mar 2011 18:47:31 -0800
Subject: futex: Remove redundant pagefault_disable in
 futex_atomic_cmpxchg_inatomic()

kernel/futex.c disables page faults before calling
futex_atomic_cmpxchg_inatomic(), so there is no need to do it again
within that function.

Signed-off-by: Michel Lespinasse <walken@google.com>
Cc: Darren Hart <darren@dvhart.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: David Howells <dhowells@redhat.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: "James E.J. Bottomley" <jejb@parisc-linux.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <20110311024731.GB26122@google.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/arm/include/asm/futex.h b/arch/arm/include/asm/futex.h
index b33fe70..7133a86 100644
--- a/arch/arm/include/asm/futex.h
+++ b/arch/arm/include/asm/futex.h
@@ -95,7 +95,8 @@ futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval)
 	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
 		return -EFAULT;
 
-	pagefault_disable();	/* implies preempt_disable() */
+	/* Note that preemption is disabled by futex_atomic_cmpxchg_inatomic
+	 * call sites. */
 
 	__asm__ __volatile__("@futex_atomic_cmpxchg_inatomic\n"
 	"1:	" T(ldr) "	%0, [%3]\n"
@@ -115,8 +116,6 @@ futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval)
 	: "r" (oldval), "r" (newval), "r" (uaddr), "Ir" (-EFAULT)
 	: "cc", "memory");
 
-	pagefault_enable();	/* subsumes preempt_enable() */
-
 	return val;
 }
 
-- 
cgit v0.10.2


From 37a9d912b24f96a0591773e6e6c3642991ae5a70 Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Thu, 10 Mar 2011 18:48:51 -0800
Subject: futex: Sanitize cmpxchg_futex_value_locked API

The cmpxchg_futex_value_locked API was funny in that it returned either
the original, user-exposed futex value OR an error code such as -EFAULT.
This was confusing at best, and could be a source of livelocks in places
that retry the cmpxchg_futex_value_locked after trying to fix the issue
by running fault_in_user_writeable().

This change makes the cmpxchg_futex_value_locked API more similar to the
get_futex_value_locked one, returning an error code and updating the
original value through a reference argument.

Signed-off-by: Michel Lespinasse <walken@google.com>
Acked-by: Chris Metcalf <cmetcalf@tilera.com>  [tile]
Acked-by: Tony Luck <tony.luck@intel.com>  [ia64]
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Michal Simek <monstr@monstr.eu>  [microblaze]
Acked-by: David Howells <dhowells@redhat.com> [frv]
Cc: Darren Hart <darren@dvhart.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: "James E.J. Bottomley" <jejb@parisc-linux.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <20110311024851.GC26122@google.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/alpha/include/asm/futex.h b/arch/alpha/include/asm/futex.h
index 945de22..c4e5c28 100644
--- a/arch/alpha/include/asm/futex.h
+++ b/arch/alpha/include/asm/futex.h
@@ -81,21 +81,22 @@ static inline int futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
 }
 
 static inline int
-futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval)
+futex_atomic_cmpxchg_inatomic(int *uval, int __user *uaddr,
+			      int oldval, int newval)
 {
-	int prev, cmp;
+	int ret = 0, prev, cmp;
 
 	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
 		return -EFAULT;
 
 	__asm__ __volatile__ (
 		__ASM_SMP_MB
-	"1:	ldl_l	%0,0(%2)\n"
-	"	cmpeq	%0,%3,%1\n"
-	"	beq	%1,3f\n"
-	"	mov	%4,%1\n"
-	"2:	stl_c	%1,0(%2)\n"
-	"	beq	%1,4f\n"
+	"1:	ldl_l	%1,0(%3)\n"
+	"	cmpeq	%1,%4,%2\n"
+	"	beq	%2,3f\n"
+	"	mov	%5,%2\n"
+	"2:	stl_c	%2,0(%3)\n"
+	"	beq	%2,4f\n"
 	"3:	.subsection 2\n"
 	"4:	br	1b\n"
 	"	.previous\n"
@@ -105,11 +106,12 @@ futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval)
 	"	.long	2b-.\n"
 	"	lda	$31,3b-2b(%0)\n"
 	"	.previous\n"
-	:	"=&r"(prev), "=&r"(cmp)
+	:	"+r"(ret), "=&r"(prev), "=&r"(cmp)
 	:	"r"(uaddr), "r"((long)oldval), "r"(newval)
 	:	"memory");
 
-	return prev;
+	*uval = prev;
+	return ret;
 }
 
 #endif /* __KERNEL__ */
diff --git a/arch/arm/include/asm/futex.h b/arch/arm/include/asm/futex.h
index 7133a86..d20b78f 100644
--- a/arch/arm/include/asm/futex.h
+++ b/arch/arm/include/asm/futex.h
@@ -88,9 +88,10 @@ futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
 }
 
 static inline int
-futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval)
+futex_atomic_cmpxchg_inatomic(int *uval, int __user *uaddr,
+			      int oldval, int newval)
 {
-	int val;
+	int ret = 0, val;
 
 	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
 		return -EFAULT;
@@ -99,24 +100,25 @@ futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval)
 	 * call sites. */
 
 	__asm__ __volatile__("@futex_atomic_cmpxchg_inatomic\n"
-	"1:	" T(ldr) "	%0, [%3]\n"
-	"	teq	%0, %1\n"
+	"1:	" T(ldr) "	%1, [%4]\n"
+	"	teq	%1, %2\n"
 	"	it	eq	@ explicit IT needed for the 2b label\n"
-	"2:	" T(streq) "	%2, [%3]\n"
+	"2:	" T(streq) "	%3, [%4]\n"
 	"3:\n"
 	"	.pushsection __ex_table,\"a\"\n"
 	"	.align	3\n"
 	"	.long	1b, 4f, 2b, 4f\n"
 	"	.popsection\n"
 	"	.pushsection .fixup,\"ax\"\n"
-	"4:	mov	%0, %4\n"
+	"4:	mov	%0, %5\n"
 	"	b	3b\n"
 	"	.popsection"
-	: "=&r" (val)
+	: "+r" (ret), "=&r" (val)
 	: "r" (oldval), "r" (newval), "r" (uaddr), "Ir" (-EFAULT)
 	: "cc", "memory");
 
-	return val;
+	*uval = val;
+	return ret;
 }
 
 #endif /* !SMP */
diff --git a/arch/frv/include/asm/futex.h b/arch/frv/include/asm/futex.h
index 08b3d1d..0548f8e 100644
--- a/arch/frv/include/asm/futex.h
+++ b/arch/frv/include/asm/futex.h
@@ -10,7 +10,8 @@
 extern int futex_atomic_op_inuser(int encoded_op, int __user *uaddr);
 
 static inline int
-futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval)
+futex_atomic_cmpxchg_inatomic(int *uval, int __user *uaddr,
+			      int oldval, int newval)
 {
 	return -ENOSYS;
 }
diff --git a/arch/ia64/include/asm/futex.h b/arch/ia64/include/asm/futex.h
index c7f0f06..b072840 100644
--- a/arch/ia64/include/asm/futex.h
+++ b/arch/ia64/include/asm/futex.h
@@ -100,23 +100,26 @@ futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
 }
 
 static inline int
-futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval)
+futex_atomic_cmpxchg_inatomic(int *uval, int __user *uaddr,
+			      int oldval, int newval)
 {
 	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
 		return -EFAULT;
 
 	{
-		register unsigned long r8 __asm ("r8");
+		register unsigned long r8 __asm ("r8") = 0;
+		unsigned long prev;
 		__asm__ __volatile__(
 			"	mf;;					\n"
 			"	mov ar.ccv=%3;;				\n"
 			"[1:]	cmpxchg4.acq %0=[%1],%2,ar.ccv		\n"
 			"	.xdata4 \"__ex_table\", 1b-., 2f-.	\n"
 			"[2:]"
-			: "=r" (r8)
+			: "=r" (prev)
 			: "r" (uaddr), "r" (newval),
 			  "rO" ((long) (unsigned) oldval)
 			: "memory");
+		*uval = prev;
 		return r8;
 	}
 }
diff --git a/arch/microblaze/include/asm/futex.h b/arch/microblaze/include/asm/futex.h
index ad3fd61..fa019ed 100644
--- a/arch/microblaze/include/asm/futex.h
+++ b/arch/microblaze/include/asm/futex.h
@@ -94,31 +94,33 @@ futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
 }
 
 static inline int
-futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval)
+futex_atomic_cmpxchg_inatomic(int *uval, int __user *uaddr,
+			      int oldval, int newval)
 {
-	int prev, cmp;
+	int ret = 0, prev, cmp;
 
 	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
 		return -EFAULT;
 
-	__asm__ __volatile__ ("1:	lwx	%0, %2, r0;		\
-					cmp	%1, %0, %3;		\
-					beqi	%1, 3f;			\
-				2:	swx	%4, %2, r0;		\
-					addic	%1, r0, 0;		\
-					bnei	%1, 1b;			\
+	__asm__ __volatile__ ("1:	lwx	%1, %3, r0;		\
+					cmp	%2, %1, %4;		\
+					beqi	%2, 3f;			\
+				2:	swx	%5, %3, r0;		\
+					addic	%2, r0, 0;		\
+					bnei	%2, 1b;			\
 				3:					\
 				.section .fixup,\"ax\";			\
 				4:	brid	3b;			\
-					addik	%0, r0, %5;		\
+					addik	%0, r0, %6;		\
 				.previous;				\
 				.section __ex_table,\"a\";		\
 				.word	1b,4b,2b,4b;			\
 				.previous;"				\
-		: "=&r" (prev), "=&r"(cmp)				\
+		: "+r" (ret), "=&r" (prev), "=&r"(cmp)	\
 		: "r" (uaddr), "r" (oldval), "r" (newval), "i" (-EFAULT));
 
-	return prev;
+	*uval = prev;
+	return ret;
 }
 
 #endif /* __KERNEL__ */
diff --git a/arch/mips/include/asm/futex.h b/arch/mips/include/asm/futex.h
index b9cce90..692a24b 100644
--- a/arch/mips/include/asm/futex.h
+++ b/arch/mips/include/asm/futex.h
@@ -132,9 +132,10 @@ futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
 }
 
 static inline int
-futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval)
+futex_atomic_cmpxchg_inatomic(int *uval, int __user *uaddr,
+			      int oldval, int newval)
 {
-	int retval;
+	int ret = 0, val;
 
 	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
 		return -EFAULT;
@@ -145,25 +146,25 @@ futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval)
 		"	.set	push					\n"
 		"	.set	noat					\n"
 		"	.set	mips3					\n"
-		"1:	ll	%0, %2					\n"
-		"	bne	%0, %z3, 3f				\n"
+		"1:	ll	%1, %3					\n"
+		"	bne	%1, %z4, 3f				\n"
 		"	.set	mips0					\n"
-		"	move	$1, %z4					\n"
+		"	move	$1, %z5					\n"
 		"	.set	mips3					\n"
-		"2:	sc	$1, %1					\n"
+		"2:	sc	$1, %2					\n"
 		"	beqzl	$1, 1b					\n"
 		__WEAK_LLSC_MB
 		"3:							\n"
 		"	.set	pop					\n"
 		"	.section .fixup,\"ax\"				\n"
-		"4:	li	%0, %5					\n"
+		"4:	li	%0, %6					\n"
 		"	j	3b					\n"
 		"	.previous					\n"
 		"	.section __ex_table,\"a\"			\n"
 		"	"__UA_ADDR "\t1b, 4b				\n"
 		"	"__UA_ADDR "\t2b, 4b				\n"
 		"	.previous					\n"
-		: "=&r" (retval), "=R" (*uaddr)
+		: "+r" (ret), "=&r" (val), "=R" (*uaddr)
 		: "R" (*uaddr), "Jr" (oldval), "Jr" (newval), "i" (-EFAULT)
 		: "memory");
 	} else if (cpu_has_llsc) {
@@ -172,31 +173,32 @@ futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval)
 		"	.set	push					\n"
 		"	.set	noat					\n"
 		"	.set	mips3					\n"
-		"1:	ll	%0, %2					\n"
-		"	bne	%0, %z3, 3f				\n"
+		"1:	ll	%1, %3					\n"
+		"	bne	%1, %z4, 3f				\n"
 		"	.set	mips0					\n"
-		"	move	$1, %z4					\n"
+		"	move	$1, %z5					\n"
 		"	.set	mips3					\n"
-		"2:	sc	$1, %1					\n"
+		"2:	sc	$1, %2					\n"
 		"	beqz	$1, 1b					\n"
 		__WEAK_LLSC_MB
 		"3:							\n"
 		"	.set	pop					\n"
 		"	.section .fixup,\"ax\"				\n"
-		"4:	li	%0, %5					\n"
+		"4:	li	%0, %6					\n"
 		"	j	3b					\n"
 		"	.previous					\n"
 		"	.section __ex_table,\"a\"			\n"
 		"	"__UA_ADDR "\t1b, 4b				\n"
 		"	"__UA_ADDR "\t2b, 4b				\n"
 		"	.previous					\n"
-		: "=&r" (retval), "=R" (*uaddr)
+		: "+r" (ret), "=&r" (val), "=R" (*uaddr)
 		: "R" (*uaddr), "Jr" (oldval), "Jr" (newval), "i" (-EFAULT)
 		: "memory");
 	} else
 		return -ENOSYS;
 
-	return retval;
+	*uval = val;
+	return ret;
 }
 
 #endif
diff --git a/arch/parisc/include/asm/futex.h b/arch/parisc/include/asm/futex.h
index 0c705c3..4c6d867 100644
--- a/arch/parisc/include/asm/futex.h
+++ b/arch/parisc/include/asm/futex.h
@@ -51,10 +51,10 @@ futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
 
 /* Non-atomic version */
 static inline int
-futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval)
+futex_atomic_cmpxchg_inatomic(int *uval, int __user *uaddr,
+			      int oldval, int newval)
 {
-	int err = 0;
-	int uval;
+	int val;
 
 	/* futex.c wants to do a cmpxchg_inatomic on kernel NULL, which is
 	 * our gateway page, and causes no end of trouble...
@@ -65,12 +65,12 @@ futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval)
 	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
 		return -EFAULT;
 
-	err = get_user(uval, uaddr);
-	if (err) return -EFAULT;
-	if (uval == oldval)
-		err = put_user(newval, uaddr);
-	if (err) return -EFAULT;
-	return uval;
+	if (get_user(val, uaddr))
+		return -EFAULT;
+	if (val == oldval && put_user(newval, uaddr))
+		return -EFAULT;
+	*uval = val;
+	return 0;
 }
 
 #endif /*__KERNEL__*/
diff --git a/arch/powerpc/include/asm/futex.h b/arch/powerpc/include/asm/futex.h
index 7c589ef..631e8da 100644
--- a/arch/powerpc/include/asm/futex.h
+++ b/arch/powerpc/include/asm/futex.h
@@ -82,35 +82,37 @@ static inline int futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
 }
 
 static inline int
-futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval)
+futex_atomic_cmpxchg_inatomic(int *uval, int __user *uaddr,
+			      int oldval, int newval)
 {
-	int prev;
+	int ret = 0, prev;
 
 	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
 		return -EFAULT;
 
         __asm__ __volatile__ (
         PPC_RELEASE_BARRIER
-"1:     lwarx   %0,0,%2         # futex_atomic_cmpxchg_inatomic\n\
-        cmpw    0,%0,%3\n\
+"1:     lwarx   %1,0,%3         # futex_atomic_cmpxchg_inatomic\n\
+        cmpw    0,%1,%4\n\
         bne-    3f\n"
-        PPC405_ERR77(0,%2)
-"2:     stwcx.  %4,0,%2\n\
+        PPC405_ERR77(0,%3)
+"2:     stwcx.  %5,0,%3\n\
         bne-    1b\n"
         PPC_ACQUIRE_BARRIER
 "3:	.section .fixup,\"ax\"\n\
-4:	li	%0,%5\n\
+4:	li	%0,%6\n\
 	b	3b\n\
 	.previous\n\
 	.section __ex_table,\"a\"\n\
 	.align 3\n\
 	" PPC_LONG "1b,4b,2b,4b\n\
 	.previous" \
-        : "=&r" (prev), "+m" (*uaddr)
+        : "+r" (ret), "=&r" (prev), "+m" (*uaddr)
         : "r" (uaddr), "r" (oldval), "r" (newval), "i" (-EFAULT)
         : "cc", "memory");
 
-        return prev;
+	*uval = prev;
+        return ret;
 }
 
 #endif /* __KERNEL__ */
diff --git a/arch/s390/include/asm/futex.h b/arch/s390/include/asm/futex.h
index 5c5d02d..27ac515 100644
--- a/arch/s390/include/asm/futex.h
+++ b/arch/s390/include/asm/futex.h
@@ -39,13 +39,13 @@ static inline int futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
 	return ret;
 }
 
-static inline int futex_atomic_cmpxchg_inatomic(int __user *uaddr,
+static inline int futex_atomic_cmpxchg_inatomic(int *uval, int __user *uaddr,
 						int oldval, int newval)
 {
 	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
 		return -EFAULT;
 
-	return uaccess.futex_atomic_cmpxchg(uaddr, oldval, newval);
+	return uaccess.futex_atomic_cmpxchg(uval, uaddr, oldval, newval);
 }
 
 #endif /* __KERNEL__ */
diff --git a/arch/s390/include/asm/uaccess.h b/arch/s390/include/asm/uaccess.h
index d6b1ed0..549adf6 100644
--- a/arch/s390/include/asm/uaccess.h
+++ b/arch/s390/include/asm/uaccess.h
@@ -84,7 +84,7 @@ struct uaccess_ops {
 	size_t (*strnlen_user)(size_t, const char __user *);
 	size_t (*strncpy_from_user)(size_t, const char __user *, char *);
 	int (*futex_atomic_op)(int op, int __user *, int oparg, int *old);
-	int (*futex_atomic_cmpxchg)(int __user *, int old, int new);
+	int (*futex_atomic_cmpxchg)(int *, int __user *, int old, int new);
 };
 
 extern struct uaccess_ops uaccess;
diff --git a/arch/s390/lib/uaccess.h b/arch/s390/lib/uaccess.h
index 126011d..89a8067 100644
--- a/arch/s390/lib/uaccess.h
+++ b/arch/s390/lib/uaccess.h
@@ -12,12 +12,12 @@ extern size_t copy_from_user_std(size_t, const void __user *, void *);
 extern size_t copy_to_user_std(size_t, void __user *, const void *);
 extern size_t strnlen_user_std(size_t, const char __user *);
 extern size_t strncpy_from_user_std(size_t, const char __user *, char *);
-extern int futex_atomic_cmpxchg_std(int __user *, int, int);
+extern int futex_atomic_cmpxchg_std(int *, int __user *, int, int);
 extern int futex_atomic_op_std(int, int __user *, int, int *);
 
 extern size_t copy_from_user_pt(size_t, const void __user *, void *);
 extern size_t copy_to_user_pt(size_t, void __user *, const void *);
 extern int futex_atomic_op_pt(int, int __user *, int, int *);
-extern int futex_atomic_cmpxchg_pt(int __user *, int, int);
+extern int futex_atomic_cmpxchg_pt(int *, int __user *, int, int);
 
 #endif /* __ARCH_S390_LIB_UACCESS_H */
diff --git a/arch/s390/lib/uaccess_pt.c b/arch/s390/lib/uaccess_pt.c
index 404f2de..b3cebcd 100644
--- a/arch/s390/lib/uaccess_pt.c
+++ b/arch/s390/lib/uaccess_pt.c
@@ -354,26 +354,29 @@ int futex_atomic_op_pt(int op, int __user *uaddr, int oparg, int *old)
 	return ret;
 }
 
-static int __futex_atomic_cmpxchg_pt(int __user *uaddr, int oldval, int newval)
+static int __futex_atomic_cmpxchg_pt(int *uval, int __user *uaddr,
+				     int oldval, int newval)
 {
 	int ret;
 
 	asm volatile("0: cs   %1,%4,0(%5)\n"
-		     "1: lr   %0,%1\n"
+		     "1: la   %0,0\n"
 		     "2:\n"
 		     EX_TABLE(0b,2b) EX_TABLE(1b,2b)
 		     : "=d" (ret), "+d" (oldval), "=m" (*uaddr)
 		     : "0" (-EFAULT), "d" (newval), "a" (uaddr), "m" (*uaddr)
 		     : "cc", "memory" );
+	*uval = oldval;
 	return ret;
 }
 
-int futex_atomic_cmpxchg_pt(int __user *uaddr, int oldval, int newval)
+int futex_atomic_cmpxchg_pt(int *uval, int __user *uaddr,
+			    int oldval, int newval)
 {
 	int ret;
 
 	if (segment_eq(get_fs(), KERNEL_DS))
-		return __futex_atomic_cmpxchg_pt(uaddr, oldval, newval);
+		return __futex_atomic_cmpxchg_pt(uval, uaddr, oldval, newval);
 	spin_lock(&current->mm->page_table_lock);
 	uaddr = (int __user *) __dat_user_addr((unsigned long) uaddr);
 	if (!uaddr) {
@@ -382,7 +385,7 @@ int futex_atomic_cmpxchg_pt(int __user *uaddr, int oldval, int newval)
 	}
 	get_page(virt_to_page(uaddr));
 	spin_unlock(&current->mm->page_table_lock);
-	ret = __futex_atomic_cmpxchg_pt(uaddr, oldval, newval);
+	ret = __futex_atomic_cmpxchg_pt(uval, uaddr, oldval, newval);
 	put_page(virt_to_page(uaddr));
 	return ret;
 }
diff --git a/arch/s390/lib/uaccess_std.c b/arch/s390/lib/uaccess_std.c
index a6c4f7e..1d6643c 100644
--- a/arch/s390/lib/uaccess_std.c
+++ b/arch/s390/lib/uaccess_std.c
@@ -287,19 +287,21 @@ int futex_atomic_op_std(int op, int __user *uaddr, int oparg, int *old)
 	return ret;
 }
 
-int futex_atomic_cmpxchg_std(int __user *uaddr, int oldval, int newval)
+int futex_atomic_cmpxchg_std(int *uval, int __user *uaddr,
+			     int oldval, int newval)
 {
 	int ret;
 
 	asm volatile(
 		"   sacf 256\n"
 		"0: cs   %1,%4,0(%5)\n"
-		"1: lr   %0,%1\n"
+		"1: la   %0,0\n"
 		"2: sacf 0\n"
 		EX_TABLE(0b,2b) EX_TABLE(1b,2b)
 		: "=d" (ret), "+d" (oldval), "=m" (*uaddr)
 		: "0" (-EFAULT), "d" (newval), "a" (uaddr), "m" (*uaddr)
 		: "cc", "memory" );
+	*uval = oldval;
 	return ret;
 }
 
diff --git a/arch/sh/include/asm/futex-irq.h b/arch/sh/include/asm/futex-irq.h
index a9f16a7..7b701cb 100644
--- a/arch/sh/include/asm/futex-irq.h
+++ b/arch/sh/include/asm/futex-irq.h
@@ -88,7 +88,8 @@ static inline int atomic_futex_op_xchg_xor(int oparg, int __user *uaddr,
 	return ret;
 }
 
-static inline int atomic_futex_op_cmpxchg_inatomic(int __user *uaddr,
+static inline int atomic_futex_op_cmpxchg_inatomic(int *uval,
+						   int __user *uaddr,
 						   int oldval, int newval)
 {
 	unsigned long flags;
@@ -102,10 +103,8 @@ static inline int atomic_futex_op_cmpxchg_inatomic(int __user *uaddr,
 
 	local_irq_restore(flags);
 
-	if (ret)
-		return ret;
-
-	return prev;
+	*uval = prev;
+	return ret;
 }
 
 #endif /* __ASM_SH_FUTEX_IRQ_H */
diff --git a/arch/sh/include/asm/futex.h b/arch/sh/include/asm/futex.h
index 68256ec..a8a5125 100644
--- a/arch/sh/include/asm/futex.h
+++ b/arch/sh/include/asm/futex.h
@@ -65,12 +65,13 @@ static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
 }
 
 static inline int
-futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval)
+futex_atomic_cmpxchg_inatomic(int *uval, int __user *uaddr,
+			      int oldval, int newval)
 {
 	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
 		return -EFAULT;
 
-	return atomic_futex_op_cmpxchg_inatomic(uaddr, oldval, newval);
+	return atomic_futex_op_cmpxchg_inatomic(uval, uaddr, oldval, newval);
 }
 
 #endif /* __KERNEL__ */
diff --git a/arch/sparc/include/asm/futex_64.h b/arch/sparc/include/asm/futex_64.h
index 47f9583..e086220 100644
--- a/arch/sparc/include/asm/futex_64.h
+++ b/arch/sparc/include/asm/futex_64.h
@@ -85,26 +85,30 @@ static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
 }
 
 static inline int
-futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval)
+futex_atomic_cmpxchg_inatomic(int *uval, int __user *uaddr,
+			      int oldval, int newval)
 {
+	int ret = 0;
+
 	__asm__ __volatile__(
-	"\n1:	casa	[%3] %%asi, %2, %0\n"
+	"\n1:	casa	[%4] %%asi, %3, %1\n"
 	"2:\n"
 	"	.section .fixup,#alloc,#execinstr\n"
 	"	.align	4\n"
 	"3:	sethi	%%hi(2b), %0\n"
 	"	jmpl	%0 + %%lo(2b), %%g0\n"
-	"	 mov	%4, %0\n"
+	"	mov	%5, %0\n"
 	"	.previous\n"
 	"	.section __ex_table,\"a\"\n"
 	"	.align	4\n"
 	"	.word	1b, 3b\n"
 	"	.previous\n"
-	: "=r" (newval)
-	: "0" (newval), "r" (oldval), "r" (uaddr), "i" (-EFAULT)
+	: "+r" (ret), "=r" (newval)
+	: "1" (newval), "r" (oldval), "r" (uaddr), "i" (-EFAULT)
 	: "memory");
 
-	return newval;
+	*uval = newval;
+	return ret;
 }
 
 #endif /* !(_SPARC64_FUTEX_H) */
diff --git a/arch/tile/include/asm/futex.h b/arch/tile/include/asm/futex.h
index fe0d10d..664b20a 100644
--- a/arch/tile/include/asm/futex.h
+++ b/arch/tile/include/asm/futex.h
@@ -119,8 +119,8 @@ static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
 	return ret;
 }
 
-static inline int futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval,
-						int newval)
+static inline int futex_atomic_cmpxchg_inatomic(int *uval, int __user *uaddr,
+						int oldval, int newval)
 {
 	struct __get_user asm_ret;
 
@@ -128,7 +128,8 @@ static inline int futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval,
 		return -EFAULT;
 
 	asm_ret = futex_cmpxchg(uaddr, oldval, newval);
-	return asm_ret.err ? asm_ret.err : asm_ret.val;
+	*uval = asm_ret.val;
+	return asm_ret.err;
 }
 
 #ifndef __tilegx__
diff --git a/arch/x86/include/asm/futex.h b/arch/x86/include/asm/futex.h
index 1f11ce4..884c0b5 100644
--- a/arch/x86/include/asm/futex.h
+++ b/arch/x86/include/asm/futex.h
@@ -109,9 +109,10 @@ static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
 	return ret;
 }
 
-static inline int futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval,
-						int newval)
+static inline int futex_atomic_cmpxchg_inatomic(int *uval, int __user *uaddr,
+						int oldval, int newval)
 {
+	int ret = 0;
 
 #if defined(CONFIG_X86_32) && !defined(CONFIG_X86_BSWAP)
 	/* Real i386 machines have no cmpxchg instruction */
@@ -122,18 +123,19 @@ static inline int futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval,
 	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
 		return -EFAULT;
 
-	asm volatile("1:\t" LOCK_PREFIX "cmpxchgl %3, %1\n"
+	asm volatile("1:\t" LOCK_PREFIX "cmpxchgl %4, %2\n"
 		     "2:\t.section .fixup, \"ax\"\n"
-		     "3:\tmov     %2, %0\n"
+		     "3:\tmov     %3, %0\n"
 		     "\tjmp     2b\n"
 		     "\t.previous\n"
 		     _ASM_EXTABLE(1b, 3b)
-		     : "=a" (oldval), "+m" (*uaddr)
-		     : "i" (-EFAULT), "r" (newval), "0" (oldval)
+		     : "+r" (ret), "=a" (oldval), "+m" (*uaddr)
+		     : "i" (-EFAULT), "r" (newval), "1" (oldval)
 		     : "memory"
 	);
 
-	return oldval;
+	*uval = oldval;
+	return ret;
 }
 
 #endif
diff --git a/include/asm-generic/futex.h b/include/asm-generic/futex.h
index 3c2344f..132bf52 100644
--- a/include/asm-generic/futex.h
+++ b/include/asm-generic/futex.h
@@ -48,7 +48,8 @@ futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
 }
 
 static inline int
-futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval)
+futex_atomic_cmpxchg_inatomic(int *uval, int __user *uaddr,
+			      int oldval, int newval)
 {
 	return -ENOSYS;
 }
diff --git a/kernel/futex.c b/kernel/futex.c
index 7738154..237f14b 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -381,15 +381,16 @@ static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb,
 	return NULL;
 }
 
-static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval)
+static int cmpxchg_futex_value_locked(u32 *curval, u32 __user *uaddr,
+				      u32 uval, u32 newval)
 {
-	u32 curval;
+	int ret;
 
 	pagefault_disable();
-	curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
+	ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval);
 	pagefault_enable();
 
-	return curval;
+	return ret;
 }
 
 static int get_futex_value_locked(u32 *dest, u32 __user *from)
@@ -688,9 +689,7 @@ retry:
 	if (set_waiters)
 		newval |= FUTEX_WAITERS;
 
-	curval = cmpxchg_futex_value_locked(uaddr, 0, newval);
-
-	if (unlikely(curval == -EFAULT))
+	if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, 0, newval)))
 		return -EFAULT;
 
 	/*
@@ -728,9 +727,7 @@ retry:
 		lock_taken = 1;
 	}
 
-	curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
-
-	if (unlikely(curval == -EFAULT))
+	if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
 		return -EFAULT;
 	if (unlikely(curval != uval))
 		goto retry;
@@ -843,9 +840,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
 
 		newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
 
-		curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
-
-		if (curval == -EFAULT)
+		if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
 			ret = -EFAULT;
 		else if (curval != uval)
 			ret = -EINVAL;
@@ -880,10 +875,8 @@ static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
 	 * There is no waiter, so we unlock the futex. The owner died
 	 * bit has not to be preserved here. We are the owner:
 	 */
-	oldval = cmpxchg_futex_value_locked(uaddr, uval, 0);
-
-	if (oldval == -EFAULT)
-		return oldval;
+	if (cmpxchg_futex_value_locked(&oldval, uaddr, uval, 0))
+		return -EFAULT;
 	if (oldval != uval)
 		return -EAGAIN;
 
@@ -1578,9 +1571,7 @@ retry:
 	while (1) {
 		newval = (uval & FUTEX_OWNER_DIED) | newtid;
 
-		curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
-
-		if (curval == -EFAULT)
+		if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
 			goto handle_fault;
 		if (curval == uval)
 			break;
@@ -2073,11 +2064,8 @@ retry:
 	 * again. If it succeeds then we can return without waking
 	 * anyone else up:
 	 */
-	if (!(uval & FUTEX_OWNER_DIED))
-		uval = cmpxchg_futex_value_locked(uaddr, vpid, 0);
-
-
-	if (unlikely(uval == -EFAULT))
+	if (!(uval & FUTEX_OWNER_DIED) &&
+	    cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0))
 		goto pi_faulted;
 	/*
 	 * Rare case: we managed to release the lock atomically,
@@ -2464,9 +2452,7 @@ retry:
 		 * userspace.
 		 */
 		mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
-		nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, mval);
-
-		if (nval == -EFAULT)
+		if (futex_atomic_cmpxchg_inatomic(&nval, uaddr, uval, mval))
 			return -1;
 
 		if (nval != uval)
@@ -2679,8 +2665,7 @@ static int __init futex_init(void)
 	 * implementation, the non-functional ones will return
 	 * -ENOSYS.
 	 */
-	curval = cmpxchg_futex_value_locked(NULL, 0, 0);
-	if (curval == -EFAULT)
+	if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT)
 		futex_cmpxchg_enabled = 1;
 
 	for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
-- 
cgit v0.10.2


From 8d7718aa082aaf30a0b4989e1f04858952f941bc Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Thu, 10 Mar 2011 18:50:58 -0800
Subject: futex: Sanitize futex ops argument types

Change futex_atomic_op_inuser and futex_atomic_cmpxchg_inatomic
prototypes to use u32 types for the futex as this is the data type the
futex core code uses all over the place.

Signed-off-by: Michel Lespinasse <walken@google.com>
Cc: Darren Hart <darren@dvhart.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: David Howells <dhowells@redhat.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: "James E.J. Bottomley" <jejb@parisc-linux.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <20110311025058.GD26122@google.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/alpha/include/asm/futex.h b/arch/alpha/include/asm/futex.h
index c4e5c28..e8a761a 100644
--- a/arch/alpha/include/asm/futex.h
+++ b/arch/alpha/include/asm/futex.h
@@ -29,7 +29,7 @@
 	:	"r" (uaddr), "r"(oparg)				\
 	:	"memory")
 
-static inline int futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
+static inline int futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
 {
 	int op = (encoded_op >> 28) & 7;
 	int cmp = (encoded_op >> 24) & 15;
@@ -39,7 +39,7 @@ static inline int futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
 	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
 		oparg = 1 << oparg;
 
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
+	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
 		return -EFAULT;
 
 	pagefault_disable();
@@ -81,12 +81,13 @@ static inline int futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
 }
 
 static inline int
-futex_atomic_cmpxchg_inatomic(int *uval, int __user *uaddr,
-			      int oldval, int newval)
+futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
+			      u32 oldval, u32 newval)
 {
-	int ret = 0, prev, cmp;
+	int ret = 0, cmp;
+	u32 prev;
 
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
+	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
 		return -EFAULT;
 
 	__asm__ __volatile__ (
diff --git a/arch/arm/include/asm/futex.h b/arch/arm/include/asm/futex.h
index d20b78f..0e29d8e 100644
--- a/arch/arm/include/asm/futex.h
+++ b/arch/arm/include/asm/futex.h
@@ -35,7 +35,7 @@
 	: "cc", "memory")
 
 static inline int
-futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
+futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
 {
 	int op = (encoded_op >> 28) & 7;
 	int cmp = (encoded_op >> 24) & 15;
@@ -46,7 +46,7 @@ futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
 	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
 		oparg = 1 << oparg;
 
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
+	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
 		return -EFAULT;
 
 	pagefault_disable();	/* implies preempt_disable() */
@@ -88,12 +88,13 @@ futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
 }
 
 static inline int
-futex_atomic_cmpxchg_inatomic(int *uval, int __user *uaddr,
-			      int oldval, int newval)
+futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
+			      u32 oldval, u32 newval)
 {
-	int ret = 0, val;
+	int ret = 0;
+	u32 val;
 
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
+	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
 		return -EFAULT;
 
 	/* Note that preemption is disabled by futex_atomic_cmpxchg_inatomic
diff --git a/arch/frv/include/asm/futex.h b/arch/frv/include/asm/futex.h
index 0548f8e..4bea27f 100644
--- a/arch/frv/include/asm/futex.h
+++ b/arch/frv/include/asm/futex.h
@@ -7,11 +7,11 @@
 #include <asm/errno.h>
 #include <asm/uaccess.h>
 
-extern int futex_atomic_op_inuser(int encoded_op, int __user *uaddr);
+extern int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr);
 
 static inline int
-futex_atomic_cmpxchg_inatomic(int *uval, int __user *uaddr,
-			      int oldval, int newval)
+futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
+			      u32 oldval, u32 newval)
 {
 	return -ENOSYS;
 }
diff --git a/arch/frv/kernel/futex.c b/arch/frv/kernel/futex.c
index 14f64b0..d155ca9 100644
--- a/arch/frv/kernel/futex.c
+++ b/arch/frv/kernel/futex.c
@@ -18,7 +18,7 @@
  * the various futex operations; MMU fault checking is ignored under no-MMU
  * conditions
  */
-static inline int atomic_futex_op_xchg_set(int oparg, int __user *uaddr, int *_oldval)
+static inline int atomic_futex_op_xchg_set(int oparg, u32 __user *uaddr, int *_oldval)
 {
 	int oldval, ret;
 
@@ -50,7 +50,7 @@ static inline int atomic_futex_op_xchg_set(int oparg, int __user *uaddr, int *_o
 	return ret;
 }
 
-static inline int atomic_futex_op_xchg_add(int oparg, int __user *uaddr, int *_oldval)
+static inline int atomic_futex_op_xchg_add(int oparg, u32 __user *uaddr, int *_oldval)
 {
 	int oldval, ret;
 
@@ -83,7 +83,7 @@ static inline int atomic_futex_op_xchg_add(int oparg, int __user *uaddr, int *_o
 	return ret;
 }
 
-static inline int atomic_futex_op_xchg_or(int oparg, int __user *uaddr, int *_oldval)
+static inline int atomic_futex_op_xchg_or(int oparg, u32 __user *uaddr, int *_oldval)
 {
 	int oldval, ret;
 
@@ -116,7 +116,7 @@ static inline int atomic_futex_op_xchg_or(int oparg, int __user *uaddr, int *_ol
 	return ret;
 }
 
-static inline int atomic_futex_op_xchg_and(int oparg, int __user *uaddr, int *_oldval)
+static inline int atomic_futex_op_xchg_and(int oparg, u32 __user *uaddr, int *_oldval)
 {
 	int oldval, ret;
 
@@ -149,7 +149,7 @@ static inline int atomic_futex_op_xchg_and(int oparg, int __user *uaddr, int *_o
 	return ret;
 }
 
-static inline int atomic_futex_op_xchg_xor(int oparg, int __user *uaddr, int *_oldval)
+static inline int atomic_futex_op_xchg_xor(int oparg, u32 __user *uaddr, int *_oldval)
 {
 	int oldval, ret;
 
@@ -186,7 +186,7 @@ static inline int atomic_futex_op_xchg_xor(int oparg, int __user *uaddr, int *_o
 /*
  * do the futex operations
  */
-int futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
+int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
 {
 	int op = (encoded_op >> 28) & 7;
 	int cmp = (encoded_op >> 24) & 15;
@@ -197,7 +197,7 @@ int futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
 	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
 		oparg = 1 << oparg;
 
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
+	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
 		return -EFAULT;
 
 	pagefault_disable();
diff --git a/arch/ia64/include/asm/futex.h b/arch/ia64/include/asm/futex.h
index b072840..8428525 100644
--- a/arch/ia64/include/asm/futex.h
+++ b/arch/ia64/include/asm/futex.h
@@ -46,7 +46,7 @@ do {									\
 } while (0)
 
 static inline int
-futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
+futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
 {
 	int op = (encoded_op >> 28) & 7;
 	int cmp = (encoded_op >> 24) & 15;
@@ -56,7 +56,7 @@ futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
 	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
 		oparg = 1 << oparg;
 
-	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
+	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(u32)))
 		return -EFAULT;
 
 	pagefault_disable();
@@ -100,10 +100,10 @@ futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
 }
 
 static inline int
-futex_atomic_cmpxchg_inatomic(int *uval, int __user *uaddr,
-			      int oldval, int newval)
+futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
+			      u32 oldval, u32 newval)
 {
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
+	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
 		return -EFAULT;
 
 	{
diff --git a/arch/microblaze/include/asm/futex.h b/arch/microblaze/include/asm/futex.h
index fa019ed..b0526d2 100644
--- a/arch/microblaze/include/asm/futex.h
+++ b/arch/microblaze/include/asm/futex.h
@@ -29,7 +29,7 @@
 })
 
 static inline int
-futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
+futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
 {
 	int op = (encoded_op >> 28) & 7;
 	int cmp = (encoded_op >> 24) & 15;
@@ -39,7 +39,7 @@ futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
 	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
 		oparg = 1 << oparg;
 
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
+	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
 		return -EFAULT;
 
 	pagefault_disable();
@@ -94,12 +94,13 @@ futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
 }
 
 static inline int
-futex_atomic_cmpxchg_inatomic(int *uval, int __user *uaddr,
-			      int oldval, int newval)
+futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
+			      u32 oldval, u32 newval)
 {
-	int ret = 0, prev, cmp;
+	int ret = 0, cmp;
+	u32 prev;
 
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
+	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
 		return -EFAULT;
 
 	__asm__ __volatile__ ("1:	lwx	%1, %3, r0;		\
diff --git a/arch/mips/include/asm/futex.h b/arch/mips/include/asm/futex.h
index 692a24b..6ebf173 100644
--- a/arch/mips/include/asm/futex.h
+++ b/arch/mips/include/asm/futex.h
@@ -75,7 +75,7 @@
 }
 
 static inline int
-futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
+futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
 {
 	int op = (encoded_op >> 28) & 7;
 	int cmp = (encoded_op >> 24) & 15;
@@ -85,7 +85,7 @@ futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
 	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
 		oparg = 1 << oparg;
 
-	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
+	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(u32)))
 		return -EFAULT;
 
 	pagefault_disable();
@@ -132,12 +132,13 @@ futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
 }
 
 static inline int
-futex_atomic_cmpxchg_inatomic(int *uval, int __user *uaddr,
-			      int oldval, int newval)
+futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
+			      u32 oldval, u32 newval)
 {
-	int ret = 0, val;
+	int ret = 0;
+	u32 val;
 
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
+	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
 		return -EFAULT;
 
 	if (cpu_has_llsc && R10000_LLSC_WAR) {
diff --git a/arch/parisc/include/asm/futex.h b/arch/parisc/include/asm/futex.h
index 4c6d867..67a33cc 100644
--- a/arch/parisc/include/asm/futex.h
+++ b/arch/parisc/include/asm/futex.h
@@ -8,7 +8,7 @@
 #include <asm/errno.h>
 
 static inline int
-futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
+futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
 {
 	int op = (encoded_op >> 28) & 7;
 	int cmp = (encoded_op >> 24) & 15;
@@ -18,7 +18,7 @@ futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
 	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
 		oparg = 1 << oparg;
 
-	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
+	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(u32)))
 		return -EFAULT;
 
 	pagefault_disable();
@@ -51,10 +51,10 @@ futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
 
 /* Non-atomic version */
 static inline int
-futex_atomic_cmpxchg_inatomic(int *uval, int __user *uaddr,
-			      int oldval, int newval)
+futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
+			      u32 oldval, u32 newval)
 {
-	int val;
+	u32 val;
 
 	/* futex.c wants to do a cmpxchg_inatomic on kernel NULL, which is
 	 * our gateway page, and causes no end of trouble...
@@ -62,7 +62,7 @@ futex_atomic_cmpxchg_inatomic(int *uval, int __user *uaddr,
 	if (segment_eq(KERNEL_DS, get_fs()) && !uaddr)
 		return -EFAULT;
 
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
+	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
 		return -EFAULT;
 
 	if (get_user(val, uaddr))
diff --git a/arch/powerpc/include/asm/futex.h b/arch/powerpc/include/asm/futex.h
index 631e8da..c94e4a3 100644
--- a/arch/powerpc/include/asm/futex.h
+++ b/arch/powerpc/include/asm/futex.h
@@ -30,7 +30,7 @@
 	: "b" (uaddr), "i" (-EFAULT), "r" (oparg) \
 	: "cr0", "memory")
 
-static inline int futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
+static inline int futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
 {
 	int op = (encoded_op >> 28) & 7;
 	int cmp = (encoded_op >> 24) & 15;
@@ -40,7 +40,7 @@ static inline int futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
 	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
 		oparg = 1 << oparg;
 
-	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
+	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(u32)))
 		return -EFAULT;
 
 	pagefault_disable();
@@ -82,12 +82,13 @@ static inline int futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
 }
 
 static inline int
-futex_atomic_cmpxchg_inatomic(int *uval, int __user *uaddr,
-			      int oldval, int newval)
+futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
+			      u32 oldval, u32 newval)
 {
-	int ret = 0, prev;
+	int ret = 0;
+	u32 prev;
 
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
+	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
 		return -EFAULT;
 
         __asm__ __volatile__ (
diff --git a/arch/s390/include/asm/futex.h b/arch/s390/include/asm/futex.h
index 27ac515..81cf36b 100644
--- a/arch/s390/include/asm/futex.h
+++ b/arch/s390/include/asm/futex.h
@@ -7,7 +7,7 @@
 #include <linux/uaccess.h>
 #include <asm/errno.h>
 
-static inline int futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
+static inline int futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
 {
 	int op = (encoded_op >> 28) & 7;
 	int cmp = (encoded_op >> 24) & 15;
@@ -18,7 +18,7 @@ static inline int futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
 	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
 		oparg = 1 << oparg;
 
-	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
+	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(u32)))
 		return -EFAULT;
 
 	pagefault_disable();
@@ -39,10 +39,10 @@ static inline int futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
 	return ret;
 }
 
-static inline int futex_atomic_cmpxchg_inatomic(int *uval, int __user *uaddr,
-						int oldval, int newval)
+static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
+						u32 oldval, u32 newval)
 {
-	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
+	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(u32)))
 		return -EFAULT;
 
 	return uaccess.futex_atomic_cmpxchg(uval, uaddr, oldval, newval);
diff --git a/arch/s390/include/asm/uaccess.h b/arch/s390/include/asm/uaccess.h
index 549adf6..2d9ea11f 100644
--- a/arch/s390/include/asm/uaccess.h
+++ b/arch/s390/include/asm/uaccess.h
@@ -83,8 +83,8 @@ struct uaccess_ops {
 	size_t (*clear_user)(size_t, void __user *);
 	size_t (*strnlen_user)(size_t, const char __user *);
 	size_t (*strncpy_from_user)(size_t, const char __user *, char *);
-	int (*futex_atomic_op)(int op, int __user *, int oparg, int *old);
-	int (*futex_atomic_cmpxchg)(int *, int __user *, int old, int new);
+	int (*futex_atomic_op)(int op, u32 __user *, int oparg, int *old);
+	int (*futex_atomic_cmpxchg)(u32 *, u32 __user *, u32 old, u32 new);
 };
 
 extern struct uaccess_ops uaccess;
diff --git a/arch/s390/lib/uaccess.h b/arch/s390/lib/uaccess.h
index 89a8067..1d2536c 100644
--- a/arch/s390/lib/uaccess.h
+++ b/arch/s390/lib/uaccess.h
@@ -12,12 +12,12 @@ extern size_t copy_from_user_std(size_t, const void __user *, void *);
 extern size_t copy_to_user_std(size_t, void __user *, const void *);
 extern size_t strnlen_user_std(size_t, const char __user *);
 extern size_t strncpy_from_user_std(size_t, const char __user *, char *);
-extern int futex_atomic_cmpxchg_std(int *, int __user *, int, int);
-extern int futex_atomic_op_std(int, int __user *, int, int *);
+extern int futex_atomic_cmpxchg_std(u32 *, u32 __user *, u32, u32);
+extern int futex_atomic_op_std(int, u32 __user *, int, int *);
 
 extern size_t copy_from_user_pt(size_t, const void __user *, void *);
 extern size_t copy_to_user_pt(size_t, void __user *, const void *);
-extern int futex_atomic_op_pt(int, int __user *, int, int *);
-extern int futex_atomic_cmpxchg_pt(int *, int __user *, int, int);
+extern int futex_atomic_op_pt(int, u32 __user *, int, int *);
+extern int futex_atomic_cmpxchg_pt(u32 *, u32 __user *, u32, u32);
 
 #endif /* __ARCH_S390_LIB_UACCESS_H */
diff --git a/arch/s390/lib/uaccess_pt.c b/arch/s390/lib/uaccess_pt.c
index b3cebcd..7483383 100644
--- a/arch/s390/lib/uaccess_pt.c
+++ b/arch/s390/lib/uaccess_pt.c
@@ -302,7 +302,7 @@ fault:
 		     : "0" (-EFAULT), "d" (oparg), "a" (uaddr),		\
 		       "m" (*uaddr) : "cc" );
 
-static int __futex_atomic_op_pt(int op, int __user *uaddr, int oparg, int *old)
+static int __futex_atomic_op_pt(int op, u32 __user *uaddr, int oparg, int *old)
 {
 	int oldval = 0, newval, ret;
 
@@ -335,7 +335,7 @@ static int __futex_atomic_op_pt(int op, int __user *uaddr, int oparg, int *old)
 	return ret;
 }
 
-int futex_atomic_op_pt(int op, int __user *uaddr, int oparg, int *old)
+int futex_atomic_op_pt(int op, u32 __user *uaddr, int oparg, int *old)
 {
 	int ret;
 
@@ -354,8 +354,8 @@ int futex_atomic_op_pt(int op, int __user *uaddr, int oparg, int *old)
 	return ret;
 }
 
-static int __futex_atomic_cmpxchg_pt(int *uval, int __user *uaddr,
-				     int oldval, int newval)
+static int __futex_atomic_cmpxchg_pt(u32 *uval, u32 __user *uaddr,
+				     u32 oldval, u32 newval)
 {
 	int ret;
 
@@ -370,8 +370,8 @@ static int __futex_atomic_cmpxchg_pt(int *uval, int __user *uaddr,
 	return ret;
 }
 
-int futex_atomic_cmpxchg_pt(int *uval, int __user *uaddr,
-			    int oldval, int newval)
+int futex_atomic_cmpxchg_pt(u32 *uval, u32 __user *uaddr,
+			    u32 oldval, u32 newval)
 {
 	int ret;
 
diff --git a/arch/s390/lib/uaccess_std.c b/arch/s390/lib/uaccess_std.c
index 1d6643c..bb1a7ee 100644
--- a/arch/s390/lib/uaccess_std.c
+++ b/arch/s390/lib/uaccess_std.c
@@ -255,7 +255,7 @@ size_t strncpy_from_user_std(size_t size, const char __user *src, char *dst)
 		: "0" (-EFAULT), "d" (oparg), "a" (uaddr),		\
 		  "m" (*uaddr) : "cc");
 
-int futex_atomic_op_std(int op, int __user *uaddr, int oparg, int *old)
+int futex_atomic_op_std(int op, u32 __user *uaddr, int oparg, int *old)
 {
 	int oldval = 0, newval, ret;
 
@@ -287,8 +287,8 @@ int futex_atomic_op_std(int op, int __user *uaddr, int oparg, int *old)
 	return ret;
 }
 
-int futex_atomic_cmpxchg_std(int *uval, int __user *uaddr,
-			     int oldval, int newval)
+int futex_atomic_cmpxchg_std(u32 *uval, u32 __user *uaddr,
+			     u32 oldval, u32 newval)
 {
 	int ret;
 
diff --git a/arch/sh/include/asm/futex-irq.h b/arch/sh/include/asm/futex-irq.h
index 7b701cb..6cb9f19 100644
--- a/arch/sh/include/asm/futex-irq.h
+++ b/arch/sh/include/asm/futex-irq.h
@@ -3,7 +3,7 @@
 
 #include <asm/system.h>
 
-static inline int atomic_futex_op_xchg_set(int oparg, int __user *uaddr,
+static inline int atomic_futex_op_xchg_set(int oparg, u32 __user *uaddr,
 					   int *oldval)
 {
 	unsigned long flags;
@@ -20,7 +20,7 @@ static inline int atomic_futex_op_xchg_set(int oparg, int __user *uaddr,
 	return ret;
 }
 
-static inline int atomic_futex_op_xchg_add(int oparg, int __user *uaddr,
+static inline int atomic_futex_op_xchg_add(int oparg, u32 __user *uaddr,
 					   int *oldval)
 {
 	unsigned long flags;
@@ -37,7 +37,7 @@ static inline int atomic_futex_op_xchg_add(int oparg, int __user *uaddr,
 	return ret;
 }
 
-static inline int atomic_futex_op_xchg_or(int oparg, int __user *uaddr,
+static inline int atomic_futex_op_xchg_or(int oparg, u32 __user *uaddr,
 					  int *oldval)
 {
 	unsigned long flags;
@@ -54,7 +54,7 @@ static inline int atomic_futex_op_xchg_or(int oparg, int __user *uaddr,
 	return ret;
 }
 
-static inline int atomic_futex_op_xchg_and(int oparg, int __user *uaddr,
+static inline int atomic_futex_op_xchg_and(int oparg, u32 __user *uaddr,
 					   int *oldval)
 {
 	unsigned long flags;
@@ -71,7 +71,7 @@ static inline int atomic_futex_op_xchg_and(int oparg, int __user *uaddr,
 	return ret;
 }
 
-static inline int atomic_futex_op_xchg_xor(int oparg, int __user *uaddr,
+static inline int atomic_futex_op_xchg_xor(int oparg, u32 __user *uaddr,
 					   int *oldval)
 {
 	unsigned long flags;
@@ -88,12 +88,13 @@ static inline int atomic_futex_op_xchg_xor(int oparg, int __user *uaddr,
 	return ret;
 }
 
-static inline int atomic_futex_op_cmpxchg_inatomic(int *uval,
-						   int __user *uaddr,
-						   int oldval, int newval)
+static inline int atomic_futex_op_cmpxchg_inatomic(u32 *uval,
+						   u32 __user *uaddr,
+						   u32 oldval, u32 newval)
 {
 	unsigned long flags;
-	int ret, prev = 0;
+	int ret;
+	u32 prev = 0;
 
 	local_irq_save(flags);
 
diff --git a/arch/sh/include/asm/futex.h b/arch/sh/include/asm/futex.h
index a8a5125..7be39a6 100644
--- a/arch/sh/include/asm/futex.h
+++ b/arch/sh/include/asm/futex.h
@@ -10,7 +10,7 @@
 /* XXX: UP variants, fix for SH-4A and SMP.. */
 #include <asm/futex-irq.h>
 
-static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
+static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
 {
 	int op = (encoded_op >> 28) & 7;
 	int cmp = (encoded_op >> 24) & 15;
@@ -21,7 +21,7 @@ static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
 	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
 		oparg = 1 << oparg;
 
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
+	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
 		return -EFAULT;
 
 	pagefault_disable();
@@ -65,10 +65,10 @@ static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
 }
 
 static inline int
-futex_atomic_cmpxchg_inatomic(int *uval, int __user *uaddr,
-			      int oldval, int newval)
+futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
+			      u32 oldval, u32 newval)
 {
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
+	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
 		return -EFAULT;
 
 	return atomic_futex_op_cmpxchg_inatomic(uval, uaddr, oldval, newval);
diff --git a/arch/sparc/include/asm/futex_64.h b/arch/sparc/include/asm/futex_64.h
index e086220..444e7be 100644
--- a/arch/sparc/include/asm/futex_64.h
+++ b/arch/sparc/include/asm/futex_64.h
@@ -30,7 +30,7 @@
 	: "r" (uaddr), "r" (oparg), "i" (-EFAULT)	\
 	: "memory")
 
-static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
+static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
 {
 	int op = (encoded_op >> 28) & 7;
 	int cmp = (encoded_op >> 24) & 15;
@@ -38,7 +38,7 @@ static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
 	int cmparg = (encoded_op << 20) >> 20;
 	int oldval = 0, ret, tem;
 
-	if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(int))))
+	if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))))
 		return -EFAULT;
 	if (unlikely((((unsigned long) uaddr) & 0x3UL)))
 		return -EINVAL;
@@ -85,8 +85,8 @@ static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
 }
 
 static inline int
-futex_atomic_cmpxchg_inatomic(int *uval, int __user *uaddr,
-			      int oldval, int newval)
+futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
+			      u32 oldval, u32 newval)
 {
 	int ret = 0;
 
diff --git a/arch/tile/include/asm/futex.h b/arch/tile/include/asm/futex.h
index 664b20a..d03ec12 100644
--- a/arch/tile/include/asm/futex.h
+++ b/arch/tile/include/asm/futex.h
@@ -29,16 +29,16 @@
 #include <linux/uaccess.h>
 #include <linux/errno.h>
 
-extern struct __get_user futex_set(int __user *v, int i);
-extern struct __get_user futex_add(int __user *v, int n);
-extern struct __get_user futex_or(int __user *v, int n);
-extern struct __get_user futex_andn(int __user *v, int n);
-extern struct __get_user futex_cmpxchg(int __user *v, int o, int n);
+extern struct __get_user futex_set(u32 __user *v, int i);
+extern struct __get_user futex_add(u32 __user *v, int n);
+extern struct __get_user futex_or(u32 __user *v, int n);
+extern struct __get_user futex_andn(u32 __user *v, int n);
+extern struct __get_user futex_cmpxchg(u32 __user *v, int o, int n);
 
 #ifndef __tilegx__
-extern struct __get_user futex_xor(int __user *v, int n);
+extern struct __get_user futex_xor(u32 __user *v, int n);
 #else
-static inline struct __get_user futex_xor(int __user *uaddr, int n)
+static inline struct __get_user futex_xor(u32 __user *uaddr, int n)
 {
 	struct __get_user asm_ret = __get_user_4(uaddr);
 	if (!asm_ret.err) {
@@ -53,7 +53,7 @@ static inline struct __get_user futex_xor(int __user *uaddr, int n)
 }
 #endif
 
-static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
+static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
 {
 	int op = (encoded_op >> 28) & 7;
 	int cmp = (encoded_op >> 24) & 15;
@@ -65,7 +65,7 @@ static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
 	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
 		oparg = 1 << oparg;
 
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
+	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
 		return -EFAULT;
 
 	pagefault_disable();
@@ -119,12 +119,12 @@ static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
 	return ret;
 }
 
-static inline int futex_atomic_cmpxchg_inatomic(int *uval, int __user *uaddr,
-						int oldval, int newval)
+static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
+						u32 oldval, u32 newval)
 {
 	struct __get_user asm_ret;
 
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
+	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
 		return -EFAULT;
 
 	asm_ret = futex_cmpxchg(uaddr, oldval, newval);
diff --git a/arch/x86/include/asm/futex.h b/arch/x86/include/asm/futex.h
index 884c0b5..d09bb03 100644
--- a/arch/x86/include/asm/futex.h
+++ b/arch/x86/include/asm/futex.h
@@ -37,7 +37,7 @@
 		       "+m" (*uaddr), "=&r" (tem)		\
 		     : "r" (oparg), "i" (-EFAULT), "1" (0))
 
-static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
+static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
 {
 	int op = (encoded_op >> 28) & 7;
 	int cmp = (encoded_op >> 24) & 15;
@@ -48,7 +48,7 @@ static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
 	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
 		oparg = 1 << oparg;
 
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
+	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
 		return -EFAULT;
 
 #if defined(CONFIG_X86_32) && !defined(CONFIG_X86_BSWAP)
@@ -109,8 +109,8 @@ static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
 	return ret;
 }
 
-static inline int futex_atomic_cmpxchg_inatomic(int *uval, int __user *uaddr,
-						int oldval, int newval)
+static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
+						u32 oldval, u32 newval)
 {
 	int ret = 0;
 
@@ -120,7 +120,7 @@ static inline int futex_atomic_cmpxchg_inatomic(int *uval, int __user *uaddr,
 		return -ENOSYS;
 #endif
 
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
+	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
 		return -EFAULT;
 
 	asm volatile("1:\t" LOCK_PREFIX "cmpxchgl %4, %2\n"
diff --git a/include/asm-generic/futex.h b/include/asm-generic/futex.h
index 132bf52..01f227e 100644
--- a/include/asm-generic/futex.h
+++ b/include/asm-generic/futex.h
@@ -6,7 +6,7 @@
 #include <asm/errno.h>
 
 static inline int
-futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
+futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
 {
 	int op = (encoded_op >> 28) & 7;
 	int cmp = (encoded_op >> 24) & 15;
@@ -16,7 +16,7 @@ futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
 	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
 		oparg = 1 << oparg;
 
-	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
+	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(u32)))
 		return -EFAULT;
 
 	pagefault_disable();
@@ -48,8 +48,8 @@ futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
 }
 
 static inline int
-futex_atomic_cmpxchg_inatomic(int *uval, int __user *uaddr,
-			      int oldval, int newval)
+futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
+			      u32 oldval, u32 newval)
 {
 	return -ENOSYS;
 }
-- 
cgit v0.10.2


From d9936bb3952a08d701f7b03f8f62d158f94d8085 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 11 Mar 2011 14:15:35 +0100
Subject: genirq: Add desc->irq_data accessor

We have accessors for all fields in irq_data based on irq_desc, but
not for irq_data itself.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
index 9eb9cd3..0021837 100644
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -109,6 +109,11 @@ static inline struct irq_desc *move_irq_desc(struct irq_desc *desc, int node)
 
 #ifdef CONFIG_GENERIC_HARDIRQS
 
+static inline struct irq_data *irq_desc_get_irq_data(struct irq_desc *desc)
+{
+	return &desc->irq_data;
+}
+
 static inline struct irq_chip *irq_desc_get_chip(struct irq_desc *desc)
 {
 	return desc->irq_data.chip;
-- 
cgit v0.10.2


From 86b32122fd54addc9af01f8b919c65d3f49090a3 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Wed, 9 Mar 2011 22:03:16 -0500
Subject: xen/e820: Don't mark balloon memory as E820_UNUSABLE when running as
 guest and fix overflow.

If we have a guest that asked for:

memory=1024
maxmem=2048

Which means we want 1GB now, and create pagetables so that we can expand
up to 2GB, we would have this E820 layout:

[    0.000000] BIOS-provided physical RAM map:
[    0.000000]  Xen: 0000000000000000 - 00000000000a0000 (usable)
[    0.000000]  Xen: 00000000000a0000 - 0000000000100000 (reserved)
[    0.000000]  Xen: 0000000000100000 - 0000000080800000 (usable)

Due to patch: "xen/setup: Inhibit resource API from using System RAM E820 gaps as PCI mem gaps."
we would mark the memory past the 1GB mark as unusuable resulting in:

[    0.000000] BIOS-provided physical RAM map:
[    0.000000]  Xen: 0000000000000000 - 00000000000a0000 (usable)
[    0.000000]  Xen: 00000000000a0000 - 0000000000100000 (reserved)
[    0.000000]  Xen: 0000000000100000 - 0000000040000000 (usable)
[    0.000000]  Xen: 0000000040000000 - 0000000080800000 (unusable)

which meant that we could not balloon up anymore. We could
balloon the guest down. The fix is to run the code introduced
by the above mentioned patch only for the initial domain.

We will have to revisit this once we start introducing a modified
E820 for PCI passthrough so that we can utilize the P2M identity code.

We also fix an overflow by having UL instead of ULL on 32-bit machines.

[v2: Ian pointed to the overflow issue]
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 2a4add9..010ba2e 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -200,7 +200,8 @@ char * __init xen_memory_setup(void)
 			 * used as potential resource for I/O address (happens
 			 * when 'allocate_resource' is called).
 			 */
-			if (delta && end < 0x100000000UL)
+			if (delta &&
+				(xen_initial_domain() && end < 0x100000000ULL))
 				e820_add_region(end, delta, E820_UNUSABLE);
 		}
 
-- 
cgit v0.10.2


From 2e12978a9f7a7abd54e8eb9ce70a7718767b8b2c Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Wed, 22 Dec 2010 14:18:50 +0800
Subject: futex,plist: Pass the real head of the priority list to plist_del()

Some plist_del()s in kernel/futex.c are passed a faked head of the
priority list.

It does not fail because the current code does not require the real head
in plist_del(). The current code of plist_del() just uses the head for checking,
so it will not cause a bad result even when we use a faked head.

But it is undocumented usage:

/**
 * plist_del - Remove a @node from plist.
 *
 * @node:	&struct plist_node pointer - entry to be removed
 * @head:	&struct plist_head pointer - list head
 */

The document says that the @head is the "list head" head of the priority list.

In futex code, several places use "plist_del(&q->list, &q->list.plist);",
they pass a fake head. We need to fix them all.

Thanks to Darren Hart for many suggestions.

Acked-by: Darren Hart <dvhart@linux.intel.com>
Signed-off-by:  Lai Jiangshan <laijs@cn.fujitsu.com>
LKML-Reference: <4D11984A.5030203@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/futex.c b/kernel/futex.c
index b766d28..6feeea4 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -775,6 +775,24 @@ retry:
 	return ret;
 }
 
+/**
+ * __unqueue_futex() - Remove the futex_q from its futex_hash_bucket
+ * @q:	The futex_q to unqueue
+ *
+ * The q->lock_ptr must not be NULL and must be held by the caller.
+ */
+static void __unqueue_futex(struct futex_q *q)
+{
+	struct futex_hash_bucket *hb;
+
+	if (WARN_ON(!q->lock_ptr || !spin_is_locked(q->lock_ptr)
+			|| plist_node_empty(&q->list)))
+		return;
+
+	hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
+	plist_del(&q->list, &hb->chain);
+}
+
 /*
  * The hash bucket lock must be held when this is called.
  * Afterwards, the futex_q must not be accessed.
@@ -792,7 +810,7 @@ static void wake_futex(struct futex_q *q)
 	 */
 	get_task_struct(p);
 
-	plist_del(&q->list, &q->list.plist);
+	__unqueue_futex(q);
 	/*
 	 * The waiting task can free the futex_q as soon as
 	 * q->lock_ptr = NULL is written, without taking any locks. A
@@ -1100,8 +1118,7 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
 	get_futex_key_refs(key);
 	q->key = *key;
 
-	WARN_ON(plist_node_empty(&q->list));
-	plist_del(&q->list, &q->list.plist);
+	__unqueue_futex(q);
 
 	WARN_ON(!q->rt_waiter);
 	q->rt_waiter = NULL;
@@ -1504,8 +1521,7 @@ retry:
 			spin_unlock(lock_ptr);
 			goto retry;
 		}
-		WARN_ON(plist_node_empty(&q->list));
-		plist_del(&q->list, &q->list.plist);
+		__unqueue_futex(q);
 
 		BUG_ON(q->pi_state);
 
@@ -1525,8 +1541,7 @@ retry:
 static void unqueue_me_pi(struct futex_q *q)
 	__releases(q->lock_ptr)
 {
-	WARN_ON(plist_node_empty(&q->list));
-	plist_del(&q->list, &q->list.plist);
+	__unqueue_futex(q);
 
 	BUG_ON(!q->pi_state);
 	free_pi_state(q->pi_state);
@@ -2167,7 +2182,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
 		 * We were woken prior to requeue by a timeout or a signal.
 		 * Unqueue the futex_q and determine which it was.
 		 */
-		plist_del(&q->list, &q->list.plist);
+		plist_del(&q->list, &hb->chain);
 
 		/* Handle spurious wakeups gracefully */
 		ret = -EWOULDBLOCK;
-- 
cgit v0.10.2


From 017f2b239dabb2740b91df162e004371b861f371 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Tue, 21 Dec 2010 17:55:10 +0800
Subject: futex,plist: Remove debug lock assignment from plist_node

The original code uses &plist_node->plist as the fake head of
the priority list for plist_del(), these debug locks in
the fake head are needed for CONFIG_DEBUG_PI_LIST.

But now we always pass the real head to plist_del(), the debug locks
in plist_node will not be used, so we remove these assignments.

Acked-by: Darren Hart <dvhart@linux.intel.com>
Signed-off-by:  Lai Jiangshan <laijs@cn.fujitsu.com>
LKML-Reference: <4D10797E.7040803@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/futex.c b/kernel/futex.c
index 6feeea4..9fe9131 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1089,9 +1089,6 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
 		plist_del(&q->list, &hb1->chain);
 		plist_add(&q->list, &hb2->chain);
 		q->lock_ptr = &hb2->lock;
-#ifdef CONFIG_DEBUG_PI_LIST
-		q->list.plist.spinlock = &hb2->lock;
-#endif
 	}
 	get_futex_key_refs(key2);
 	q->key = *key2;
@@ -1124,9 +1121,6 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
 	q->rt_waiter = NULL;
 
 	q->lock_ptr = &hb->lock;
-#ifdef CONFIG_DEBUG_PI_LIST
-	q->list.plist.spinlock = &hb->lock;
-#endif
 
 	wake_up_state(q->task, TASK_NORMAL);
 }
@@ -1474,9 +1468,6 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
 	prio = min(current->normal_prio, MAX_RT_PRIO);
 
 	plist_node_init(&q->list, prio);
-#ifdef CONFIG_DEBUG_PI_LIST
-	q->list.plist.spinlock = &hb->lock;
-#endif
 	plist_add(&q->list, &hb->chain);
 	q->task = current;
 	spin_unlock(&hb->lock);
-- 
cgit v0.10.2


From bf6a9b8336ba12672755c2ae898b0abe42c7a5ac Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Tue, 21 Dec 2010 17:55:14 +0800
Subject: plist: Shrink struct plist_head

struct plist_head is used in struct task_struct as well as struct
rtmutex. If we can make it smaller, it will also make these structures
smaller as well.

The field prio_list in struct plist_head is seldom used and we can get
its information from the plist_nodes. Removing this field will decrease
the size of plist_head by half.

Signed-off-by:  Lai Jiangshan <laijs@cn.fujitsu.com>
LKML-Reference: <4D107982.9090700@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/include/linux/plist.h b/include/linux/plist.h
index 7254eda..c9b9f32 100644
--- a/include/linux/plist.h
+++ b/include/linux/plist.h
@@ -31,15 +31,17 @@
  *
  * Simple ASCII art explanation:
  *
- * |HEAD          |
- * |              |
- * |prio_list.prev|<------------------------------------|
- * |prio_list.next|<->|pl|<->|pl|<--------------->|pl|<-|
- * |10            |   |10|   |21|   |21|   |21|   |40|   (prio)
- * |              |   |  |   |  |   |  |   |  |   |  |
- * |              |   |  |   |  |   |  |   |  |   |  |
- * |node_list.next|<->|nl|<->|nl|<->|nl|<->|nl|<->|nl|<-|
- * |node_list.prev|<------------------------------------|
+ * pl:prio_list (only for plist_node)
+ * nl:node_list
+ *   HEAD|             NODE(S)
+ *       |
+ *       ||------------------------------------|
+ *       ||->|pl|<->|pl|<--------------->|pl|<-|
+ *       |   |10|   |21|   |21|   |21|   |40|   (prio)
+ *       |   |  |   |  |   |  |   |  |   |  |
+ *       |   |  |   |  |   |  |   |  |   |  |
+ * |->|nl|<->|nl|<->|nl|<->|nl|<->|nl|<->|nl|<-|
+ * |-------------------------------------------|
  *
  * The nodes on the prio_list list are sorted by priority to simplify
  * the insertion of new nodes. There are no nodes with duplicate
@@ -78,7 +80,6 @@
 #include <linux/spinlock_types.h>
 
 struct plist_head {
-	struct list_head prio_list;
 	struct list_head node_list;
 #ifdef CONFIG_DEBUG_PI_LIST
 	raw_spinlock_t *rawlock;
@@ -88,7 +89,8 @@ struct plist_head {
 
 struct plist_node {
 	int			prio;
-	struct plist_head	plist;
+	struct list_head	prio_list;
+	struct list_head	node_list;
 };
 
 #ifdef CONFIG_DEBUG_PI_LIST
@@ -100,7 +102,6 @@ struct plist_node {
 #endif
 
 #define _PLIST_HEAD_INIT(head)				\
-	.prio_list = LIST_HEAD_INIT((head).prio_list),	\
 	.node_list = LIST_HEAD_INIT((head).node_list)
 
 /**
@@ -133,7 +134,8 @@ struct plist_node {
 #define PLIST_NODE_INIT(node, __prio)			\
 {							\
 	.prio  = (__prio),				\
-	.plist = { _PLIST_HEAD_INIT((node).plist) },	\
+	.prio_list = LIST_HEAD_INIT((node).prio_list),	\
+	.node_list = LIST_HEAD_INIT((node).node_list),	\
 }
 
 /**
@@ -144,7 +146,6 @@ struct plist_node {
 static inline void
 plist_head_init(struct plist_head *head, spinlock_t *lock)
 {
-	INIT_LIST_HEAD(&head->prio_list);
 	INIT_LIST_HEAD(&head->node_list);
 #ifdef CONFIG_DEBUG_PI_LIST
 	head->spinlock = lock;
@@ -160,7 +161,6 @@ plist_head_init(struct plist_head *head, spinlock_t *lock)
 static inline void
 plist_head_init_raw(struct plist_head *head, raw_spinlock_t *lock)
 {
-	INIT_LIST_HEAD(&head->prio_list);
 	INIT_LIST_HEAD(&head->node_list);
 #ifdef CONFIG_DEBUG_PI_LIST
 	head->rawlock = lock;
@@ -176,7 +176,8 @@ plist_head_init_raw(struct plist_head *head, raw_spinlock_t *lock)
 static inline void plist_node_init(struct plist_node *node, int prio)
 {
 	node->prio = prio;
-	plist_head_init(&node->plist, NULL);
+	INIT_LIST_HEAD(&node->prio_list);
+	INIT_LIST_HEAD(&node->node_list);
 }
 
 extern void plist_add(struct plist_node *node, struct plist_head *head);
@@ -188,7 +189,7 @@ extern void plist_del(struct plist_node *node, struct plist_head *head);
  * @head:	the head for your list
  */
 #define plist_for_each(pos, head)	\
-	 list_for_each_entry(pos, &(head)->node_list, plist.node_list)
+	 list_for_each_entry(pos, &(head)->node_list, node_list)
 
 /**
  * plist_for_each_safe - iterate safely over a plist of given type
@@ -199,7 +200,7 @@ extern void plist_del(struct plist_node *node, struct plist_head *head);
  * Iterate over a plist of given type, safe against removal of list entry.
  */
 #define plist_for_each_safe(pos, n, head)	\
-	 list_for_each_entry_safe(pos, n, &(head)->node_list, plist.node_list)
+	 list_for_each_entry_safe(pos, n, &(head)->node_list, node_list)
 
 /**
  * plist_for_each_entry	- iterate over list of given type
@@ -208,7 +209,7 @@ extern void plist_del(struct plist_node *node, struct plist_head *head);
  * @mem:	the name of the list_struct within the struct
  */
 #define plist_for_each_entry(pos, head, mem)	\
-	 list_for_each_entry(pos, &(head)->node_list, mem.plist.node_list)
+	 list_for_each_entry(pos, &(head)->node_list, mem.node_list)
 
 /**
  * plist_for_each_entry_safe - iterate safely over list of given type
@@ -220,7 +221,7 @@ extern void plist_del(struct plist_node *node, struct plist_head *head);
  * Iterate over list of given type, safe against removal of list entry.
  */
 #define plist_for_each_entry_safe(pos, n, head, m)	\
-	list_for_each_entry_safe(pos, n, &(head)->node_list, m.plist.node_list)
+	list_for_each_entry_safe(pos, n, &(head)->node_list, m.node_list)
 
 /**
  * plist_head_empty - return !0 if a plist_head is empty
@@ -237,7 +238,7 @@ static inline int plist_head_empty(const struct plist_head *head)
  */
 static inline int plist_node_empty(const struct plist_node *node)
 {
-	return plist_head_empty(&node->plist);
+	return list_empty(&node->node_list);
 }
 
 /* All functions below assume the plist_head is not empty. */
@@ -285,7 +286,7 @@ static inline int plist_node_empty(const struct plist_node *node)
 static inline struct plist_node *plist_first(const struct plist_head *head)
 {
 	return list_entry(head->node_list.next,
-			  struct plist_node, plist.node_list);
+			  struct plist_node, node_list);
 }
 
 /**
@@ -297,7 +298,7 @@ static inline struct plist_node *plist_first(const struct plist_head *head)
 static inline struct plist_node *plist_last(const struct plist_head *head)
 {
 	return list_entry(head->node_list.prev,
-			  struct plist_node, plist.node_list);
+			  struct plist_node, node_list);
 }
 
 #endif
diff --git a/lib/plist.c b/lib/plist.c
index 1471988..8c614d0 100644
--- a/lib/plist.c
+++ b/lib/plist.c
@@ -59,7 +59,8 @@ static void plist_check_head(struct plist_head *head)
 		WARN_ON_SMP(!raw_spin_is_locked(head->rawlock));
 	if (head->spinlock)
 		WARN_ON_SMP(!spin_is_locked(head->spinlock));
-	plist_check_list(&head->prio_list);
+	if (!plist_head_empty(head))
+		plist_check_list(&plist_first(head)->prio_list);
 	plist_check_list(&head->node_list);
 }
 
@@ -75,25 +76,33 @@ static void plist_check_head(struct plist_head *head)
  */
 void plist_add(struct plist_node *node, struct plist_head *head)
 {
-	struct plist_node *iter;
+	struct plist_node *first, *iter, *prev = NULL;
+	struct list_head *node_next = &head->node_list;
 
 	plist_check_head(head);
 	WARN_ON(!plist_node_empty(node));
+	WARN_ON(!list_empty(&node->prio_list));
 
-	list_for_each_entry(iter, &head->prio_list, plist.prio_list) {
-		if (node->prio < iter->prio)
-			goto lt_prio;
-		else if (node->prio == iter->prio) {
-			iter = list_entry(iter->plist.prio_list.next,
-					struct plist_node, plist.prio_list);
-			goto eq_prio;
+	if (plist_head_empty(head))
+		goto ins_node;
+
+	first = iter = plist_first(head);
+
+	do {
+		if (node->prio < iter->prio) {
+			node_next = &iter->node_list;
+			break;
 		}
-	}
 
-lt_prio:
-	list_add_tail(&node->plist.prio_list, &iter->plist.prio_list);
-eq_prio:
-	list_add_tail(&node->plist.node_list, &iter->plist.node_list);
+		prev = iter;
+		iter = list_entry(iter->prio_list.next,
+				struct plist_node, prio_list);
+	} while (iter != first);
+
+	if (!prev || prev->prio != node->prio)
+		list_add_tail(&node->prio_list, &iter->prio_list);
+ins_node:
+	list_add_tail(&node->node_list, node_next);
 
 	plist_check_head(head);
 }
@@ -108,14 +117,21 @@ void plist_del(struct plist_node *node, struct plist_head *head)
 {
 	plist_check_head(head);
 
-	if (!list_empty(&node->plist.prio_list)) {
-		struct plist_node *next = plist_first(&node->plist);
+	if (!list_empty(&node->prio_list)) {
+		if (node->node_list.next != &head->node_list) {
+			struct plist_node *next;
+
+			next = list_entry(node->node_list.next,
+					struct plist_node, node_list);
 
-		list_move_tail(&next->plist.prio_list, &node->plist.prio_list);
-		list_del_init(&node->plist.prio_list);
+			/* add the next plist_node into prio_list */
+			if (list_empty(&next->prio_list))
+				list_add(&next->prio_list, &node->prio_list);
+		}
+		list_del_init(&node->prio_list);
 	}
 
-	list_del_init(&node->plist.node_list);
+	list_del_init(&node->node_list);
 
 	plist_check_head(head);
 }
-- 
cgit v0.10.2


From 6d55da53db3d9b911f69f2ce1e5fb8943eafe057 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Tue, 21 Dec 2010 17:55:18 +0800
Subject: plist: Add priority list test

Add test code for checking plist when the kernel is booting.

Signed-off-by:  Lai Jiangshan <laijs@cn.fujitsu.com>
LKML-Reference: <4D107986.1010302@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/lib/plist.c b/lib/plist.c
index 8c614d0..0ae7e64 100644
--- a/lib/plist.c
+++ b/lib/plist.c
@@ -28,6 +28,8 @@
 
 #ifdef CONFIG_DEBUG_PI_LIST
 
+static struct plist_head test_head;
+
 static void plist_check_prev_next(struct list_head *t, struct list_head *p,
 				  struct list_head *n)
 {
@@ -54,7 +56,7 @@ static void plist_check_list(struct list_head *top)
 
 static void plist_check_head(struct plist_head *head)
 {
-	WARN_ON(!head->rawlock && !head->spinlock);
+	WARN_ON(head != &test_head && !head->rawlock && !head->spinlock);
 	if (head->rawlock)
 		WARN_ON_SMP(!raw_spin_is_locked(head->rawlock));
 	if (head->spinlock)
@@ -135,3 +137,80 @@ void plist_del(struct plist_node *node, struct plist_head *head)
 
 	plist_check_head(head);
 }
+
+#ifdef CONFIG_DEBUG_PI_LIST
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/init.h>
+
+static struct plist_node __initdata test_node[241];
+
+static void __init plist_test_check(int nr_expect)
+{
+	struct plist_node *first, *prio_pos, *node_pos;
+
+	if (plist_head_empty(&test_head)) {
+		BUG_ON(nr_expect != 0);
+		return;
+	}
+
+	prio_pos = first = plist_first(&test_head);
+	plist_for_each(node_pos, &test_head) {
+		if (nr_expect-- < 0)
+			break;
+		if (node_pos == first)
+			continue;
+		if (node_pos->prio == prio_pos->prio) {
+			BUG_ON(!list_empty(&node_pos->prio_list));
+			continue;
+		}
+
+		BUG_ON(prio_pos->prio > node_pos->prio);
+		BUG_ON(prio_pos->prio_list.next != &node_pos->prio_list);
+		prio_pos = node_pos;
+	}
+
+	BUG_ON(nr_expect != 0);
+	BUG_ON(prio_pos->prio_list.next != &first->prio_list);
+}
+
+static int  __init plist_test(void)
+{
+	int nr_expect = 0, i, loop;
+	unsigned int r = local_clock();
+
+	printk(KERN_INFO "start plist test\n");
+	plist_head_init(&test_head, NULL);
+	for (i = 0; i < ARRAY_SIZE(test_node); i++)
+		plist_node_init(test_node + i, 0);
+
+	for (loop = 0; loop < 1000; loop++) {
+		r = r * 193939 % 47629;
+		i = r % ARRAY_SIZE(test_node);
+		if (plist_node_empty(test_node + i)) {
+			r = r * 193939 % 47629;
+			test_node[i].prio = r % 99;
+			plist_add(test_node + i, &test_head);
+			nr_expect++;
+		} else {
+			plist_del(test_node + i, &test_head);
+			nr_expect--;
+		}
+		plist_test_check(nr_expect);
+	}
+
+	for (i = 0; i < ARRAY_SIZE(test_node); i++) {
+		if (plist_node_empty(test_node + i))
+			continue;
+		plist_del(test_node + i, &test_head);
+		nr_expect--;
+		plist_test_check(nr_expect);
+	}
+
+	printk(KERN_INFO "end plist test\n");
+	return 0;
+}
+
+module_init(plist_test);
+
+#endif
-- 
cgit v0.10.2


From 371c394af27ab7d1e58a66bc19d9f1f3ac1f67b4 Mon Sep 17 00:00:00 2001
From: Alexander van Heukelum <heukelum@fastmail.fm>
Date: Fri, 11 Mar 2011 21:59:38 +0100
Subject: x86, binutils, xen: Fix another wrong size directive

The latest binutils (2.21.0.20110302/Ubuntu) breaks the build
yet another time, under CONFIG_XEN=y due to a .size directive that
refers to a slightly differently named (hence, to the now very
strict and unforgiving assembler, non-existent) symbol.

[ mingo:

   This unnecessary build breakage caused by new binutils
   version 2.21 gets escallated back several kernel releases spanning
   several years of Linux history, affecting over 130,000 upstream
   kernel commits (!), on CONFIG_XEN=y 64-bit kernels (i.e. essentially
   affecting all major Linux distro kernel configs).

   Git annotate tells us that this slight debug symbol code mismatch
   bug has been introduced in 2008 in commit 3d75e1b8:

     3d75e1b8        (Jeremy Fitzhardinge    2008-07-08 15:06:49 -0700 1231) ENTRY(xen_do_hypervisor_callback)   # do_hypervisor_callback(struct *pt_regs)

   The 'bug' is just a slight assymetry in ENTRY()/END()
   debug-symbols sequences, with lots of assembly code between the
   ENTRY() and the END():

     ENTRY(xen_do_hypervisor_callback)   # do_hypervisor_callback(struct *pt_regs)
       ...
     END(do_hypervisor_callback)

   Human reviewers almost never catch such small mismatches, and binutils
   never even warned about it either.

   This new binutils version thus breaks the Xen build on all upstream kernels
   since v2.6.27, out of the blue.

   This makes a straightforward Git bisection of all 64-bit Xen-enabled kernels
   impossible on such binutils, for a bisection window of over hundred
   thousand historic commits. (!)

   This is a major fail on the side of binutils and binutils needs to turn
   this show-stopper build failure into a warning ASAP. ]

Signed-off-by: Alexander van Heukelum <heukelum@fastmail.fm>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Cc: Jan Beulich <jbeulich@novell.com>
Cc: H.J. Lu <hjl.tools@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Kees Cook <kees.cook@canonical.com>
LKML-Reference: <1299877178-26063-1-git-send-email-heukelum@fastmail.fm>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index aed1ffb..bbd5c80 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1248,7 +1248,7 @@ ENTRY(xen_do_hypervisor_callback)   # do_hypervisor_callback(struct *pt_regs)
 	decl PER_CPU_VAR(irq_count)
 	jmp  error_exit
 	CFI_ENDPROC
-END(do_hypervisor_callback)
+END(xen_do_hypervisor_callback)
 
 /*
  * Hypervisor uses this for application faults while it executes.
-- 
cgit v0.10.2


From d209a699a0b975ad47f399d70ddc3791f1b84496 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 11 Mar 2011 21:22:14 +0100
Subject: genirq: Add chip flag to force mask on suspend

On suspend we disable all interrupts in the core code, but this does
not mask the interrupt line in the default implementation as we use a
lazy disable approach. That means we mark the interrupt disabled, but
leave the hardware unmasked. That's an optimization because we avoid
the hardware access for the common case where no interrupt happens
after we marked it disabled. If an interrupt happens, then the
interrupt flow handler masks the line at the hardware level and marks
it pending.

Suspend makes use of this delayed disable as it "disables" all
interrupts when preparing the suspend transition. Right before the
system goes into hardware suspend state it checks whether one of the
interrupts which is marked as a wakeup interrupt came in after
disabling it.

Most interrupt chips have a separate register which selects the
interrupts which can wake up the system from suspend, so we don't have
to mask any on the non wakeup interrupts.

But now we have to deal with brilliant designed hardware which lacks
such a wakeup configuration facility. For such hardware it's necessary
to mask all non wakeup interrupts before going into suspend in order
to avoid the wakeup from random interrupts.

Rather than working around this in the affected interrupt chip
implementations we can solve this elegant in the core code itself.

Add a flag IRQCHIP_MASK_ON_SUSPEND which can be set by the irq chip
implementation to indicate, that the interrupts which are not selected
as wakeup sources must be masked in the suspend path. Mask them in the
loop which checks the wakeup interrupts pending flag.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Abhijeet Dharmapurikar <adharmap@codeaurora.org>
LKML-Reference: <alpine.LFD.2.00.1103112112310.2787@localhost6.localdomain6>

diff --git a/include/linux/irq.h b/include/linux/irq.h
index ff62d01..1d3577f 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -330,10 +330,12 @@ struct irq_chip {
  *
  * IRQCHIP_SET_TYPE_MASKED:	Mask before calling chip.irq_set_type()
  * IRQCHIP_EOI_IF_HANDLED:	Only issue irq_eoi() when irq was handled
+ * IRQCHIP_MASK_ON_SUSPEND:	Mask non wake irqs in the suspend path
  */
 enum {
 	IRQCHIP_SET_TYPE_MASKED		= (1 <<  0),
 	IRQCHIP_EOI_IF_HANDLED		= (1 <<  1),
+	IRQCHIP_MASK_ON_SUSPEND		= (1 <<  2),
 };
 
 /* This include will go away once we isolated irq_desc usage to core code */
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index 1329f0e..f76fc00 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -68,10 +68,24 @@ int check_wakeup_irqs(void)
 	struct irq_desc *desc;
 	int irq;
 
-	for_each_irq_desc(irq, desc)
-		if (irqd_is_wakeup_set(&desc->irq_data) &&
-		    (desc->istate & IRQS_PENDING))
-			return -EBUSY;
+	for_each_irq_desc(irq, desc) {
+		if (irqd_is_wakeup_set(&desc->irq_data)) {
+			if (desc->istate & IRQS_PENDING)
+				return -EBUSY;
+			continue;
+		}
+		/*
+		 * Check the non wakeup interrupts whether they need
+		 * to be masked before finally going into suspend
+		 * state. That's for hardware which has no wakeup
+		 * source configuration facility. The chip
+		 * implementation indicates that with
+		 * IRQCHIP_MASK_ON_SUSPEND.
+		 */
+		if (desc->istate & IRQS_SUSPENDED &&
+		    irq_desc_get_chip(desc)->flags & IRQCHIP_MASK_ON_SUSPEND)
+			mask_irq(desc);
+	}
 
 	return 0;
 }
-- 
cgit v0.10.2


From 56396e6823fe9b42fe9cf9403d6ed67756255f70 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 11 Mar 2011 10:33:31 +0100
Subject: x86-64, NUMA: Don't call numa_set_distanc() for all possible node
 combinations during emulation

The distance transforming in numa_emulation() used to call
numa_set_distance() for all MAX_NUMNODES * MAX_NUMNODES node
combinations regardless of which are enabled.  As numa_set_distance()
ignores all out-of-bound distance settings, this doesn't cause any
problem other than looping unnecessarily many times during boot.

However, as MAX_NUMNODES * MAX_NUMNODES can be pretty high, update the
code such that it iterates through only the enabled combinations.

Yinghai Lu identified the issue and provided an initial patch to
address the issue; however, the patch was incorrect in that it didn't
build emulated distance table when there's no physical distance table
and unnecessarily complex.

  http://thread.gmane.org/gmane.linux.kernel/1107986/focus=1107988

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Yinghai Lu <yinghai@kernel.org>
Acked-by: Yinghai Lu <yinghai@kernel.org>

diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c
index 3696be0..ad091e4 100644
--- a/arch/x86/mm/numa_emulation.c
+++ b/arch/x86/mm/numa_emulation.c
@@ -301,7 +301,7 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
 	const u64 max_addr = max_pfn << PAGE_SHIFT;
 	u8 *phys_dist = NULL;
 	size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]);
-	int dfl_phys_nid;
+	int max_emu_nid, dfl_phys_nid;
 	int i, j, ret;
 
 	if (!emu_cmdline)
@@ -358,12 +358,17 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
 					node_distance(i, j);
 	}
 
-	/* determine the default phys nid to use for unmapped nodes */
+	/*
+	 * Determine the max emulated nid and the default phys nid to use
+	 * for unmapped nodes.
+	 */
+	max_emu_nid = 0;
 	dfl_phys_nid = NUMA_NO_NODE;
 	for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) {
 		if (emu_nid_to_phys[i] != NUMA_NO_NODE) {
-			dfl_phys_nid = emu_nid_to_phys[i];
-			break;
+			max_emu_nid = i;
+			if (dfl_phys_nid == NUMA_NO_NODE)
+				dfl_phys_nid = emu_nid_to_phys[i];
 		}
 	}
 	if (dfl_phys_nid == NUMA_NO_NODE) {
@@ -393,14 +398,10 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
 		if (emu_nid_to_phys[i] == NUMA_NO_NODE)
 			emu_nid_to_phys[i] = dfl_phys_nid;
 
-	/*
-	 * Transform distance table.  numa_set_distance() ignores all
-	 * out-of-bound distances.  Just call it for every possible node
-	 * combination.
-	 */
+	/* transform distance table */
 	numa_reset_distance();
-	for (i = 0; i < MAX_NUMNODES; i++) {
-		for (j = 0; j < MAX_NUMNODES; j++) {
+	for (i = 0; i < max_emu_nid + 1; i++) {
+		for (j = 0; j < max_emu_nid + 1; j++) {
 			int physi = emu_nid_to_phys[i];
 			int physj = emu_nid_to_phys[j];
 			int dist;
-- 
cgit v0.10.2


From 2c778651f73d92edb847e65d371bb29b17c7ca60 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 12 Mar 2011 12:20:43 +0100
Subject: x86: Cleanup the genirq name space

genirq is switching to a consistent name space for the irq related
functions. Convert x86. Conversion was done with coccinelle.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 8d23e83..7a88b04 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -187,7 +187,7 @@ int __init arch_early_irq_init(void)
 	irq_reserve_irqs(0, legacy_pic->nr_legacy_irqs);
 
 	for (i = 0; i < count; i++) {
-		set_irq_chip_data(i, &cfg[i]);
+		irq_set_chip_data(i, &cfg[i]);
 		zalloc_cpumask_var_node(&cfg[i].domain, GFP_KERNEL, node);
 		zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_KERNEL, node);
 		/*
@@ -206,7 +206,7 @@ int __init arch_early_irq_init(void)
 #ifdef CONFIG_SPARSE_IRQ
 static struct irq_cfg *irq_cfg(unsigned int irq)
 {
-	return get_irq_chip_data(irq);
+	return irq_get_chip_data(irq);
 }
 
 static struct irq_cfg *alloc_irq_cfg(unsigned int irq, int node)
@@ -232,7 +232,7 @@ static void free_irq_cfg(unsigned int at, struct irq_cfg *cfg)
 {
 	if (!cfg)
 		return;
-	set_irq_chip_data(at, NULL);
+	irq_set_chip_data(at, NULL);
 	free_cpumask_var(cfg->domain);
 	free_cpumask_var(cfg->old_domain);
 	kfree(cfg);
@@ -262,14 +262,14 @@ static struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node)
 	if (res < 0) {
 		if (res != -EEXIST)
 			return NULL;
-		cfg = get_irq_chip_data(at);
+		cfg = irq_get_chip_data(at);
 		if (cfg)
 			return cfg;
 	}
 
 	cfg = alloc_irq_cfg(at, node);
 	if (cfg)
-		set_irq_chip_data(at, cfg);
+		irq_set_chip_data(at, cfg);
 	else
 		irq_free_desc(at);
 	return cfg;
@@ -1185,7 +1185,7 @@ void __setup_vector_irq(int cpu)
 	raw_spin_lock(&vector_lock);
 	/* Mark the inuse vectors */
 	for_each_active_irq(irq) {
-		cfg = get_irq_chip_data(irq);
+		cfg = irq_get_chip_data(irq);
 		if (!cfg)
 			continue;
 		/*
@@ -1249,25 +1249,24 @@ static void ioapic_register_intr(unsigned int irq, unsigned long trigger)
 	else
 		irq_clear_status_flags(irq, IRQ_LEVEL);
 
-	if (irq_remapped(get_irq_chip_data(irq))) {
+	if (irq_remapped(irq_get_chip_data(irq))) {
 		irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
 		if (trigger)
-			set_irq_chip_and_handler_name(irq, &ir_ioapic_chip,
+			irq_set_chip_and_handler_name(irq, &ir_ioapic_chip,
 						      handle_fasteoi_irq,
-						     "fasteoi");
+						      "fasteoi");
 		else
-			set_irq_chip_and_handler_name(irq, &ir_ioapic_chip,
+			irq_set_chip_and_handler_name(irq, &ir_ioapic_chip,
 						      handle_edge_irq, "edge");
 		return;
 	}
 
 	if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
 	    trigger == IOAPIC_LEVEL)
-		set_irq_chip_and_handler_name(irq, &ioapic_chip,
-					      handle_fasteoi_irq,
-					      "fasteoi");
+		irq_set_chip_and_handler_name(irq, &ioapic_chip,
+					      handle_fasteoi_irq, "fasteoi");
 	else
-		set_irq_chip_and_handler_name(irq, &ioapic_chip,
+		irq_set_chip_and_handler_name(irq, &ioapic_chip,
 					      handle_edge_irq, "edge");
 }
 
@@ -1491,7 +1490,8 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic_id, unsigned int pin,
 	 * The timer IRQ doesn't have to know that behind the
 	 * scene we may have a 8259A-master in AEOI mode ...
 	 */
-	set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge");
+	irq_set_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq,
+				      "edge");
 
 	/*
 	 * Add it to the IO-APIC irq-routing table:
@@ -1598,7 +1598,7 @@ __apicdebuginit(void) print_IO_APIC(void)
 	for_each_active_irq(irq) {
 		struct irq_pin_list *entry;
 
-		cfg = get_irq_chip_data(irq);
+		cfg = irq_get_chip_data(irq);
 		if (!cfg)
 			continue;
 		entry = cfg->irq_2_pin;
@@ -2364,7 +2364,7 @@ static void irq_complete_move(struct irq_cfg *cfg)
 
 void irq_force_complete_move(int irq)
 {
-	struct irq_cfg *cfg = get_irq_chip_data(irq);
+	struct irq_cfg *cfg = irq_get_chip_data(irq);
 
 	if (!cfg)
 		return;
@@ -2587,7 +2587,7 @@ static inline void init_IO_APIC_traps(void)
 	 * 0x80, because int 0x80 is hm, kind of importantish. ;)
 	 */
 	for_each_active_irq(irq) {
-		cfg = get_irq_chip_data(irq);
+		cfg = irq_get_chip_data(irq);
 		if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) {
 			/*
 			 * Hmm.. We don't have an entry for this,
@@ -2598,7 +2598,7 @@ static inline void init_IO_APIC_traps(void)
 				legacy_pic->make_irq(irq);
 			else
 				/* Strange. Oh, well.. */
-				set_irq_chip(irq, &no_irq_chip);
+				irq_set_chip(irq, &no_irq_chip);
 		}
 	}
 }
@@ -2638,7 +2638,7 @@ static struct irq_chip lapic_chip __read_mostly = {
 static void lapic_register_intr(int irq)
 {
 	irq_clear_status_flags(irq, IRQ_LEVEL);
-	set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
+	irq_set_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
 				      "edge");
 }
 
@@ -2722,7 +2722,7 @@ int timer_through_8259 __initdata;
  */
 static inline void __init check_timer(void)
 {
-	struct irq_cfg *cfg = get_irq_chip_data(0);
+	struct irq_cfg *cfg = irq_get_chip_data(0);
 	int node = cpu_to_node(0);
 	int apic1, pin1, apic2, pin2;
 	unsigned long flags;
@@ -3033,7 +3033,7 @@ unsigned int create_irq_nr(unsigned int from, int node)
 	raw_spin_unlock_irqrestore(&vector_lock, flags);
 
 	if (ret) {
-		set_irq_chip_data(irq, cfg);
+		irq_set_chip_data(irq, cfg);
 		irq_clear_status_flags(irq, IRQ_NOREQUEST);
 	} else {
 		free_irq_at(irq, cfg);
@@ -3058,7 +3058,7 @@ int create_irq(void)
 
 void destroy_irq(unsigned int irq)
 {
-	struct irq_cfg *cfg = get_irq_chip_data(irq);
+	struct irq_cfg *cfg = irq_get_chip_data(irq);
 	unsigned long flags;
 
 	irq_set_status_flags(irq, IRQ_NOREQUEST|IRQ_NOPROBE);
@@ -3092,7 +3092,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
 
 	dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus());
 
-	if (irq_remapped(get_irq_chip_data(irq))) {
+	if (irq_remapped(irq_get_chip_data(irq))) {
 		struct irte irte;
 		int ir_index;
 		u16 sub_handle;
@@ -3271,14 +3271,16 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
 	if (ret < 0)
 		return ret;
 
-	set_irq_msi(irq, msidesc);
+	irq_set_msi_desc(irq, msidesc);
 	write_msi_msg(irq, &msg);
 
-	if (irq_remapped(get_irq_chip_data(irq))) {
+	if (irq_remapped(irq_get_chip_data(irq))) {
 		irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
-		set_irq_chip_and_handler_name(irq, &msi_ir_chip, handle_edge_irq, "edge");
+		irq_set_chip_and_handler_name(irq, &msi_ir_chip,
+					      handle_edge_irq, "edge");
 	} else
-		set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge");
+		irq_set_chip_and_handler_name(irq, &msi_chip,
+					      handle_edge_irq, "edge");
 
 	dev_printk(KERN_DEBUG, &dev->dev, "irq %d for MSI/MSI-X\n", irq);
 
@@ -3396,8 +3398,8 @@ int arch_setup_dmar_msi(unsigned int irq)
 	if (ret < 0)
 		return ret;
 	dmar_msi_write(irq, &msg);
-	set_irq_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq,
-		"edge");
+	irq_set_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq,
+				      "edge");
 	return 0;
 }
 #endif
@@ -3474,13 +3476,13 @@ int arch_setup_hpet_msi(unsigned int irq, unsigned int id)
 	if (ret < 0)
 		return ret;
 
-	hpet_msi_write(get_irq_data(irq), &msg);
+	hpet_msi_write(irq_get_handler_data(irq), &msg);
 	irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
-	if (irq_remapped(get_irq_chip_data(irq)))
-		set_irq_chip_and_handler_name(irq, &ir_hpet_msi_type,
+	if (irq_remapped(irq_get_chip_data(irq)))
+		irq_set_chip_and_handler_name(irq, &ir_hpet_msi_type,
 					      handle_edge_irq, "edge");
 	else
-		set_irq_chip_and_handler_name(irq, &hpet_msi_type,
+		irq_set_chip_and_handler_name(irq, &hpet_msi_type,
 					      handle_edge_irq, "edge");
 
 	return 0;
@@ -3569,7 +3571,7 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
 
 		write_ht_irq_msg(irq, &msg);
 
-		set_irq_chip_and_handler_name(irq, &ht_irq_chip,
+		irq_set_chip_and_handler_name(irq, &ht_irq_chip,
 					      handle_edge_irq, "edge");
 
 		dev_printk(KERN_DEBUG, &dev->dev, "irq %d for HT\n", irq);
@@ -4054,5 +4056,6 @@ void __init pre_init_apic_IRQ0(void)
 	setup_local_APIC();
 
 	io_apic_setup_irq_pin(0, 0, &attr);
-	set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge");
+	irq_set_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq,
+				      "edge");
 }
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 4ff5968..bfe8f72 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -503,7 +503,7 @@ static int hpet_assign_irq(struct hpet_dev *dev)
 	if (!irq)
 		return -EINVAL;
 
-	set_irq_data(irq, dev);
+	irq_set_handler_data(irq, dev);
 
 	if (hpet_setup_msi_irq(irq))
 		return -EINVAL;
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index 20757cb..d9ca749 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -112,7 +112,7 @@ static void make_8259A_irq(unsigned int irq)
 {
 	disable_irq_nosync(irq);
 	io_apic_irqs &= ~(1<<irq);
-	set_irq_chip_and_handler_name(irq, &i8259A_chip, handle_level_irq,
+	irq_set_chip_and_handler_name(irq, &i8259A_chip, handle_level_irq,
 				      i8259A_chip.name);
 	enable_irq(irq);
 }
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index c752e97..3dd751c 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -110,7 +110,7 @@ void __init init_ISA_irqs(void)
 	legacy_pic->init(0);
 
 	for (i = 0; i < legacy_pic->nr_legacy_irqs; i++)
-		set_irq_chip_and_handler_name(i, chip, handle_level_irq, name);
+		irq_set_chip_and_handler_name(i, chip, handle_level_irq, name);
 }
 
 void __init init_IRQ(void)
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index eba687f0..b9ec1c7 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -847,7 +847,7 @@ static void __init lguest_init_IRQ(void)
 void lguest_setup_irq(unsigned int irq)
 {
 	irq_alloc_desc_at(irq, 0);
-	set_irq_chip_and_handler_name(irq, &lguest_irq_controller,
+	irq_set_chip_and_handler_name(irq, &lguest_irq_controller,
 				      handle_level_irq, "level");
 }
 
@@ -995,7 +995,7 @@ static void lguest_time_irq(unsigned int irq, struct irq_desc *desc)
 static void lguest_time_init(void)
 {
 	/* Set up the timer interrupt (0) to go to our simple timer routine */
-	set_irq_handler(0, lguest_time_irq);
+	irq_set_handler(0, lguest_time_irq);
 
 	clocksource_register(&lguest_clock);
 
diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index 25cd4a0..2bdcc36 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -104,7 +104,7 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 					"msi-x" : "msi", &irq, &pirq, XEN_ALLOC_IRQ);
 			if (irq < 0)
 				goto error;
-			ret = set_irq_msi(irq, msidesc);
+			ret = irq_set_msi_desc(irq, msidesc);
 			if (ret < 0)
 				goto error_while;
 			printk(KERN_DEBUG "xen: msi already setup: msi --> irq=%d"
@@ -117,7 +117,7 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 			goto error;
 		printk(KERN_DEBUG "xen: msi --> irq=%d, pirq=%d\n", irq, pirq);
 		xen_msi_compose_msg(dev, pirq, &msg);
-		ret = set_irq_msi(irq, msidesc);
+		ret = irq_set_msi_desc(irq, msidesc);
 		if (ret < 0)
 			goto error_while;
 		write_msi_msg(irq, &msg);
@@ -165,7 +165,7 @@ static int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 			goto free;
 		}
 
-		ret = set_irq_msi(irq, msidesc);
+		ret = irq_set_msi_desc(irq, msidesc);
 		if (ret)
 			goto error_while;
 		i++;
@@ -210,7 +210,7 @@ static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 		if (irq < 0)
 			return -1;
 
-		ret = set_irq_msi(irq, msidesc);
+		ret = irq_set_msi_desc(irq, msidesc);
 		if (ret)
 			goto error;
 	}
diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c
index 7b24460..374a05d 100644
--- a/arch/x86/platform/uv/uv_irq.c
+++ b/arch/x86/platform/uv/uv_irq.c
@@ -131,7 +131,7 @@ arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
 		       unsigned long mmr_offset, int limit)
 {
 	const struct cpumask *eligible_cpu = cpumask_of(cpu);
-	struct irq_cfg *cfg = get_irq_chip_data(irq);
+	struct irq_cfg *cfg = irq_get_chip_data(irq);
 	unsigned long mmr_value;
 	struct uv_IO_APIC_route_entry *entry;
 	int mmr_pnode, err;
@@ -148,7 +148,7 @@ arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
 	else
 		irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
 
-	set_irq_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq,
+	irq_set_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq,
 				      irq_name);
 
 	mmr_value = 0;
diff --git a/arch/x86/platform/visws/visws_quirks.c b/arch/x86/platform/visws/visws_quirks.c
index 6320376..dcc7aea 100644
--- a/arch/x86/platform/visws/visws_quirks.c
+++ b/arch/x86/platform/visws/visws_quirks.c
@@ -606,7 +606,7 @@ static void __init visws_pre_intr_init(void)
 			chip = &cobalt_irq_type;
 
 		if (chip)
-			set_irq_chip(i, chip);
+			irq_set_chip(i, chip);
 	}
 
 	setup_irq(CO_IRQ_8259, &master_action);
-- 
cgit v0.10.2


From c60eaf25cd211d2282a6edddb3ce26b1e5795097 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 11 Mar 2011 13:17:16 +0100
Subject: x86: ioapic: Simplify irq chip and handler setup

Use pointers instead of ugly multiline if/else constructs.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 7a88b04..224edce 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1242,32 +1242,28 @@ static inline int IO_APIC_irq_trigger(int irq)
 
 static void ioapic_register_intr(unsigned int irq, unsigned long trigger)
 {
+	struct irq_chip *chip = &ioapic_chip;
+	irq_flow_handler_t hdl;
+	bool fasteoi;
 
 	if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
-	    trigger == IOAPIC_LEVEL)
+	    trigger == IOAPIC_LEVEL) {
 		irq_set_status_flags(irq, IRQ_LEVEL);
-	else
+		fasteoi = true;
+	} else {
 		irq_clear_status_flags(irq, IRQ_LEVEL);
+		fasteoi = false;
+	}
 
 	if (irq_remapped(irq_get_chip_data(irq))) {
 		irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
-		if (trigger)
-			irq_set_chip_and_handler_name(irq, &ir_ioapic_chip,
-						      handle_fasteoi_irq,
-						      "fasteoi");
-		else
-			irq_set_chip_and_handler_name(irq, &ir_ioapic_chip,
-						      handle_edge_irq, "edge");
-		return;
+		chip = &ir_ioapic_chip;
+		fasteoi = trigger != 0;
 	}
 
-	if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
-	    trigger == IOAPIC_LEVEL)
-		irq_set_chip_and_handler_name(irq, &ioapic_chip,
-					      handle_fasteoi_irq, "fasteoi");
-	else
-		irq_set_chip_and_handler_name(irq, &ioapic_chip,
-					      handle_edge_irq, "edge");
+	hdl = fasteoi ? handle_fasteoi_irq : handle_edge_irq;
+	irq_set_chip_and_handler_name(irq, chip, hdl,
+				      fasteoi ? "fasteoi" : "edge");
 }
 
 static int setup_ioapic_entry(int apic_id, int irq,
@@ -3264,6 +3260,7 @@ static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec)
 
 static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
 {
+	struct irq_chip *chip = &msi_chip;
 	struct msi_msg msg;
 	int ret;
 
@@ -3276,11 +3273,10 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
 
 	if (irq_remapped(irq_get_chip_data(irq))) {
 		irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
-		irq_set_chip_and_handler_name(irq, &msi_ir_chip,
-					      handle_edge_irq, "edge");
-	} else
-		irq_set_chip_and_handler_name(irq, &msi_chip,
-					      handle_edge_irq, "edge");
+		chip = &msi_ir_chip;
+	}
+
+	irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge");
 
 	dev_printk(KERN_DEBUG, &dev->dev, "irq %d for MSI/MSI-X\n", irq);
 
@@ -3457,6 +3453,7 @@ static struct irq_chip hpet_msi_type = {
 
 int arch_setup_hpet_msi(unsigned int irq, unsigned int id)
 {
+	struct irq_chip *chip = &hpet_msi_type;
 	struct msi_msg msg;
 	int ret;
 
@@ -3479,12 +3476,9 @@ int arch_setup_hpet_msi(unsigned int irq, unsigned int id)
 	hpet_msi_write(irq_get_handler_data(irq), &msg);
 	irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
 	if (irq_remapped(irq_get_chip_data(irq)))
-		irq_set_chip_and_handler_name(irq, &ir_hpet_msi_type,
-					      handle_edge_irq, "edge");
-	else
-		irq_set_chip_and_handler_name(irq, &hpet_msi_type,
-					      handle_edge_irq, "edge");
+		chip = &ir_hpet_msi_type;
 
+	irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge");
 	return 0;
 }
 #endif
-- 
cgit v0.10.2


From 5451ddc5621550a2f4f82ddeac938b3ca392525f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 5 Feb 2011 15:35:51 +0100
Subject: x86: ioapic: Use irq_data->state

Use the state information in irq_data. That avoids a radix-tree lookup
from apic_ack_level() and simplifies setup_ioapic_dest().

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 224edce..e481e00 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -2431,7 +2431,7 @@ static void ack_apic_level(struct irq_data *data)
 	irq_complete_move(cfg);
 #ifdef CONFIG_GENERIC_PENDING_IRQ
 	/* If we are moving the irq we need to mask it */
-	if (unlikely(irq_to_desc(irq)->status & IRQ_MOVE_PENDING)) {
+	if (unlikely(irqd_is_setaffinity_pending(data))) {
 		do_unmask_irq = 1;
 		mask_ioapic(cfg);
 	}
@@ -3822,8 +3822,8 @@ int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity)
 void __init setup_ioapic_dest(void)
 {
 	int pin, ioapic, irq, irq_entry;
-	struct irq_desc *desc;
 	const struct cpumask *mask;
+	struct irq_data *idata;
 
 	if (skip_ioapic_setup == 1)
 		return;
@@ -3838,21 +3838,20 @@ void __init setup_ioapic_dest(void)
 		if ((ioapic > 0) && (irq > 16))
 			continue;
 
-		desc = irq_to_desc(irq);
+		idata = irq_get_irq_data(irq);
 
 		/*
 		 * Honour affinities which have been set in early boot
 		 */
-		if (desc->status &
-		    (IRQ_NO_BALANCING | IRQ_AFFINITY_SET))
-			mask = desc->irq_data.affinity;
+		if (!irqd_can_balance(idata) || irqd_affinity_was_set(idata))
+			mask = idata->affinity;
 		else
 			mask = apic->target_cpus();
 
 		if (intr_remapping_enabled)
-			ir_ioapic_set_affinity(&desc->irq_data, mask, false);
+			ir_ioapic_set_affinity(idata, mask, false);
 		else
-			ioapic_set_affinity(&desc->irq_data, mask, false);
+			ioapic_set_affinity(idata, mask, false);
 	}
 
 }
-- 
cgit v0.10.2


From 51c43ac6e4540786a6d79ea318b30f7bfa615ec7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 10 Feb 2011 21:40:36 +0100
Subject: x86: Use the proper accessors in fixup_irqs()

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 78793ef..00bf99d 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -293,6 +293,7 @@ void fixup_irqs(void)
 	static int warned;
 	struct irq_desc *desc;
 	struct irq_data *data;
+	struct irq_chip *chip;
 
 	for_each_irq_desc(irq, desc) {
 		int break_affinity = 0;
@@ -307,7 +308,7 @@ void fixup_irqs(void)
 		/* interrupt's are disabled at this point */
 		raw_spin_lock(&desc->lock);
 
-		data = &desc->irq_data;
+		data = irq_desc_get_irq_data(desc);
 		affinity = data->affinity;
 		if (!irq_has_action(irq) ||
 		    cpumask_subset(affinity, cpu_online_mask)) {
@@ -327,16 +328,17 @@ void fixup_irqs(void)
 			affinity = cpu_all_mask;
 		}
 
-		if (!(desc->status & IRQ_MOVE_PCNTXT) && data->chip->irq_mask)
-			data->chip->irq_mask(data);
+		chip = irq_data_get_irq_chip(data);
+		if (!irqd_can_move_in_process_context(data) && chip->irq_mask)
+			chip->irq_mask(data);
 
-		if (data->chip->irq_set_affinity)
-			data->chip->irq_set_affinity(data, affinity, true);
+		if (chip->irq_set_affinity)
+			chip->irq_set_affinity(data, affinity, true);
 		else if (!(warned++))
 			set_affinity = 0;
 
-		if (!(desc->status & IRQ_MOVE_PCNTXT) && data->chip->irq_unmask)
-			data->chip->irq_unmask(data);
+		if (!irqd_can_move_in_process_context(data) && chip->irq_unmask)
+			chip->irq_unmask(data);
 
 		raw_spin_unlock(&desc->lock);
 
@@ -368,10 +370,11 @@ void fixup_irqs(void)
 			irq = __this_cpu_read(vector_irq[vector]);
 
 			desc = irq_to_desc(irq);
-			data = &desc->irq_data;
+			data = irq_desc_get_irq_data(desc);
+			chip = irq_data_get_irq_chip(data);
 			raw_spin_lock(&desc->lock);
-			if (data->chip->irq_retrigger)
-				data->chip->irq_retrigger(data);
+			if (chip->irq_retrigger)
+				chip->irq_retrigger(data);
 			raw_spin_unlock(&desc->lock);
 		}
 	}
-- 
cgit v0.10.2


From 08221110e88ae101acf2464154f98e6d1b1ab21c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 4 Feb 2011 18:56:11 +0100
Subject: x86: ioapic: Use new move_irq functions

Use the functions which take irq_data. We already have a pointer to
irq_data. That avoids a sparse irq lookup in move_*_irq.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index e481e00..e9d4b96 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -2374,7 +2374,7 @@ static inline void irq_complete_move(struct irq_cfg *cfg) { }
 static void ack_apic_edge(struct irq_data *data)
 {
 	irq_complete_move(data->chip_data);
-	move_native_irq(data->irq);
+	irq_move_irq(data);
 	ack_APIC_irq();
 }
 
@@ -2520,7 +2520,7 @@ static void ack_apic_level(struct irq_data *data)
 		 * and you can go talk to the chipset vendor about it.
 		 */
 		if (!io_apic_level_ack_pending(cfg))
-			move_masked_irq(irq);
+			irq_move_masked_irq(data);
 		unmask_ioapic(cfg);
 	}
 }
-- 
cgit v0.10.2


From 1a0e62a49ad417712cfa79a395f6c39f67aadb44 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 12 Mar 2011 13:47:18 +0100
Subject: x86: ioapic: Avoid redundant lookup of irq_cfg

The caller of ioapic_register_intr() has a pointer to the irq_cfg for
the irq already. Hand it in to avoid a full lookup.

In msi_compose_msg() the pointer to irq_cfg is already available. No
need to look it up again.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index e9d4b96..4b5ebd2 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1240,7 +1240,8 @@ static inline int IO_APIC_irq_trigger(int irq)
 }
 #endif
 
-static void ioapic_register_intr(unsigned int irq, unsigned long trigger)
+static void ioapic_register_intr(unsigned int irq, struct irq_cfg *cfg,
+				 unsigned long trigger)
 {
 	struct irq_chip *chip = &ioapic_chip;
 	irq_flow_handler_t hdl;
@@ -1255,7 +1256,7 @@ static void ioapic_register_intr(unsigned int irq, unsigned long trigger)
 		fasteoi = false;
 	}
 
-	if (irq_remapped(irq_get_chip_data(irq))) {
+	if (irq_remapped(cfg)) {
 		irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
 		chip = &ir_ioapic_chip;
 		fasteoi = trigger != 0;
@@ -1361,7 +1362,7 @@ static void setup_ioapic_irq(int apic_id, int pin, unsigned int irq,
 		return;
 	}
 
-	ioapic_register_intr(irq, trigger);
+	ioapic_register_intr(irq, cfg, trigger);
 	if (irq < legacy_pic->nr_legacy_irqs)
 		legacy_pic->mask(irq);
 
@@ -3088,7 +3089,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
 
 	dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus());
 
-	if (irq_remapped(irq_get_chip_data(irq))) {
+	if (irq_remapped(cfg)) {
 		struct irte irte;
 		int ir_index;
 		u16 sub_handle;
-- 
cgit v0.10.2


From 517e49815677b43b26d3167aadca83919ef36a45 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 16 Dec 2010 17:59:57 +0100
Subject: x86: Use generic show_interrupts

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 867a8cf..ad74608 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -66,6 +66,7 @@ config X86
 	select HAVE_SPARSE_IRQ
 	select GENERIC_IRQ_PROBE
 	select GENERIC_PENDING_IRQ if SMP
+	select GENERIC_IRQ_SHOW
 	select USE_GENERIC_SMP_HELPERS if SMP
 
 config INSTRUCTION_DECODER
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 00bf99d..5ee693f 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -44,9 +44,9 @@ void ack_bad_irq(unsigned int irq)
 
 #define irq_stats(x)		(&per_cpu(irq_stat, x))
 /*
- * /proc/interrupts printing:
+ * /proc/interrupts printing for arch specific interrupts
  */
-static int show_other_interrupts(struct seq_file *p, int prec)
+int arch_show_interrupts(struct seq_file *p, int prec)
 {
 	int j;
 
@@ -122,59 +122,6 @@ static int show_other_interrupts(struct seq_file *p, int prec)
 	return 0;
 }
 
-int show_interrupts(struct seq_file *p, void *v)
-{
-	unsigned long flags, any_count = 0;
-	int i = *(loff_t *) v, j, prec;
-	struct irqaction *action;
-	struct irq_desc *desc;
-
-	if (i > nr_irqs)
-		return 0;
-
-	for (prec = 3, j = 1000; prec < 10 && j <= nr_irqs; ++prec)
-		j *= 10;
-
-	if (i == nr_irqs)
-		return show_other_interrupts(p, prec);
-
-	/* print header */
-	if (i == 0) {
-		seq_printf(p, "%*s", prec + 8, "");
-		for_each_online_cpu(j)
-			seq_printf(p, "CPU%-8d", j);
-		seq_putc(p, '\n');
-	}
-
-	desc = irq_to_desc(i);
-	if (!desc)
-		return 0;
-
-	raw_spin_lock_irqsave(&desc->lock, flags);
-	for_each_online_cpu(j)
-		any_count |= kstat_irqs_cpu(i, j);
-	action = desc->action;
-	if (!action && !any_count)
-		goto out;
-
-	seq_printf(p, "%*d: ", prec, i);
-	for_each_online_cpu(j)
-		seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
-	seq_printf(p, " %8s", desc->irq_data.chip->name);
-	seq_printf(p, "-%-8s", desc->name);
-
-	if (action) {
-		seq_printf(p, "  %s", action->name);
-		while ((action = action->next) != NULL)
-			seq_printf(p, ", %s", action->name);
-	}
-
-	seq_putc(p, '\n');
-out:
-	raw_spin_unlock_irqrestore(&desc->lock, flags);
-	return 0;
-}
-
 /*
  * /proc/stat helpers
  */
-- 
cgit v0.10.2


From 9bbbff25b31bbfdb512563cc5a14bcde5bf29bdf Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 27 Jan 2011 18:17:01 +0100
Subject: x86: Mark low level interrupts IRQF_NO_THREAD

These cannot be threaded.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 3dd751c..1cc302d 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -71,6 +71,7 @@ static irqreturn_t math_error_irq(int cpl, void *dev_id)
 static struct irqaction fpu_irq = {
 	.handler = math_error_irq,
 	.name = "fpu",
+	.flags = IRQF_NO_THREAD,
 };
 #endif
 
@@ -80,6 +81,7 @@ static struct irqaction fpu_irq = {
 static struct irqaction irq2 = {
 	.handler = no_action,
 	.name = "cascade",
+	.flags = IRQF_NO_THREAD,
 };
 
 DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
diff --git a/arch/x86/platform/visws/visws_quirks.c b/arch/x86/platform/visws/visws_quirks.c
index dcc7aea..fe4cf82 100644
--- a/arch/x86/platform/visws/visws_quirks.c
+++ b/arch/x86/platform/visws/visws_quirks.c
@@ -569,11 +569,13 @@ out_unlock:
 static struct irqaction master_action = {
 	.handler =	piix4_master_intr,
 	.name =		"PIIX4-8259",
+	.flags =	IRQF_NO_THREAD,
 };
 
 static struct irqaction cascade_action = {
 	.handler = 	no_action,
 	.name =		"cascade",
+	.flags =	IRQF_NO_THREAD,
 };
 
 static inline void set_piix4_virtual_irq_type(void)
-- 
cgit v0.10.2


From c0185808eb85139f45dbfd0de66963c498d0c4db Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 7 Feb 2011 02:24:08 +0100
Subject: x86: Enable forced interrupt threading support

All non threadeable interrupts are marked. Enable forced irq threading
support.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index ad74608..e7ef769 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -67,6 +67,7 @@ config X86
 	select GENERIC_IRQ_PROBE
 	select GENERIC_PENDING_IRQ if SMP
 	select GENERIC_IRQ_SHOW
+	select IRQ_FORCED_THREADING
 	select USE_GENERIC_SMP_HELPERS if SMP
 
 config INSTRUCTION_DECODER
-- 
cgit v0.10.2


From 6e6823d17b157f185be09f4c70181299f9273f0b Mon Sep 17 00:00:00 2001
From: Torben Hohn <torbenh@gmx.de>
Date: Thu, 3 Mar 2011 18:26:14 +0100
Subject: posix-clocks: Check write permissions in posix syscalls

pc_clock_settime() and pc_clock_adjtime() do not check whether the fd
was opened in write mode, so a clock can be set with a read only fd.

[ tglx: We deliberately do not return -EPERM as we want this to be
  	distingushable from the capability based permission check ]

Signed-off-by: Torben Hohn <torbenh@gmx.de>
LKML-Reference: <1299173174-348-4-git-send-email-torbenh@gmx.de>
Cc: Richard Cochran <richard.cochran@omicron.at>
Cc: John Stultz <johnstul@us.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
index 04498cb..25028dd 100644
--- a/kernel/time/posix-clock.c
+++ b/kernel/time/posix-clock.c
@@ -287,11 +287,16 @@ static int pc_clock_adjtime(clockid_t id, struct timex *tx)
 	if (err)
 		return err;
 
+	if ((cd.fp->f_mode & FMODE_WRITE) == 0) {
+		err = -EACCES;
+		goto out;
+	}
+
 	if (cd.clk->ops.clock_adjtime)
 		err = cd.clk->ops.clock_adjtime(cd.clk, tx);
 	else
 		err = -EOPNOTSUPP;
-
+out:
 	put_clock_desc(&cd);
 
 	return err;
@@ -344,11 +349,16 @@ static int pc_clock_settime(clockid_t id, const struct timespec *ts)
 	if (err)
 		return err;
 
+	if ((cd.fp->f_mode & FMODE_WRITE) == 0) {
+		err = -EACCES;
+		goto out;
+	}
+
 	if (cd.clk->ops.clock_settime)
 		err = cd.clk->ops.clock_settime(cd.clk, ts);
 	else
 		err = -EOPNOTSUPP;
-
+out:
 	put_clock_desc(&cd);
 
 	return err;
-- 
cgit v0.10.2


From 586ce098a23b6ab7383df853a84ae3d48dc889aa Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 13 Mar 2011 01:50:58 -0500
Subject: compat breakage in preadv() and pwritev()

Fix for a dumb preadv()/pwritev() compat bug - unlike the native
variants, compat_... ones forget to check FMODE_P{READ,WRITE}, so e.g.
on pipe the native preadv() will fail with -ESPIPE and compat one will
act as readv() and succeed.  Not critical, but it's a clear bug with trivial
fix.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/compat.c b/fs/compat.c
index f6fd0a0..691c3fd 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1228,7 +1228,9 @@ compat_sys_preadv(unsigned long fd, const struct compat_iovec __user *vec,
 	file = fget_light(fd, &fput_needed);
 	if (!file)
 		return -EBADF;
-	ret = compat_readv(file, vec, vlen, &pos);
+	ret = -ESPIPE;
+	if (file->f_mode & FMODE_PREAD)
+		ret = compat_readv(file, vec, vlen, &pos);
 	fput_light(file, fput_needed);
 	return ret;
 }
@@ -1285,7 +1287,9 @@ compat_sys_pwritev(unsigned long fd, const struct compat_iovec __user *vec,
 	file = fget_light(fd, &fput_needed);
 	if (!file)
 		return -EBADF;
-	ret = compat_writev(file, vec, vlen, &pos);
+	ret = -ESPIPE;
+	if (file->f_mode & FMODE_PWRITE)
+		ret = compat_writev(file, vec, vlen, &pos);
 	fput_light(file, fput_needed);
 	return ret;
 }
-- 
cgit v0.10.2


From 15a9155fe3e8215c02b80df51ec2cac7c0d726ad Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 16 Feb 2011 15:08:54 -0500
Subject: fix race in audit_get_nd()

don't rely on pathname resolution ending up twice at the same point...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index d2e3c78..20b9fe6 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -144,9 +144,9 @@ int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev)
 }
 
 /* Initialize a parent watch entry. */
-static struct audit_parent *audit_init_parent(struct nameidata *ndp)
+static struct audit_parent *audit_init_parent(struct path *path)
 {
-	struct inode *inode = ndp->path.dentry->d_inode;
+	struct inode *inode = path->dentry->d_inode;
 	struct audit_parent *parent;
 	int ret;
 
@@ -353,53 +353,40 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
 }
 
 /* Get path information necessary for adding watches. */
-static int audit_get_nd(char *path, struct nameidata **ndp, struct nameidata **ndw)
+static int audit_get_nd(struct audit_watch *watch, struct path *parent)
 {
-	struct nameidata *ndparent, *ndwatch;
+	struct nameidata nd;
+	struct dentry *d;
 	int err;
 
-	ndparent = kmalloc(sizeof(*ndparent), GFP_KERNEL);
-	if (unlikely(!ndparent))
-		return -ENOMEM;
+	err = path_lookup(watch->path, LOOKUP_PARENT, &nd);
+	if (err)
+		return err;
 
-	ndwatch = kmalloc(sizeof(*ndwatch), GFP_KERNEL);
-	if (unlikely(!ndwatch)) {
-		kfree(ndparent);
-		return -ENOMEM;
+	if (nd.last_type != LAST_NORM) {
+		path_put(&nd.path);
+		return -EINVAL;
 	}
 
-	err = path_lookup(path, LOOKUP_PARENT, ndparent);
-	if (err) {
-		kfree(ndparent);
-		kfree(ndwatch);
-		return err;
+	mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
+	d = lookup_one_len(nd.last.name, nd.path.dentry, nd.last.len);
+	if (IS_ERR(d)) {
+		mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
+		path_put(&nd.path);
+		return PTR_ERR(d);
 	}
-
-	err = path_lookup(path, 0, ndwatch);
-	if (err) {
-		kfree(ndwatch);
-		ndwatch = NULL;
+	if (d->d_inode) {
+		/* update watch filter fields */
+		watch->dev = d->d_inode->i_sb->s_dev;
+		watch->ino = d->d_inode->i_ino;
 	}
+	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
 
-	*ndp = ndparent;
-	*ndw = ndwatch;
-
+	*parent = nd.path;
+	dput(d);
 	return 0;
 }
 
-/* Release resources used for watch path information. */
-static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw)
-{
-	if (ndp) {
-		path_put(&ndp->path);
-		kfree(ndp);
-	}
-	if (ndw) {
-		path_put(&ndw->path);
-		kfree(ndw);
-	}
-}
-
 /* Associate the given rule with an existing parent.
  * Caller must hold audit_filter_mutex. */
 static void audit_add_to_parent(struct audit_krule *krule,
@@ -440,31 +427,24 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list)
 {
 	struct audit_watch *watch = krule->watch;
 	struct audit_parent *parent;
-	struct nameidata *ndp = NULL, *ndw = NULL;
+	struct path parent_path;
 	int h, ret = 0;
 
 	mutex_unlock(&audit_filter_mutex);
 
 	/* Avoid calling path_lookup under audit_filter_mutex. */
-	ret = audit_get_nd(watch->path, &ndp, &ndw);
-	if (ret) {
-		/* caller expects mutex locked */
-		mutex_lock(&audit_filter_mutex);
-		goto error;
-	}
+	ret = audit_get_nd(watch, &parent_path);
 
+	/* caller expects mutex locked */
 	mutex_lock(&audit_filter_mutex);
 
-	/* update watch filter fields */
-	if (ndw) {
-		watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev;
-		watch->ino = ndw->path.dentry->d_inode->i_ino;
-	}
+	if (ret)
+		return ret;
 
 	/* either find an old parent or attach a new one */
-	parent = audit_find_parent(ndp->path.dentry->d_inode);
+	parent = audit_find_parent(parent_path.dentry->d_inode);
 	if (!parent) {
-		parent = audit_init_parent(ndp);
+		parent = audit_init_parent(&parent_path);
 		if (IS_ERR(parent)) {
 			ret = PTR_ERR(parent);
 			goto error;
@@ -479,9 +459,8 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list)
 	h = audit_hash_ino((u32)watch->ino);
 	*list = &audit_inode_hash[h];
 error:
-	audit_put_nd(ndp, ndw);		/* NULL args OK */
+	path_put(&parent_path);
 	return ret;
-
 }
 
 void audit_remove_watch_rule(struct audit_krule *krule)
-- 
cgit v0.10.2


From c9c6cac0c2bdbda42e7b804838648d0bc60ddb13 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 16 Feb 2011 15:15:47 -0500
Subject: kill path_lookup()

all remaining callers pass LOOKUP_PARENT to it, so
flags argument can die; renamed to kern_path_parent()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/arch/powerpc/platforms/cell/spufs/syscalls.c b/arch/powerpc/platforms/cell/spufs/syscalls.c
index 187a7d3..a3d2ce5 100644
--- a/arch/powerpc/platforms/cell/spufs/syscalls.c
+++ b/arch/powerpc/platforms/cell/spufs/syscalls.c
@@ -70,7 +70,7 @@ static long do_spu_create(const char __user *pathname, unsigned int flags,
 	if (!IS_ERR(tmp)) {
 		struct nameidata nd;
 
-		ret = path_lookup(tmp, LOOKUP_PARENT, &nd);
+		ret = kern_path_parent(tmp, &nd);
 		if (!ret) {
 			nd.flags |= LOOKUP_OPEN | LOOKUP_CREATE;
 			ret = spufs_create(&nd, flags, mode, neighbor);
diff --git a/fs/namei.c b/fs/namei.c
index a4689eb..1d6bc81 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1747,10 +1747,9 @@ static int do_path_lookup(int dfd, const char *name,
 	return retval;
 }
 
-int path_lookup(const char *name, unsigned int flags,
-			struct nameidata *nd)
+int kern_path_parent(const char *name, struct nameidata *nd)
 {
-	return do_path_lookup(AT_FDCWD, name, flags, nd);
+	return do_path_lookup(AT_FDCWD, name, LOOKUP_PARENT, nd);
 }
 
 int kern_path(const char *name, unsigned int flags, struct path *path)
@@ -3586,7 +3585,7 @@ EXPORT_SYMBOL(page_readlink);
 EXPORT_SYMBOL(__page_symlink);
 EXPORT_SYMBOL(page_symlink);
 EXPORT_SYMBOL(page_symlink_inode_operations);
-EXPORT_SYMBOL(path_lookup);
+EXPORT_SYMBOL(kern_path_parent);
 EXPORT_SYMBOL(kern_path);
 EXPORT_SYMBOL(vfs_path_lookup);
 EXPORT_SYMBOL(inode_permission);
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 19ebc5a..29623da 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -4379,7 +4379,7 @@ static int ocfs2_user_path_parent(const char __user *path,
 	if (IS_ERR(s))
 		return PTR_ERR(s);
 
-	error = path_lookup(s, LOOKUP_PARENT, nd);
+	error = kern_path_parent(s, nd);
 	if (error)
 		putname(s);
 	else
diff --git a/include/linux/namei.h b/include/linux/namei.h
index f276d4f..58ce343 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -72,7 +72,7 @@ extern int user_path_at(int, const char __user *, unsigned, struct path *);
 
 extern int kern_path(const char *, unsigned, struct path *);
 
-extern int path_lookup(const char *, unsigned, struct nameidata *);
+extern int kern_path_parent(const char *, struct nameidata *);
 extern int vfs_path_lookup(struct dentry *, struct vfsmount *,
 			   const char *, unsigned int, struct nameidata *);
 
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 20b9fe6..e683869 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -359,7 +359,7 @@ static int audit_get_nd(struct audit_watch *watch, struct path *parent)
 	struct dentry *d;
 	int err;
 
-	err = path_lookup(watch->path, LOOKUP_PARENT, &nd);
+	err = kern_path_parent(watch->path, &nd);
 	if (err)
 		return err;
 
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index dd419d2..d8c04a60 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -850,7 +850,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 		 * Get the parent directory, calculate the hash for last
 		 * component.
 		 */
-		err = path_lookup(sunaddr->sun_path, LOOKUP_PARENT, &nd);
+		err = kern_path_parent(sunaddr->sun_path, &nd);
 		if (err)
 			goto out_mknod_parent;
 
-- 
cgit v0.10.2


From 52094c8a0610cf57920ad4c6c57470ae2ccbbd25 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 21 Feb 2011 21:34:47 -0500
Subject: take RCU-dependent stuff around exec_permission() into a new helper

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/namei.c b/fs/namei.c
index 1d6bc81..8c70446 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1322,6 +1322,18 @@ fail:
 	return PTR_ERR(dentry);
 }
 
+static inline int may_lookup(struct nameidata *nd)
+{
+	if (nd->flags & LOOKUP_RCU) {
+		int err = exec_permission(nd->inode, IPERM_FLAG_RCU);
+		if (err != -ECHILD)
+			return err;
+		if (nameidata_drop_rcu(nd))
+			return -ECHILD;
+	}
+	return exec_permission(nd->inode, 0);
+}
+
 /*
  * Name resolution.
  * This is the basic name resolution function, turning a pathname into
@@ -1352,17 +1364,8 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 		unsigned int c;
 
 		nd->flags |= LOOKUP_CONTINUE;
-		if (nd->flags & LOOKUP_RCU) {
-			err = exec_permission(nd->inode, IPERM_FLAG_RCU);
-			if (err == -ECHILD) {
-				if (nameidata_drop_rcu(nd))
-					return -ECHILD;
-				goto exec_again;
-			}
-		} else {
-exec_again:
-			err = exec_permission(nd->inode, 0);
-		}
+
+		err = may_lookup(nd);
  		if (err)
 			break;
 
-- 
cgit v0.10.2


From ee0827cd6b42b0385dc1a116cd853ac1b739f711 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 21 Feb 2011 23:38:09 -0500
Subject: sanitize path_walk() mess

New helper: path_lookupat().  Basically, what do_path_lookup() boils to
modulo -ECHILD/-ESTALE handler.  path_walk* family is gone; vfs_path_lookup()
is using link_path_walk() directly, do_path_lookup() and do_filp_open()
are using path_lookupat().

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/namei.c b/fs/namei.c
index 8c70446..f5de5bb 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1520,59 +1520,6 @@ return_err:
 	return err;
 }
 
-static inline int path_walk_rcu(const char *name, struct nameidata *nd)
-{
-	current->total_link_count = 0;
-
-	return link_path_walk(name, nd);
-}
-
-static inline int path_walk_simple(const char *name, struct nameidata *nd)
-{
-	current->total_link_count = 0;
-
-	return link_path_walk(name, nd);
-}
-
-static int path_walk(const char *name, struct nameidata *nd)
-{
-	struct path save = nd->path;
-	int result;
-
-	current->total_link_count = 0;
-
-	/* make sure the stuff we saved doesn't go away */
-	path_get(&save);
-
-	result = link_path_walk(name, nd);
-	if (result == -ESTALE) {
-		/* nd->path had been dropped */
-		current->total_link_count = 0;
-		nd->path = save;
-		nd->inode = save.dentry->d_inode;
-		path_get(&nd->path);
-		nd->flags |= LOOKUP_REVAL;
-		result = link_path_walk(name, nd);
-	}
-
-	path_put(&save);
-
-	return result;
-}
-
-static void path_finish_rcu(struct nameidata *nd)
-{
-	if (nd->flags & LOOKUP_RCU) {
-		/* RCU dangling. Cancel it. */
-		nd->flags &= ~LOOKUP_RCU;
-		nd->root.mnt = NULL;
-		rcu_read_unlock();
-		br_read_unlock(vfsmount_lock);
-	}
-	if (nd->file)
-		fput(nd->file);
-}
-
 static int path_init_rcu(int dfd, const char *name, unsigned int flags, struct nameidata *nd)
 {
 	int retval = 0;
@@ -1697,7 +1644,7 @@ out_fail:
 }
 
 /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
-static int do_path_lookup(int dfd, const char *name,
+static int path_lookupat(int dfd, const char *name,
 				unsigned int flags, struct nameidata *nd)
 {
 	int retval;
@@ -1716,29 +1663,45 @@ static int do_path_lookup(int dfd, const char *name,
 	 * be handled by restarting a traditional ref-walk (which will always
 	 * be able to complete).
 	 */
-	retval = path_init_rcu(dfd, name, flags, nd);
+	if (flags & LOOKUP_RCU)
+		retval = path_init_rcu(dfd, name, flags, nd);
+	else
+		retval = path_init(dfd, name, flags, nd);
+
 	if (unlikely(retval))
 		return retval;
-	retval = path_walk_rcu(name, nd);
-	path_finish_rcu(nd);
+
+	current->total_link_count = 0;
+	retval = link_path_walk(name, nd);
+
+	if (nd->flags & LOOKUP_RCU) {
+		/* RCU dangling. Cancel it. */
+		nd->flags &= ~LOOKUP_RCU;
+		nd->root.mnt = NULL;
+		rcu_read_unlock();
+		br_read_unlock(vfsmount_lock);
+	}
+
+	if (nd->file) {
+		fput(nd->file);
+		nd->file = NULL;
+	}
+
 	if (nd->root.mnt) {
 		path_put(&nd->root);
 		nd->root.mnt = NULL;
 	}
+	return retval;
+}
 
-	if (unlikely(retval == -ECHILD || retval == -ESTALE)) {
-		/* slower, locked walk */
-		if (retval == -ESTALE)
-			flags |= LOOKUP_REVAL;
-		retval = path_init(dfd, name, flags, nd);
-		if (unlikely(retval))
-			return retval;
-		retval = path_walk(name, nd);
-		if (nd->root.mnt) {
-			path_put(&nd->root);
-			nd->root.mnt = NULL;
-		}
-	}
+static int do_path_lookup(int dfd, const char *name,
+				unsigned int flags, struct nameidata *nd)
+{
+	int retval = path_lookupat(dfd, name, flags | LOOKUP_RCU, nd);
+	if (unlikely(retval == -ECHILD))
+		retval = path_lookupat(dfd, name, flags, nd);
+	if (unlikely(retval == -ESTALE))
+		retval = path_lookupat(dfd, name, flags | LOOKUP_REVAL, nd);
 
 	if (likely(!retval)) {
 		if (unlikely(!audit_dummy_context())) {
@@ -1746,7 +1709,6 @@ static int do_path_lookup(int dfd, const char *name,
 				audit_inode(name, nd->path.dentry);
 		}
 	}
-
 	return retval;
 }
 
@@ -1776,7 +1738,7 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
 		    const char *name, unsigned int flags,
 		    struct nameidata *nd)
 {
-	int retval;
+	int result;
 
 	/* same as do_path_lookup */
 	nd->last_type = LAST_ROOT;
@@ -1790,15 +1752,27 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
 	path_get(&nd->root);
 	nd->inode = nd->path.dentry->d_inode;
 
-	retval = path_walk(name, nd);
-	if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry &&
+	current->total_link_count = 0;
+
+	result = link_path_walk(name, nd);
+	if (result == -ESTALE) {
+		/* nd->path had been dropped */
+		current->total_link_count = 0;
+		nd->path.dentry = dentry;
+		nd->path.mnt = mnt;
+		nd->inode = dentry->d_inode;
+		path_get(&nd->path);
+		nd->flags |= LOOKUP_REVAL;
+		result = link_path_walk(name, nd);
+	}
+	if (unlikely(!result && !audit_dummy_context() && nd->path.dentry &&
 				nd->inode))
 		audit_inode(name, nd->path.dentry);
 
 	path_put(&nd->root);
 	nd->root.mnt = NULL;
 
-	return retval;
+	return result;
 }
 
 static struct dentry *__lookup_hash(struct qstr *name,
@@ -2483,24 +2457,14 @@ out_filp2:
 
 creat:
 	/* OK, have to create the file. Find the parent. */
-	error = path_init_rcu(dfd, pathname,
-			LOOKUP_PARENT | (flags & LOOKUP_REVAL), &nd);
-	if (error)
-		goto out_filp;
-	error = path_walk_rcu(pathname, &nd);
-	path_finish_rcu(&nd);
-	if (unlikely(error == -ECHILD || error == -ESTALE)) {
-		/* slower, locked walk */
-		if (error == -ESTALE) {
+	error = path_lookupat(dfd, pathname, LOOKUP_PARENT | LOOKUP_RCU, &nd);
+	if (unlikely(error == -ECHILD))
+		error = path_lookupat(dfd, pathname, LOOKUP_PARENT, &nd);
+	if (unlikely(error == -ESTALE)) {
 reval:
-			flags |= LOOKUP_REVAL;
-		}
-		error = path_init(dfd, pathname,
-				LOOKUP_PARENT | (flags & LOOKUP_REVAL), &nd);
-		if (error)
-			goto out_filp;
-
-		error = path_walk_simple(pathname, &nd);
+		flags |= LOOKUP_REVAL;
+		error = path_lookupat(dfd, pathname,
+				LOOKUP_PARENT | LOOKUP_REVAL, &nd);
 	}
 	if (unlikely(error))
 		goto out_filp;
-- 
cgit v0.10.2


From e41f7d4ee5bdb00da7d327a00b0ab9c4a2e9eaa3 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 22 Feb 2011 14:02:58 -0500
Subject: merge path_init and path_init_rcu

Actual dependency on whether we want RCU or not is in 3 small areas
(as it ought to be) and everything around those is the same in both
versions.  Since each function has only one caller and those callers
are on two sides of if (flags & LOOKUP_RCU), it's easier and cleaner
to merge them and pull the checks inside.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/namei.c b/fs/namei.c
index f5de5bb..b9e5379 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1520,45 +1520,44 @@ return_err:
 	return err;
 }
 
-static int path_init_rcu(int dfd, const char *name, unsigned int flags, struct nameidata *nd)
+static int path_init(int dfd, const char *name, unsigned int flags, struct nameidata *nd)
 {
 	int retval = 0;
 	int fput_needed;
 	struct file *file;
 
 	nd->last_type = LAST_ROOT; /* if there are only slashes... */
-	nd->flags = flags | LOOKUP_RCU;
+	nd->flags = flags;
 	nd->depth = 0;
 	nd->root.mnt = NULL;
 	nd->file = NULL;
 
 	if (*name=='/') {
-		struct fs_struct *fs = current->fs;
-		unsigned seq;
-
-		br_read_lock(vfsmount_lock);
-		rcu_read_lock();
-
-		do {
-			seq = read_seqcount_begin(&fs->seq);
-			nd->root = fs->root;
-			nd->path = nd->root;
-			nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
-		} while (read_seqcount_retry(&fs->seq, seq));
-
+		if (flags & LOOKUP_RCU) {
+			br_read_lock(vfsmount_lock);
+			rcu_read_lock();
+			set_root_rcu(nd);
+		} else {
+			set_root(nd);
+			path_get(&nd->root);
+		}
+		nd->path = nd->root;
 	} else if (dfd == AT_FDCWD) {
-		struct fs_struct *fs = current->fs;
-		unsigned seq;
-
-		br_read_lock(vfsmount_lock);
-		rcu_read_lock();
+		if (flags & LOOKUP_RCU) {
+			struct fs_struct *fs = current->fs;
+			unsigned seq;
 
-		do {
-			seq = read_seqcount_begin(&fs->seq);
-			nd->path = fs->pwd;
-			nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
-		} while (read_seqcount_retry(&fs->seq, seq));
+			br_read_lock(vfsmount_lock);
+			rcu_read_lock();
 
+			do {
+				seq = read_seqcount_begin(&fs->seq);
+				nd->path = fs->pwd;
+				nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
+			} while (read_seqcount_retry(&fs->seq, seq));
+		} else {
+			get_fs_pwd(current->fs, &nd->path);
+		}
 	} else {
 		struct dentry *dentry;
 
@@ -1578,62 +1577,18 @@ static int path_init_rcu(int dfd, const char *name, unsigned int flags, struct n
 			goto fput_fail;
 
 		nd->path = file->f_path;
-		if (fput_needed)
-			nd->file = file;
-
-		nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
-		br_read_lock(vfsmount_lock);
-		rcu_read_lock();
+		if (flags & LOOKUP_RCU) {
+			if (fput_needed)
+				nd->file = file;
+			nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
+			br_read_lock(vfsmount_lock);
+			rcu_read_lock();
+		} else {
+			path_get(&file->f_path);
+			fput_light(file, fput_needed);
+		}
 	}
-	nd->inode = nd->path.dentry->d_inode;
-	return 0;
 
-fput_fail:
-	fput_light(file, fput_needed);
-out_fail:
-	return retval;
-}
-
-static int path_init(int dfd, const char *name, unsigned int flags, struct nameidata *nd)
-{
-	int retval = 0;
-	int fput_needed;
-	struct file *file;
-
-	nd->last_type = LAST_ROOT; /* if there are only slashes... */
-	nd->flags = flags;
-	nd->depth = 0;
-	nd->root.mnt = NULL;
-
-	if (*name=='/') {
-		set_root(nd);
-		nd->path = nd->root;
-		path_get(&nd->root);
-	} else if (dfd == AT_FDCWD) {
-		get_fs_pwd(current->fs, &nd->path);
-	} else {
-		struct dentry *dentry;
-
-		file = fget_light(dfd, &fput_needed);
-		retval = -EBADF;
-		if (!file)
-			goto out_fail;
-
-		dentry = file->f_path.dentry;
-
-		retval = -ENOTDIR;
-		if (!S_ISDIR(dentry->d_inode->i_mode))
-			goto fput_fail;
-
-		retval = file_permission(file, MAY_EXEC);
-		if (retval)
-			goto fput_fail;
-
-		nd->path = file->f_path;
-		path_get(&file->f_path);
-
-		fput_light(file, fput_needed);
-	}
 	nd->inode = nd->path.dentry->d_inode;
 	return 0;
 
@@ -1663,10 +1618,7 @@ static int path_lookupat(int dfd, const char *name,
 	 * be handled by restarting a traditional ref-walk (which will always
 	 * be able to complete).
 	 */
-	if (flags & LOOKUP_RCU)
-		retval = path_init_rcu(dfd, name, flags, nd);
-	else
-		retval = path_init(dfd, name, flags, nd);
+	retval = path_init(dfd, name, flags, nd);
 
 	if (unlikely(retval))
 		return retval;
-- 
cgit v0.10.2


From fe479a580dc9c737c4eb49ff7fdb31d41d2c7003 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 22 Feb 2011 15:10:03 -0500
Subject: merge component type recognition

no need to do it in three places...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/namei.c b/fs/namei.c
index b9e5379..4521b5f 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1362,6 +1362,7 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 		unsigned long hash;
 		struct qstr this;
 		unsigned int c;
+		int type;
 
 		nd->flags |= LOOKUP_CONTINUE;
 
@@ -1381,6 +1382,16 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 		this.len = name - (const char *) this.name;
 		this.hash = end_name_hash(hash);
 
+		type = LAST_NORM;
+		if (this.name[0] == '.') switch (this.len) {
+			case 2:
+				if (this.name[1] == '.')
+					type = LAST_DOTDOT;
+				break;
+			case 1:
+				type = LAST_DOT;
+		}
+
 		/* remove trailing slashes? */
 		if (!c)
 			goto last_component;
@@ -1393,21 +1404,17 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 		 * to be able to know about the current root directory and
 		 * parent relationships.
 		 */
-		if (this.name[0] == '.') switch (this.len) {
-			default:
-				break;
-			case 2:
-				if (this.name[1] != '.')
-					break;
+		if (unlikely(type != LAST_NORM)) {
+			if (type == LAST_DOTDOT) {
 				if (nd->flags & LOOKUP_RCU) {
 					if (follow_dotdot_rcu(nd))
 						return -ECHILD;
 				} else
 					follow_dotdot(nd);
-				/* fallthrough */
-			case 1:
-				continue;
+			}
+			continue;
 		}
+
 		/* This does the actual lookups.. */
 		err = do_lookup(nd, &this, &next, &inode);
 		if (err)
@@ -1441,20 +1448,15 @@ last_component:
 		nd->flags &= lookup_flags | ~LOOKUP_CONTINUE;
 		if (lookup_flags & LOOKUP_PARENT)
 			goto lookup_parent;
-		if (this.name[0] == '.') switch (this.len) {
-			default:
-				break;
-			case 2:
-				if (this.name[1] != '.')
-					break;
+		if (unlikely(type != LAST_NORM)) {
+			if (type == LAST_DOTDOT) {
 				if (nd->flags & LOOKUP_RCU) {
 					if (follow_dotdot_rcu(nd))
 						return -ECHILD;
 				} else
 					follow_dotdot(nd);
-				/* fallthrough */
-			case 1:
-				goto return_reval;
+			}
+			goto return_reval;
 		}
 		err = do_lookup(nd, &this, &next, &inode);
 		if (err)
@@ -1480,14 +1482,8 @@ last_component:
 		goto return_base;
 lookup_parent:
 		nd->last = this;
-		nd->last_type = LAST_NORM;
-		if (this.name[0] != '.')
-			goto return_base;
-		if (this.len == 1)
-			nd->last_type = LAST_DOT;
-		else if (this.len == 2 && this.name[1] == '.')
-			nd->last_type = LAST_DOTDOT;
-		else
+		nd->last_type = type;
+		if (type == LAST_NORM)
 			goto return_base;
 return_reval:
 		/*
-- 
cgit v0.10.2


From 16c2cd7179881d5dd87779512ca5a0d657c64f62 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 22 Feb 2011 15:50:10 -0500
Subject: untangle the "need_reval_dot" mess

instead of ad-hackery around need_reval_dot(), do the following:
set a flag (LOOKUP_JUMPED) in the beginning of path, on absolute
symlink traversal, on ".." and on procfs-style symlinks.  Clear on
normal components, leave unchanged on ".".  Non-nested callers of
link_path_walk() call handle_reval_path(), which checks that flag
is set and that fs does want the final revalidate thing, then does
->d_revalidate().  In link_path_walk() all the return_reval stuff
is gone.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/namei.c b/fs/namei.c
index 4521b5f..450b686 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -613,19 +613,8 @@ do_revalidate_rcu(struct dentry *dentry, struct nameidata *nd)
 	return dentry;
 }
 
-static inline int need_reval_dot(struct dentry *dentry)
-{
-	if (likely(!(dentry->d_flags & DCACHE_OP_REVALIDATE)))
-		return 0;
-
-	if (likely(!(dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)))
-		return 0;
-
-	return 1;
-}
-
 /*
- * force_reval_path - force revalidation of a dentry
+ * handle_reval_path - force revalidation of a dentry
  *
  * In some situations the path walking code will trust dentries without
  * revalidating them. This causes problems for filesystems that depend on
@@ -639,27 +628,28 @@ static inline int need_reval_dot(struct dentry *dentry)
  * invalidate the dentry. It's up to the caller to handle putting references
  * to the path if necessary.
  */
-static int
-force_reval_path(struct path *path, struct nameidata *nd)
+static inline int handle_reval_path(struct nameidata *nd)
 {
+	struct dentry *dentry = nd->path.dentry;
 	int status;
-	struct dentry *dentry = path->dentry;
 
-	/*
-	 * only check on filesystems where it's possible for the dentry to
-	 * become stale.
-	 */
-	if (!need_reval_dot(dentry))
+	if (likely(!(nd->flags & LOOKUP_JUMPED)))
+		return 0;
+
+	if (likely(!(dentry->d_flags & DCACHE_OP_REVALIDATE)))
 		return 0;
 
+	if (likely(!(dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)))
+		return 0;
+
+	/* Note: we do not d_invalidate() */
 	status = d_revalidate(dentry, nd);
 	if (status > 0)
 		return 0;
 
-	if (!status) {
-		d_invalidate(dentry);
+	if (!status)
 		status = -ESTALE;
-	}
+
 	return status;
 }
 
@@ -728,6 +718,7 @@ static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *l
 		path_put(&nd->path);
 		nd->path = nd->root;
 		path_get(&nd->root);
+		nd->flags |= LOOKUP_JUMPED;
 	}
 	nd->inode = nd->path.dentry->d_inode;
 
@@ -779,11 +770,8 @@ __do_follow_link(const struct path *link, struct nameidata *nd, void **p)
 		error = 0;
 		if (s)
 			error = __vfs_follow_link(nd, s);
-		else if (nd->last_type == LAST_BIND) {
-			error = force_reval_path(&nd->path, nd);
-			if (error)
-				path_put(&nd->path);
-		}
+		else if (nd->last_type == LAST_BIND)
+			nd->flags |= LOOKUP_JUMPED;
 	}
 	return error;
 }
@@ -1351,7 +1339,7 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 	while (*name=='/')
 		name++;
 	if (!*name)
-		goto return_reval;
+		goto return_base;
 
 	if (nd->depth)
 		lookup_flags = LOOKUP_FOLLOW | (nd->flags & LOOKUP_CONTINUE);
@@ -1385,12 +1373,16 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 		type = LAST_NORM;
 		if (this.name[0] == '.') switch (this.len) {
 			case 2:
-				if (this.name[1] == '.')
+				if (this.name[1] == '.') {
 					type = LAST_DOTDOT;
+					nd->flags |= LOOKUP_JUMPED;
+				}
 				break;
 			case 1:
 				type = LAST_DOT;
 		}
+		if (likely(type == LAST_NORM))
+			nd->flags &= ~LOOKUP_JUMPED;
 
 		/* remove trailing slashes? */
 		if (!c)
@@ -1456,7 +1448,7 @@ last_component:
 				} else
 					follow_dotdot(nd);
 			}
-			goto return_reval;
+			goto return_base;
 		}
 		err = do_lookup(nd, &this, &next, &inode);
 		if (err)
@@ -1483,24 +1475,6 @@ last_component:
 lookup_parent:
 		nd->last = this;
 		nd->last_type = type;
-		if (type == LAST_NORM)
-			goto return_base;
-return_reval:
-		/*
-		 * We bypassed the ordinary revalidation routines.
-		 * We may need to check the cached dentry for staleness.
-		 */
-		if (need_reval_dot(nd->path.dentry)) {
-			if (nameidata_drop_rcu_last_maybe(nd))
-				return -ECHILD;
-			/* Note: we do not d_invalidate() */
-			err = d_revalidate(nd->path.dentry, nd);
-			if (!err)
-				err = -ESTALE;
-			if (err < 0)
-				break;
-			return 0;
-		}
 return_base:
 		if (nameidata_drop_rcu_last_maybe(nd))
 			return -ECHILD;
@@ -1523,7 +1497,7 @@ static int path_init(int dfd, const char *name, unsigned int flags, struct namei
 	struct file *file;
 
 	nd->last_type = LAST_ROOT; /* if there are only slashes... */
-	nd->flags = flags;
+	nd->flags = flags | LOOKUP_JUMPED;
 	nd->depth = 0;
 	nd->root.mnt = NULL;
 	nd->file = NULL;
@@ -1630,6 +1604,9 @@ static int path_lookupat(int dfd, const char *name,
 		br_read_unlock(vfsmount_lock);
 	}
 
+	if (!retval)
+		retval = handle_reval_path(nd);
+
 	if (nd->file) {
 		fput(nd->file);
 		nd->file = NULL;
@@ -1690,7 +1667,7 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
 
 	/* same as do_path_lookup */
 	nd->last_type = LAST_ROOT;
-	nd->flags = flags;
+	nd->flags = flags | LOOKUP_JUMPED;
 	nd->depth = 0;
 
 	nd->path.dentry = dentry;
@@ -1703,6 +1680,8 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
 	current->total_link_count = 0;
 
 	result = link_path_walk(name, nd);
+	if (!result)
+		result = handle_reval_path(nd);
 	if (result == -ESTALE) {
 		/* nd->path had been dropped */
 		current->total_link_count = 0;
@@ -1710,8 +1689,11 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
 		nd->path.mnt = mnt;
 		nd->inode = dentry->d_inode;
 		path_get(&nd->path);
-		nd->flags |= LOOKUP_REVAL;
+		nd->flags = flags | LOOKUP_JUMPED | LOOKUP_REVAL;
+
 		result = link_path_walk(name, nd);
+		if (!result)
+			result = handle_reval_path(nd);
 	}
 	if (unlikely(!result && !audit_dummy_context() && nd->path.dentry &&
 				nd->inode))
@@ -2198,30 +2180,29 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 {
 	struct dentry *dir = nd->path.dentry;
 	struct file *filp;
-	int error = -EISDIR;
+	int error;
 
 	switch (nd->last_type) {
 	case LAST_DOTDOT:
 		follow_dotdot(nd);
 		dir = nd->path.dentry;
 	case LAST_DOT:
-		if (need_reval_dot(dir)) {
-			int status = d_revalidate(nd->path.dentry, nd);
-			if (!status)
-				status = -ESTALE;
-			if (status < 0) {
-				error = status;
-				goto exit;
-			}
-		}
 		/* fallthrough */
 	case LAST_ROOT:
+		error = handle_reval_path(nd);
+		if (error)
+			goto exit;
+		error = -EISDIR;
 		goto exit;
 	case LAST_BIND:
+		error = handle_reval_path(nd);
+		if (error)
+			goto exit;
 		audit_inode(pathname, dir);
 		goto ok;
 	}
 
+	error = -EISDIR;
 	/* trailing slashes? */
 	if (nd->last.name[nd->last.len])
 		goto exit;
@@ -2422,7 +2403,7 @@ reval:
 	/*
 	 * We have the parent and last component.
 	 */
-	nd.flags = flags;
+	nd.flags = (nd.flags & ~LOOKUP_PARENT) | flags;
 	filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
 	while (unlikely(!filp)) { /* trailing symlink */
 		struct path link = path;
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 58ce343..265378a 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -63,6 +63,8 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND};
 #define LOOKUP_EXCL		0x0400
 #define LOOKUP_RENAME_TARGET	0x0800
 
+#define LOOKUP_JUMPED		0x1000
+
 extern int user_path_at(int, const char __user *, unsigned, struct path *);
 
 #define user_path(name, path) user_path_at(AT_FDCWD, name, LOOKUP_FOLLOW, path)
-- 
cgit v0.10.2


From 086e183a641109033420e0b26ddecb6f4abb4c89 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 22 Feb 2011 20:56:27 -0500
Subject: pull dropping RCU on success of link_path_walk() into path_lookupat()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/namei.c b/fs/namei.c
index 450b686..8f10a9f 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -539,14 +539,6 @@ err_unlock:
 	return -ECHILD;
 }
 
-/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing.  */
-static inline int nameidata_drop_rcu_last_maybe(struct nameidata *nd)
-{
-	if (likely(nd->flags & LOOKUP_RCU))
-		return nameidata_drop_rcu_last(nd);
-	return 0;
-}
-
 /**
  * release_open_intent - free up open intent resources
  * @nd: pointer to nameidata
@@ -1339,7 +1331,7 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 	while (*name=='/')
 		name++;
 	if (!*name)
-		goto return_base;
+		return 0;
 
 	if (nd->depth)
 		lookup_flags = LOOKUP_FOLLOW | (nd->flags & LOOKUP_CONTINUE);
@@ -1448,7 +1440,7 @@ last_component:
 				} else
 					follow_dotdot(nd);
 			}
-			goto return_base;
+			return 0;
 		}
 		err = do_lookup(nd, &this, &next, &inode);
 		if (err)
@@ -1471,13 +1463,10 @@ last_component:
 			if (!nd->inode->i_op->lookup)
 				break;
 		}
-		goto return_base;
+		return 0;
 lookup_parent:
 		nd->last = this;
 		nd->last_type = type;
-return_base:
-		if (nameidata_drop_rcu_last_maybe(nd))
-			return -ECHILD;
 		return 0;
 out_dput:
 		if (!(nd->flags & LOOKUP_RCU))
@@ -1598,10 +1587,15 @@ static int path_lookupat(int dfd, const char *name,
 
 	if (nd->flags & LOOKUP_RCU) {
 		/* RCU dangling. Cancel it. */
-		nd->flags &= ~LOOKUP_RCU;
-		nd->root.mnt = NULL;
-		rcu_read_unlock();
-		br_read_unlock(vfsmount_lock);
+		if (!retval) {
+			if (nameidata_drop_rcu_last(nd))
+				retval = -ECHILD;
+		} else {
+			nd->flags &= ~LOOKUP_RCU;
+			nd->root.mnt = NULL;
+			rcu_read_unlock();
+			br_read_unlock(vfsmount_lock);
+		}
 	}
 
 	if (!retval)
-- 
cgit v0.10.2


From 36f3b4f69070fee7c647bab5dc4408990bb3606c Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 22 Feb 2011 21:24:38 -0500
Subject: pull security_inode_follow_link() into __do_follow_link()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/namei.c b/fs/namei.c
index 8f10a9f..f956567 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -754,6 +754,13 @@ __do_follow_link(const struct path *link, struct nameidata *nd, void **p)
 	if (link->mnt == nd->path.mnt)
 		mntget(link->mnt);
 
+	error = security_inode_follow_link(link->dentry, nd);
+	if (error) {
+		*p = ERR_PTR(error); /* no ->put_link(), please */
+		path_put(&nd->path);
+		return error;
+	}
+
 	nd->last_type = LAST_BIND;
 	*p = dentry->d_inode->i_op->follow_link(dentry, nd);
 	error = PTR_ERR(*p);
@@ -791,9 +798,6 @@ static inline int do_follow_link(struct inode *inode, struct path *path, struct
 		goto loop;
 	BUG_ON(nd->depth >= MAX_NESTED_LINKS);
 	cond_resched();
-	err = security_inode_follow_link(path->dentry, nd);
-	if (err)
-		goto loop;
 	current->link_count++;
 	current->total_link_count++;
 	nd->depth++;
@@ -2420,9 +2424,6 @@ reval:
 		 * just set LAST_BIND.
 		 */
 		nd.flags |= LOOKUP_PARENT;
-		error = security_inode_follow_link(link.dentry, &nd);
-		if (error)
-			goto exit_dput;
 		error = __do_follow_link(&link, &nd, &cookie);
 		if (unlikely(error)) {
 			if (!IS_ERR(cookie) && linki->i_op->put_link)
-- 
cgit v0.10.2


From f1afe9efc84476ca42fbb7301a441021063eead7 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 22 Feb 2011 22:27:28 -0500
Subject: clean up the failure exits after __do_follow_link() in do_filp_open()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/namei.c b/fs/namei.c
index f956567..e0f5903 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2426,15 +2426,12 @@ reval:
 		nd.flags |= LOOKUP_PARENT;
 		error = __do_follow_link(&link, &nd, &cookie);
 		if (unlikely(error)) {
-			if (!IS_ERR(cookie) && linki->i_op->put_link)
-				linki->i_op->put_link(link.dentry, &nd, cookie);
-			/* nd.path had been dropped */
-			nd.path = link;
-			goto out_path;
+			filp = ERR_PTR(error);
+		} else {
+			nd.flags &= ~LOOKUP_PARENT;
+			filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
 		}
-		nd.flags &= ~LOOKUP_PARENT;
-		filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
-		if (linki->i_op->put_link)
+		if (!IS_ERR(cookie) && linki->i_op->put_link)
 			linki->i_op->put_link(link.dentry, &nd, cookie);
 		path_put(&link);
 	}
-- 
cgit v0.10.2


From c3e380b0b3cfa613189fb91513efd88a65e1d9d8 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 23 Feb 2011 13:39:45 -0500
Subject: Collect "operation mode" arguments of do_last() into a structure

No point messing with passing shitloads of "operation mode" arguments
to do_open() one by one, especially since they are not going to change
during do_filp_open().  Collect them into a struct, fill it and pass
to do_last() by reference.

Make sure that lookup intent flags are correctly set and removed - we
want them for do_last(), but they make no sense for __do_follow_link().

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/namei.c b/fs/namei.c
index e0f5903..5e4206f 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2169,17 +2169,26 @@ exit:
 	return ERR_PTR(error);
 }
 
+struct open_flags {
+	int open_flag;
+	int mode;
+	int acc_mode;
+	int intent;
+};
+
 /*
  * Handle O_CREAT case for do_filp_open
  */
 static struct file *do_last(struct nameidata *nd, struct path *path,
-			    int open_flag, int acc_mode,
-			    int mode, const char *pathname)
+			    const struct open_flags *op, const char *pathname)
 {
 	struct dentry *dir = nd->path.dentry;
 	struct file *filp;
 	int error;
 
+	nd->flags &= ~LOOKUP_PARENT;
+	nd->flags |= op->intent;
+
 	switch (nd->last_type) {
 	case LAST_DOTDOT:
 		follow_dotdot(nd);
@@ -2233,7 +2242,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 		error = mnt_want_write(nd->path.mnt);
 		if (error)
 			goto exit_mutex_unlock;
-		error = __open_namei_create(nd, path, open_flag, mode);
+		error = __open_namei_create(nd, path, op->open_flag, op->mode);
 		if (error) {
 			mnt_drop_write(nd->path.mnt);
 			goto exit;
@@ -2242,7 +2251,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 		mnt_drop_write(nd->path.mnt);
 		path_put(&nd->path);
 		if (!IS_ERR(filp)) {
-			error = ima_file_check(filp, acc_mode);
+			error = ima_file_check(filp, op->acc_mode);
 			if (error) {
 				fput(filp);
 				filp = ERR_PTR(error);
@@ -2258,7 +2267,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 	audit_inode(pathname, path->dentry);
 
 	error = -EEXIST;
-	if (open_flag & O_EXCL)
+	if (op->open_flag & O_EXCL)
 		goto exit_dput;
 
 	error = follow_managed(path, nd->flags);
@@ -2278,7 +2287,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 	if (S_ISDIR(nd->inode->i_mode))
 		goto exit;
 ok:
-	filp = finish_open(nd, open_flag, acc_mode);
+	filp = finish_open(nd, op->open_flag, op->acc_mode);
 	return filp;
 
 exit_mutex_unlock:
@@ -2304,7 +2313,8 @@ struct file *do_filp_open(int dfd, const char *pathname,
 	struct path path;
 	int count = 0;
 	int flag = open_to_namei_flags(open_flag);
-	int flags;
+	int flags = 0;
+	struct open_flags op;
 
 	if (!(open_flag & O_CREAT))
 		mode = 0;
@@ -2321,6 +2331,8 @@ struct file *do_filp_open(int dfd, const char *pathname,
 	if (open_flag & __O_SYNC)
 		open_flag |= O_DSYNC;
 
+	op.open_flag = open_flag;
+
 	if (!acc_mode)
 		acc_mode = MAY_OPEN | ACC_MODE(open_flag);
 
@@ -2333,12 +2345,15 @@ struct file *do_filp_open(int dfd, const char *pathname,
 	if (open_flag & O_APPEND)
 		acc_mode |= MAY_APPEND;
 
-	flags = LOOKUP_OPEN;
+	op.acc_mode = acc_mode;
+
+	op.intent = LOOKUP_OPEN;
 	if (open_flag & O_CREAT) {
-		flags |= LOOKUP_CREATE;
+		op.intent |= LOOKUP_CREATE;
 		if (open_flag & O_EXCL)
-			flags |= LOOKUP_EXCL;
+			op.intent |= LOOKUP_EXCL;
 	}
+
 	if (open_flag & O_DIRECTORY)
 		flags |= LOOKUP_DIRECTORY;
 	if (!(open_flag & O_NOFOLLOW))
@@ -2357,7 +2372,7 @@ struct file *do_filp_open(int dfd, const char *pathname,
 		goto creat;
 
 	/* !O_CREAT, simple open */
-	error = do_path_lookup(dfd, pathname, flags, &nd);
+	error = do_path_lookup(dfd, pathname, flags | op.intent, &nd);
 	if (unlikely(error))
 		goto out_filp2;
 	error = -ELOOP;
@@ -2384,14 +2399,14 @@ out_filp2:
 
 creat:
 	/* OK, have to create the file. Find the parent. */
-	error = path_lookupat(dfd, pathname, LOOKUP_PARENT | LOOKUP_RCU, &nd);
+	error = path_lookupat(dfd, pathname,
+			LOOKUP_PARENT | LOOKUP_RCU | flags, &nd);
 	if (unlikely(error == -ECHILD))
-		error = path_lookupat(dfd, pathname, LOOKUP_PARENT, &nd);
+		error = path_lookupat(dfd, pathname, LOOKUP_PARENT | flags, &nd);
 	if (unlikely(error == -ESTALE)) {
 reval:
 		flags |= LOOKUP_REVAL;
-		error = path_lookupat(dfd, pathname,
-				LOOKUP_PARENT | LOOKUP_REVAL, &nd);
+		error = path_lookupat(dfd, pathname, LOOKUP_PARENT | flags, &nd);
 	}
 	if (unlikely(error))
 		goto out_filp;
@@ -2401,8 +2416,7 @@ reval:
 	/*
 	 * We have the parent and last component.
 	 */
-	nd.flags = (nd.flags & ~LOOKUP_PARENT) | flags;
-	filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
+	filp = do_last(&nd, &path, &op, pathname);
 	while (unlikely(!filp)) { /* trailing symlink */
 		struct path link = path;
 		struct inode *linki = link.dentry->d_inode;
@@ -2424,13 +2438,12 @@ reval:
 		 * just set LAST_BIND.
 		 */
 		nd.flags |= LOOKUP_PARENT;
+		nd.flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
 		error = __do_follow_link(&link, &nd, &cookie);
-		if (unlikely(error)) {
+		if (unlikely(error))
 			filp = ERR_PTR(error);
-		} else {
-			nd.flags &= ~LOOKUP_PARENT;
-			filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
-		}
+		else
+			filp = do_last(&nd, &path, &op, pathname);
 		if (!IS_ERR(cookie) && linki->i_op->put_link)
 			linki->i_op->put_link(link.dentry, &nd, cookie);
 		path_put(&link);
-- 
cgit v0.10.2


From 47c805dc2d2dff686962f5f0baa6bac2d703ba19 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 23 Feb 2011 17:44:09 -0500
Subject: switch do_filp_open() to struct open_flags

take calculation of open_flags by open(2) arguments into new helper
in fs/open.c, move filp_open() over there, have it and do_sys_open()
use that helper, switch exec.c callers of do_filp_open() to explicit
(and constant) struct open_flags.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/exec.c b/fs/exec.c
index 52a447d..ba99e1a 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -115,13 +115,16 @@ SYSCALL_DEFINE1(uselib, const char __user *, library)
 	struct file *file;
 	char *tmp = getname(library);
 	int error = PTR_ERR(tmp);
+	static const struct open_flags uselib_flags = {
+		.open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
+		.acc_mode = MAY_READ | MAY_EXEC | MAY_OPEN,
+		.intent = LOOKUP_OPEN
+	};
 
 	if (IS_ERR(tmp))
 		goto out;
 
-	file = do_filp_open(AT_FDCWD, tmp,
-				O_LARGEFILE | O_RDONLY | __FMODE_EXEC, 0,
-				MAY_READ | MAY_EXEC | MAY_OPEN);
+	file = do_filp_open(AT_FDCWD, tmp, &uselib_flags, LOOKUP_FOLLOW);
 	putname(tmp);
 	error = PTR_ERR(file);
 	if (IS_ERR(file))
@@ -721,10 +724,13 @@ struct file *open_exec(const char *name)
 {
 	struct file *file;
 	int err;
+	static const struct open_flags open_exec_flags = {
+		.open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
+		.acc_mode = MAY_EXEC | MAY_OPEN,
+		.intent = LOOKUP_OPEN
+	};
 
-	file = do_filp_open(AT_FDCWD, name,
-				O_LARGEFILE | O_RDONLY | __FMODE_EXEC, 0,
-				MAY_EXEC | MAY_OPEN);
+	file = do_filp_open(AT_FDCWD, name, &open_exec_flags, LOOKUP_FOLLOW);
 	if (IS_ERR(file))
 		goto out;
 
diff --git a/fs/internal.h b/fs/internal.h
index 9b976b5..6fdbdf2 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -106,6 +106,14 @@ extern void put_super(struct super_block *sb);
 struct nameidata;
 extern struct file *nameidata_to_filp(struct nameidata *);
 extern void release_open_intent(struct nameidata *);
+struct open_flags {
+	int open_flag;
+	int mode;
+	int acc_mode;
+	int intent;
+};
+extern struct file *do_filp_open(int dfd, const char *pathname,
+		const struct open_flags *op, int lookup_flags);
 
 /*
  * inode.c
diff --git a/fs/namei.c b/fs/namei.c
index 5e4206f..9c7fa94 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2169,13 +2169,6 @@ exit:
 	return ERR_PTR(error);
 }
 
-struct open_flags {
-	int open_flag;
-	int mode;
-	int acc_mode;
-	int intent;
-};
-
 /*
  * Handle O_CREAT case for do_filp_open
  */
@@ -2305,74 +2298,28 @@ exit:
  * open_to_namei_flags() for more details.
  */
 struct file *do_filp_open(int dfd, const char *pathname,
-		int open_flag, int mode, int acc_mode)
+		const struct open_flags *op, int flags)
 {
 	struct file *filp;
 	struct nameidata nd;
 	int error;
 	struct path path;
 	int count = 0;
-	int flag = open_to_namei_flags(open_flag);
-	int flags = 0;
-	struct open_flags op;
-
-	if (!(open_flag & O_CREAT))
-		mode = 0;
-
-	/* Must never be set by userspace */
-	open_flag &= ~FMODE_NONOTIFY;
-
-	/*
-	 * O_SYNC is implemented as __O_SYNC|O_DSYNC.  As many places only
-	 * check for O_DSYNC if the need any syncing at all we enforce it's
-	 * always set instead of having to deal with possibly weird behaviour
-	 * for malicious applications setting only __O_SYNC.
-	 */
-	if (open_flag & __O_SYNC)
-		open_flag |= O_DSYNC;
-
-	op.open_flag = open_flag;
-
-	if (!acc_mode)
-		acc_mode = MAY_OPEN | ACC_MODE(open_flag);
-
-	/* O_TRUNC implies we need access checks for write permissions */
-	if (open_flag & O_TRUNC)
-		acc_mode |= MAY_WRITE;
-
-	/* Allow the LSM permission hook to distinguish append 
-	   access from general write access. */
-	if (open_flag & O_APPEND)
-		acc_mode |= MAY_APPEND;
-
-	op.acc_mode = acc_mode;
-
-	op.intent = LOOKUP_OPEN;
-	if (open_flag & O_CREAT) {
-		op.intent |= LOOKUP_CREATE;
-		if (open_flag & O_EXCL)
-			op.intent |= LOOKUP_EXCL;
-	}
-
-	if (open_flag & O_DIRECTORY)
-		flags |= LOOKUP_DIRECTORY;
-	if (!(open_flag & O_NOFOLLOW))
-		flags |= LOOKUP_FOLLOW;
 
 	filp = get_empty_filp();
 	if (!filp)
 		return ERR_PTR(-ENFILE);
 
-	filp->f_flags = open_flag;
+	filp->f_flags = op->open_flag;
 	nd.intent.open.file = filp;
-	nd.intent.open.flags = flag;
-	nd.intent.open.create_mode = mode;
+	nd.intent.open.flags = open_to_namei_flags(op->open_flag);
+	nd.intent.open.create_mode = op->mode;
 
-	if (open_flag & O_CREAT)
+	if (op->open_flag & O_CREAT)
 		goto creat;
 
 	/* !O_CREAT, simple open */
-	error = do_path_lookup(dfd, pathname, flags | op.intent, &nd);
+	error = do_path_lookup(dfd, pathname, flags | op->intent, &nd);
 	if (unlikely(error))
 		goto out_filp2;
 	error = -ELOOP;
@@ -2386,7 +2333,7 @@ struct file *do_filp_open(int dfd, const char *pathname,
 			goto out_path2;
 	}
 	audit_inode(pathname, nd.path.dentry);
-	filp = finish_open(&nd, open_flag, acc_mode);
+	filp = finish_open(&nd, op->open_flag, op->acc_mode);
 out2:
 	release_open_intent(&nd);
 	return filp;
@@ -2416,7 +2363,7 @@ reval:
 	/*
 	 * We have the parent and last component.
 	 */
-	filp = do_last(&nd, &path, &op, pathname);
+	filp = do_last(&nd, &path, op, pathname);
 	while (unlikely(!filp)) { /* trailing symlink */
 		struct path link = path;
 		struct inode *linki = link.dentry->d_inode;
@@ -2443,7 +2390,7 @@ reval:
 		if (unlikely(error))
 			filp = ERR_PTR(error);
 		else
-			filp = do_last(&nd, &path, &op, pathname);
+			filp = do_last(&nd, &path, op, pathname);
 		if (!IS_ERR(cookie) && linki->i_op->put_link)
 			linki->i_op->put_link(link.dentry, &nd, cookie);
 		path_put(&link);
@@ -2466,23 +2413,6 @@ out_filp:
 }
 
 /**
- * filp_open - open file and return file pointer
- *
- * @filename:	path to open
- * @flags:	open flags as per the open(2) second argument
- * @mode:	mode for the new file if O_CREAT is set, else ignored
- *
- * This is the helper to open a file from kernelspace if you really
- * have to.  But in generally you should not do this, so please move
- * along, nothing to see here..
- */
-struct file *filp_open(const char *filename, int flags, int mode)
-{
-	return do_filp_open(AT_FDCWD, filename, flags, mode, 0);
-}
-EXPORT_SYMBOL(filp_open);
-
-/**
  * lookup_create - lookup a dentry, creating it if it doesn't exist
  * @nd: nameidata info
  * @is_dir: directory flag
diff --git a/fs/open.c b/fs/open.c
index b47aab3..d05e18c 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -890,15 +890,86 @@ void fd_install(unsigned int fd, struct file *file)
 
 EXPORT_SYMBOL(fd_install);
 
+static inline int build_open_flags(int flags, int mode, struct open_flags *op)
+{
+	int lookup_flags = 0;
+	int acc_mode;
+
+	if (!(flags & O_CREAT))
+		mode = 0;
+	op->mode = mode;
+
+	/* Must never be set by userspace */
+	flags &= ~FMODE_NONOTIFY;
+
+	/*
+	 * O_SYNC is implemented as __O_SYNC|O_DSYNC.  As many places only
+	 * check for O_DSYNC if the need any syncing at all we enforce it's
+	 * always set instead of having to deal with possibly weird behaviour
+	 * for malicious applications setting only __O_SYNC.
+	 */
+	if (flags & __O_SYNC)
+		flags |= O_DSYNC;
+
+	op->open_flag = flags;
+
+	acc_mode = MAY_OPEN | ACC_MODE(flags);
+
+	/* O_TRUNC implies we need access checks for write permissions */
+	if (flags & O_TRUNC)
+		acc_mode |= MAY_WRITE;
+
+	/* Allow the LSM permission hook to distinguish append
+	   access from general write access. */
+	if (flags & O_APPEND)
+		acc_mode |= MAY_APPEND;
+
+	op->acc_mode = acc_mode;
+
+	op->intent = LOOKUP_OPEN;
+	if (flags & O_CREAT) {
+		op->intent |= LOOKUP_CREATE;
+		if (flags & O_EXCL)
+			op->intent |= LOOKUP_EXCL;
+	}
+
+	if (flags & O_DIRECTORY)
+		lookup_flags |= LOOKUP_DIRECTORY;
+	if (!(flags & O_NOFOLLOW))
+		lookup_flags |= LOOKUP_FOLLOW;
+	return lookup_flags;
+}
+
+/**
+ * filp_open - open file and return file pointer
+ *
+ * @filename:	path to open
+ * @flags:	open flags as per the open(2) second argument
+ * @mode:	mode for the new file if O_CREAT is set, else ignored
+ *
+ * This is the helper to open a file from kernelspace if you really
+ * have to.  But in generally you should not do this, so please move
+ * along, nothing to see here..
+ */
+struct file *filp_open(const char *filename, int flags, int mode)
+{
+	struct open_flags op;
+	int lookup = build_open_flags(flags, mode, &op);
+	return do_filp_open(AT_FDCWD, filename, &op, lookup);
+}
+EXPORT_SYMBOL(filp_open);
+
 long do_sys_open(int dfd, const char __user *filename, int flags, int mode)
 {
+	struct open_flags op;
+	int lookup = build_open_flags(flags, mode, &op);
 	char *tmp = getname(filename);
 	int fd = PTR_ERR(tmp);
 
 	if (!IS_ERR(tmp)) {
 		fd = get_unused_fd_flags(flags);
 		if (fd >= 0) {
-			struct file *f = do_filp_open(dfd, tmp, flags, mode, 0);
+			struct file *f = do_filp_open(dfd, tmp, &op, lookup);
 			if (IS_ERR(f)) {
 				put_unused_fd(fd);
 				fd = PTR_ERR(f);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e38b50a4..9c75714 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2205,8 +2205,6 @@ extern struct file *create_read_pipe(struct file *f, int flags);
 extern struct file *create_write_pipe(int flags);
 extern void free_write_pipe(struct file *);
 
-extern struct file *do_filp_open(int dfd, const char *pathname,
-		int open_flag, int mode, int acc_mode);
 extern int may_open(struct path *, int, int);
 
 extern int kernel_read(struct file *, loff_t, char *, unsigned long);
-- 
cgit v0.10.2


From 13aab428a73d3200b9283b61b7fdf5713181ac66 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 23 Feb 2011 17:54:08 -0500
Subject: separate -ESTALE/-ECHILD retries in do_filp_open() from real work

new helper: path_openat().  Does what do_filp_open() does, except
that it tries only the walk mode (RCU/normal/force revalidation)
it had been told to.

Both create and non-create branches are using path_lookupat() now.
Fixed the double audit_inode() in non-create branch.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/namei.c b/fs/namei.c
index 9c7fa94..01a17dd 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2292,19 +2292,14 @@ exit:
 	return ERR_PTR(error);
 }
 
-/*
- * Note that the low bits of the passed in "open_flag"
- * are not the same as in the local variable "flag". See
- * open_to_namei_flags() for more details.
- */
-struct file *do_filp_open(int dfd, const char *pathname,
+static struct file *path_openat(int dfd, const char *pathname,
 		const struct open_flags *op, int flags)
 {
 	struct file *filp;
 	struct nameidata nd;
-	int error;
 	struct path path;
 	int count = 0;
+	int error;
 
 	filp = get_empty_filp();
 	if (!filp)
@@ -2319,42 +2314,27 @@ struct file *do_filp_open(int dfd, const char *pathname,
 		goto creat;
 
 	/* !O_CREAT, simple open */
-	error = do_path_lookup(dfd, pathname, flags | op->intent, &nd);
+	error = path_lookupat(dfd, pathname, flags | op->intent, &nd);
 	if (unlikely(error))
-		goto out_filp2;
+		goto out_filp;
 	error = -ELOOP;
 	if (!(nd.flags & LOOKUP_FOLLOW)) {
 		if (nd.inode->i_op->follow_link)
-			goto out_path2;
+			goto out_path;
 	}
 	error = -ENOTDIR;
 	if (nd.flags & LOOKUP_DIRECTORY) {
 		if (!nd.inode->i_op->lookup)
-			goto out_path2;
+			goto out_path;
 	}
 	audit_inode(pathname, nd.path.dentry);
 	filp = finish_open(&nd, op->open_flag, op->acc_mode);
-out2:
 	release_open_intent(&nd);
 	return filp;
 
-out_path2:
-	path_put(&nd.path);
-out_filp2:
-	filp = ERR_PTR(error);
-	goto out2;
-
 creat:
 	/* OK, have to create the file. Find the parent. */
-	error = path_lookupat(dfd, pathname,
-			LOOKUP_PARENT | LOOKUP_RCU | flags, &nd);
-	if (unlikely(error == -ECHILD))
-		error = path_lookupat(dfd, pathname, LOOKUP_PARENT | flags, &nd);
-	if (unlikely(error == -ESTALE)) {
-reval:
-		flags |= LOOKUP_REVAL;
-		error = path_lookupat(dfd, pathname, LOOKUP_PARENT | flags, &nd);
-	}
+	error = path_lookupat(dfd, pathname, LOOKUP_PARENT | flags, &nd);
 	if (unlikely(error))
 		goto out_filp;
 	if (unlikely(!audit_dummy_context()))
@@ -2398,8 +2378,6 @@ reval:
 out:
 	if (nd.root.mnt)
 		path_put(&nd.root);
-	if (filp == ERR_PTR(-ESTALE) && !(flags & LOOKUP_REVAL))
-		goto reval;
 	release_open_intent(&nd);
 	return filp;
 
@@ -2412,6 +2390,19 @@ out_filp:
 	goto out;
 }
 
+struct file *do_filp_open(int dfd, const char *pathname,
+		const struct open_flags *op, int flags)
+{
+	struct file *filp;
+
+	filp = path_openat(dfd, pathname, op, flags | LOOKUP_RCU);
+	if (unlikely(filp == ERR_PTR(-ECHILD)))
+		filp = path_openat(dfd, pathname, op, flags);
+	if (unlikely(filp == ERR_PTR(-ESTALE)))
+		filp = path_openat(dfd, pathname, op, flags | LOOKUP_REVAL);
+	return filp;
+}
+
 /**
  * lookup_create - lookup a dentry, creating it if it doesn't exist
  * @nd: nameidata info
-- 
cgit v0.10.2


From 7bc055d1d524f209bf49d8b9cb220712dd7df4ed Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 23 Feb 2011 19:41:31 -0500
Subject: kill out_dput: in link_path_walk()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/namei.c b/fs/namei.c
index 01a17dd..fea3636 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1407,22 +1407,19 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 		err = do_lookup(nd, &this, &next, &inode);
 		if (err)
 			break;
-		err = -ENOENT;
-		if (!inode)
-			goto out_dput;
 
-		if (inode->i_op->follow_link) {
+		if (inode && inode->i_op->follow_link) {
 			err = do_follow_link(inode, &next, nd);
 			if (err)
 				goto return_err;
 			nd->inode = nd->path.dentry->d_inode;
-			err = -ENOENT;
-			if (!nd->inode)
-				break;
 		} else {
 			path_to_nameidata(&next, nd);
 			nd->inode = inode;
 		}
+		err = -ENOENT;
+		if (!nd->inode)
+			break;
 		err = -ENOTDIR; 
 		if (!nd->inode->i_op->lookup)
 			break;
@@ -1472,10 +1469,6 @@ lookup_parent:
 		nd->last = this;
 		nd->last_type = type;
 		return 0;
-out_dput:
-		if (!(nd->flags & LOOKUP_RCU))
-			path_put_conditional(&next, nd);
-		break;
 	}
 	if (!(nd->flags & LOOKUP_RCU))
 		path_put(&nd->path);
-- 
cgit v0.10.2


From 9856fa1b281eccdc9f8d94d716e96818c675e78e Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 4 Mar 2011 14:22:06 -0500
Subject: pull handling of . and .. into inlined helper

getting LOOKUP_RCU checks out of link_path_walk()...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/namei.c b/fs/namei.c
index fea3636..d29f91e 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1318,6 +1318,18 @@ static inline int may_lookup(struct nameidata *nd)
 	return exec_permission(nd->inode, 0);
 }
 
+static inline int handle_dots(struct nameidata *nd, int type)
+{
+	if (type == LAST_DOTDOT) {
+		if (nd->flags & LOOKUP_RCU) {
+			if (follow_dotdot_rcu(nd))
+				return -ECHILD;
+		} else
+			follow_dotdot(nd);
+	}
+	return 0;
+}
+
 /*
  * Name resolution.
  * This is the basic name resolution function, turning a pathname into
@@ -1393,13 +1405,8 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 		 * parent relationships.
 		 */
 		if (unlikely(type != LAST_NORM)) {
-			if (type == LAST_DOTDOT) {
-				if (nd->flags & LOOKUP_RCU) {
-					if (follow_dotdot_rcu(nd))
-						return -ECHILD;
-				} else
-					follow_dotdot(nd);
-			}
+			if (handle_dots(nd, type))
+				return -ECHILD;
 			continue;
 		}
 
@@ -1434,13 +1441,8 @@ last_component:
 		if (lookup_flags & LOOKUP_PARENT)
 			goto lookup_parent;
 		if (unlikely(type != LAST_NORM)) {
-			if (type == LAST_DOTDOT) {
-				if (nd->flags & LOOKUP_RCU) {
-					if (follow_dotdot_rcu(nd))
-						return -ECHILD;
-				} else
-					follow_dotdot(nd);
-			}
+			if (handle_dots(nd, type))
+				return -ECHILD;
 			return 0;
 		}
 		err = do_lookup(nd, &this, &next, &inode);
-- 
cgit v0.10.2


From 4455ca6223cc59cbc0a75f4be8bce9e84cc0d6b8 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 4 Mar 2011 14:28:10 -0500
Subject: clear RCU on all failure exits from link_path_walk()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/namei.c b/fs/namei.c
index d29f91e..f09887a 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1405,8 +1405,9 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 		 * parent relationships.
 		 */
 		if (unlikely(type != LAST_NORM)) {
-			if (handle_dots(nd, type))
-				return -ECHILD;
+			err = handle_dots(nd, type);
+			if (err)
+				goto return_err;
 			continue;
 		}
 
@@ -1441,8 +1442,9 @@ last_component:
 		if (lookup_flags & LOOKUP_PARENT)
 			goto lookup_parent;
 		if (unlikely(type != LAST_NORM)) {
-			if (handle_dots(nd, type))
-				return -ECHILD;
+			err = handle_dots(nd, type);
+			if (err)
+				goto return_err;
 			return 0;
 		}
 		err = do_lookup(nd, &this, &next, &inode);
@@ -1475,6 +1477,12 @@ lookup_parent:
 	if (!(nd->flags & LOOKUP_RCU))
 		path_put(&nd->path);
 return_err:
+	if (nd->flags & LOOKUP_RCU) {
+		nd->flags &= ~LOOKUP_RCU;
+		nd->root.mnt = NULL;
+		rcu_read_unlock();
+		br_read_unlock(vfsmount_lock);
+	}
 	return err;
 }
 
@@ -1585,16 +1593,10 @@ static int path_lookupat(int dfd, const char *name,
 	retval = link_path_walk(name, nd);
 
 	if (nd->flags & LOOKUP_RCU) {
-		/* RCU dangling. Cancel it. */
-		if (!retval) {
-			if (nameidata_drop_rcu_last(nd))
-				retval = -ECHILD;
-		} else {
-			nd->flags &= ~LOOKUP_RCU;
-			nd->root.mnt = NULL;
-			rcu_read_unlock();
-			br_read_unlock(vfsmount_lock);
-		}
+		/* went all way through without dropping RCU */
+		BUG_ON(retval);
+		if (nameidata_drop_rcu_last(nd))
+			retval = -ECHILD;
 	}
 
 	if (!retval)
-- 
cgit v0.10.2


From ef7562d5283a91da3ba5c14de3221f47b7f08823 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 4 Mar 2011 14:35:59 -0500
Subject: make handle_dots() leave RCU mode on error

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/namei.c b/fs/namei.c
index f09887a..ea14bfb 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1052,7 +1052,7 @@ static int follow_dotdot_rcu(struct nameidata *nd)
 
 			seq = read_seqcount_begin(&parent->d_seq);
 			if (read_seqcount_retry(&old->d_seq, nd->seq))
-				return -ECHILD;
+				goto failed;
 			inode = parent->d_inode;
 			nd->path.dentry = parent;
 			nd->seq = seq;
@@ -1065,8 +1065,14 @@ static int follow_dotdot_rcu(struct nameidata *nd)
 	}
 	__follow_mount_rcu(nd, &nd->path, &inode, true);
 	nd->inode = inode;
-
 	return 0;
+
+failed:
+	nd->flags &= ~LOOKUP_RCU;
+	nd->root.mnt = NULL;
+	rcu_read_unlock();
+	br_read_unlock(vfsmount_lock);
+	return -ECHILD;
 }
 
 /*
@@ -1405,9 +1411,8 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 		 * parent relationships.
 		 */
 		if (unlikely(type != LAST_NORM)) {
-			err = handle_dots(nd, type);
-			if (err)
-				goto return_err;
+			if (handle_dots(nd, type))
+				return -ECHILD;
 			continue;
 		}
 
@@ -1441,12 +1446,8 @@ last_component:
 		nd->flags &= lookup_flags | ~LOOKUP_CONTINUE;
 		if (lookup_flags & LOOKUP_PARENT)
 			goto lookup_parent;
-		if (unlikely(type != LAST_NORM)) {
-			err = handle_dots(nd, type);
-			if (err)
-				goto return_err;
-			return 0;
-		}
+		if (unlikely(type != LAST_NORM))
+			return handle_dots(nd, type);
 		err = do_lookup(nd, &this, &next, &inode);
 		if (err)
 			break;
-- 
cgit v0.10.2


From a7472baba22dd5d68580f528374f93421b33667e Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 4 Mar 2011 14:39:30 -0500
Subject: make nameidata_dentry_drop_rcu_maybe() always leave RCU mode

Now we have do_follow_link() guaranteed to leave without dangling RCU
and the next step will get LOOKUP_RCU logics completely out of
link_path_walk().

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/namei.c b/fs/namei.c
index ea14bfb..53bba7c 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -498,8 +498,15 @@ err_root:
 /* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing.  */
 static inline int nameidata_dentry_drop_rcu_maybe(struct nameidata *nd, struct dentry *dentry)
 {
-	if (nd->flags & LOOKUP_RCU)
-		return nameidata_dentry_drop_rcu(nd, dentry);
+	if (nd->flags & LOOKUP_RCU) {
+		if (unlikely(nameidata_dentry_drop_rcu(nd, dentry))) {
+			nd->flags &= ~LOOKUP_RCU;
+			nd->root.mnt = NULL;
+			rcu_read_unlock();
+			br_read_unlock(vfsmount_lock);
+			return -ECHILD;
+		}
+	}
 	return 0;
 }
 
@@ -1424,7 +1431,7 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 		if (inode && inode->i_op->follow_link) {
 			err = do_follow_link(inode, &next, nd);
 			if (err)
-				goto return_err;
+				return err;
 			nd->inode = nd->path.dentry->d_inode;
 		} else {
 			path_to_nameidata(&next, nd);
@@ -1455,7 +1462,7 @@ last_component:
 		    (lookup_flags & LOOKUP_FOLLOW)) {
 			err = do_follow_link(inode, &next, nd);
 			if (err)
-				goto return_err;
+				return err;
 			nd->inode = nd->path.dentry->d_inode;
 		} else {
 			path_to_nameidata(&next, nd);
@@ -1477,7 +1484,6 @@ lookup_parent:
 	}
 	if (!(nd->flags & LOOKUP_RCU))
 		path_put(&nd->path);
-return_err:
 	if (nd->flags & LOOKUP_RCU) {
 		nd->flags &= ~LOOKUP_RCU;
 		nd->root.mnt = NULL;
-- 
cgit v0.10.2


From 951361f954596bd134d4270df834f47d151f98a6 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 4 Mar 2011 14:44:37 -0500
Subject: get rid of the last LOOKUP_RCU dependencies in link_path_walk()

New helper: terminate_walk().  An error has happened during pathname
resolution and we either drop nd->path or terminate RCU, depending
the mode we had been in.  After that, nd is essentially empty.
Switch link_path_walk() to using that for cleanup.

Now the top-level logics in link_path_walk() is back to sanity.  RCU
dependencies are in the lower-level functions.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/namei.c b/fs/namei.c
index 53bba7c..85f6e39 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1343,6 +1343,18 @@ static inline int handle_dots(struct nameidata *nd, int type)
 	return 0;
 }
 
+static void terminate_walk(struct nameidata *nd)
+{
+	if (!(nd->flags & LOOKUP_RCU)) {
+		path_put(&nd->path);
+	} else {
+		nd->flags &= ~LOOKUP_RCU;
+		nd->root.mnt = NULL;
+		rcu_read_unlock();
+		br_read_unlock(vfsmount_lock);
+	}
+}
+
 /*
  * Name resolution.
  * This is the basic name resolution function, turning a pathname into
@@ -1482,14 +1494,7 @@ lookup_parent:
 		nd->last_type = type;
 		return 0;
 	}
-	if (!(nd->flags & LOOKUP_RCU))
-		path_put(&nd->path);
-	if (nd->flags & LOOKUP_RCU) {
-		nd->flags &= ~LOOKUP_RCU;
-		nd->root.mnt = NULL;
-		rcu_read_unlock();
-		br_read_unlock(vfsmount_lock);
-	}
+	terminate_walk(nd);
 	return err;
 }
 
-- 
cgit v0.10.2


From 70e9b3571107b88674cd55ae4bed33f76261e7d3 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 5 Mar 2011 21:12:22 -0500
Subject: get rid of nd->file

Don't stash the struct file * used as starting point of walk in nameidata;
pass file ** to path_init() instead.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/namei.c b/fs/namei.c
index 85f6e39..a260a30 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1498,7 +1498,8 @@ lookup_parent:
 	return err;
 }
 
-static int path_init(int dfd, const char *name, unsigned int flags, struct nameidata *nd)
+static int path_init(int dfd, const char *name, unsigned int flags,
+		     struct nameidata *nd, struct file **fp)
 {
 	int retval = 0;
 	int fput_needed;
@@ -1508,7 +1509,6 @@ static int path_init(int dfd, const char *name, unsigned int flags, struct namei
 	nd->flags = flags | LOOKUP_JUMPED;
 	nd->depth = 0;
 	nd->root.mnt = NULL;
-	nd->file = NULL;
 
 	if (*name=='/') {
 		if (flags & LOOKUP_RCU) {
@@ -1557,7 +1557,7 @@ static int path_init(int dfd, const char *name, unsigned int flags, struct namei
 		nd->path = file->f_path;
 		if (flags & LOOKUP_RCU) {
 			if (fput_needed)
-				nd->file = file;
+				*fp = file;
 			nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
 			br_read_lock(vfsmount_lock);
 			rcu_read_lock();
@@ -1580,6 +1580,7 @@ out_fail:
 static int path_lookupat(int dfd, const char *name,
 				unsigned int flags, struct nameidata *nd)
 {
+	struct file *base = NULL;
 	int retval;
 
 	/*
@@ -1596,7 +1597,7 @@ static int path_lookupat(int dfd, const char *name,
 	 * be handled by restarting a traditional ref-walk (which will always
 	 * be able to complete).
 	 */
-	retval = path_init(dfd, name, flags, nd);
+	retval = path_init(dfd, name, flags, nd, &base);
 
 	if (unlikely(retval))
 		return retval;
@@ -1614,10 +1615,8 @@ static int path_lookupat(int dfd, const char *name,
 	if (!retval)
 		retval = handle_reval_path(nd);
 
-	if (nd->file) {
-		fput(nd->file);
-		nd->file = NULL;
-	}
+	if (base)
+		fput(base);
 
 	if (nd->root.mnt) {
 		path_put(&nd->root);
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 265378a..72ffd62 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -19,7 +19,6 @@ struct nameidata {
 	struct path	path;
 	struct qstr	last;
 	struct path	root;
-	struct file	*file;
 	struct inode	*inode; /* path.dentry.d_inode */
 	unsigned int	flags;
 	unsigned	seq;
-- 
cgit v0.10.2


From fe2d35ff0d18a2c93993b0d7d46f846ff4331b72 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 5 Mar 2011 22:58:25 -0500
Subject: switch non-create side of open() to use of do_last()

Instead of path_lookupat() doing trailing symlink resolution,
use the same scheme as on the O_CREAT side.  Walk with
LOOKUP_PARENT, then (in do_last()) look the final component
up, then either open it or return error or, if it's a symlink,
give the symlink back to path_openat() to be resolved there.

The really messy complication here is RCU.  We don't want to drop
out of RCU mode before the final lookup, since we don't want to
bounce parent directory ->d_count without a good reason.

Result is _not_ pretty; later in the series we'll clean it up.
For now we are roughly back where we'd been before the revert
done by Nick's series - top-level logics of path_openat() is
cleaned up, do_last() does actual opening, symlink resolution is
done uniformly.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/namei.c b/fs/namei.c
index a260a30..9595b4a 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2178,13 +2178,14 @@ exit:
 }
 
 /*
- * Handle O_CREAT case for do_filp_open
+ * Handle the last step of open()
  */
 static struct file *do_last(struct nameidata *nd, struct path *path,
 			    const struct open_flags *op, const char *pathname)
 {
 	struct dentry *dir = nd->path.dentry;
 	struct file *filp;
+	struct inode *inode;
 	int error;
 
 	nd->flags &= ~LOOKUP_PARENT;
@@ -2192,17 +2193,27 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 
 	switch (nd->last_type) {
 	case LAST_DOTDOT:
-		follow_dotdot(nd);
-		dir = nd->path.dentry;
 	case LAST_DOT:
+		error = handle_dots(nd, nd->last_type);
+		if (error)
+			return ERR_PTR(error);
 		/* fallthrough */
 	case LAST_ROOT:
+		if (nd->flags & LOOKUP_RCU) {
+			if (nameidata_drop_rcu_last(nd))
+				return ERR_PTR(-ECHILD);
+		}
 		error = handle_reval_path(nd);
 		if (error)
 			goto exit;
-		error = -EISDIR;
-		goto exit;
+		audit_inode(pathname, nd->path.dentry);
+		if (op->open_flag & O_CREAT) {
+			error = -EISDIR;
+			goto exit;
+		}
+		goto ok;
 	case LAST_BIND:
+		/* can't be RCU mode here */
 		error = handle_reval_path(nd);
 		if (error)
 			goto exit;
@@ -2210,6 +2221,51 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 		goto ok;
 	}
 
+	if (!(op->open_flag & O_CREAT)) {
+		if (nd->last.name[nd->last.len])
+			nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
+		/* we _can_ be in RCU mode here */
+		error = do_lookup(nd, &nd->last, path, &inode);
+		if (error) {
+			terminate_walk(nd);
+			return ERR_PTR(error);
+		}
+		if (!inode) {
+			path_to_nameidata(path, nd);
+			terminate_walk(nd);
+			return ERR_PTR(-ENOENT);
+		}
+		if (unlikely(inode->i_op->follow_link)) {
+			/* We drop rcu-walk here */
+			if (nameidata_dentry_drop_rcu_maybe(nd, path->dentry))
+				return ERR_PTR(-ECHILD);
+			return NULL;
+		}
+		path_to_nameidata(path, nd);
+		nd->inode = inode;
+		/* sayonara */
+		if (nd->flags & LOOKUP_RCU) {
+			if (nameidata_drop_rcu_last(nd))
+				return ERR_PTR(-ECHILD);
+		}
+
+		error = -ENOTDIR;
+		if (nd->flags & LOOKUP_DIRECTORY) {
+			if (!inode->i_op->lookup)
+				goto exit;
+		}
+		audit_inode(pathname, nd->path.dentry);
+		goto ok;
+	}
+
+	/* create side of things */
+
+	if (nd->flags & LOOKUP_RCU) {
+		if (nameidata_drop_rcu_last(nd))
+			return ERR_PTR(-ECHILD);
+	}
+
+	audit_inode(pathname, dir);
 	error = -EISDIR;
 	/* trailing slashes? */
 	if (nd->last.name[nd->last.len])
@@ -2303,6 +2359,7 @@ exit:
 static struct file *path_openat(int dfd, const char *pathname,
 		const struct open_flags *op, int flags)
 {
+	struct file *base = NULL;
 	struct file *filp;
 	struct nameidata nd;
 	struct path path;
@@ -2318,39 +2375,15 @@ static struct file *path_openat(int dfd, const char *pathname,
 	nd.intent.open.flags = open_to_namei_flags(op->open_flag);
 	nd.intent.open.create_mode = op->mode;
 
-	if (op->open_flag & O_CREAT)
-		goto creat;
-
-	/* !O_CREAT, simple open */
-	error = path_lookupat(dfd, pathname, flags | op->intent, &nd);
+	error = path_init(dfd, pathname, flags | LOOKUP_PARENT, &nd, &base);
 	if (unlikely(error))
 		goto out_filp;
-	error = -ELOOP;
-	if (!(nd.flags & LOOKUP_FOLLOW)) {
-		if (nd.inode->i_op->follow_link)
-			goto out_path;
-	}
-	error = -ENOTDIR;
-	if (nd.flags & LOOKUP_DIRECTORY) {
-		if (!nd.inode->i_op->lookup)
-			goto out_path;
-	}
-	audit_inode(pathname, nd.path.dentry);
-	filp = finish_open(&nd, op->open_flag, op->acc_mode);
-	release_open_intent(&nd);
-	return filp;
 
-creat:
-	/* OK, have to create the file. Find the parent. */
-	error = path_lookupat(dfd, pathname, LOOKUP_PARENT | flags, &nd);
+	current->total_link_count = 0;
+	error = link_path_walk(pathname, &nd);
 	if (unlikely(error))
 		goto out_filp;
-	if (unlikely(!audit_dummy_context()))
-		audit_inode(pathname, nd.path.dentry);
 
-	/*
-	 * We have the parent and last component.
-	 */
 	filp = do_last(&nd, &path, op, pathname);
 	while (unlikely(!filp)) { /* trailing symlink */
 		struct path link = path;
@@ -2386,12 +2419,13 @@ creat:
 out:
 	if (nd.root.mnt)
 		path_put(&nd.root);
+	if (base)
+		fput(base);
 	release_open_intent(&nd);
 	return filp;
 
 exit_dput:
 	path_put_conditional(&path, &nd);
-out_path:
 	path_put(&nd.path);
 out_filp:
 	filp = ERR_PTR(error);
-- 
cgit v0.10.2


From 6a96ba54418be740303765c0f52be028573cb99a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 7 Mar 2011 23:49:20 -0500
Subject: kill __lookup_one_len()

only one caller left

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/namei.c b/fs/namei.c
index 9595b4a..f6f3ef4 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1759,28 +1759,6 @@ static struct dentry *lookup_hash(struct nameidata *nd)
 	return __lookup_hash(&nd->last, nd->path.dentry, nd);
 }
 
-static int __lookup_one_len(const char *name, struct qstr *this,
-		struct dentry *base, int len)
-{
-	unsigned long hash;
-	unsigned int c;
-
-	this->name = name;
-	this->len = len;
-	if (!len)
-		return -EACCES;
-
-	hash = init_name_hash();
-	while (len--) {
-		c = *(const unsigned char *)name++;
-		if (c == '/' || c == '\0')
-			return -EACCES;
-		hash = partial_name_hash(c, hash);
-	}
-	this->hash = end_name_hash(hash);
-	return 0;
-}
-
 /**
  * lookup_one_len - filesystem helper to lookup single pathname component
  * @name:	pathname component to lookup
@@ -1794,14 +1772,25 @@ static int __lookup_one_len(const char *name, struct qstr *this,
  */
 struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
 {
-	int err;
 	struct qstr this;
+	unsigned long hash;
+	unsigned int c;
 
 	WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex));
 
-	err = __lookup_one_len(name, &this, base, len);
-	if (err)
-		return ERR_PTR(err);
+	this.name = name;
+	this.len = len;
+	if (!len)
+		return ERR_PTR(-EACCES);
+
+	hash = init_name_hash();
+	while (len--) {
+		c = *(const unsigned char *)name++;
+		if (c == '/' || c == '\0')
+			return ERR_PTR(-EACCES);
+		hash = partial_name_hash(c, hash);
+	}
+	this.hash = end_name_hash(hash);
 
 	return __lookup_hash(&this, base, NULL);
 }
-- 
cgit v0.10.2


From 5a202bcd75bbd2397136397961babbd8463416af Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 8 Mar 2011 14:17:44 -0500
Subject: sanitize pathname component hash calculation

Lift it to lookup_one_len() and link_path_walk() resp. into the
same place where we calculated default hash function of the same
name.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/namei.c b/fs/namei.c
index f6f3ef4..d1a5dfe 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1217,16 +1217,6 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
 	int err;
 
 	/*
-	 * See if the low-level filesystem might want
-	 * to use its own hash..
-	 */
-	if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
-		err = parent->d_op->d_hash(parent, nd->inode, name);
-		if (err < 0)
-			return err;
-	}
-
-	/*
 	 * Rename seqlock is not required here because in the off chance
 	 * of a false negative due to a concurrent rename, we're going to
 	 * do the non-racy lookup, below.
@@ -1414,8 +1404,16 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 			case 1:
 				type = LAST_DOT;
 		}
-		if (likely(type == LAST_NORM))
+		if (likely(type == LAST_NORM)) {
+			struct dentry *parent = nd->path.dentry;
 			nd->flags &= ~LOOKUP_JUMPED;
+			if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
+				err = parent->d_op->d_hash(parent, nd->inode,
+							   &this);
+				if (err < 0)
+					break;
+			}
+		}
 
 		/* remove trailing slashes? */
 		if (!c)
@@ -1723,17 +1721,6 @@ static struct dentry *__lookup_hash(struct qstr *name,
 		return ERR_PTR(err);
 
 	/*
-	 * See if the low-level filesystem might want
-	 * to use its own hash..
-	 */
-	if (base->d_flags & DCACHE_OP_HASH) {
-		err = base->d_op->d_hash(base, inode, name);
-		dentry = ERR_PTR(err);
-		if (err < 0)
-			goto out;
-	}
-
-	/*
 	 * Don't bother with __d_lookup: callers are for creat as
 	 * well as unlink, so a lot of the time it would cost
 	 * a double lookup.
@@ -1745,7 +1732,7 @@ static struct dentry *__lookup_hash(struct qstr *name,
 
 	if (!dentry)
 		dentry = d_alloc_and_lookup(base, name, nd);
-out:
+
 	return dentry;
 }
 
@@ -1791,6 +1778,15 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
 		hash = partial_name_hash(c, hash);
 	}
 	this.hash = end_name_hash(hash);
+	/*
+	 * See if the low-level filesystem might want
+	 * to use its own hash..
+	 */
+	if (base->d_flags & DCACHE_OP_HASH) {
+		int err = base->d_op->d_hash(base, base->d_inode, &this);
+		if (err < 0)
+			return ERR_PTR(err);
+	}
 
 	return __lookup_hash(&this, base, NULL);
 }
-- 
cgit v0.10.2


From 0f9d1a10c341020617e5b1c7f9c16f6a070438ec Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 9 Mar 2011 00:13:14 -0500
Subject: expand finish_open() in its only caller

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/namei.c b/fs/namei.c
index d1a5dfe..1f561dc 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2111,57 +2111,6 @@ static int open_will_truncate(int flag, struct inode *inode)
 	return (flag & O_TRUNC);
 }
 
-static struct file *finish_open(struct nameidata *nd,
-				int open_flag, int acc_mode)
-{
-	struct file *filp;
-	int will_truncate;
-	int error;
-
-	will_truncate = open_will_truncate(open_flag, nd->path.dentry->d_inode);
-	if (will_truncate) {
-		error = mnt_want_write(nd->path.mnt);
-		if (error)
-			goto exit;
-	}
-	error = may_open(&nd->path, acc_mode, open_flag);
-	if (error) {
-		if (will_truncate)
-			mnt_drop_write(nd->path.mnt);
-		goto exit;
-	}
-	filp = nameidata_to_filp(nd);
-	if (!IS_ERR(filp)) {
-		error = ima_file_check(filp, acc_mode);
-		if (error) {
-			fput(filp);
-			filp = ERR_PTR(error);
-		}
-	}
-	if (!IS_ERR(filp)) {
-		if (will_truncate) {
-			error = handle_truncate(filp);
-			if (error) {
-				fput(filp);
-				filp = ERR_PTR(error);
-			}
-		}
-	}
-	/*
-	 * It is now safe to drop the mnt write
-	 * because the filp has had a write taken
-	 * on its behalf.
-	 */
-	if (will_truncate)
-		mnt_drop_write(nd->path.mnt);
-	path_put(&nd->path);
-	return filp;
-
-exit:
-	path_put(&nd->path);
-	return ERR_PTR(error);
-}
-
 /*
  * Handle the last step of open()
  */
@@ -2169,6 +2118,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 			    const struct open_flags *op, const char *pathname)
 {
 	struct dentry *dir = nd->path.dentry;
+	int will_truncate;
 	struct file *filp;
 	struct inode *inode;
 	int error;
@@ -2329,7 +2279,43 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 	if (S_ISDIR(nd->inode->i_mode))
 		goto exit;
 ok:
-	filp = finish_open(nd, op->open_flag, op->acc_mode);
+	will_truncate = open_will_truncate(op->open_flag, nd->path.dentry->d_inode);
+	if (will_truncate) {
+		error = mnt_want_write(nd->path.mnt);
+		if (error)
+			goto exit;
+	}
+	error = may_open(&nd->path, op->acc_mode, op->open_flag);
+	if (error) {
+		if (will_truncate)
+			mnt_drop_write(nd->path.mnt);
+		goto exit;
+	}
+	filp = nameidata_to_filp(nd);
+	if (!IS_ERR(filp)) {
+		error = ima_file_check(filp, op->acc_mode);
+		if (error) {
+			fput(filp);
+			filp = ERR_PTR(error);
+		}
+	}
+	if (!IS_ERR(filp)) {
+		if (will_truncate) {
+			error = handle_truncate(filp);
+			if (error) {
+				fput(filp);
+				filp = ERR_PTR(error);
+			}
+		}
+	}
+	/*
+	 * It is now safe to drop the mnt write
+	 * because the filp has had a write taken
+	 * on its behalf.
+	 */
+	if (will_truncate)
+		mnt_drop_write(nd->path.mnt);
+	path_put(&nd->path);
 	return filp;
 
 exit_mutex_unlock:
-- 
cgit v0.10.2


From 9b44f1b3928b6f41532c9a1dc9a6fc665989ad5b Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 9 Mar 2011 00:17:27 -0500
Subject: move may_open() from __open_name_create() to do_last()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/namei.c b/fs/namei.c
index 1f561dc..def63e7 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2069,11 +2069,7 @@ out_unlock:
 	mutex_unlock(&dir->d_inode->i_mutex);
 	dput(nd->path.dentry);
 	nd->path.dentry = path->dentry;
-
-	if (error)
-		return error;
-	/* Don't check for write permission, don't truncate */
-	return may_open(&nd->path, 0, open_flag & ~O_TRUNC);
+	return error;
 }
 
 /*
@@ -2239,6 +2235,12 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 			mnt_drop_write(nd->path.mnt);
 			goto exit;
 		}
+		/* Don't check for write permission, don't truncate */
+		error = may_open(&nd->path, 0, op->open_flag & ~O_TRUNC);
+		if (error) {
+			mnt_drop_write(nd->path.mnt);
+			goto exit;
+		}
 		filp = nameidata_to_filp(nd);
 		mnt_drop_write(nd->path.mnt);
 		path_put(&nd->path);
-- 
cgit v0.10.2


From ca344a894b41a133dab07dfbbdf652c053f6658c Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 9 Mar 2011 00:36:45 -0500
Subject: do_last: unify may_open() call and everyting after it

We have a bunch of diverging codepaths in do_last(); some of
them converge, but the case of having to create a new file
duplicates large part of common tail of the rest and exits
separately.  Massage them so that they could be merged.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/namei.c b/fs/namei.c
index def63e7..6384477 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2114,7 +2114,10 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 			    const struct open_flags *op, const char *pathname)
 {
 	struct dentry *dir = nd->path.dentry;
+	int open_flag = op->open_flag;
 	int will_truncate;
+	int want_write = 0;
+	int skip_perm = 0;
 	struct file *filp;
 	struct inode *inode;
 	int error;
@@ -2138,7 +2141,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 		if (error)
 			goto exit;
 		audit_inode(pathname, nd->path.dentry);
-		if (op->open_flag & O_CREAT) {
+		if (open_flag & O_CREAT) {
 			error = -EISDIR;
 			goto exit;
 		}
@@ -2152,7 +2155,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 		goto ok;
 	}
 
-	if (!(op->open_flag & O_CREAT)) {
+	if (!(open_flag & O_CREAT)) {
 		if (nd->last.name[nd->last.len])
 			nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
 		/* we _can_ be in RCU mode here */
@@ -2230,28 +2233,15 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 		error = mnt_want_write(nd->path.mnt);
 		if (error)
 			goto exit_mutex_unlock;
-		error = __open_namei_create(nd, path, op->open_flag, op->mode);
-		if (error) {
-			mnt_drop_write(nd->path.mnt);
+		want_write = 1;
+		will_truncate = 0;
+		error = __open_namei_create(nd, path, open_flag, op->mode);
+		if (error)
 			goto exit;
-		}
 		/* Don't check for write permission, don't truncate */
-		error = may_open(&nd->path, 0, op->open_flag & ~O_TRUNC);
-		if (error) {
-			mnt_drop_write(nd->path.mnt);
-			goto exit;
-		}
-		filp = nameidata_to_filp(nd);
-		mnt_drop_write(nd->path.mnt);
-		path_put(&nd->path);
-		if (!IS_ERR(filp)) {
-			error = ima_file_check(filp, op->acc_mode);
-			if (error) {
-				fput(filp);
-				filp = ERR_PTR(error);
-			}
-		}
-		return filp;
+		open_flag &= ~O_TRUNC;
+		skip_perm = 1;
+		goto common;
 	}
 
 	/*
@@ -2261,7 +2251,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 	audit_inode(pathname, path->dentry);
 
 	error = -EEXIST;
-	if (op->open_flag & O_EXCL)
+	if (open_flag & O_EXCL)
 		goto exit_dput;
 
 	error = follow_managed(path, nd->flags);
@@ -2281,18 +2271,17 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 	if (S_ISDIR(nd->inode->i_mode))
 		goto exit;
 ok:
-	will_truncate = open_will_truncate(op->open_flag, nd->path.dentry->d_inode);
+	will_truncate = open_will_truncate(open_flag, nd->path.dentry->d_inode);
 	if (will_truncate) {
 		error = mnt_want_write(nd->path.mnt);
 		if (error)
 			goto exit;
+		want_write = 1;
 	}
-	error = may_open(&nd->path, op->acc_mode, op->open_flag);
-	if (error) {
-		if (will_truncate)
-			mnt_drop_write(nd->path.mnt);
+common:
+	error = may_open(&nd->path, skip_perm ? 0 : op->acc_mode, open_flag);
+	if (error)
 		goto exit;
-	}
 	filp = nameidata_to_filp(nd);
 	if (!IS_ERR(filp)) {
 		error = ima_file_check(filp, op->acc_mode);
@@ -2310,12 +2299,8 @@ ok:
 			}
 		}
 	}
-	/*
-	 * It is now safe to drop the mnt write
-	 * because the filp has had a write taken
-	 * on its behalf.
-	 */
-	if (will_truncate)
+out:
+	if (want_write)
 		mnt_drop_write(nd->path.mnt);
 	path_put(&nd->path);
 	return filp;
@@ -2325,8 +2310,8 @@ exit_mutex_unlock:
 exit_dput:
 	path_put_conditional(path, nd);
 exit:
-	path_put(&nd->path);
-	return ERR_PTR(error);
+	filp = ERR_PTR(error);
+	goto out;
 }
 
 static struct file *path_openat(int dfd, const char *pathname,
-- 
cgit v0.10.2


From 6c0d46c493217cf48999b3f8808910ae534aa085 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 9 Mar 2011 00:59:59 -0500
Subject: fold __open_namei_create() and open_will_truncate() into do_last()

... and clean up a bit more

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/namei.c b/fs/namei.c
index 6384477..441f110 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2049,30 +2049,6 @@ static int handle_truncate(struct file *filp)
 }
 
 /*
- * Be careful about ever adding any more callers of this
- * function.  Its flags must be in the namei format, not
- * what get passed to sys_open().
- */
-static int __open_namei_create(struct nameidata *nd, struct path *path,
-				int open_flag, int mode)
-{
-	int error;
-	struct dentry *dir = nd->path.dentry;
-
-	if (!IS_POSIXACL(dir->d_inode))
-		mode &= ~current_umask();
-	error = security_path_mknod(&nd->path, path->dentry, mode, 0);
-	if (error)
-		goto out_unlock;
-	error = vfs_create(dir->d_inode, path->dentry, mode, nd);
-out_unlock:
-	mutex_unlock(&dir->d_inode->i_mutex);
-	dput(nd->path.dentry);
-	nd->path.dentry = path->dentry;
-	return error;
-}
-
-/*
  * Note that while the flag value (low two bits) for sys_open means:
  *	00 - read-only
  *	01 - write-only
@@ -2096,17 +2072,6 @@ static inline int open_to_namei_flags(int flag)
 	return flag;
 }
 
-static int open_will_truncate(int flag, struct inode *inode)
-{
-	/*
-	 * We'll never write to the fs underlying
-	 * a device file.
-	 */
-	if (special_file(inode->i_mode))
-		return 0;
-	return (flag & O_TRUNC);
-}
-
 /*
  * Handle the last step of open()
  */
@@ -2114,8 +2079,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 			    const struct open_flags *op, const char *pathname)
 {
 	struct dentry *dir = nd->path.dentry;
+	struct dentry *dentry;
 	int open_flag = op->open_flag;
-	int will_truncate;
+	int will_truncate = open_flag & O_TRUNC;
 	int want_write = 0;
 	int skip_perm = 0;
 	struct file *filp;
@@ -2207,25 +2173,29 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 
 	mutex_lock(&dir->d_inode->i_mutex);
 
-	path->dentry = lookup_hash(nd);
-	path->mnt = nd->path.mnt;
-
-	error = PTR_ERR(path->dentry);
-	if (IS_ERR(path->dentry)) {
+	dentry = lookup_hash(nd);
+	error = PTR_ERR(dentry);
+	if (IS_ERR(dentry)) {
 		mutex_unlock(&dir->d_inode->i_mutex);
 		goto exit;
 	}
 
+	path->dentry = dentry;
+	path->mnt = nd->path.mnt;
+
 	if (IS_ERR(nd->intent.open.file)) {
 		error = PTR_ERR(nd->intent.open.file);
 		goto exit_mutex_unlock;
 	}
 
 	/* Negative dentry, just create the file */
-	if (!path->dentry->d_inode) {
+	if (!dentry->d_inode) {
+		int mode = op->mode;
+		if (!IS_POSIXACL(dir->d_inode))
+			mode &= ~current_umask();
 		/*
 		 * This write is needed to ensure that a
-		 * ro->rw transition does not occur between
+		 * rw->ro transition does not occur between
 		 * the time when the file is created and when
 		 * a permanent write count is taken through
 		 * the 'struct file' in nameidata_to_filp().
@@ -2234,13 +2204,19 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 		if (error)
 			goto exit_mutex_unlock;
 		want_write = 1;
-		will_truncate = 0;
-		error = __open_namei_create(nd, path, open_flag, op->mode);
-		if (error)
-			goto exit;
 		/* Don't check for write permission, don't truncate */
 		open_flag &= ~O_TRUNC;
+		will_truncate = 0;
 		skip_perm = 1;
+		error = security_path_mknod(&nd->path, dentry, mode, 0);
+		if (error)
+			goto exit_mutex_unlock;
+		error = vfs_create(dir->d_inode, dentry, mode, nd);
+		if (error)
+			goto exit_mutex_unlock;
+		mutex_unlock(&dir->d_inode->i_mutex);
+		dput(nd->path.dentry);
+		nd->path.dentry = dentry;
 		goto common;
 	}
 
@@ -2271,7 +2247,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 	if (S_ISDIR(nd->inode->i_mode))
 		goto exit;
 ok:
-	will_truncate = open_will_truncate(open_flag, nd->path.dentry->d_inode);
+	if (!S_ISREG(nd->inode->i_mode))
+		will_truncate = 0;
+
 	if (will_truncate) {
 		error = mnt_want_write(nd->path.mnt);
 		if (error)
-- 
cgit v0.10.2


From f374ed5fa8afed8590deaae5dc147422e0e1a6d9 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 9 Mar 2011 01:34:45 -0500
Subject: do_last: kill a rudiment of old ->d_revalidate() workaround

There used to be time when ->d_revalidate() couldn't return an error.
So intents code had lookup_instantiate_filp() stash ERR_PTR(error)
in nd->intent.open.filp and had it checked after lookup_hash(), to
catch the otherwise silent failures.  That had been introduced by
commit 4af4c52f34606bdaab6930a845550c6fb02078a4.  These days
->d_revalidate() can and does propagate errors back to callers
explicitly, so this check isn't needed anymore.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/namei.c b/fs/namei.c
index 441f110..6972e76 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2183,11 +2183,6 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 	path->dentry = dentry;
 	path->mnt = nd->path.mnt;
 
-	if (IS_ERR(nd->intent.open.file)) {
-		error = PTR_ERR(nd->intent.open.file);
-		goto exit_mutex_unlock;
-	}
-
 	/* Negative dentry, just create the file */
 	if (!dentry->d_inode) {
 		int mode = op->mode;
-- 
cgit v0.10.2


From 40b39136f07279fdc868a36cba050f4e84ce0ace Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 9 Mar 2011 16:22:18 -0500
Subject: path_openat: clean ELOOP handling a bit

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/namei.c b/fs/namei.c
index 6972e76..ca9a06a 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2320,11 +2320,12 @@ static struct file *path_openat(int dfd, const char *pathname,
 		struct path link = path;
 		struct inode *linki = link.dentry->d_inode;
 		void *cookie;
-		error = -ELOOP;
-		if (!(nd.flags & LOOKUP_FOLLOW))
-			goto exit_dput;
-		if (count++ == 32)
-			goto exit_dput;
+		if (!(nd.flags & LOOKUP_FOLLOW) || count++ == 32) {
+			path_put_conditional(&path, &nd);
+			path_put(&nd.path);
+			filp = ERR_PTR(-ELOOP);
+			break;
+		}
 		/*
 		 * This is subtle. Instead of calling do_follow_link() we do
 		 * the thing by hands. The reason is that this way we have zero
@@ -2355,9 +2356,6 @@ out:
 	release_open_intent(&nd);
 	return filp;
 
-exit_dput:
-	path_put_conditional(&path, &nd);
-	path_put(&nd.path);
 out_filp:
 	filp = ERR_PTR(error);
 	goto out;
-- 
cgit v0.10.2


From 5a18fff2090c3af830d699c8ccb230498a1e37e5 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 11 Mar 2011 04:44:53 -0500
Subject: untangle do_lookup()

That thing has devolved into rats nest of gotos; sane use of unlikely()
gets rid of that horror and gives much more readable structure:
	* make a fast attempt to find a dentry; false negatives are OK.
In RCU mode if everything went fine, we are done, otherwise just drop
out of RCU.  If we'd done (RCU) ->d_revalidate() and it had not refused
outright (i.e. didn't give us -ECHILD), remember its result.
	* now we are not in RCU mode and hopefully have a dentry.  If we
do not, lock parent, do full d_lookup() and if that has not found anything,
allocate and call ->lookup().  If we'd done that ->lookup(), remember that
dentry is good and we don't need to revalidate it.
	* now we have a dentry.  If it has ->d_revalidate() and we can't
skip it, call it.
	* hopefully dentry is good; if not, either fail (in case of error)
or try to invalidate it.  If d_invalidate() has succeeded, drop it and
retry everything as if original attempt had not found a dentry.
	* now we can finish it up - deal with mountpoint crossing and
automount.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/namei.c b/fs/namei.c
index ca9a06a..0bebd13 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -589,29 +589,6 @@ do_revalidate(struct dentry *dentry, struct nameidata *nd)
 	return dentry;
 }
 
-static inline struct dentry *
-do_revalidate_rcu(struct dentry *dentry, struct nameidata *nd)
-{
-	int status = d_revalidate(dentry, nd);
-	if (likely(status > 0))
-		return dentry;
-	if (status == -ECHILD) {
-		if (nameidata_dentry_drop_rcu(nd, dentry))
-			return ERR_PTR(-ECHILD);
-		return do_revalidate(dentry, nd);
-	}
-	if (status < 0)
-		return ERR_PTR(status);
-	/* Don't d_invalidate in rcu-walk mode */
-	if (nameidata_dentry_drop_rcu(nd, dentry))
-		return ERR_PTR(-ECHILD);
-	if (!d_invalidate(dentry)) {
-		dput(dentry);
-		dentry = NULL;
-	}
-	return dentry;
-}
-
 /*
  * handle_reval_path - force revalidation of a dentry
  *
@@ -1213,7 +1190,8 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
 {
 	struct vfsmount *mnt = nd->path.mnt;
 	struct dentry *dentry, *parent = nd->path.dentry;
-	struct inode *dir;
+	int need_reval = 1;
+	int status = 1;
 	int err;
 
 	/*
@@ -1223,48 +1201,74 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
 	 */
 	if (nd->flags & LOOKUP_RCU) {
 		unsigned seq;
-
 		*inode = nd->inode;
 		dentry = __d_lookup_rcu(parent, name, &seq, inode);
-		if (!dentry) {
-			if (nameidata_drop_rcu(nd))
-				return -ECHILD;
-			goto need_lookup;
-		}
+		if (!dentry)
+			goto unlazy;
+
 		/* Memory barrier in read_seqcount_begin of child is enough */
 		if (__read_seqcount_retry(&parent->d_seq, nd->seq))
 			return -ECHILD;
-
 		nd->seq = seq;
+
 		if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) {
-			dentry = do_revalidate_rcu(dentry, nd);
-			if (!dentry)
-				goto need_lookup;
-			if (IS_ERR(dentry))
-				goto fail;
-			if (!(nd->flags & LOOKUP_RCU))
-				goto done;
+			status = d_revalidate(dentry, nd);
+			if (unlikely(status <= 0)) {
+				if (status != -ECHILD)
+					need_reval = 0;
+				goto unlazy;
+			}
 		}
 		path->mnt = mnt;
 		path->dentry = dentry;
 		if (likely(__follow_mount_rcu(nd, path, inode, false)))
 			return 0;
-		if (nameidata_drop_rcu(nd))
-			return -ECHILD;
-		/* fallthru */
+unlazy:
+		if (dentry) {
+			if (nameidata_dentry_drop_rcu(nd, dentry))
+				return -ECHILD;
+		} else {
+			if (nameidata_drop_rcu(nd))
+				return -ECHILD;
+		}
+	} else {
+		dentry = __d_lookup(parent, name);
 	}
-	dentry = __d_lookup(parent, name);
-	if (!dentry)
-		goto need_lookup;
-found:
-	if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) {
-		dentry = do_revalidate(dentry, nd);
-		if (!dentry)
-			goto need_lookup;
-		if (IS_ERR(dentry))
-			goto fail;
+
+retry:
+	if (unlikely(!dentry)) {
+		struct inode *dir = parent->d_inode;
+		BUG_ON(nd->inode != dir);
+
+		mutex_lock(&dir->i_mutex);
+		dentry = d_lookup(parent, name);
+		if (likely(!dentry)) {
+			dentry = d_alloc_and_lookup(parent, name, nd);
+			if (IS_ERR(dentry)) {
+				mutex_unlock(&dir->i_mutex);
+				return PTR_ERR(dentry);
+			}
+			/* known good */
+			need_reval = 0;
+			status = 1;
+		}
+		mutex_unlock(&dir->i_mutex);
+	}
+	if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE) && need_reval)
+		status = d_revalidate(dentry, nd);
+	if (unlikely(status <= 0)) {
+		if (status < 0) {
+			dput(dentry);
+			return status;
+		}
+		if (!d_invalidate(dentry)) {
+			dput(dentry);
+			dentry = NULL;
+			need_reval = 1;
+			goto retry;
+		}
 	}
-done:
+
 	path->mnt = mnt;
 	path->dentry = dentry;
 	err = follow_managed(path, nd->flags);
@@ -1274,39 +1278,6 @@ done:
 	}
 	*inode = path->dentry->d_inode;
 	return 0;
-
-need_lookup:
-	dir = parent->d_inode;
-	BUG_ON(nd->inode != dir);
-
-	mutex_lock(&dir->i_mutex);
-	/*
-	 * First re-do the cached lookup just in case it was created
-	 * while we waited for the directory semaphore, or the first
-	 * lookup failed due to an unrelated rename.
-	 *
-	 * This could use version numbering or similar to avoid unnecessary
-	 * cache lookups, but then we'd have to do the first lookup in the
-	 * non-racy way. However in the common case here, everything should
-	 * be hot in cache, so would it be a big win?
-	 */
-	dentry = d_lookup(parent, name);
-	if (likely(!dentry)) {
-		dentry = d_alloc_and_lookup(parent, name, nd);
-		mutex_unlock(&dir->i_mutex);
-		if (IS_ERR(dentry))
-			goto fail;
-		goto done;
-	}
-	/*
-	 * Uhhuh! Nasty case: the cache was re-populated while
-	 * we waited on the semaphore. Need to revalidate.
-	 */
-	mutex_unlock(&dir->i_mutex);
-	goto found;
-
-fail:
-	return PTR_ERR(dentry);
 }
 
 static inline int may_lookup(struct nameidata *nd)
-- 
cgit v0.10.2


From 5b6ca027d85b7438c84b78a54ccdc2e53f2909cd Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 9 Mar 2011 23:04:47 -0500
Subject: reduce vfs_path_lookup() to do_path_lookup()

New lookup flag: LOOKUP_ROOT.  nd->root is set (and held) by caller,
path_init() starts walking from that place and all pathname resolution
machinery never drops nd->root if that flag is set.  That turns
vfs_path_lookup() into a special case of do_path_lookup() *and*
gets us down to 3 callers of link_path_walk(), making it finally
feasible to rip the handling of trailing symlink out of link_path_walk().
That will not only simply the living hell out of it, but make life
much simpler for unionfs merge.  Trailing symlink handling will
become iterative, which is a good thing for stack footprint in
a lot of situations as well.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/namei.c b/fs/namei.c
index 0bebd13..8ee7785 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -401,9 +401,11 @@ static int nameidata_drop_rcu(struct nameidata *nd)
 {
 	struct fs_struct *fs = current->fs;
 	struct dentry *dentry = nd->path.dentry;
+	int want_root = 0;
 
 	BUG_ON(!(nd->flags & LOOKUP_RCU));
-	if (nd->root.mnt) {
+	if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
+		want_root = 1;
 		spin_lock(&fs->lock);
 		if (nd->root.mnt != fs->root.mnt ||
 				nd->root.dentry != fs->root.dentry)
@@ -414,7 +416,7 @@ static int nameidata_drop_rcu(struct nameidata *nd)
 		goto err;
 	BUG_ON(nd->inode != dentry->d_inode);
 	spin_unlock(&dentry->d_lock);
-	if (nd->root.mnt) {
+	if (want_root) {
 		path_get(&nd->root);
 		spin_unlock(&fs->lock);
 	}
@@ -427,7 +429,7 @@ static int nameidata_drop_rcu(struct nameidata *nd)
 err:
 	spin_unlock(&dentry->d_lock);
 err_root:
-	if (nd->root.mnt)
+	if (want_root)
 		spin_unlock(&fs->lock);
 	return -ECHILD;
 }
@@ -454,9 +456,11 @@ static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry
 {
 	struct fs_struct *fs = current->fs;
 	struct dentry *parent = nd->path.dentry;
+	int want_root = 0;
 
 	BUG_ON(!(nd->flags & LOOKUP_RCU));
-	if (nd->root.mnt) {
+	if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
+		want_root = 1;
 		spin_lock(&fs->lock);
 		if (nd->root.mnt != fs->root.mnt ||
 				nd->root.dentry != fs->root.dentry)
@@ -476,7 +480,7 @@ static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry
 	parent->d_count++;
 	spin_unlock(&dentry->d_lock);
 	spin_unlock(&parent->d_lock);
-	if (nd->root.mnt) {
+	if (want_root) {
 		path_get(&nd->root);
 		spin_unlock(&fs->lock);
 	}
@@ -490,7 +494,7 @@ err:
 	spin_unlock(&dentry->d_lock);
 	spin_unlock(&parent->d_lock);
 err_root:
-	if (nd->root.mnt)
+	if (want_root)
 		spin_unlock(&fs->lock);
 	return -ECHILD;
 }
@@ -501,7 +505,8 @@ static inline int nameidata_dentry_drop_rcu_maybe(struct nameidata *nd, struct d
 	if (nd->flags & LOOKUP_RCU) {
 		if (unlikely(nameidata_dentry_drop_rcu(nd, dentry))) {
 			nd->flags &= ~LOOKUP_RCU;
-			nd->root.mnt = NULL;
+			if (!(nd->flags & LOOKUP_ROOT))
+				nd->root.mnt = NULL;
 			rcu_read_unlock();
 			br_read_unlock(vfsmount_lock);
 			return -ECHILD;
@@ -525,7 +530,8 @@ static int nameidata_drop_rcu_last(struct nameidata *nd)
 
 	BUG_ON(!(nd->flags & LOOKUP_RCU));
 	nd->flags &= ~LOOKUP_RCU;
-	nd->root.mnt = NULL;
+	if (!(nd->flags & LOOKUP_ROOT))
+		nd->root.mnt = NULL;
 	spin_lock(&dentry->d_lock);
 	if (!__d_rcu_to_refcount(dentry, nd->seq))
 		goto err_unlock;
@@ -1053,7 +1059,8 @@ static int follow_dotdot_rcu(struct nameidata *nd)
 
 failed:
 	nd->flags &= ~LOOKUP_RCU;
-	nd->root.mnt = NULL;
+	if (!(nd->flags & LOOKUP_ROOT))
+		nd->root.mnt = NULL;
 	rcu_read_unlock();
 	br_read_unlock(vfsmount_lock);
 	return -ECHILD;
@@ -1310,7 +1317,8 @@ static void terminate_walk(struct nameidata *nd)
 		path_put(&nd->path);
 	} else {
 		nd->flags &= ~LOOKUP_RCU;
-		nd->root.mnt = NULL;
+		if (!(nd->flags & LOOKUP_ROOT))
+			nd->root.mnt = NULL;
 		rcu_read_unlock();
 		br_read_unlock(vfsmount_lock);
 	}
@@ -1477,6 +1485,25 @@ static int path_init(int dfd, const char *name, unsigned int flags,
 	nd->last_type = LAST_ROOT; /* if there are only slashes... */
 	nd->flags = flags | LOOKUP_JUMPED;
 	nd->depth = 0;
+	if (flags & LOOKUP_ROOT) {
+		struct inode *inode = nd->root.dentry->d_inode;
+		if (!inode->i_op->lookup)
+			return -ENOTDIR;
+		retval = inode_permission(inode, MAY_EXEC);
+		if (retval)
+			return retval;
+		nd->path = nd->root;
+		nd->inode = inode;
+		if (flags & LOOKUP_RCU) {
+			br_read_lock(vfsmount_lock);
+			rcu_read_lock();
+			nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
+		} else {
+			path_get(&nd->path);
+		}
+		return 0;
+	}
+
 	nd->root.mnt = NULL;
 
 	if (*name=='/') {
@@ -1587,7 +1614,7 @@ static int path_lookupat(int dfd, const char *name,
 	if (base)
 		fput(base);
 
-	if (nd->root.mnt) {
+	if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
 		path_put(&nd->root);
 		nd->root.mnt = NULL;
 	}
@@ -1638,46 +1665,10 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
 		    const char *name, unsigned int flags,
 		    struct nameidata *nd)
 {
-	int result;
-
-	/* same as do_path_lookup */
-	nd->last_type = LAST_ROOT;
-	nd->flags = flags | LOOKUP_JUMPED;
-	nd->depth = 0;
-
-	nd->path.dentry = dentry;
-	nd->path.mnt = mnt;
-	path_get(&nd->path);
-	nd->root = nd->path;
-	path_get(&nd->root);
-	nd->inode = nd->path.dentry->d_inode;
-
-	current->total_link_count = 0;
-
-	result = link_path_walk(name, nd);
-	if (!result)
-		result = handle_reval_path(nd);
-	if (result == -ESTALE) {
-		/* nd->path had been dropped */
-		current->total_link_count = 0;
-		nd->path.dentry = dentry;
-		nd->path.mnt = mnt;
-		nd->inode = dentry->d_inode;
-		path_get(&nd->path);
-		nd->flags = flags | LOOKUP_JUMPED | LOOKUP_REVAL;
-
-		result = link_path_walk(name, nd);
-		if (!result)
-			result = handle_reval_path(nd);
-	}
-	if (unlikely(!result && !audit_dummy_context() && nd->path.dentry &&
-				nd->inode))
-		audit_inode(name, nd->path.dentry);
-
-	path_put(&nd->root);
-	nd->root.mnt = NULL;
-
-	return result;
+	nd->root.dentry = dentry;
+	nd->root.mnt = mnt;
+	/* the first argument of do_path_lookup() is ignored with LOOKUP_ROOT */
+	return do_path_lookup(AT_FDCWD, name, flags | LOOKUP_ROOT, nd);
 }
 
 static struct dentry *__lookup_hash(struct qstr *name,
@@ -2320,7 +2311,7 @@ static struct file *path_openat(int dfd, const char *pathname,
 		path_put(&link);
 	}
 out:
-	if (nd.root.mnt)
+	if (nd.root.mnt && !(nd.flags & LOOKUP_ROOT))
 		path_put(&nd.root);
 	if (base)
 		fput(base);
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 72ffd62..83cd6e5 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -63,6 +63,7 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND};
 #define LOOKUP_RENAME_TARGET	0x0800
 
 #define LOOKUP_JUMPED		0x1000
+#define LOOKUP_ROOT		0x2000
 
 extern int user_path_at(int, const char __user *, unsigned, struct path *);
 
-- 
cgit v0.10.2


From 73d049a40fc6269189c4e2ba6792cb5dd054883c Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 11 Mar 2011 12:08:24 -0500
Subject: open-style analog of vfs_path_lookup()

new function: file_open_root(dentry, mnt, name, flags) opens the file
vfs_path_lookup would arrive to.

Note that name can be empty; in that case the usual requirement that
dentry should be a directory is lifted.

open-coded equivalents switched to it, may_open() got down exactly
one caller and became static.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c
index 975613b..c70e047 100644
--- a/arch/um/drivers/mconsole_kern.c
+++ b/arch/um/drivers/mconsole_kern.c
@@ -124,35 +124,18 @@ void mconsole_log(struct mc_request *req)
 #if 0
 void mconsole_proc(struct mc_request *req)
 {
-	struct nameidata nd;
 	struct vfsmount *mnt = current->nsproxy->pid_ns->proc_mnt;
 	struct file *file;
-	int n, err;
+	int n;
 	char *ptr = req->request.data, *buf;
 	mm_segment_t old_fs = get_fs();
 
 	ptr += strlen("proc");
 	ptr = skip_spaces(ptr);
 
-	err = vfs_path_lookup(mnt->mnt_root, mnt, ptr, LOOKUP_FOLLOW, &nd);
-	if (err) {
-		mconsole_reply(req, "Failed to look up file", 1, 0);
-		goto out;
-	}
-
-	err = may_open(&nd.path, MAY_READ, O_RDONLY);
-	if (result) {
-		mconsole_reply(req, "Failed to open file", 1, 0);
-		path_put(&nd.path);
-		goto out;
-	}
-
-	file = dentry_open(nd.path.dentry, nd.path.mnt, O_RDONLY,
-			   current_cred());
-	err = PTR_ERR(file);
+	file = file_open_root(mnt->mnt_root, mnt, ptr, O_RDONLY);
 	if (IS_ERR(file)) {
 		mconsole_reply(req, "Failed to open file", 1, 0);
-		path_put(&nd.path);
 		goto out;
 	}
 
diff --git a/fs/internal.h b/fs/internal.h
index 6fdbdf2..52abc52 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -114,6 +114,8 @@ struct open_flags {
 };
 extern struct file *do_filp_open(int dfd, const char *pathname,
 		const struct open_flags *op, int lookup_flags);
+extern struct file *do_file_open_root(struct dentry *, struct vfsmount *,
+		const char *, const struct open_flags *, int lookup_flags);
 
 /*
  * inode.c
diff --git a/fs/namei.c b/fs/namei.c
index 8ee7785..abc8d2d 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1487,11 +1487,13 @@ static int path_init(int dfd, const char *name, unsigned int flags,
 	nd->depth = 0;
 	if (flags & LOOKUP_ROOT) {
 		struct inode *inode = nd->root.dentry->d_inode;
-		if (!inode->i_op->lookup)
-			return -ENOTDIR;
-		retval = inode_permission(inode, MAY_EXEC);
-		if (retval)
-			return retval;
+		if (*name) {
+			if (!inode->i_op->lookup)
+				return -ENOTDIR;
+			retval = inode_permission(inode, MAY_EXEC);
+			if (retval)
+				return retval;
+		}
 		nd->path = nd->root;
 		nd->inode = inode;
 		if (flags & LOOKUP_RCU) {
@@ -1937,7 +1939,7 @@ int vfs_create(struct inode *dir, struct dentry *dentry, int mode,
 	return error;
 }
 
-int may_open(struct path *path, int acc_mode, int flag)
+static int may_open(struct path *path, int acc_mode, int flag)
 {
 	struct dentry *dentry = path->dentry;
 	struct inode *inode = dentry->d_inode;
@@ -2250,11 +2252,10 @@ exit:
 }
 
 static struct file *path_openat(int dfd, const char *pathname,
-		const struct open_flags *op, int flags)
+		struct nameidata *nd, const struct open_flags *op, int flags)
 {
 	struct file *base = NULL;
 	struct file *filp;
-	struct nameidata nd;
 	struct path path;
 	int count = 0;
 	int error;
@@ -2264,27 +2265,27 @@ static struct file *path_openat(int dfd, const char *pathname,
 		return ERR_PTR(-ENFILE);
 
 	filp->f_flags = op->open_flag;
-	nd.intent.open.file = filp;
-	nd.intent.open.flags = open_to_namei_flags(op->open_flag);
-	nd.intent.open.create_mode = op->mode;
+	nd->intent.open.file = filp;
+	nd->intent.open.flags = open_to_namei_flags(op->open_flag);
+	nd->intent.open.create_mode = op->mode;
 
-	error = path_init(dfd, pathname, flags | LOOKUP_PARENT, &nd, &base);
+	error = path_init(dfd, pathname, flags | LOOKUP_PARENT, nd, &base);
 	if (unlikely(error))
 		goto out_filp;
 
 	current->total_link_count = 0;
-	error = link_path_walk(pathname, &nd);
+	error = link_path_walk(pathname, nd);
 	if (unlikely(error))
 		goto out_filp;
 
-	filp = do_last(&nd, &path, op, pathname);
+	filp = do_last(nd, &path, op, pathname);
 	while (unlikely(!filp)) { /* trailing symlink */
 		struct path link = path;
 		struct inode *linki = link.dentry->d_inode;
 		void *cookie;
-		if (!(nd.flags & LOOKUP_FOLLOW) || count++ == 32) {
-			path_put_conditional(&path, &nd);
-			path_put(&nd.path);
+		if (!(nd->flags & LOOKUP_FOLLOW) || count++ == 32) {
+			path_put_conditional(&path, nd);
+			path_put(&nd->path);
 			filp = ERR_PTR(-ELOOP);
 			break;
 		}
@@ -2299,23 +2300,23 @@ static struct file *path_openat(int dfd, const char *pathname,
 		 * have to putname() it when we are done. Procfs-like symlinks
 		 * just set LAST_BIND.
 		 */
-		nd.flags |= LOOKUP_PARENT;
-		nd.flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
-		error = __do_follow_link(&link, &nd, &cookie);
+		nd->flags |= LOOKUP_PARENT;
+		nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
+		error = __do_follow_link(&link, nd, &cookie);
 		if (unlikely(error))
 			filp = ERR_PTR(error);
 		else
-			filp = do_last(&nd, &path, op, pathname);
+			filp = do_last(nd, &path, op, pathname);
 		if (!IS_ERR(cookie) && linki->i_op->put_link)
-			linki->i_op->put_link(link.dentry, &nd, cookie);
+			linki->i_op->put_link(link.dentry, nd, cookie);
 		path_put(&link);
 	}
 out:
-	if (nd.root.mnt && !(nd.flags & LOOKUP_ROOT))
-		path_put(&nd.root);
+	if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT))
+		path_put(&nd->root);
 	if (base)
 		fput(base);
-	release_open_intent(&nd);
+	release_open_intent(nd);
 	return filp;
 
 out_filp:
@@ -2326,16 +2327,39 @@ out_filp:
 struct file *do_filp_open(int dfd, const char *pathname,
 		const struct open_flags *op, int flags)
 {
+	struct nameidata nd;
 	struct file *filp;
 
-	filp = path_openat(dfd, pathname, op, flags | LOOKUP_RCU);
+	filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_RCU);
 	if (unlikely(filp == ERR_PTR(-ECHILD)))
-		filp = path_openat(dfd, pathname, op, flags);
+		filp = path_openat(dfd, pathname, &nd, op, flags);
 	if (unlikely(filp == ERR_PTR(-ESTALE)))
-		filp = path_openat(dfd, pathname, op, flags | LOOKUP_REVAL);
+		filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_REVAL);
 	return filp;
 }
 
+struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
+		const char *name, const struct open_flags *op, int flags)
+{
+	struct nameidata nd;
+	struct file *file;
+
+	nd.root.mnt = mnt;
+	nd.root.dentry = dentry;
+
+	flags |= LOOKUP_ROOT;
+
+	if (dentry->d_inode->i_op->follow_link)
+		return ERR_PTR(-ELOOP);
+
+	file = path_openat(-1, name, &nd, op, flags | LOOKUP_RCU);
+	if (unlikely(file == ERR_PTR(-ECHILD)))
+		file = path_openat(-1, name, &nd, op, flags);
+	if (unlikely(file == ERR_PTR(-ESTALE)))
+		file = path_openat(-1, name, &nd, op, flags | LOOKUP_REVAL);
+	return file;
+}
+
 /**
  * lookup_create - lookup a dentry, creating it if it doesn't exist
  * @nd: nameidata info
diff --git a/fs/nfsctl.c b/fs/nfsctl.c
index bf9cbd2..124e8fc 100644
--- a/fs/nfsctl.c
+++ b/fs/nfsctl.c
@@ -22,30 +22,17 @@
 
 static struct file *do_open(char *name, int flags)
 {
-	struct nameidata nd;
 	struct vfsmount *mnt;
-	int error;
+	struct file *file;
 
 	mnt = do_kern_mount("nfsd", 0, "nfsd", NULL);
 	if (IS_ERR(mnt))
 		return (struct file *)mnt;
 
-	error = vfs_path_lookup(mnt->mnt_root, mnt, name, 0, &nd);
-	mntput(mnt);	/* drop do_kern_mount reference */
-	if (error)
-		return ERR_PTR(error);
-
-	if (flags == O_RDWR)
-		error = may_open(&nd.path, MAY_READ|MAY_WRITE, flags);
-	else
-		error = may_open(&nd.path, MAY_WRITE, flags);
+	file = file_open_root(mnt->mnt_root, mnt, name, flags);
 
-	if (!error)
-		return dentry_open(nd.path.dentry, nd.path.mnt, flags,
-				   current_cred());
-
-	path_put(&nd.path);
-	return ERR_PTR(error);
+	mntput(mnt);	/* drop do_kern_mount reference */
+	return file;
 }
 
 static struct {
diff --git a/fs/open.c b/fs/open.c
index d05e18c..48afc5c 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -959,6 +959,20 @@ struct file *filp_open(const char *filename, int flags, int mode)
 }
 EXPORT_SYMBOL(filp_open);
 
+struct file *file_open_root(struct dentry *dentry, struct vfsmount *mnt,
+			    const char *filename, int flags)
+{
+	struct open_flags op;
+	int lookup = build_open_flags(flags, 0, &op);
+	if (flags & O_CREAT)
+		return ERR_PTR(-EINVAL);
+	if (!filename && (flags & O_DIRECTORY))
+		if (!dentry->d_inode->i_op->lookup)
+			return ERR_PTR(-ENOTDIR);
+	return do_file_open_root(dentry, mnt, filename, &op, lookup);
+}
+EXPORT_SYMBOL(file_open_root);
+
 long do_sys_open(int dfd, const char __user *filename, int flags, int mode)
 {
 	struct open_flags op;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9c75714..bf5c3c8 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1990,6 +1990,8 @@ extern int do_fallocate(struct file *file, int mode, loff_t offset,
 extern long do_sys_open(int dfd, const char __user *filename, int flags,
 			int mode);
 extern struct file *filp_open(const char *, int, int);
+extern struct file *file_open_root(struct dentry *, struct vfsmount *,
+				   const char *, int);
 extern struct file * dentry_open(struct dentry *, struct vfsmount *, int,
 				 const struct cred *);
 extern int filp_close(struct file *, fl_owner_t id);
@@ -2205,8 +2207,6 @@ extern struct file *create_read_pipe(struct file *f, int flags);
 extern struct file *create_write_pipe(int flags);
 extern void free_write_pipe(struct file *);
 
-extern int may_open(struct path *, int, int);
-
 extern int kernel_read(struct file *, loff_t, char *, unsigned long);
 extern struct file * open_exec(const char *);
  
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index b875bed..3b8e028 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -1321,13 +1321,11 @@ static ssize_t binary_sysctl(const int *name, int nlen,
 	void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
 {
 	const struct bin_table *table = NULL;
-	struct nameidata nd;
 	struct vfsmount *mnt;
 	struct file *file;
 	ssize_t result;
 	char *pathname;
 	int flags;
-	int acc_mode;
 
 	pathname = sysctl_getname(name, nlen, &table);
 	result = PTR_ERR(pathname);
@@ -1337,28 +1335,17 @@ static ssize_t binary_sysctl(const int *name, int nlen,
 	/* How should the sysctl be accessed? */
 	if (oldval && oldlen && newval && newlen) {
 		flags = O_RDWR;
-		acc_mode = MAY_READ | MAY_WRITE;
 	} else if (newval && newlen) {
 		flags = O_WRONLY;
-		acc_mode = MAY_WRITE;
 	} else if (oldval && oldlen) {
 		flags = O_RDONLY;
-		acc_mode = MAY_READ;
 	} else {
 		result = 0;
 		goto out_putname;
 	}
 
 	mnt = current->nsproxy->pid_ns->proc_mnt;
-	result = vfs_path_lookup(mnt->mnt_root, mnt, pathname, 0, &nd);
-	if (result)
-		goto out_putname;
-
-	result = may_open(&nd.path, acc_mode, flags);
-	if (result)
-		goto out_putpath;
-
-	file = dentry_open(nd.path.dentry, nd.path.mnt, flags, current_cred());
+	file = file_open_root(mnt->mnt_root, mnt, pathname, flags);
 	result = PTR_ERR(file);
 	if (IS_ERR(file))
 		goto out_putname;
@@ -1370,10 +1357,6 @@ out_putname:
 	putname(pathname);
 out:
 	return result;
-
-out_putpath:
-	path_put(&nd.path);
-	goto out_putname;
 }
 
 
-- 
cgit v0.10.2


From c8b91accfa1059d5565443193d89572eca2f5dd6 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 12 Mar 2011 10:41:39 -0500
Subject: clean statfs-like syscalls up

New helpers: user_statfs() and fd_statfs(), taking userland pathname and
descriptor resp. and filling struct kstatfs.  Syscalls of statfs family
(native, compat and foreign - osf and hpux on alpha and parisc resp.)
switched to those.  Removes some boilerplate code, simplifies cleanup
on errors...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c
index fe698b5..376f221 100644
--- a/arch/alpha/kernel/osf_sys.c
+++ b/arch/alpha/kernel/osf_sys.c
@@ -230,44 +230,24 @@ linux_to_osf_statfs(struct kstatfs *linux_stat, struct osf_statfs __user *osf_st
 	return copy_to_user(osf_stat, &tmp_stat, bufsiz) ? -EFAULT : 0;
 }
 
-static int
-do_osf_statfs(struct path *path, struct osf_statfs __user *buffer,
-	      unsigned long bufsiz)
+SYSCALL_DEFINE3(osf_statfs, const char __user *, pathname,
+		struct osf_statfs __user *, buffer, unsigned long, bufsiz)
 {
 	struct kstatfs linux_stat;
-	int error = vfs_statfs(path, &linux_stat);
+	int error = user_statfs(pathname, &linux_stat);
 	if (!error)
 		error = linux_to_osf_statfs(&linux_stat, buffer, bufsiz);
 	return error;	
 }
 
-SYSCALL_DEFINE3(osf_statfs, const char __user *, pathname,
-		struct osf_statfs __user *, buffer, unsigned long, bufsiz)
-{
-	struct path path;
-	int retval;
-
-	retval = user_path(pathname, &path);
-	if (!retval) {
-		retval = do_osf_statfs(&path, buffer, bufsiz);
-		path_put(&path);
-	}
-	return retval;
-}
-
 SYSCALL_DEFINE3(osf_fstatfs, unsigned long, fd,
 		struct osf_statfs __user *, buffer, unsigned long, bufsiz)
 {
-	struct file *file;
-	int retval;
-
-	retval = -EBADF;
-	file = fget(fd);
-	if (file) {
-		retval = do_osf_statfs(&file->f_path, buffer, bufsiz);
-		fput(file);
-	}
-	return retval;
+	struct kstatfs linux_stat;
+	int error = fd_statfs(fd, &linux_stat);
+	if (!error)
+		error = linux_to_osf_statfs(&linux_stat, buffer, bufsiz);
+	return error;
 }
 
 /*
diff --git a/arch/parisc/hpux/sys_hpux.c b/arch/parisc/hpux/sys_hpux.c
index 3039408..6ab9580 100644
--- a/arch/parisc/hpux/sys_hpux.c
+++ b/arch/parisc/hpux/sys_hpux.c
@@ -185,26 +185,21 @@ struct hpux_statfs {
      int16_t f_pad;
 };
 
-static int do_statfs_hpux(struct path *path, struct hpux_statfs *buf)
+static int do_statfs_hpux(struct kstatfs *st, struct hpux_statfs __user *p)
 {
-	struct kstatfs st;
-	int retval;
-	
-	retval = vfs_statfs(path, &st);
-	if (retval)
-		return retval;
-
-	memset(buf, 0, sizeof(*buf));
-	buf->f_type = st.f_type;
-	buf->f_bsize = st.f_bsize;
-	buf->f_blocks = st.f_blocks;
-	buf->f_bfree = st.f_bfree;
-	buf->f_bavail = st.f_bavail;
-	buf->f_files = st.f_files;
-	buf->f_ffree = st.f_ffree;
-	buf->f_fsid[0] = st.f_fsid.val[0];
-	buf->f_fsid[1] = st.f_fsid.val[1];
-
+	struct hpux_statfs buf;
+	memset(&buf, 0, sizeof(buf));
+	buf.f_type = st->f_type;
+	buf.f_bsize = st->f_bsize;
+	buf.f_blocks = st->f_blocks;
+	buf.f_bfree = st->f_bfree;
+	buf.f_bavail = st->f_bavail;
+	buf.f_files = st->f_files;
+	buf.f_ffree = st->f_ffree;
+	buf.f_fsid[0] = st->f_fsid.val[0];
+	buf.f_fsid[1] = st->f_fsid.val[1];
+	if (copy_to_user(p, &buf, sizeof(buf)))
+		return -EFAULT;
 	return 0;
 }
 
@@ -212,35 +207,19 @@ static int do_statfs_hpux(struct path *path, struct hpux_statfs *buf)
 asmlinkage long hpux_statfs(const char __user *pathname,
 						struct hpux_statfs __user *buf)
 {
-	struct path path;
-	int error;
-
-	error = user_path(pathname, &path);
-	if (!error) {
-		struct hpux_statfs tmp;
-		error = do_statfs_hpux(&path, &tmp);
-		if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
-			error = -EFAULT;
-		path_put(&path);
-	}
+	struct kstatfs st;
+	int error = user_statfs(pathname, &st);
+	if (!error)
+		error = do_statfs_hpux(&st, buf);
 	return error;
 }
 
 asmlinkage long hpux_fstatfs(unsigned int fd, struct hpux_statfs __user * buf)
 {
-	struct file *file;
-	struct hpux_statfs tmp;
-	int error;
-
-	error = -EBADF;
-	file = fget(fd);
-	if (!file)
-		goto out;
-	error = do_statfs_hpux(&file->f_path, &tmp);
-	if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
-		error = -EFAULT;
-	fput(file);
- out:
+	struct kstatfs st;
+	int error = fd_statfs(fd, &st);
+	if (!error)
+		error = do_statfs_hpux(&st, buf);
 	return error;
 }
 
diff --git a/fs/compat.c b/fs/compat.c
index 691c3fd..a071775 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -262,35 +262,19 @@ static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs *
  */
 asmlinkage long compat_sys_statfs(const char __user *pathname, struct compat_statfs __user *buf)
 {
-	struct path path;
-	int error;
-
-	error = user_path(pathname, &path);
-	if (!error) {
-		struct kstatfs tmp;
-		error = vfs_statfs(&path, &tmp);
-		if (!error)
-			error = put_compat_statfs(buf, &tmp);
-		path_put(&path);
-	}
+	struct kstatfs tmp;
+	int error = user_statfs(pathname, &tmp);
+	if (!error)
+		error = put_compat_statfs(buf, &tmp);
 	return error;
 }
 
 asmlinkage long compat_sys_fstatfs(unsigned int fd, struct compat_statfs __user *buf)
 {
-	struct file * file;
 	struct kstatfs tmp;
-	int error;
-
-	error = -EBADF;
-	file = fget(fd);
-	if (!file)
-		goto out;
-	error = vfs_statfs(&file->f_path, &tmp);
+	int error = fd_statfs(fd, &tmp);
 	if (!error)
 		error = put_compat_statfs(buf, &tmp);
-	fput(file);
-out:
 	return error;
 }
 
@@ -329,41 +313,29 @@ static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstat
 
 asmlinkage long compat_sys_statfs64(const char __user *pathname, compat_size_t sz, struct compat_statfs64 __user *buf)
 {
-	struct path path;
+	struct kstatfs tmp;
 	int error;
 
 	if (sz != sizeof(*buf))
 		return -EINVAL;
 
-	error = user_path(pathname, &path);
-	if (!error) {
-		struct kstatfs tmp;
-		error = vfs_statfs(&path, &tmp);
-		if (!error)
-			error = put_compat_statfs64(buf, &tmp);
-		path_put(&path);
-	}
+	error = user_statfs(pathname, &tmp);
+	if (!error)
+		error = put_compat_statfs64(buf, &tmp);
 	return error;
 }
 
 asmlinkage long compat_sys_fstatfs64(unsigned int fd, compat_size_t sz, struct compat_statfs64 __user *buf)
 {
-	struct file * file;
 	struct kstatfs tmp;
 	int error;
 
 	if (sz != sizeof(*buf))
 		return -EINVAL;
 
-	error = -EBADF;
-	file = fget(fd);
-	if (!file)
-		goto out;
-	error = vfs_statfs(&file->f_path, &tmp);
+	error = fd_statfs(fd, &tmp);
 	if (!error)
 		error = put_compat_statfs64(buf, &tmp);
-	fput(file);
-out:
 	return error;
 }
 
diff --git a/fs/statfs.c b/fs/statfs.c
index 30ea8c8..8244924 100644
--- a/fs/statfs.c
+++ b/fs/statfs.c
@@ -73,149 +73,135 @@ int vfs_statfs(struct path *path, struct kstatfs *buf)
 }
 EXPORT_SYMBOL(vfs_statfs);
 
-static int do_statfs_native(struct path *path, struct statfs *buf)
+int user_statfs(const char __user *pathname, struct kstatfs *st)
 {
-	struct kstatfs st;
-	int retval;
+	struct path path;
+	int error = user_path(pathname, &path);
+	if (!error) {
+		error = vfs_statfs(&path, st);
+		path_put(&path);
+	}
+	return error;
+}
 
-	retval = vfs_statfs(path, &st);
-	if (retval)
-		return retval;
+int fd_statfs(int fd, struct kstatfs *st)
+{
+	struct file *file = fget(fd);
+	int error = -EBADF;
+	if (file) {
+		error = vfs_statfs(&file->f_path, st);
+		fput(file);
+	}
+	return error;
+}
 
-	if (sizeof(*buf) == sizeof(st))
-		memcpy(buf, &st, sizeof(st));
+static int do_statfs_native(struct kstatfs *st, struct statfs __user *p)
+{
+	struct statfs buf;
+
+	if (sizeof(buf) == sizeof(*st))
+		memcpy(&buf, st, sizeof(*st));
 	else {
-		if (sizeof buf->f_blocks == 4) {
-			if ((st.f_blocks | st.f_bfree | st.f_bavail |
-			     st.f_bsize | st.f_frsize) &
+		if (sizeof buf.f_blocks == 4) {
+			if ((st->f_blocks | st->f_bfree | st->f_bavail |
+			     st->f_bsize | st->f_frsize) &
 			    0xffffffff00000000ULL)
 				return -EOVERFLOW;
 			/*
 			 * f_files and f_ffree may be -1; it's okay to stuff
 			 * that into 32 bits
 			 */
-			if (st.f_files != -1 &&
-			    (st.f_files & 0xffffffff00000000ULL))
+			if (st->f_files != -1 &&
+			    (st->f_files & 0xffffffff00000000ULL))
 				return -EOVERFLOW;
-			if (st.f_ffree != -1 &&
-			    (st.f_ffree & 0xffffffff00000000ULL))
+			if (st->f_ffree != -1 &&
+			    (st->f_ffree & 0xffffffff00000000ULL))
 				return -EOVERFLOW;
 		}
 
-		buf->f_type = st.f_type;
-		buf->f_bsize = st.f_bsize;
-		buf->f_blocks = st.f_blocks;
-		buf->f_bfree = st.f_bfree;
-		buf->f_bavail = st.f_bavail;
-		buf->f_files = st.f_files;
-		buf->f_ffree = st.f_ffree;
-		buf->f_fsid = st.f_fsid;
-		buf->f_namelen = st.f_namelen;
-		buf->f_frsize = st.f_frsize;
-		buf->f_flags = st.f_flags;
-		memset(buf->f_spare, 0, sizeof(buf->f_spare));
+		buf.f_type = st->f_type;
+		buf.f_bsize = st->f_bsize;
+		buf.f_blocks = st->f_blocks;
+		buf.f_bfree = st->f_bfree;
+		buf.f_bavail = st->f_bavail;
+		buf.f_files = st->f_files;
+		buf.f_ffree = st->f_ffree;
+		buf.f_fsid = st->f_fsid;
+		buf.f_namelen = st->f_namelen;
+		buf.f_frsize = st->f_frsize;
+		buf.f_flags = st->f_flags;
+		memset(buf.f_spare, 0, sizeof(buf.f_spare));
 	}
+	if (copy_to_user(p, &buf, sizeof(buf)))
+		return -EFAULT;
 	return 0;
 }
 
-static int do_statfs64(struct path *path, struct statfs64 *buf)
+static int do_statfs64(struct kstatfs *st, struct statfs64 __user *p)
 {
-	struct kstatfs st;
-	int retval;
-
-	retval = vfs_statfs(path, &st);
-	if (retval)
-		return retval;
-
-	if (sizeof(*buf) == sizeof(st))
-		memcpy(buf, &st, sizeof(st));
+	struct statfs64 buf;
+	if (sizeof(buf) == sizeof(*st))
+		memcpy(&buf, st, sizeof(*st));
 	else {
-		buf->f_type = st.f_type;
-		buf->f_bsize = st.f_bsize;
-		buf->f_blocks = st.f_blocks;
-		buf->f_bfree = st.f_bfree;
-		buf->f_bavail = st.f_bavail;
-		buf->f_files = st.f_files;
-		buf->f_ffree = st.f_ffree;
-		buf->f_fsid = st.f_fsid;
-		buf->f_namelen = st.f_namelen;
-		buf->f_frsize = st.f_frsize;
-		buf->f_flags = st.f_flags;
-		memset(buf->f_spare, 0, sizeof(buf->f_spare));
+		buf.f_type = st->f_type;
+		buf.f_bsize = st->f_bsize;
+		buf.f_blocks = st->f_blocks;
+		buf.f_bfree = st->f_bfree;
+		buf.f_bavail = st->f_bavail;
+		buf.f_files = st->f_files;
+		buf.f_ffree = st->f_ffree;
+		buf.f_fsid = st->f_fsid;
+		buf.f_namelen = st->f_namelen;
+		buf.f_frsize = st->f_frsize;
+		buf.f_flags = st->f_flags;
+		memset(buf.f_spare, 0, sizeof(buf.f_spare));
 	}
+	if (copy_to_user(p, &buf, sizeof(buf)))
+		return -EFAULT;
 	return 0;
 }
 
 SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct statfs __user *, buf)
 {
-	struct path path;
-	int error;
-
-	error = user_path(pathname, &path);
-	if (!error) {
-		struct statfs tmp;
-		error = do_statfs_native(&path, &tmp);
-		if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
-			error = -EFAULT;
-		path_put(&path);
-	}
+	struct kstatfs st;
+	int error = user_statfs(pathname, &st);
+	if (!error)
+		error = do_statfs_native(&st, buf);
 	return error;
 }
 
 SYSCALL_DEFINE3(statfs64, const char __user *, pathname, size_t, sz, struct statfs64 __user *, buf)
 {
-	struct path path;
-	long error;
-
+	struct kstatfs st;
+	int error;
 	if (sz != sizeof(*buf))
 		return -EINVAL;
-	error = user_path(pathname, &path);
-	if (!error) {
-		struct statfs64 tmp;
-		error = do_statfs64(&path, &tmp);
-		if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
-			error = -EFAULT;
-		path_put(&path);
-	}
+	error = user_statfs(pathname, &st);
+	if (!error)
+		error = do_statfs64(&st, buf);
 	return error;
 }
 
 SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct statfs __user *, buf)
 {
-	struct file *file;
-	struct statfs tmp;
-	int error;
-
-	error = -EBADF;
-	file = fget(fd);
-	if (!file)
-		goto out;
-	error = do_statfs_native(&file->f_path, &tmp);
-	if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
-		error = -EFAULT;
-	fput(file);
-out:
+	struct kstatfs st;
+	int error = fd_statfs(fd, &st);
+	if (!error)
+		error = do_statfs_native(&st, buf);
 	return error;
 }
 
 SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, size_t, sz, struct statfs64 __user *, buf)
 {
-	struct file *file;
-	struct statfs64 tmp;
+	struct kstatfs st;
 	int error;
 
 	if (sz != sizeof(*buf))
 		return -EINVAL;
 
-	error = -EBADF;
-	file = fget(fd);
-	if (!file)
-		goto out;
-	error = do_statfs64(&file->f_path, &tmp);
-	if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
-		error = -EFAULT;
-	fput(file);
-out:
+	error = fd_statfs(fd, &st);
+	if (!error)
+		error = do_statfs64(&st, buf);
 	return error;
 }
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index bf5c3c8..b7178b0 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1874,6 +1874,8 @@ extern void drop_collected_mounts(struct vfsmount *);
 extern int iterate_mounts(int (*)(struct vfsmount *, void *), void *,
 			  struct vfsmount *);
 extern int vfs_statfs(struct path *, struct kstatfs *);
+extern int user_statfs(const char __user *, struct kstatfs *);
+extern int fd_statfs(int, struct kstatfs *);
 extern int statfs_by_dentry(struct dentry *, struct kstatfs *);
 extern int freeze_super(struct super_block *super);
 extern int thaw_super(struct super_block *super);
-- 
cgit v0.10.2


From 5fe0c2378884e68beb532f5890cc0e3539ac747b Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Sat, 29 Jan 2011 18:43:25 +0530
Subject: exportfs: Return the minimum required handle size

The exportfs encode handle function should return the minimum required
handle size. This helps user to find out the handle size by passing 0
handle size in the first step and then redoing to the call again with
the returned handle size value.

Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index ff27d7a..b4ffad8 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -21,9 +21,13 @@ static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
 	int len = *max_len;
 	int type;
 
-	if ((len < BTRFS_FID_SIZE_NON_CONNECTABLE) ||
-	    (connectable && len < BTRFS_FID_SIZE_CONNECTABLE))
+	if (connectable && (len < BTRFS_FID_SIZE_CONNECTABLE)) {
+		*max_len = BTRFS_FID_SIZE_CONNECTABLE;
 		return 255;
+	} else if (len < BTRFS_FID_SIZE_NON_CONNECTABLE) {
+		*max_len = BTRFS_FID_SIZE_NON_CONNECTABLE;
+		return 255;
+	}
 
 	len  = BTRFS_FID_SIZE_NON_CONNECTABLE;
 	type = FILEID_BTRFS_WITHOUT_PARENT;
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 4b68257..cfe5573 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -320,9 +320,14 @@ static int export_encode_fh(struct dentry *dentry, struct fid *fid,
 	struct inode * inode = dentry->d_inode;
 	int len = *max_len;
 	int type = FILEID_INO32_GEN;
-	
-	if (len < 2 || (connectable && len < 4))
+
+	if (connectable && (len < 4)) {
+		*max_len = 4;
+		return 255;
+	} else if (len < 2) {
+		*max_len = 2;
 		return 255;
+	}
 
 	len = 2;
 	fid->i32.ino = inode->i_ino;
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 86753fe..0e277ec 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -757,8 +757,10 @@ fat_encode_fh(struct dentry *de, __u32 *fh, int *lenp, int connectable)
 	struct inode *inode =  de->d_inode;
 	u32 ipos_h, ipos_m, ipos_l;
 
-	if (len < 5)
+	if (len < 5) {
+		*lenp = 5;
 		return 255; /* no room */
+	}
 
 	ipos_h = MSDOS_I(inode)->i_pos >> 8;
 	ipos_m = (MSDOS_I(inode)->i_pos & 0xf0) << 24;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 9e3f68c..051b1a0 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -637,8 +637,10 @@ static int fuse_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
 	u64 nodeid;
 	u32 generation;
 
-	if (*max_len < len)
+	if (*max_len < len) {
+		*max_len = len;
 		return  255;
+	}
 
 	nodeid = get_fuse_inode(inode)->nodeid;
 	generation = inode->i_generation;
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index 9023db8..b5a5e60 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -36,9 +36,13 @@ static int gfs2_encode_fh(struct dentry *dentry, __u32 *p, int *len,
 	struct super_block *sb = inode->i_sb;
 	struct gfs2_inode *ip = GFS2_I(inode);
 
-	if (*len < GFS2_SMALL_FH_SIZE ||
-	    (connectable && *len < GFS2_LARGE_FH_SIZE))
+	if (connectable && (*len < GFS2_LARGE_FH_SIZE)) {
+		*len = GFS2_LARGE_FH_SIZE;
 		return 255;
+	} else if (*len < GFS2_SMALL_FH_SIZE) {
+		*len = GFS2_SMALL_FH_SIZE;
+		return 255;
+	}
 
 	fh[0] = cpu_to_be32(ip->i_no_formal_ino >> 32);
 	fh[1] = cpu_to_be32(ip->i_no_formal_ino & 0xFFFFFFFF);
diff --git a/fs/isofs/export.c b/fs/isofs/export.c
index ed752cb..dd4687f 100644
--- a/fs/isofs/export.c
+++ b/fs/isofs/export.c
@@ -124,9 +124,13 @@ isofs_export_encode_fh(struct dentry *dentry,
 	 * offset of the inode and the upper 16 bits of fh32[1] to
 	 * hold the offset of the parent.
 	 */
-
-	if (len < 3 || (connectable && len < 5))
+	if (connectable && (len < 5)) {
+		*max_len = 5;
+		return 255;
+	} else if (len < 3) {
+		*max_len = 3;
 		return 255;
+	}
 
 	len = 3;
 	fh32[0] = ei->i_iget5_block;
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index 5dbc306..254652a 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -197,8 +197,12 @@ static int ocfs2_encode_fh(struct dentry *dentry, u32 *fh_in, int *max_len,
 		   dentry->d_name.len, dentry->d_name.name,
 		   fh, len, connectable);
 
-	if (len < 3 || (connectable && len < 6)) {
-		mlog(ML_ERROR, "fh buffer is too small for encoding\n");
+	if (connectable && (len < 6)) {
+		*max_len = 6;
+		type = 255;
+		goto bail;
+	} else if (len < 3) {
+		*max_len = 3;
 		type = 255;
 		goto bail;
 	}
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 0bae036..1bba24b 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1593,8 +1593,13 @@ int reiserfs_encode_fh(struct dentry *dentry, __u32 * data, int *lenp,
 	struct inode *inode = dentry->d_inode;
 	int maxlen = *lenp;
 
-	if (maxlen < 3)
+	if (need_parent && (maxlen < 5)) {
+		*lenp = 5;
 		return 255;
+	} else if (maxlen < 3) {
+		*lenp = 3;
+		return 255;
+	}
 
 	data[0] = inode->i_ino;
 	data[1] = le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index b7c338d..f1dce84 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -1286,8 +1286,13 @@ static int udf_encode_fh(struct dentry *de, __u32 *fh, int *lenp,
 	struct fid *fid = (struct fid *)fh;
 	int type = FILEID_UDF_WITHOUT_PARENT;
 
-	if (len < 3 || (connectable && len < 5))
+	if (connectable && (len < 5)) {
+		*lenp = 5;
+		return 255;
+	} else if (len < 3) {
+		*lenp = 3;
 		return 255;
+	}
 
 	*lenp = 3;
 	fid->udf.block = location.logicalBlockNum;
diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c
index fc0114d..f4f878f 100644
--- a/fs/xfs/linux-2.6/xfs_export.c
+++ b/fs/xfs/linux-2.6/xfs_export.c
@@ -89,8 +89,10 @@ xfs_fs_encode_fh(
 	 * seven combinations work.  The real answer is "don't use v2".
 	 */
 	len = xfs_fileid_length(fileid_type);
-	if (*max_len < len)
+	if (*max_len < len) {
+		*max_len = len;
 		return 255;
+	}
 	*max_len = len;
 
 	switch (fileid_type) {
diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
index 2802898..65afdfd 100644
--- a/include/linux/exportfs.h
+++ b/include/linux/exportfs.h
@@ -121,8 +121,10 @@ struct fid {
  *    set, the encode_fh() should store sufficient information so that a good
  *    attempt can be made to find not only the file but also it's place in the
  *    filesystem.   This typically means storing a reference to de->d_parent in
- *    the filehandle fragment.  encode_fh() should return the number of bytes
- *    stored or a negative error code such as %-ENOSPC
+ *    the filehandle fragment.  encode_fh() should return the fileid_type on
+ *    success and on error returns 255 (if the space needed to encode fh is
+ *    greater than @max_len*4 bytes). On error @max_len contains the minimum
+ *    size(in 4 byte unit) needed to encode the file handle.
  *
  * fh_to_dentry:
  *    @fh_to_dentry is given a &struct super_block (@sb) and a file handle
diff --git a/mm/shmem.c b/mm/shmem.c
index 5ee67c9..3437b65 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2144,8 +2144,10 @@ static int shmem_encode_fh(struct dentry *dentry, __u32 *fh, int *len,
 {
 	struct inode *inode = dentry->d_inode;
 
-	if (*len < 3)
+	if (*len < 3) {
+		*len = 3;
 		return 255;
+	}
 
 	if (inode_unhashed(inode)) {
 		/* Unfortunately insert_inode_hash is not idempotent,
-- 
cgit v0.10.2


From f4cec35b0d4b90d96e3770a3d1e68ea882e7a7c8 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Tue, 18 Jan 2011 20:15:21 -0500
Subject: xen/mmu: Add the notion of identity (1-1) mapping.

Our P2M tree structure is a three-level. On the leaf nodes
we set the Machine Frame Number (MFN) of the PFN. What this means
is that when one does: pfn_to_mfn(pfn), which is used when creating
PTE entries, you get the real MFN of the hardware. When Xen sets
up a guest it initially populates a array which has descending
(or ascending) MFN values, as so:

 idx: 0,  1,       2
 [0x290F, 0x290E, 0x290D, ..]

so pfn_to_mfn(2)==0x290D. If you start, restart many guests that list
starts looking quite random.

We graft this structure on our P2M tree structure and stick in
those MFN in the leafs. But for all other leaf entries, or for the top
root, or middle one, for which there is a void entry, we assume it is
"missing". So
 pfn_to_mfn(0xc0000)=INVALID_P2M_ENTRY.

We add the possibility of setting 1-1 mappings on certain regions, so
that:
 pfn_to_mfn(0xc0000)=0xc0000

The benefit of this is, that we can assume for non-RAM regions (think
PCI BARs, or ACPI spaces), we can create mappings easily b/c we
get the PFN value to match the MFN.

For this to work efficiently we introduce one new page p2m_identity and
allocate (via reserved_brk) any other pages we need to cover the sides
(1GB or 4MB boundary violations). All entries in p2m_identity are set to
INVALID_P2M_ENTRY type (Xen toolstack only recognizes that and MFNs,
no other fancy value).

On lookup we spot that the entry points to p2m_identity and return the identity
value instead of dereferencing and returning INVALID_P2M_ENTRY. If the entry
points to an allocated page, we just proceed as before and return the PFN.
If the PFN has IDENTITY_FRAME_BIT set we unmask that in appropriate functions
(pfn_to_mfn).

The reason for having the IDENTITY_FRAME_BIT instead of just returning the
PFN is that we could find ourselves where pfn_to_mfn(pfn)==pfn for a
non-identity pfn. To protect ourselves against we elect to set (and get) the
IDENTITY_FRAME_BIT on all identity mapped PFNs.

This simplistic diagram is used to explain the more subtle piece of code.
There is also a digram of the P2M at the end that can help.
Imagine your E820 looking as so:

                   1GB                                           2GB
/-------------------+---------\/----\         /----------\    /---+-----\
| System RAM        | Sys RAM ||ACPI|         | reserved |    | Sys RAM |
\-------------------+---------/\----/         \----------/    \---+-----/
                              ^- 1029MB                       ^- 2001MB

[1029MB = 263424 (0x40500), 2001MB = 512256 (0x7D100), 2048MB = 524288 (0x80000)]

And dom0_mem=max:3GB,1GB is passed in to the guest, meaning memory past 1GB
is actually not present (would have to kick the balloon driver to put it in).

When we are told to set the PFNs for identity mapping (see patch: "xen/setup:
Set identity mapping for non-RAM E820 and E820 gaps.") we pass in the start
of the PFN and the end PFN (263424 and 512256 respectively). The first step is
to reserve_brk a top leaf page if the p2m[1] is missing. The top leaf page
covers 512^2 of page estate (1GB) and in case the start or end PFN is not
aligned on 512^2*PAGE_SIZE (1GB) we loop on aligned 1GB PFNs from start pfn to
end pfn.  We reserve_brk top leaf pages if they are missing (means they point
to p2m_mid_missing).

With the E820 example above, 263424 is not 1GB aligned so we allocate a
reserve_brk page which will cover the PFNs estate from 0x40000 to 0x80000.
Each entry in the allocate page is "missing" (points to p2m_missing).

Next stage is to determine if we need to do a more granular boundary check
on the 4MB (or 2MB depending on architecture) off the start and end pfn's.
We check if the start pfn and end pfn violate that boundary check, and if
so reserve_brk a middle (p2m[x][y]) leaf page. This way we have a much finer
granularity of setting which PFNs are missing and which ones are identity.
In our example 263424 and 512256 both fail the check so we reserve_brk two
pages. Populate them with INVALID_P2M_ENTRY (so they both have "missing" values)
and assign them to p2m[1][2] and p2m[1][488] respectively.

At this point we would at minimum reserve_brk one page, but could be up to
three. Each call to set_phys_range_identity has at maximum a three page
cost. If we were to query the P2M at this stage, all those entries from
start PFN through end PFN (so 1029MB -> 2001MB) would return INVALID_P2M_ENTRY
("missing").

The next step is to walk from the start pfn to the end pfn setting
the IDENTITY_FRAME_BIT on each PFN. This is done in 'set_phys_range_identity'.
If we find that the middle leaf is pointing to p2m_missing we can swap it over
to p2m_identity - this way covering 4MB (or 2MB) PFN space.  At this point we
do not need to worry about boundary aligment (so no need to reserve_brk a middle
page, figure out which PFNs are "missing" and which ones are identity), as that
has been done earlier.  If we find that the middle leaf is not occupied by
p2m_identity or p2m_missing, we dereference that page (which covers
512 PFNs) and set the appropriate PFN with IDENTITY_FRAME_BIT. In our example
263424 and 512256 end up there, and we set from p2m[1][2][256->511] and
p2m[1][488][0->256] with IDENTITY_FRAME_BIT set.

All other regions that are void (or not filled) either point to p2m_missing
(considered missing) or have the default value of INVALID_P2M_ENTRY (also
considered missing). In our case, p2m[1][2][0->255] and p2m[1][488][257->511]
contain the INVALID_P2M_ENTRY value and are considered "missing."

This is what the p2m ends up looking (for the E820 above) with this
fabulous drawing:

   p2m         /--------------\
 /-----\       | &mfn_list[0],|                           /-----------------\
 |  0  |------>| &mfn_list[1],|    /---------------\      | ~0, ~0, ..      |
 |-----|       |  ..., ~0, ~0 |    | ~0, ~0, [x]---+----->| IDENTITY [@256] |
 |  1  |---\   \--------------/    | [p2m_identity]+\     | IDENTITY [@257] |
 |-----|    \                      | [p2m_identity]+\\    | ....            |
 |  2  |--\  \-------------------->|  ...          | \\   \----------------/
 |-----|   \                       \---------------/  \\
 |  3  |\   \                                          \\  p2m_identity
 |-----| \   \-------------------->/---------------\   /-----------------\
 | ..  +->+                        | [p2m_identity]+-->| ~0, ~0, ~0, ... |
 \-----/ /                         | [p2m_identity]+-->| ..., ~0         |
        / /---------------\        | ....          |   \-----------------/
       /  | IDENTITY[@0]  |      /-+-[x], ~0, ~0.. |
      /   | IDENTITY[@256]|<----/  \---------------/
     /    | ~0, ~0, ....  |
    |     \---------------/
    |
    p2m_missing             p2m_missing
/------------------\     /------------\
| [p2m_mid_missing]+---->| ~0, ~0, ~0 |
| [p2m_mid_missing]+---->| ..., ~0    |
\------------------/     \------------/

where ~0 is INVALID_P2M_ENTRY. IDENTITY is (PFN | IDENTITY_BIT)

Reviewed-by: Ian Campbell <ian.campbell@citrix.com>
[v5: Changed code to use ranges, added ASCII art]
[v6: Rebased on top of xen->p2m code split]
[v4: Squished patches in just this one]
[v7: Added RESERVE_BRK for potentially allocated pages]
[v8: Fixed alignment problem]
[v9: Changed 1<<3X to 1<<BITS_PER_LONG-X]
[v10: Copied git commit description in the p2m code + Add Review tag]
[v11: Title had '2-1' - should be '1-1' mapping]
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index 8ea9772..65fa4f26 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -29,8 +29,10 @@ typedef struct xpaddr {
 
 /**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/
 #define INVALID_P2M_ENTRY	(~0UL)
-#define FOREIGN_FRAME_BIT	(1UL<<31)
+#define FOREIGN_FRAME_BIT	(1UL<<(BITS_PER_LONG-1))
+#define IDENTITY_FRAME_BIT	(1UL<<(BITS_PER_LONG-2))
 #define FOREIGN_FRAME(m)	((m) | FOREIGN_FRAME_BIT)
+#define IDENTITY_FRAME(m)	((m) | IDENTITY_FRAME_BIT)
 
 /* Maximum amount of memory we can handle in a domain in pages */
 #define MAX_DOMAIN_PAGES						\
@@ -42,6 +44,8 @@ extern unsigned int   machine_to_phys_order;
 extern unsigned long get_phys_to_machine(unsigned long pfn);
 extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn);
 extern bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn);
+extern unsigned long set_phys_range_identity(unsigned long pfn_s,
+					     unsigned long pfn_e);
 
 extern int m2p_add_override(unsigned long mfn, struct page *page);
 extern int m2p_remove_override(struct page *page);
@@ -58,7 +62,7 @@ static inline unsigned long pfn_to_mfn(unsigned long pfn)
 	mfn = get_phys_to_machine(pfn);
 
 	if (mfn != INVALID_P2M_ENTRY)
-		mfn &= ~FOREIGN_FRAME_BIT;
+		mfn &= ~(FOREIGN_FRAME_BIT | IDENTITY_FRAME_BIT);
 
 	return mfn;
 }
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index df4e367..809fe35 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -23,6 +23,129 @@
  * P2M_PER_PAGE depends on the architecture, as a mfn is always
  * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to
  * 512 and 1024 entries respectively. 
+ *
+ * In short, these structures contain the Machine Frame Number (MFN) of the PFN.
+ *
+ * However not all entries are filled with MFNs. Specifically for all other
+ * leaf entries, or for the top  root, or middle one, for which there is a void
+ * entry, we assume it is  "missing". So (for example)
+ *  pfn_to_mfn(0x90909090)=INVALID_P2M_ENTRY.
+ *
+ * We also have the possibility of setting 1-1 mappings on certain regions, so
+ * that:
+ *  pfn_to_mfn(0xc0000)=0xc0000
+ *
+ * The benefit of this is, that we can assume for non-RAM regions (think
+ * PCI BARs, or ACPI spaces), we can create mappings easily b/c we
+ * get the PFN value to match the MFN.
+ *
+ * For this to work efficiently we have one new page p2m_identity and
+ * allocate (via reserved_brk) any other pages we need to cover the sides
+ * (1GB or 4MB boundary violations). All entries in p2m_identity are set to
+ * INVALID_P2M_ENTRY type (Xen toolstack only recognizes that and MFNs,
+ * no other fancy value).
+ *
+ * On lookup we spot that the entry points to p2m_identity and return the
+ * identity value instead of dereferencing and returning INVALID_P2M_ENTRY.
+ * If the entry points to an allocated page, we just proceed as before and
+ * return the PFN.  If the PFN has IDENTITY_FRAME_BIT set we unmask that in
+ * appropriate functions (pfn_to_mfn).
+ *
+ * The reason for having the IDENTITY_FRAME_BIT instead of just returning the
+ * PFN is that we could find ourselves where pfn_to_mfn(pfn)==pfn for a
+ * non-identity pfn. To protect ourselves against we elect to set (and get) the
+ * IDENTITY_FRAME_BIT on all identity mapped PFNs.
+ *
+ * This simplistic diagram is used to explain the more subtle piece of code.
+ * There is also a digram of the P2M at the end that can help.
+ * Imagine your E820 looking as so:
+ *
+ *                    1GB                                           2GB
+ * /-------------------+---------\/----\         /----------\    /---+-----\
+ * | System RAM        | Sys RAM ||ACPI|         | reserved |    | Sys RAM |
+ * \-------------------+---------/\----/         \----------/    \---+-----/
+ *                               ^- 1029MB                       ^- 2001MB
+ *
+ * [1029MB = 263424 (0x40500), 2001MB = 512256 (0x7D100),
+ *  2048MB = 524288 (0x80000)]
+ *
+ * And dom0_mem=max:3GB,1GB is passed in to the guest, meaning memory past 1GB
+ * is actually not present (would have to kick the balloon driver to put it in).
+ *
+ * When we are told to set the PFNs for identity mapping (see patch: "xen/setup:
+ * Set identity mapping for non-RAM E820 and E820 gaps.") we pass in the start
+ * of the PFN and the end PFN (263424 and 512256 respectively). The first step
+ * is to reserve_brk a top leaf page if the p2m[1] is missing. The top leaf page
+ * covers 512^2 of page estate (1GB) and in case the start or end PFN is not
+ * aligned on 512^2*PAGE_SIZE (1GB) we loop on aligned 1GB PFNs from start pfn
+ * to end pfn.  We reserve_brk top leaf pages if they are missing (means they
+ * point to p2m_mid_missing).
+ *
+ * With the E820 example above, 263424 is not 1GB aligned so we allocate a
+ * reserve_brk page which will cover the PFNs estate from 0x40000 to 0x80000.
+ * Each entry in the allocate page is "missing" (points to p2m_missing).
+ *
+ * Next stage is to determine if we need to do a more granular boundary check
+ * on the 4MB (or 2MB depending on architecture) off the start and end pfn's.
+ * We check if the start pfn and end pfn violate that boundary check, and if
+ * so reserve_brk a middle (p2m[x][y]) leaf page. This way we have a much finer
+ * granularity of setting which PFNs are missing and which ones are identity.
+ * In our example 263424 and 512256 both fail the check so we reserve_brk two
+ * pages. Populate them with INVALID_P2M_ENTRY (so they both have "missing"
+ * values) and assign them to p2m[1][2] and p2m[1][488] respectively.
+ *
+ * At this point we would at minimum reserve_brk one page, but could be up to
+ * three. Each call to set_phys_range_identity has at maximum a three page
+ * cost. If we were to query the P2M at this stage, all those entries from
+ * start PFN through end PFN (so 1029MB -> 2001MB) would return
+ * INVALID_P2M_ENTRY ("missing").
+ *
+ * The next step is to walk from the start pfn to the end pfn setting
+ * the IDENTITY_FRAME_BIT on each PFN. This is done in set_phys_range_identity.
+ * If we find that the middle leaf is pointing to p2m_missing we can swap it
+ * over to p2m_identity - this way covering 4MB (or 2MB) PFN space.  At this
+ * point we do not need to worry about boundary aligment (so no need to
+ * reserve_brk a middle page, figure out which PFNs are "missing" and which
+ * ones are identity), as that has been done earlier.  If we find that the
+ * middle leaf is not occupied by p2m_identity or p2m_missing, we dereference
+ * that page (which covers 512 PFNs) and set the appropriate PFN with
+ * IDENTITY_FRAME_BIT. In our example 263424 and 512256 end up there, and we
+ * set from p2m[1][2][256->511] and p2m[1][488][0->256] with
+ * IDENTITY_FRAME_BIT set.
+ *
+ * All other regions that are void (or not filled) either point to p2m_missing
+ * (considered missing) or have the default value of INVALID_P2M_ENTRY (also
+ * considered missing). In our case, p2m[1][2][0->255] and p2m[1][488][257->511]
+ * contain the INVALID_P2M_ENTRY value and are considered "missing."
+ *
+ * This is what the p2m ends up looking (for the E820 above) with this
+ * fabulous drawing:
+ *
+ *    p2m         /--------------\
+ *  /-----\       | &mfn_list[0],|                           /-----------------\
+ *  |  0  |------>| &mfn_list[1],|    /---------------\      | ~0, ~0, ..      |
+ *  |-----|       |  ..., ~0, ~0 |    | ~0, ~0, [x]---+----->| IDENTITY [@256] |
+ *  |  1  |---\   \--------------/    | [p2m_identity]+\     | IDENTITY [@257] |
+ *  |-----|    \                      | [p2m_identity]+\\    | ....            |
+ *  |  2  |--\  \-------------------->|  ...          | \\   \----------------/
+ *  |-----|   \                       \---------------/  \\
+ *  |  3  |\   \                                          \\  p2m_identity
+ *  |-----| \   \-------------------->/---------------\   /-----------------\
+ *  | ..  +->+                        | [p2m_identity]+-->| ~0, ~0, ~0, ... |
+ *  \-----/ /                         | [p2m_identity]+-->| ..., ~0         |
+ *         / /---------------\        | ....          |   \-----------------/
+ *        /  | IDENTITY[@0]  |      /-+-[x], ~0, ~0.. |
+ *       /   | IDENTITY[@256]|<----/  \---------------/
+ *      /    | ~0, ~0, ....  |
+ *     |     \---------------/
+ *     |
+ *     p2m_missing             p2m_missing
+ * /------------------\     /------------\
+ * | [p2m_mid_missing]+---->| ~0, ~0, ~0 |
+ * | [p2m_mid_missing]+---->| ..., ~0    |
+ * \------------------/     \------------/
+ *
+ * where ~0 is INVALID_P2M_ENTRY. IDENTITY is (PFN | IDENTITY_BIT)
  */
 
 #include <linux/init.h>
@@ -59,9 +182,15 @@ static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE);
 static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE);
 static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE);
 
+static RESERVE_BRK_ARRAY(unsigned long, p2m_identity, P2M_PER_PAGE);
+
 RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
 RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
 
+/* We might hit two boundary violations at the start and end, at max each
+ * boundary violation will require three middle nodes. */
+RESERVE_BRK(p2m_mid_identity, PAGE_SIZE * 2 * 3);
+
 static inline unsigned p2m_top_index(unsigned long pfn)
 {
 	BUG_ON(pfn >= MAX_P2M_PFN);
@@ -221,6 +350,9 @@ void __init xen_build_dynamic_phys_to_machine(void)
 	p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE);
 	p2m_top_init(p2m_top);
 
+	p2m_identity = extend_brk(PAGE_SIZE, PAGE_SIZE);
+	p2m_init(p2m_identity);
+
 	/*
 	 * The domain builder gives us a pre-constructed p2m array in
 	 * mfn_list for all the pages initially given to us, so we just
@@ -272,6 +404,14 @@ unsigned long get_phys_to_machine(unsigned long pfn)
 	mididx = p2m_mid_index(pfn);
 	idx = p2m_index(pfn);
 
+	/*
+	 * The INVALID_P2M_ENTRY is filled in both p2m_*identity
+	 * and in p2m_*missing, so returning the INVALID_P2M_ENTRY
+	 * would be wrong.
+	 */
+	if (p2m_top[topidx][mididx] == p2m_identity)
+		return IDENTITY_FRAME(pfn);
+
 	return p2m_top[topidx][mididx][idx];
 }
 EXPORT_SYMBOL_GPL(get_phys_to_machine);
@@ -341,9 +481,11 @@ static bool alloc_p2m(unsigned long pfn)
 			p2m_top_mfn_p[topidx] = mid_mfn;
 	}
 
-	if (p2m_top[topidx][mididx] == p2m_missing) {
+	if (p2m_top[topidx][mididx] == p2m_identity ||
+	    p2m_top[topidx][mididx] == p2m_missing) {
 		/* p2m leaf page is missing */
 		unsigned long *p2m;
+		unsigned long *p2m_orig = p2m_top[topidx][mididx];
 
 		p2m = alloc_p2m_page();
 		if (!p2m)
@@ -351,7 +493,7 @@ static bool alloc_p2m(unsigned long pfn)
 
 		p2m_init(p2m);
 
-		if (cmpxchg(&mid[mididx], p2m_missing, p2m) != p2m_missing)
+		if (cmpxchg(&mid[mididx], p2m_orig, p2m) != p2m_orig)
 			free_p2m_page(p2m);
 		else
 			mid_mfn[mididx] = virt_to_mfn(p2m);
@@ -360,6 +502,82 @@ static bool alloc_p2m(unsigned long pfn)
 	return true;
 }
 
+bool __early_alloc_p2m(unsigned long pfn)
+{
+	unsigned topidx, mididx, idx;
+
+	topidx = p2m_top_index(pfn);
+	mididx = p2m_mid_index(pfn);
+	idx = p2m_index(pfn);
+
+	/* Pfff.. No boundary cross-over, lets get out. */
+	if (!idx)
+		return false;
+
+	WARN(p2m_top[topidx][mididx] == p2m_identity,
+		"P2M[%d][%d] == IDENTITY, should be MISSING (or alloced)!\n",
+		topidx, mididx);
+
+	/*
+	 * Could be done by xen_build_dynamic_phys_to_machine..
+	 */
+	if (p2m_top[topidx][mididx] != p2m_missing)
+		return false;
+
+	/* Boundary cross-over for the edges: */
+	if (idx) {
+		unsigned long *p2m = extend_brk(PAGE_SIZE, PAGE_SIZE);
+
+		p2m_init(p2m);
+
+		p2m_top[topidx][mididx] = p2m;
+
+	}
+	return idx != 0;
+}
+unsigned long set_phys_range_identity(unsigned long pfn_s,
+				      unsigned long pfn_e)
+{
+	unsigned long pfn;
+
+	if (unlikely(pfn_s >= MAX_P2M_PFN || pfn_e >= MAX_P2M_PFN))
+		return 0;
+
+	if (unlikely(xen_feature(XENFEAT_auto_translated_physmap)))
+		return pfn_e - pfn_s;
+
+	if (pfn_s > pfn_e)
+		return 0;
+
+	for (pfn = (pfn_s & ~(P2M_MID_PER_PAGE * P2M_PER_PAGE - 1));
+		pfn < ALIGN(pfn_e, (P2M_MID_PER_PAGE * P2M_PER_PAGE));
+		pfn += P2M_MID_PER_PAGE * P2M_PER_PAGE)
+	{
+		unsigned topidx = p2m_top_index(pfn);
+		if (p2m_top[topidx] == p2m_mid_missing) {
+			unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
+
+			p2m_mid_init(mid);
+
+			p2m_top[topidx] = mid;
+		}
+	}
+
+	__early_alloc_p2m(pfn_s);
+	__early_alloc_p2m(pfn_e);
+
+	for (pfn = pfn_s; pfn < pfn_e; pfn++)
+		if (!__set_phys_to_machine(pfn, IDENTITY_FRAME(pfn)))
+			break;
+
+	if (!WARN((pfn - pfn_s) != (pfn_e - pfn_s),
+		"Identity mapping failed. We are %ld short of 1-1 mappings!\n",
+		(pfn_e - pfn_s) - (pfn - pfn_s)))
+		printk(KERN_DEBUG "1-1 mapping on %lx->%lx\n", pfn_s, pfn);
+
+	return pfn - pfn_s;
+}
+
 /* Try to install p2m mapping; fail if intermediate bits missing */
 bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
 {
@@ -378,6 +596,20 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
 	mididx = p2m_mid_index(pfn);
 	idx = p2m_index(pfn);
 
+	/* For sparse holes were the p2m leaf has real PFN along with
+	 * PCI holes, stick in the PFN as the MFN value.
+	 */
+	if (mfn != INVALID_P2M_ENTRY && (mfn & IDENTITY_FRAME_BIT)) {
+		if (p2m_top[topidx][mididx] == p2m_identity)
+			return true;
+
+		/* Swap over from MISSING to IDENTITY if needed. */
+		if (p2m_top[topidx][mididx] == p2m_missing) {
+			p2m_top[topidx][mididx] = p2m_identity;
+			return true;
+		}
+	}
+
 	if (p2m_top[topidx][mididx] == p2m_missing)
 		return mfn == INVALID_P2M_ENTRY;
 
-- 
cgit v0.10.2


From fb38923ead10aa8a28db191548e176e8856614d7 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Wed, 5 Jan 2011 15:46:31 -0500
Subject: xen/mmu: Set _PAGE_IOMAP if PFN is an identity PFN.

If we find that the PFN is within the P2M as an identity
PFN make sure to tack on the _PAGE_IOMAP flag.

Reviewed-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 0180ae8..9c9e076 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -416,8 +416,12 @@ static pteval_t pte_pfn_to_mfn(pteval_t val)
 	if (val & _PAGE_PRESENT) {
 		unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
 		pteval_t flags = val & PTE_FLAGS_MASK;
-		unsigned long mfn = pfn_to_mfn(pfn);
+		unsigned long mfn;
 
+		if (!xen_feature(XENFEAT_auto_translated_physmap))
+			mfn = get_phys_to_machine(pfn);
+		else
+			mfn = pfn;
 		/*
 		 * If there's no mfn for the pfn, then just create an
 		 * empty non-present pte.  Unfortunately this loses
@@ -427,8 +431,18 @@ static pteval_t pte_pfn_to_mfn(pteval_t val)
 		if (unlikely(mfn == INVALID_P2M_ENTRY)) {
 			mfn = 0;
 			flags = 0;
+		} else {
+			/*
+			 * Paramount to do this test _after_ the
+			 * INVALID_P2M_ENTRY as INVALID_P2M_ENTRY &
+			 * IDENTITY_FRAME_BIT resolves to true.
+			 */
+			mfn &= ~FOREIGN_FRAME_BIT;
+			if (mfn & IDENTITY_FRAME_BIT) {
+				mfn &= ~IDENTITY_FRAME_BIT;
+				flags |= _PAGE_IOMAP;
+			}
 		}
-
 		val = ((pteval_t)mfn << PAGE_SHIFT) | flags;
 	}
 
-- 
cgit v0.10.2


From c7617798771ad588d585986d896197c04b737621 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Tue, 18 Jan 2011 20:17:10 -0500
Subject: xen/mmu: WARN_ON when racing to swap middle leaf.

The initial bootup code uses set_phys_to_machine quite a lot, and after
bootup it would be used by the balloon driver. The balloon driver does have
mutex lock so this should not be necessary - but just in case, add
a WARN_ON if we do hit this scenario. If we do fail this, it is OK
to continue as there is a backup mechanism (VM_IO) that can bypass
the P2M and still set the _PAGE_IOMAP flags.

[v2: Change from WARN to BUG_ON]
[v3: Rebased on top of xen->p2m code split]
[v4: Change from BUG_ON to WARN]
Reviewed-by: Ian Campbell <Ian.Campbell@eu.citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 809fe35..4631cf9 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -605,7 +605,8 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
 
 		/* Swap over from MISSING to IDENTITY if needed. */
 		if (p2m_top[topidx][mididx] == p2m_missing) {
-			p2m_top[topidx][mididx] = p2m_identity;
+			WARN_ON(cmpxchg(&p2m_top[topidx][mididx], p2m_missing,
+				p2m_identity) != p2m_missing);
 			return true;
 		}
 	}
-- 
cgit v0.10.2


From 68df0da7f42be6ae017fe9f48ac414c43a7b9d32 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Tue, 1 Feb 2011 17:15:30 -0500
Subject: xen/setup: Set identity mapping for non-RAM E820 and E820 gaps.

We walk the E820 region and start at 0 (for PV guests we start
at ISA_END_ADDRESS) and skip any E820 RAM regions. For all other
regions and as well the gaps we set them to be identity mappings.

The reasons we do not want to set the identity mapping from 0->
ISA_END_ADDRESS when running as PV is b/c that the kernel would
try to read DMI information and fail (no permissions to read that).
There is a lot of gnarly code to deal with that weird region so
we won't try to do a cleanup in this patch.

This code ends up calling 'set_phys_to_identity' with the start
and end PFN of the the E820 that are non-RAM or have gaps.
On 99% of machines that means one big region right underneath the
4GB mark. Usually starts at 0xc0000 (or 0x80000) and goes to
0x100000.

[v2: Fix for E820 crossing 1MB region and clamp the start]
[v3: Squshed in code that does this over ranges]
[v4: Moved the comment to the correct spot]
[v5: Use the "raw" E820 from the hypervisor]
[v6: Added Review-by tag]

Reviewed-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 7201800..54d9379 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -143,12 +143,55 @@ static unsigned long __init xen_return_unused_memory(unsigned long max_pfn,
 	return released;
 }
 
+static unsigned long __init xen_set_identity(const struct e820entry *list,
+					     ssize_t map_size)
+{
+	phys_addr_t last = xen_initial_domain() ? 0 : ISA_END_ADDRESS;
+	phys_addr_t start_pci = last;
+	const struct e820entry *entry;
+	unsigned long identity = 0;
+	int i;
+
+	for (i = 0, entry = list; i < map_size; i++, entry++) {
+		phys_addr_t start = entry->addr;
+		phys_addr_t end = start + entry->size;
+
+		if (start < last)
+			start = last;
+
+		if (end <= start)
+			continue;
+
+		/* Skip over the 1MB region. */
+		if (last > end)
+			continue;
+
+		if (entry->type == E820_RAM) {
+			if (start > start_pci)
+				identity += set_phys_range_identity(
+						PFN_UP(start_pci), PFN_DOWN(start));
+
+			/* Without saving 'last' we would gooble RAM too
+			 * at the end of the loop. */
+			last = end;
+			start_pci = end;
+			continue;
+		}
+		start_pci = min(start, start_pci);
+		last = end;
+	}
+	if (last > start_pci)
+		identity += set_phys_range_identity(
+					PFN_UP(start_pci), PFN_DOWN(last));
+	return identity;
+}
 /**
  * machine_specific_memory_setup - Hook for machine specific memory setup.
  **/
 char * __init xen_memory_setup(void)
 {
 	static struct e820entry map[E820MAX] __initdata;
+	static struct e820entry map_raw[E820MAX] __initdata;
 
 	unsigned long max_pfn = xen_start_info->nr_pages;
 	unsigned long long mem_end;
@@ -156,6 +199,7 @@ char * __init xen_memory_setup(void)
 	struct xen_memory_map memmap;
 	unsigned long extra_pages = 0;
 	unsigned long extra_limit;
+	unsigned long identity_pages = 0;
 	int i;
 	int op;
 
@@ -181,6 +225,7 @@ char * __init xen_memory_setup(void)
 	}
 	BUG_ON(rc);
 
+	memcpy(map_raw, map, sizeof(map));
 	e820.nr_map = 0;
 	xen_extra_mem_start = mem_end;
 	for (i = 0; i < memmap.nr_entries; i++) {
@@ -251,6 +296,13 @@ char * __init xen_memory_setup(void)
 
 	xen_add_extra_mem(extra_pages);
 
+	/*
+	 * Set P2M for all non-RAM pages and E820 gaps to be identity
+	 * type PFNs. We supply it with the non-sanitized version
+	 * of the E820.
+	 */
+	identity_pages = xen_set_identity(map_raw, memmap.nr_entries);
+	printk(KERN_INFO "Set %ld page(s) to 1-1 mapping.\n", identity_pages);
 	return "Xen";
 }
 
-- 
cgit v0.10.2


From 2222e71bd6eff7b2ad026d4ee663b6327c5a49f5 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Wed, 22 Dec 2010 08:57:30 -0500
Subject: xen/debugfs: Add 'p2m' file for printing out the P2M layout.

We walk over the whole P2M tree and construct a simplified view of
which PFN regions belong to what level and what type they are.

Only enabled if CONFIG_XEN_DEBUG_FS is set.

[v2: UNKN->UNKNOWN, use uninitialized_var]
[v3: Rebased on top of mmu->p2m code split]
[v4: Fixed the else if]
Reviewed-by: Ian Campbell <Ian.Campbell@eu.citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index 65fa4f26..78ebbeb 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -52,6 +52,9 @@ extern int m2p_remove_override(struct page *page);
 extern struct page *m2p_find_override(unsigned long mfn);
 extern unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn);
 
+#ifdef CONFIG_XEN_DEBUG_FS
+extern int p2m_dump_show(struct seq_file *m, void *v);
+#endif
 static inline unsigned long pfn_to_mfn(unsigned long pfn)
 {
 	unsigned long mfn;
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 9c9e076..b13b6ca 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -46,6 +46,7 @@
 #include <linux/module.h>
 #include <linux/gfp.h>
 #include <linux/memblock.h>
+#include <linux/seq_file.h>
 
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
@@ -2367,6 +2368,18 @@ EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range);
 
 #ifdef CONFIG_XEN_DEBUG_FS
 
+static int p2m_dump_open(struct inode *inode, struct file *filp)
+{
+	return single_open(filp, p2m_dump_show, NULL);
+}
+
+static const struct file_operations p2m_dump_fops = {
+	.open		= p2m_dump_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
 static struct dentry *d_mmu_debug;
 
 static int __init xen_mmu_debugfs(void)
@@ -2422,6 +2435,7 @@ static int __init xen_mmu_debugfs(void)
 	debugfs_create_u32("prot_commit_batched", 0444, d_mmu_debug,
 			   &mmu_stats.prot_commit_batched);
 
+	debugfs_create_file("p2m", 0600, d_mmu_debug, NULL, &p2m_dump_fops);
 	return 0;
 }
 fs_initcall(xen_mmu_debugfs);
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 4631cf9..65f21f4 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -153,6 +153,7 @@
 #include <linux/list.h>
 #include <linux/hash.h>
 #include <linux/sched.h>
+#include <linux/seq_file.h>
 
 #include <asm/cache.h>
 #include <asm/setup.h>
@@ -758,3 +759,80 @@ unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn)
 	return ret;
 }
 EXPORT_SYMBOL_GPL(m2p_find_override_pfn);
+
+#ifdef CONFIG_XEN_DEBUG_FS
+
+int p2m_dump_show(struct seq_file *m, void *v)
+{
+	static const char * const level_name[] = { "top", "middle",
+						"entry", "abnormal" };
+	static const char * const type_name[] = { "identity", "missing",
+						"pfn", "abnormal"};
+#define TYPE_IDENTITY 0
+#define TYPE_MISSING 1
+#define TYPE_PFN 2
+#define TYPE_UNKNOWN 3
+	unsigned long pfn, prev_pfn_type = 0, prev_pfn_level = 0;
+	unsigned int uninitialized_var(prev_level);
+	unsigned int uninitialized_var(prev_type);
+
+	if (!p2m_top)
+		return 0;
+
+	for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn++) {
+		unsigned topidx = p2m_top_index(pfn);
+		unsigned mididx = p2m_mid_index(pfn);
+		unsigned idx = p2m_index(pfn);
+		unsigned lvl, type;
+
+		lvl = 4;
+		type = TYPE_UNKNOWN;
+		if (p2m_top[topidx] == p2m_mid_missing) {
+			lvl = 0; type = TYPE_MISSING;
+		} else if (p2m_top[topidx] == NULL) {
+			lvl = 0; type = TYPE_UNKNOWN;
+		} else if (p2m_top[topidx][mididx] == NULL) {
+			lvl = 1; type = TYPE_UNKNOWN;
+		} else if (p2m_top[topidx][mididx] == p2m_identity) {
+			lvl = 1; type = TYPE_IDENTITY;
+		} else if (p2m_top[topidx][mididx] == p2m_missing) {
+			lvl = 1; type = TYPE_MISSING;
+		} else if (p2m_top[topidx][mididx][idx] == 0) {
+			lvl = 2; type = TYPE_UNKNOWN;
+		} else if (p2m_top[topidx][mididx][idx] == IDENTITY_FRAME(pfn)) {
+			lvl = 2; type = TYPE_IDENTITY;
+		} else if (p2m_top[topidx][mididx][idx] == INVALID_P2M_ENTRY) {
+			lvl = 2; type = TYPE_MISSING;
+		} else if (p2m_top[topidx][mididx][idx] == pfn) {
+			lvl = 2; type = TYPE_PFN;
+		} else if (p2m_top[topidx][mididx][idx] != pfn) {
+			lvl = 2; type = TYPE_PFN;
+		}
+		if (pfn == 0) {
+			prev_level = lvl;
+			prev_type = type;
+		}
+		if (pfn == MAX_DOMAIN_PAGES-1) {
+			lvl = 3;
+			type = TYPE_UNKNOWN;
+		}
+		if (prev_type != type) {
+			seq_printf(m, " [0x%lx->0x%lx] %s\n",
+				prev_pfn_type, pfn, type_name[prev_type]);
+			prev_pfn_type = pfn;
+			prev_type = type;
+		}
+		if (prev_level != lvl) {
+			seq_printf(m, " [0x%lx->0x%lx] level %s\n",
+				prev_pfn_level, pfn, level_name[prev_level]);
+			prev_pfn_level = pfn;
+			prev_level = lvl;
+		}
+	}
+	return 0;
+#undef TYPE_IDENTITY
+#undef TYPE_MISSING
+#undef TYPE_PFN
+#undef TYPE_UNKNOWN
+}
+#endif
-- 
cgit v0.10.2


From fc25151d9ac7d809239fe68de0a1490b504bb94a Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Thu, 23 Dec 2010 16:25:29 -0500
Subject: xen/debug: WARN_ON when identity PFN has no _PAGE_IOMAP flag set.

Only enabled if XEN_DEBUG is enabled. We print a warning
when:

 pfn_to_mfn(pfn) == pfn, but no VM_IO (_PAGE_IOMAP) flag set
	(and pfn is an identity mapped pfn)
 pfn_to_mfn(pfn) != pfn, and VM_IO flag is set.
	(ditto, pfn is an identity mapped pfn)

[v2: Make it dependent on CONFIG_XEN_DEBUG instead of ..DEBUG_FS]
[v3: Fix compiler warning]

Reviewed-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 5b54892..e4343fe 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -48,3 +48,11 @@ config XEN_DEBUG_FS
 	help
 	  Enable statistics output and various tuning options in debugfs.
 	  Enabling this option may incur a significant performance overhead.
+
+config XEN_DEBUG
+	bool "Enable Xen debug checks"
+	depends on XEN
+	default n
+	help
+	  Enable various WARN_ON checks in the Xen MMU code.
+	  Enabling this option WILL incur a significant performance overhead.
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index b13b6ca..0c376a2 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -547,6 +547,41 @@ pte_t xen_make_pte(pteval_t pte)
 }
 PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte);
 
+#ifdef CONFIG_XEN_DEBUG
+pte_t xen_make_pte_debug(pteval_t pte)
+{
+	phys_addr_t addr = (pte & PTE_PFN_MASK);
+	phys_addr_t other_addr;
+	bool io_page = false;
+	pte_t _pte;
+
+	if (pte & _PAGE_IOMAP)
+		io_page = true;
+
+	_pte = xen_make_pte(pte);
+
+	if (!addr)
+		return _pte;
+
+	if (io_page &&
+	    (xen_initial_domain() || addr >= ISA_END_ADDRESS)) {
+		other_addr = pfn_to_mfn(addr >> PAGE_SHIFT) << PAGE_SHIFT;
+		WARN(addr != other_addr,
+			"0x%lx is using VM_IO, but it is 0x%lx!\n",
+			(unsigned long)addr, (unsigned long)other_addr);
+	} else {
+		pteval_t iomap_set = (_pte.pte & PTE_FLAGS_MASK) & _PAGE_IOMAP;
+		other_addr = (_pte.pte & PTE_PFN_MASK);
+		WARN((addr == other_addr) && (!io_page) && (!iomap_set),
+			"0x%lx is missing VM_IO (and wasn't fixed)!\n",
+			(unsigned long)addr);
+	}
+
+	return _pte;
+}
+PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte_debug);
+#endif
+
 pgd_t xen_make_pgd(pgdval_t pgd)
 {
 	pgd = pte_pfn_to_mfn(pgd);
@@ -1957,6 +1992,9 @@ __init void xen_ident_map_ISA(void)
 
 static __init void xen_post_allocator_init(void)
 {
+#ifdef CONFIG_XEN_DEBUG
+	pv_mmu_ops.make_pte = PV_CALLEE_SAVE(xen_make_pte_debug);
+#endif
 	pv_mmu_ops.set_pte = xen_set_pte;
 	pv_mmu_ops.set_pmd = xen_set_pmd;
 	pv_mmu_ops.set_pud = xen_set_pud;
-- 
cgit v0.10.2


From 146c4e511717e581065800938537b276173d8548 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Fri, 14 Jan 2011 17:55:44 -0500
Subject: xen/m2p: No need to catch exceptions when we know that there is no
 RAM

.. beyound what we think is the end of memory. However there might
be more System RAM - but assigned to a guest. Hence jump to the
M2P override check and consult.

[v1: Added Review-by tag]

Reviewed-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index 78ebbeb..1957070 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -85,6 +85,10 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn)
 	if (xen_feature(XENFEAT_auto_translated_physmap))
 		return mfn;
 
+	if (unlikely((mfn >> machine_to_phys_order) != 0)) {
+		pfn = ~0;
+		goto try_override;
+	}
 	pfn = 0;
 	/*
 	 * The array access can fail (e.g., device space beyond end of RAM).
@@ -92,7 +96,7 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn)
 	 * but we must handle the fault without crashing!
 	 */
 	__get_user(pfn, &machine_to_phys_mapping[mfn]);
-
+try_override:
 	/*
 	 * If this appears to be a foreign mfn (because the pfn
 	 * doesn't map back to the mfn), then check the local override
-- 
cgit v0.10.2


From 706cc9d2a4cb9b03217e15b0bb3d117f4d5109ee Mon Sep 17 00:00:00 2001
From: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Date: Wed, 2 Feb 2011 18:32:59 +0000
Subject: xen/m2p: Check whether the MFN has IDENTITY_FRAME bit set..

If there is no proper PFN value in the M2P for the MFN
(so we get 0xFFFFF.. or 0x55555, or 0x0), we should
consult the M2P override to see if there is an entry for this.
[Note: we also consult the M2P override if the MFN
is past our machine_to_phys size].

We consult the P2M with the PFN. In case the returned
MFN is one of the special values: 0xFFF.., 0x5555
(which signify that the MFN can be either "missing" or it
belongs to DOMID_IO) or the p2m(m2p(mfn)) != mfn, we check
the M2P override. If we fail the M2P override check, we reset
the PFN value to INVALID_P2M_ENTRY.

Next we try to find the MFN in the P2M using the MFN
value (not the PFN value) and if found, we know
that this MFN is an identity value and return it as so.

Otherwise we have exhausted all the posibilities and we
return the PFN, which at this stage can either be a real
PFN value found in the machine_to_phys.. array, or
INVALID_P2M_ENTRY value.

[v1: Added Review-by tag]

Reviewed-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index 1957070..c61934f 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -81,6 +81,7 @@ static inline int phys_to_machine_mapping_valid(unsigned long pfn)
 static inline unsigned long mfn_to_pfn(unsigned long mfn)
 {
 	unsigned long pfn;
+	int ret = 0;
 
 	if (xen_feature(XENFEAT_auto_translated_physmap))
 		return mfn;
@@ -95,15 +96,29 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn)
 	 * In such cases it doesn't matter what we return (we return garbage),
 	 * but we must handle the fault without crashing!
 	 */
-	__get_user(pfn, &machine_to_phys_mapping[mfn]);
+	ret = __get_user(pfn, &machine_to_phys_mapping[mfn]);
 try_override:
-	/*
-	 * If this appears to be a foreign mfn (because the pfn
-	 * doesn't map back to the mfn), then check the local override
-	 * table to see if there's a better pfn to use.
+	/* ret might be < 0 if there are no entries in the m2p for mfn */
+	if (ret < 0)
+		pfn = ~0;
+	else if (get_phys_to_machine(pfn) != mfn)
+		/*
+		 * If this appears to be a foreign mfn (because the pfn
+		 * doesn't map back to the mfn), then check the local override
+		 * table to see if there's a better pfn to use.
+		 *
+		 * m2p_find_override_pfn returns ~0 if it doesn't find anything.
+		 */
+		pfn = m2p_find_override_pfn(mfn, ~0);
+
+	/* 
+	 * pfn is ~0 if there are no entries in the m2p for mfn or if the
+	 * entry doesn't map back to the mfn and m2p_override doesn't have a
+	 * valid entry for it.
 	 */
-	if (get_phys_to_machine(pfn) != mfn)
-		pfn = m2p_find_override_pfn(mfn, pfn);
+	if (pfn == ~0 &&
+			get_phys_to_machine(mfn) == IDENTITY_FRAME(mfn))
+		pfn = mfn;
 
 	return pfn;
 }
-- 
cgit v0.10.2


From 6e0aa9f8a8190e0879a29bd67aa606b51734a122 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 14 Mar 2011 10:34:35 +0100
Subject: futex: Deobfuscate handle_futex_death()

handle_futex_death() uses futex_atomic_cmpxchg_inatomic() without
disabling page faults. That's ok, but totally non obvious.

We don't hold locks so we actually can and want to fault here, because
the get_user() before futex_atomic_cmpxchg_inatomic() does not
guarantee a R/W mapping.

We could just add a big fat comment to explain this, but actually
changing the code so that the functionality is entirely clear is
better.

Use the helper function which disables page faults around the
futex_atomic_cmpxchg_inatomic() and handle a fault with a call to
fault_in_user_writeable() as all other places in the futex code do as
well.

Pointed-out-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Darren Hart <darren@dvhart.com>
Cc: Michel Lespinasse <walken@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: David Howells <dhowells@redhat.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: "James E.J. Bottomley" <jejb@parisc-linux.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Chris Metcalf <cmetcalf@tilera.com>
LKML-Reference: <alpine.LFD.2.00.1103141126590.2787@localhost6.localdomain6>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/futex.c b/kernel/futex.c
index c6bef6e..e9251d9 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2458,9 +2458,20 @@ retry:
 		 * userspace.
 		 */
 		mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
-		if (futex_atomic_cmpxchg_inatomic(&nval, uaddr, uval, mval))
-			return -1;
-
+		/*
+		 * We are not holding a lock here, but we want to have
+		 * the pagefault_disable/enable() protection because
+		 * we want to handle the fault gracefully. If the
+		 * access fails we try to fault in the futex with R/W
+		 * verification via get_user_pages. get_user() above
+		 * does not guarantee R/W access. If that fails we
+		 * give up and leave the futex locked.
+		 */
+		if (cmpxchg_futex_value_locked(&nval, uaddr, uval, mval)) {
+			if (fault_in_user_writeable(uaddr))
+				return -1;
+			goto retry;
+		}
 		if (nval != uval)
 			goto retry;
 
-- 
cgit v0.10.2


From 07d5ecae2940ddd77746e2fb597dcf57d3c2e277 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 14 Mar 2011 20:00:30 +0100
Subject: arm: Remove bogus comment in futex_atomic_cmpxchg_inatomic()

commit 522d7dec(futex: Remove redundant pagefault_disable in
futex_atomic_cmpxchg_inatomic()) added a bogus comment.

/* Note that preemption is disabled by futex_atomic_cmpxchg_inatomic
 * call sites. */

Bogus in two aspects:

1) pagefault_disable != preempt_disable even if the mechanism we use
   is the same

2) we have a call site which deliberately does not disable pagefaults
   as it wants the possible fault to be handled - though that has been
   changed for consistency reasons now.

Sigh. I really should have seen that when committing the above. :(

Catched-by-and-rightfully-ranted-at-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <alpine.LFD.2.00.1103141126590.2787@localhost6.localdomain6>
Cc: Michel Lespinasse <walken@google.com>
Cc: Darren Hart <darren@dvhart.com>

diff --git a/arch/arm/include/asm/futex.h b/arch/arm/include/asm/futex.h
index 0e29d8e..199a6b6 100644
--- a/arch/arm/include/asm/futex.h
+++ b/arch/arm/include/asm/futex.h
@@ -97,9 +97,6 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
 	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
 		return -EFAULT;
 
-	/* Note that preemption is disabled by futex_atomic_cmpxchg_inatomic
-	 * call sites. */
-
 	__asm__ __volatile__("@futex_atomic_cmpxchg_inatomic\n"
 	"1:	" T(ldr) "	%1, [%4]\n"
 	"	teq	%1, %2\n"
-- 
cgit v0.10.2


From f52e0c11305aa09ed56cad97ffc8f0cdc3d78b5d Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 14 Mar 2011 18:56:51 -0400
Subject: New AT_... flag: AT_EMPTY_PATH

For name_to_handle_at(2) we'll want both ...at()-style syscall that
would be usable for non-directory descriptors (with empty relative
pathname).  Introduce new flag (AT_EMPTY_PATH) to deal with that and
corresponding LOOKUP_EMPTY; teach user_path_at() and path_init() to
deal with the latter.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/namei.c b/fs/namei.c
index abc8d2d..83e92ba 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -136,7 +136,7 @@ static int do_getname(const char __user *filename, char *page)
 	return retval;
 }
 
-char * getname(const char __user * filename)
+static char *getname_flags(const char __user * filename, int flags)
 {
 	char *tmp, *result;
 
@@ -147,14 +147,21 @@ char * getname(const char __user * filename)
 
 		result = tmp;
 		if (retval < 0) {
-			__putname(tmp);
-			result = ERR_PTR(retval);
+			if (retval != -ENOENT || !(flags & LOOKUP_EMPTY)) {
+				__putname(tmp);
+				result = ERR_PTR(retval);
+			}
 		}
 	}
 	audit_getname(result);
 	return result;
 }
 
+char *getname(const char __user * filename)
+{
+	return getname_flags(filename, 0);
+}
+
 #ifdef CONFIG_AUDITSYSCALL
 void putname(const char *name)
 {
@@ -1544,13 +1551,15 @@ static int path_init(int dfd, const char *name, unsigned int flags,
 
 		dentry = file->f_path.dentry;
 
-		retval = -ENOTDIR;
-		if (!S_ISDIR(dentry->d_inode->i_mode))
-			goto fput_fail;
+		if (*name) {
+			retval = -ENOTDIR;
+			if (!S_ISDIR(dentry->d_inode->i_mode))
+				goto fput_fail;
 
-		retval = file_permission(file, MAY_EXEC);
-		if (retval)
-			goto fput_fail;
+			retval = file_permission(file, MAY_EXEC);
+			if (retval)
+				goto fput_fail;
+		}
 
 		nd->path = file->f_path;
 		if (flags & LOOKUP_RCU) {
@@ -1759,7 +1768,7 @@ int user_path_at(int dfd, const char __user *name, unsigned flags,
 		 struct path *path)
 {
 	struct nameidata nd;
-	char *tmp = getname(name);
+	char *tmp = getname_flags(name, flags);
 	int err = PTR_ERR(tmp);
 	if (!IS_ERR(tmp)) {
 
diff --git a/include/linux/fcntl.h b/include/linux/fcntl.h
index a562fa5..f550f89 100644
--- a/include/linux/fcntl.h
+++ b/include/linux/fcntl.h
@@ -46,6 +46,7 @@
                                            unlinking file.  */
 #define AT_SYMLINK_FOLLOW	0x400   /* Follow symbolic links.  */
 #define AT_NO_AUTOMOUNT		0x800	/* Suppress terminal automount traversal */
+#define AT_EMPTY_PATH		0x1000	/* Allow empty relative pathname */
 
 #ifdef __KERNEL__
 
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 83cd6e5..9c86038 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -64,6 +64,7 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND};
 
 #define LOOKUP_JUMPED		0x1000
 #define LOOKUP_ROOT		0x2000
+#define LOOKUP_EMPTY		0x4000
 
 extern int user_path_at(int, const char __user *, unsigned, struct path *);
 
-- 
cgit v0.10.2


From 990d6c2d7aee921e3bce22b2d6a750fd552262be Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Sat, 29 Jan 2011 18:43:26 +0530
Subject: vfs: Add name to file handle conversion support

The syscall also return mount id which can be used
to lookup file system specific information such as uuid
in /proc/<pid>/mountinfo

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/Kconfig b/fs/Kconfig
index 3db9caa..7cb53aa 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -47,7 +47,7 @@ config FS_POSIX_ACL
 	def_bool n
 
 config EXPORTFS
-	tristate
+	bool
 
 config FILE_LOCKING
 	bool "Enable POSIX file locking API" if EXPERT
diff --git a/fs/Makefile b/fs/Makefile
index a7f7cef..ba01202 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -48,6 +48,8 @@ obj-$(CONFIG_FS_POSIX_ACL)	+= posix_acl.o xattr_acl.o
 obj-$(CONFIG_NFS_COMMON)	+= nfs_common/
 obj-$(CONFIG_GENERIC_ACL)	+= generic_acl.o
 
+obj-$(CONFIG_FHANDLE)		+= fhandle.o
+
 obj-y				+= quota/
 
 obj-$(CONFIG_PROC_FS)		+= proc/
diff --git a/fs/fhandle.c b/fs/fhandle.c
new file mode 100644
index 0000000..9f79e74
--- /dev/null
+++ b/fs/fhandle.c
@@ -0,0 +1,107 @@
+#include <linux/syscalls.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/exportfs.h>
+#include <asm/uaccess.h>
+#include "internal.h"
+
+static long do_sys_name_to_handle(struct path *path,
+				  struct file_handle __user *ufh,
+				  int __user *mnt_id)
+{
+	long retval;
+	struct file_handle f_handle;
+	int handle_dwords, handle_bytes;
+	struct file_handle *handle = NULL;
+
+	/*
+	 * We need t make sure wether the file system
+	 * support decoding of the file handle
+	 */
+	if (!path->mnt->mnt_sb->s_export_op ||
+	    !path->mnt->mnt_sb->s_export_op->fh_to_dentry)
+		return -EOPNOTSUPP;
+
+	if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle)))
+		return -EFAULT;
+
+	if (f_handle.handle_bytes > MAX_HANDLE_SZ)
+		return -EINVAL;
+
+	handle = kmalloc(sizeof(struct file_handle) + f_handle.handle_bytes,
+			 GFP_KERNEL);
+	if (!handle)
+		return -ENOMEM;
+
+	/* convert handle size to  multiple of sizeof(u32) */
+	handle_dwords = f_handle.handle_bytes >> 2;
+
+	/* we ask for a non connected handle */
+	retval = exportfs_encode_fh(path->dentry,
+				    (struct fid *)handle->f_handle,
+				    &handle_dwords,  0);
+	handle->handle_type = retval;
+	/* convert handle size to bytes */
+	handle_bytes = handle_dwords * sizeof(u32);
+	handle->handle_bytes = handle_bytes;
+	if ((handle->handle_bytes > f_handle.handle_bytes) ||
+	    (retval == 255) || (retval == -ENOSPC)) {
+		/* As per old exportfs_encode_fh documentation
+		 * we could return ENOSPC to indicate overflow
+		 * But file system returned 255 always. So handle
+		 * both the values
+		 */
+		/*
+		 * set the handle size to zero so we copy only
+		 * non variable part of the file_handle
+		 */
+		handle_bytes = 0;
+		retval = -EOVERFLOW;
+	} else
+		retval = 0;
+	/* copy the mount id */
+	if (copy_to_user(mnt_id, &path->mnt->mnt_id, sizeof(*mnt_id)) ||
+	    copy_to_user(ufh, handle,
+			 sizeof(struct file_handle) + handle_bytes))
+		retval = -EFAULT;
+	kfree(handle);
+	return retval;
+}
+
+/**
+ * sys_name_to_handle_at: convert name to handle
+ * @dfd: directory relative to which name is interpreted if not absolute
+ * @name: name that should be converted to handle.
+ * @handle: resulting file handle
+ * @mnt_id: mount id of the file system containing the file
+ * @flag: flag value to indicate whether to follow symlink or not
+ *
+ * @handle->handle_size indicate the space available to store the
+ * variable part of the file handle in bytes. If there is not
+ * enough space, the field is updated to return the minimum
+ * value required.
+ */
+SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name,
+		struct file_handle __user *, handle, int __user *, mnt_id,
+		int, flag)
+{
+	struct path path;
+	int lookup_flags;
+	int err;
+
+	if ((flag & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0)
+		return -EINVAL;
+
+	lookup_flags = (flag & AT_SYMLINK_FOLLOW) ? LOOKUP_FOLLOW : 0;
+	if (flag & AT_EMPTY_PATH)
+		lookup_flags |= LOOKUP_EMPTY;
+	err = user_path_at(dfd, name, lookup_flags, &path);
+	if (!err) {
+		err = do_sys_name_to_handle(&path, handle, mnt_id);
+		path_put(&path);
+	}
+	return err;
+}
diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
index 65afdfd..33a42f2 100644
--- a/include/linux/exportfs.h
+++ b/include/linux/exportfs.h
@@ -8,6 +8,9 @@ struct inode;
 struct super_block;
 struct vfsmount;
 
+/* limit the handle size to NFSv4 handle size now */
+#define MAX_HANDLE_SZ 128
+
 /*
  * The fileid_type identifies how the file within the filesystem is encoded.
  * In theory this is freely set and parsed by the filesystem, but we try to
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b7178b0..3f64630 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -978,6 +978,13 @@ struct file {
 #endif
 };
 
+struct file_handle {
+	__u32 handle_bytes;
+	int handle_type;
+	/* file identifier */
+	unsigned char f_handle[0];
+};
+
 #define get_file(x)	atomic_long_inc(&(x)->f_count)
 #define fput_atomic(x)	atomic_long_add_unless(&(x)->f_count, -1, 1)
 #define file_count(x)	atomic_long_read(&(x)->f_count)
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 98664db..9701126 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -62,6 +62,7 @@ struct robust_list_head;
 struct getcpu_cache;
 struct old_linux_dirent;
 struct perf_event_attr;
+struct file_handle;
 
 #include <linux/types.h>
 #include <linux/aio_abi.h>
@@ -832,5 +833,7 @@ asmlinkage long sys_mmap_pgoff(unsigned long addr, unsigned long len,
 			unsigned long prot, unsigned long flags,
 			unsigned long fd, unsigned long pgoff);
 asmlinkage long sys_old_mmap(struct mmap_arg_struct __user *arg);
-
+asmlinkage long sys_name_to_handle_at(int dfd, const char __user *name,
+				      struct file_handle __user *handle,
+				      int __user *mnt_id, int flag);
 #endif
diff --git a/init/Kconfig b/init/Kconfig
index be788c0..e72fa17 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -287,6 +287,18 @@ config BSD_PROCESS_ACCT_V3
 	  for processing it. A preliminary version of these tools is available
 	  at <http://www.gnu.org/software/acct/>.
 
+config FHANDLE
+	bool "open by fhandle syscalls"
+	select EXPORTFS
+	help
+	  If you say Y here, a user level program will be able to map
+	  file names to handle and then later use the handle for
+	  different file system operations. This is useful in implementing
+	  userspace file servers, which now track files using handles instead
+	  of names. The handle would remain the same even if file names
+	  get renamed. Enables open_by_handle_at(2) and name_to_handle_at(2)
+	  syscalls.
+
 config TASKSTATS
 	bool "Export task/process statistics through netlink (EXPERIMENTAL)"
 	depends on NET
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index c782fe9..4e01343 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -186,3 +186,6 @@ cond_syscall(sys_perf_event_open);
 /* fanotify! */
 cond_syscall(sys_fanotify_init);
 cond_syscall(sys_fanotify_mark);
+
+/* open by handle */
+cond_syscall(sys_name_to_handle_at);
-- 
cgit v0.10.2


From becfd1f37544798cbdfd788f32c827160fab98c1 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Sat, 29 Jan 2011 18:43:26 +0530
Subject: vfs: Add open by file handle support

[AV: duplicate of open() guts removed; file_open_root() used instead]

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/compat.c b/fs/compat.c
index a071775..c6d31a3 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -2284,3 +2284,16 @@ asmlinkage long compat_sys_timerfd_gettime(int ufd,
 }
 
 #endif /* CONFIG_TIMERFD */
+
+#ifdef CONFIG_FHANDLE
+/*
+ * Exactly like fs/open.c:sys_open_by_handle_at(), except that it
+ * doesn't set the O_LARGEFILE flag.
+ */
+asmlinkage long
+compat_sys_open_by_handle_at(int mountdirfd,
+			     struct file_handle __user *handle, int flags)
+{
+	return do_handle_open(mountdirfd, handle, flags);
+}
+#endif
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index cfe5573..b05acb7 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -374,6 +374,8 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid,
 	/*
 	 * Try to get any dentry for the given file handle from the filesystem.
 	 */
+	if (!nop || !nop->fh_to_dentry)
+		return ERR_PTR(-ESTALE);
 	result = nop->fh_to_dentry(mnt->mnt_sb, fid, fh_len, fileid_type);
 	if (!result)
 		result = ERR_PTR(-ESTALE);
diff --git a/fs/fhandle.c b/fs/fhandle.c
index 9f79e74..bf93ad2 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -5,6 +5,8 @@
 #include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/exportfs.h>
+#include <linux/fs_struct.h>
+#include <linux/fsnotify.h>
 #include <asm/uaccess.h>
 #include "internal.h"
 
@@ -105,3 +107,159 @@ SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name,
 	}
 	return err;
 }
+
+static struct vfsmount *get_vfsmount_from_fd(int fd)
+{
+	struct path path;
+
+	if (fd == AT_FDCWD) {
+		struct fs_struct *fs = current->fs;
+		spin_lock(&fs->lock);
+		path = fs->pwd;
+		mntget(path.mnt);
+		spin_unlock(&fs->lock);
+	} else {
+		int fput_needed;
+		struct file *file = fget_light(fd, &fput_needed);
+		if (!file)
+			return ERR_PTR(-EBADF);
+		path = file->f_path;
+		mntget(path.mnt);
+		fput_light(file, fput_needed);
+	}
+	return path.mnt;
+}
+
+static int vfs_dentry_acceptable(void *context, struct dentry *dentry)
+{
+	return 1;
+}
+
+static int do_handle_to_path(int mountdirfd, struct file_handle *handle,
+			     struct path *path)
+{
+	int retval = 0;
+	int handle_dwords;
+
+	path->mnt = get_vfsmount_from_fd(mountdirfd);
+	if (IS_ERR(path->mnt)) {
+		retval = PTR_ERR(path->mnt);
+		goto out_err;
+	}
+	/* change the handle size to multiple of sizeof(u32) */
+	handle_dwords = handle->handle_bytes >> 2;
+	path->dentry = exportfs_decode_fh(path->mnt,
+					  (struct fid *)handle->f_handle,
+					  handle_dwords, handle->handle_type,
+					  vfs_dentry_acceptable, NULL);
+	if (IS_ERR(path->dentry)) {
+		retval = PTR_ERR(path->dentry);
+		goto out_mnt;
+	}
+	return 0;
+out_mnt:
+	mntput(path->mnt);
+out_err:
+	return retval;
+}
+
+static int handle_to_path(int mountdirfd, struct file_handle __user *ufh,
+		   struct path *path)
+{
+	int retval = 0;
+	struct file_handle f_handle;
+	struct file_handle *handle = NULL;
+
+	/*
+	 * With handle we don't look at the execute bit on the
+	 * the directory. Ideally we would like CAP_DAC_SEARCH.
+	 * But we don't have that
+	 */
+	if (!capable(CAP_DAC_READ_SEARCH)) {
+		retval = -EPERM;
+		goto out_err;
+	}
+	if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle))) {
+		retval = -EFAULT;
+		goto out_err;
+	}
+	if ((f_handle.handle_bytes > MAX_HANDLE_SZ) ||
+	    (f_handle.handle_bytes == 0)) {
+		retval = -EINVAL;
+		goto out_err;
+	}
+	handle = kmalloc(sizeof(struct file_handle) + f_handle.handle_bytes,
+			 GFP_KERNEL);
+	if (!handle) {
+		retval = -ENOMEM;
+		goto out_err;
+	}
+	/* copy the full handle */
+	if (copy_from_user(handle, ufh,
+			   sizeof(struct file_handle) +
+			   f_handle.handle_bytes)) {
+		retval = -EFAULT;
+		goto out_handle;
+	}
+
+	retval = do_handle_to_path(mountdirfd, handle, path);
+
+out_handle:
+	kfree(handle);
+out_err:
+	return retval;
+}
+
+long do_handle_open(int mountdirfd,
+		    struct file_handle __user *ufh, int open_flag)
+{
+	long retval = 0;
+	struct path path;
+	struct file *file;
+	int fd;
+
+	retval = handle_to_path(mountdirfd, ufh, &path);
+	if (retval)
+		return retval;
+
+	fd = get_unused_fd_flags(open_flag);
+	if (fd < 0) {
+		path_put(&path);
+		return fd;
+	}
+	file = file_open_root(path.dentry, path.mnt, "", open_flag);
+	if (IS_ERR(file)) {
+		put_unused_fd(fd);
+		retval =  PTR_ERR(file);
+	} else {
+		retval = fd;
+		fsnotify_open(file);
+		fd_install(fd, file);
+	}
+	path_put(&path);
+	return retval;
+}
+
+/**
+ * sys_open_by_handle_at: Open the file handle
+ * @mountdirfd: directory file descriptor
+ * @handle: file handle to be opened
+ * @flag: open flags.
+ *
+ * @mountdirfd indicate the directory file descriptor
+ * of the mount point. file handle is decoded relative
+ * to the vfsmount pointed by the @mountdirfd. @flags
+ * value is same as the open(2) flags.
+ */
+SYSCALL_DEFINE3(open_by_handle_at, int, mountdirfd,
+		struct file_handle __user *, handle,
+		int, flags)
+{
+	long ret;
+
+	if (force_o_largefile())
+		flags |= O_LARGEFILE;
+
+	ret = do_handle_open(mountdirfd, handle, flags);
+	return ret;
+}
diff --git a/fs/internal.h b/fs/internal.h
index 52abc52..f3d15de 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -117,6 +117,9 @@ extern struct file *do_filp_open(int dfd, const char *pathname,
 extern struct file *do_file_open_root(struct dentry *, struct vfsmount *,
 		const char *, const struct open_flags *, int lookup_flags);
 
+extern long do_handle_open(int mountdirfd,
+			   struct file_handle __user *ufh, int open_flag);
+
 /*
  * inode.c
  */
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 9701126..2d9b79c0 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -836,4 +836,7 @@ asmlinkage long sys_old_mmap(struct mmap_arg_struct __user *arg);
 asmlinkage long sys_name_to_handle_at(int dfd, const char __user *name,
 				      struct file_handle __user *handle,
 				      int __user *mnt_id, int flag);
+asmlinkage long sys_open_by_handle_at(int mountdirfd,
+				      struct file_handle __user *handle,
+				      int flags);
 #endif
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 4e01343..25cc41c 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -189,3 +189,5 @@ cond_syscall(sys_fanotify_mark);
 
 /* open by handle */
 cond_syscall(sys_name_to_handle_at);
+cond_syscall(sys_open_by_handle_at);
+cond_syscall(compat_sys_open_by_handle_at);
-- 
cgit v0.10.2


From aae8a97d3ec30788790d1720b71d76fd8eb44b73 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Sat, 29 Jan 2011 18:43:27 +0530
Subject: fs: Don't allow to create hardlink for deleted file

Add inode->i_nlink == 0 check in VFS. Some of the file systems
do this internally. A followup patch will remove those instance.
This is needed to ensure that with link by handle we don't allow
to create hardlink of an unlinked file. The check also prevent a race
between unlink and link

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/namei.c b/fs/namei.c
index 83e92ba..33be51a 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2906,7 +2906,11 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
 		return error;
 
 	mutex_lock(&inode->i_mutex);
-	error = dir->i_op->link(old_dentry, dir, new_dentry);
+	/* Make sure we don't allow creating hardlink to an unlinked file */
+	if (inode->i_nlink == 0)
+		error =  -ENOENT;
+	else
+		error = dir->i_op->link(old_dentry, dir, new_dentry);
 	mutex_unlock(&inode->i_mutex);
 	if (!error)
 		fsnotify_link(dir, inode, new_dentry);
-- 
cgit v0.10.2


From f17b6042073e7000a90063f7edbca59a5bd1caa2 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Sat, 29 Jan 2011 18:43:30 +0530
Subject: fs: Remove i_nlink check from file system link callback

Now that VFS check for inode->i_nlink == 0 and returns proper
error, remove similar check from file system

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 0efdb65..c23f050 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4806,9 +4806,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 	int err;
 	int drop_inode = 0;
 
-	if (inode->i_nlink == 0)
-		return -ENOENT;
-
 	/* do not allow sys_link's with other subvols of the same device */
 	if (root->objectid != BTRFS_I(inode)->root->objectid)
 		return -EPERM;
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index b27ba71..561f692 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -2253,13 +2253,6 @@ static int ext3_link (struct dentry * old_dentry,
 
 	dquot_initialize(dir);
 
-	/*
-	 * Return -ENOENT if we've raced with unlink and i_nlink is 0.  Doing
-	 * otherwise has the potential to corrupt the orphan inode list.
-	 */
-	if (inode->i_nlink == 0)
-		return -ENOENT;
-
 retry:
 	handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
 					EXT3_INDEX_EXTRA_TRANS_BLOCKS);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 5485390..e781b7e 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2304,13 +2304,6 @@ static int ext4_link(struct dentry *old_dentry,
 
 	dquot_initialize(dir);
 
-	/*
-	 * Return -ENOENT if we've raced with unlink and i_nlink is 0.  Doing
-	 * otherwise has the potential to corrupt the orphan inode list.
-	 */
-	if (inode->i_nlink == 0)
-		return -ENOENT;
-
 retry:
 	handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
 					EXT4_INDEX_EXTRA_TRANS_BLOCKS);
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 5a2b269..3f04a18 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -809,9 +809,6 @@ static int jfs_link(struct dentry *old_dentry,
 	if (ip->i_nlink == JFS_LINK_MAX)
 		return -EMLINK;
 
-	if (ip->i_nlink == 0)
-		return -ENOENT;
-
 	dquot_initialize(dir);
 
 	tid = txBegin(ip->i_sb, 0);
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 68fdf45..4b2eb56 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -1122,10 +1122,6 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir,
 		reiserfs_write_unlock(dir->i_sb);
 		return -EMLINK;
 	}
-	if (inode->i_nlink == 0) {
-		reiserfs_write_unlock(dir->i_sb);
-		return -ENOENT;
-	}
 
 	/* inc before scheduling so reiserfs_unlink knows we are here */
 	inc_nlink(inode);
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 14f64b6..7217d67 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -522,24 +522,6 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
 	ubifs_assert(mutex_is_locked(&dir->i_mutex));
 	ubifs_assert(mutex_is_locked(&inode->i_mutex));
 
-	/*
-	 * Return -ENOENT if we've raced with unlink and i_nlink is 0.  Doing
-	 * otherwise has the potential to corrupt the orphan inode list.
-	 *
-	 * Indeed, consider a scenario when 'vfs_link(dirA/fileA)' and
-	 * 'vfs_unlink(dirA/fileA, dirB/fileB)' race. 'vfs_link()' does not
-	 * lock 'dirA->i_mutex', so this is possible. Both of the functions
-	 * lock 'fileA->i_mutex' though. Suppose 'vfs_unlink()' wins, and takes
-	 * 'fileA->i_mutex' mutex first. Suppose 'fileA->i_nlink' is 1. In this
-	 * case 'ubifs_unlink()' will drop the last reference, and put 'inodeA'
-	 * to the list of orphans. After this, 'vfs_link()' will link
-	 * 'dirB/fileB' to 'inodeA'. This is a problem because, for example,
-	 * the subsequent 'vfs_unlink(dirB/fileB)' will add the same inode
-	 * to the list of orphans.
-	 */
-	 if (inode->i_nlink == 0)
-		 return -ENOENT;
-
 	err = dbg_check_synced_i_size(inode);
 	if (err)
 		return err;
-- 
cgit v0.10.2


From 7dadb755b082c259f7dd4a95a3a6eb21646a28d5 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Sat, 29 Jan 2011 18:43:35 +0530
Subject: x86: Add new syscalls for x86_32

This patch adds new syscalls to x86_32

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index b766a5e..f4c4973 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -346,10 +346,12 @@
 #define __NR_fanotify_init	338
 #define __NR_fanotify_mark	339
 #define __NR_prlimit64		340
+#define __NR_name_to_handle_at	341
+#define __NR_open_by_handle_at  342
 
 #ifdef __KERNEL__
 
-#define NR_syscalls 341
+#define NR_syscalls 343
 
 #define __ARCH_WANT_IPC_PARSE_VERSION
 #define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index b35786d..c314b21 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -340,3 +340,5 @@ ENTRY(sys_call_table)
 	.long sys_fanotify_init
 	.long sys_fanotify_mark
 	.long sys_prlimit64		/* 340 */
+	.long sys_name_to_handle_at
+	.long sys_open_by_handle_at
-- 
cgit v0.10.2


From 6aae5f2b2085c5c90964bb78676ea8a6a336e037 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Sat, 29 Jan 2011 18:43:37 +0530
Subject: x86: Add new syscalls for x86_64

This patch add new syscalls to x86_64

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 518bb99..98d353e 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -851,4 +851,6 @@ ia32_sys_call_table:
 	.quad sys_fanotify_init
 	.quad sys32_fanotify_mark
 	.quad sys_prlimit64		/* 340 */
+	.quad sys_name_to_handle_at
+	.quad compat_sys_open_by_handle_at
 ia32_syscall_end:
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index 363e9b8..81a3d5b 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -669,6 +669,10 @@ __SYSCALL(__NR_fanotify_init, sys_fanotify_init)
 __SYSCALL(__NR_fanotify_mark, sys_fanotify_mark)
 #define __NR_prlimit64				302
 __SYSCALL(__NR_prlimit64, sys_prlimit64)
+#define __NR_name_to_handle_at			303
+__SYSCALL(__NR_name_to_handle_at, sys_name_to_handle_at)
+#define __NR_open_by_handle_at			304
+__SYSCALL(__NR_open_by_handle_at, sys_open_by_handle_at)
 
 #ifndef __NO_STUBS
 #define __ARCH_WANT_OLD_READDIR
-- 
cgit v0.10.2


From a51571ccb8be1b88aea502ebba8350519682c16d Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Sat, 29 Jan 2011 18:43:38 +0530
Subject: unistd.h: Add new syscalls numbers to asm-generic

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/include/asm-generic/unistd.h b/include/asm-generic/unistd.h
index b969770..57af033 100644
--- a/include/asm-generic/unistd.h
+++ b/include/asm-generic/unistd.h
@@ -646,9 +646,13 @@ __SYSCALL(__NR_prlimit64, sys_prlimit64)
 __SYSCALL(__NR_fanotify_init, sys_fanotify_init)
 #define __NR_fanotify_mark 263
 __SYSCALL(__NR_fanotify_mark, sys_fanotify_mark)
+#define __NR_name_to_handle_at		264
+__SYSCALL(__NR_name_to_handle_at, sys_name_to_handle_at)
+#define __NR_open_by_handle_at		265
+__SYSCALL(__NR_open_by_handle_at, sys_open_by_handle_at)
 
 #undef __NR_syscalls
-#define __NR_syscalls 264
+#define __NR_syscalls 266
 
 /*
  * All syscalls below here should go away really,
-- 
cgit v0.10.2


From 93f1c20bc8cdb757be50566eff88d65c3b26881f Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Sat, 29 Jan 2011 18:43:38 +0530
Subject: vfs: Export file system uuid via /proc/<pid>/mountinfo

We add a per superblock uuid field. File systems should
update the uuid in the fill_super callback

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/namespace.c b/fs/namespace.c
index d1edf26..dffe6f4 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1002,6 +1002,18 @@ const struct seq_operations mounts_op = {
 	.show	= show_vfsmnt
 };
 
+static int uuid_is_nil(u8 *uuid)
+{
+	int i;
+	u8  *cp = (u8 *)uuid;
+
+	for (i = 0; i < 16; i++) {
+		if (*cp++)
+			return 0;
+	}
+	return 1;
+}
+
 static int show_mountinfo(struct seq_file *m, void *v)
 {
 	struct proc_mounts *p = m->private;
@@ -1040,6 +1052,10 @@ static int show_mountinfo(struct seq_file *m, void *v)
 	if (IS_MNT_UNBINDABLE(mnt))
 		seq_puts(m, " unbindable");
 
+	if (!uuid_is_nil(mnt->mnt_sb->s_uuid))
+		/* print the uuid */
+		seq_printf(m, " uuid:%pU", mnt->mnt_sb->s_uuid);
+
 	/* Filesystem specific data */
 	seq_puts(m, " - ");
 	show_type(m, sb);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 3f64630..f2143e0 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1408,6 +1408,7 @@ struct super_block {
 	wait_queue_head_t	s_wait_unfrozen;
 
 	char s_id[32];				/* Informational name */
+	u8 s_uuid[16];				/* UUID */
 
 	void 			*s_fs_info;	/* Filesystem private info */
 	fmode_t			s_mode;
-- 
cgit v0.10.2


From 03cb5f03dcb26846fcad345d8c15aae91579a53d Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Sat, 29 Jan 2011 18:43:39 +0530
Subject: ext3: Copy fs UUID to superblock.

File system UUID is made available to application
via  /proc/<pid>/mountinfo

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 85c8cc8..9cc19a1d 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -1936,6 +1936,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 	sb->s_qcop = &ext3_qctl_operations;
 	sb->dq_op = &ext3_quota_operations;
 #endif
+	memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid));
 	INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
 	mutex_init(&sbi->s_orphan_lock);
 	mutex_init(&sbi->s_resize_lock);
-- 
cgit v0.10.2


From f2fa2ffc2046fdc35f96366d1ec8675f4d578522 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Sat, 29 Jan 2011 18:43:40 +0530
Subject: ext4: Copy fs UUID to superblock

File system UUID is made available to application
via  /proc/<pid>/mountinfo

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index f6a318f..5977b35 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3415,6 +3415,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_qcop = &ext4_qctl_operations;
 	sb->dq_op = &ext4_quota_operations;
 #endif
+	memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid));
+
 	INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
 	mutex_init(&sbi->s_orphan_lock);
 	mutex_init(&sbi->s_resize_lock);
-- 
cgit v0.10.2


From 1abf0c718f15a56a0a435588d1b104c7a37dc9bd Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 13 Mar 2011 03:51:11 -0400
Subject: New kind of open files - "location only".

New flag for open(2) - O_PATH.  Semantics:
	* pathname is resolved, but the file itself is _NOT_ opened
as far as filesystem is concerned.
	* almost all operations on the resulting descriptors shall
fail with -EBADF.  Exceptions are:
	1) operations on descriptors themselves (i.e.
		close(), dup(), dup2(), dup3(), fcntl(fd, F_DUPFD),
		fcntl(fd, F_DUPFD_CLOEXEC, ...), fcntl(fd, F_GETFD),
		fcntl(fd, F_SETFD, ...))
	2) fcntl(fd, F_GETFL), for a common non-destructive way to
		check if descriptor is open
	3) "dfd" arguments of ...at(2) syscalls, i.e. the starting
		points of pathname resolution
	* closing such descriptor does *NOT* affect dnotify or
posix locks.
	* permissions are checked as usual along the way to file;
no permission checks are applied to the file itself.  Of course,
giving such thing to syscall will result in permission checks (at
the moment it means checking that starting point of ....at() is
a directory and caller has exec permissions on it).

fget() and fget_light() return NULL on such descriptors; use of
fget_raw() and fget_raw_light() is needed to get them.  That protects
existing code from dealing with those things.

There are two things still missing (they come in the next commits):
one is handling of symlinks (right now we refuse to open them that
way; see the next commit for semantics related to those) and another
is descriptor passing via SCM_RIGHTS datagrams.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/fcntl.c b/fs/fcntl.c
index cb10261..6c82e5b 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -131,7 +131,7 @@ SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd)
 SYSCALL_DEFINE1(dup, unsigned int, fildes)
 {
 	int ret = -EBADF;
-	struct file *file = fget(fildes);
+	struct file *file = fget_raw(fildes);
 
 	if (file) {
 		ret = get_unused_fd();
@@ -426,15 +426,35 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
 	return err;
 }
 
+static int check_fcntl_cmd(unsigned cmd)
+{
+	switch (cmd) {
+	case F_DUPFD:
+	case F_DUPFD_CLOEXEC:
+	case F_GETFD:
+	case F_SETFD:
+	case F_GETFL:
+		return 1;
+	}
+	return 0;
+}
+
 SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, unsigned long, arg)
 {	
 	struct file *filp;
 	long err = -EBADF;
 
-	filp = fget(fd);
+	filp = fget_raw(fd);
 	if (!filp)
 		goto out;
 
+	if (unlikely(filp->f_mode & FMODE_PATH)) {
+		if (!check_fcntl_cmd(cmd)) {
+			fput(filp);
+			goto out;
+		}
+	}
+
 	err = security_file_fcntl(filp, cmd, arg);
 	if (err) {
 		fput(filp);
@@ -456,10 +476,17 @@ SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
 	long err;
 
 	err = -EBADF;
-	filp = fget(fd);
+	filp = fget_raw(fd);
 	if (!filp)
 		goto out;
 
+	if (unlikely(filp->f_mode & FMODE_PATH)) {
+		if (!check_fcntl_cmd(cmd)) {
+			fput(filp);
+			goto out;
+		}
+	}
+
 	err = security_file_fcntl(filp, cmd, arg);
 	if (err) {
 		fput(filp);
@@ -808,14 +835,14 @@ static int __init fcntl_init(void)
 	 * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY
 	 * is defined as O_NONBLOCK on some platforms and not on others.
 	 */
-	BUILD_BUG_ON(18 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32(
+	BUILD_BUG_ON(19 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32(
 		O_RDONLY	| O_WRONLY	| O_RDWR	|
 		O_CREAT		| O_EXCL	| O_NOCTTY	|
 		O_TRUNC		| O_APPEND	| /* O_NONBLOCK	| */
 		__O_SYNC	| O_DSYNC	| FASYNC	|
 		O_DIRECT	| O_LARGEFILE	| O_DIRECTORY	|
 		O_NOFOLLOW	| O_NOATIME	| O_CLOEXEC	|
-		__FMODE_EXEC
+		__FMODE_EXEC	| O_PATH
 		));
 
 	fasync_cache = kmem_cache_create("fasync_cache",
diff --git a/fs/file_table.c b/fs/file_table.c
index eb36b6b..3c16e1c 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -276,11 +276,10 @@ struct file *fget(unsigned int fd)
 	rcu_read_lock();
 	file = fcheck_files(files, fd);
 	if (file) {
-		if (!atomic_long_inc_not_zero(&file->f_count)) {
-			/* File object ref couldn't be taken */
-			rcu_read_unlock();
-			return NULL;
-		}
+		/* File object ref couldn't be taken */
+		if (file->f_mode & FMODE_PATH ||
+		    !atomic_long_inc_not_zero(&file->f_count))
+			file = NULL;
 	}
 	rcu_read_unlock();
 
@@ -289,6 +288,23 @@ struct file *fget(unsigned int fd)
 
 EXPORT_SYMBOL(fget);
 
+struct file *fget_raw(unsigned int fd)
+{
+	struct file *file;
+	struct files_struct *files = current->files;
+
+	rcu_read_lock();
+	file = fcheck_files(files, fd);
+	if (file) {
+		/* File object ref couldn't be taken */
+		if (!atomic_long_inc_not_zero(&file->f_count))
+			file = NULL;
+	}
+	rcu_read_unlock();
+
+	return file;
+}
+
 /*
  * Lightweight file lookup - no refcnt increment if fd table isn't shared.
  *
@@ -313,6 +329,33 @@ struct file *fget_light(unsigned int fd, int *fput_needed)
 	*fput_needed = 0;
 	if (atomic_read(&files->count) == 1) {
 		file = fcheck_files(files, fd);
+		if (file && (file->f_mode & FMODE_PATH))
+			file = NULL;
+	} else {
+		rcu_read_lock();
+		file = fcheck_files(files, fd);
+		if (file) {
+			if (!(file->f_mode & FMODE_PATH) &&
+			    atomic_long_inc_not_zero(&file->f_count))
+				*fput_needed = 1;
+			else
+				/* Didn't get the reference, someone's freed */
+				file = NULL;
+		}
+		rcu_read_unlock();
+	}
+
+	return file;
+}
+
+struct file *fget_raw_light(unsigned int fd, int *fput_needed)
+{
+	struct file *file;
+	struct files_struct *files = current->files;
+
+	*fput_needed = 0;
+	if (atomic_read(&files->count) == 1) {
+		file = fcheck_files(files, fd);
 	} else {
 		rcu_read_lock();
 		file = fcheck_files(files, fd);
diff --git a/fs/namei.c b/fs/namei.c
index 33be51a..e1d9f90 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1544,7 +1544,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
 	} else {
 		struct dentry *dentry;
 
-		file = fget_light(dfd, &fput_needed);
+		file = fget_raw_light(dfd, &fput_needed);
 		retval = -EBADF;
 		if (!file)
 			goto out_fail;
diff --git a/fs/open.c b/fs/open.c
index 48afc5c..14a51de 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -669,11 +669,16 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
 					int (*open)(struct inode *, struct file *),
 					const struct cred *cred)
 {
+	static const struct file_operations empty_fops = {};
 	struct inode *inode;
 	int error;
 
 	f->f_mode = OPEN_FMODE(f->f_flags) | FMODE_LSEEK |
 				FMODE_PREAD | FMODE_PWRITE;
+
+	if (unlikely(f->f_flags & O_PATH))
+		f->f_mode = FMODE_PATH;
+
 	inode = dentry->d_inode;
 	if (f->f_mode & FMODE_WRITE) {
 		error = __get_file_write_access(inode, mnt);
@@ -687,9 +692,15 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
 	f->f_path.dentry = dentry;
 	f->f_path.mnt = mnt;
 	f->f_pos = 0;
-	f->f_op = fops_get(inode->i_fop);
 	file_sb_list_add(f, inode->i_sb);
 
+	if (unlikely(f->f_mode & FMODE_PATH)) {
+		f->f_op = &empty_fops;
+		return f;
+	}
+
+	f->f_op = fops_get(inode->i_fop);
+
 	error = security_dentry_open(f, cred);
 	if (error)
 		goto cleanup_all;
@@ -911,9 +922,18 @@ static inline int build_open_flags(int flags, int mode, struct open_flags *op)
 	if (flags & __O_SYNC)
 		flags |= O_DSYNC;
 
-	op->open_flag = flags;
+	/*
+	 * If we have O_PATH in the open flag. Then we
+	 * cannot have anything other than the below set of flags
+	 */
+	if (flags & O_PATH) {
+		flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH;
+		acc_mode = 0;
+	} else {
+		acc_mode = MAY_OPEN | ACC_MODE(flags);
+	}
 
-	acc_mode = MAY_OPEN | ACC_MODE(flags);
+	op->open_flag = flags;
 
 	/* O_TRUNC implies we need access checks for write permissions */
 	if (flags & O_TRUNC)
@@ -926,7 +946,8 @@ static inline int build_open_flags(int flags, int mode, struct open_flags *op)
 
 	op->acc_mode = acc_mode;
 
-	op->intent = LOOKUP_OPEN;
+	op->intent = flags & O_PATH ? 0 : LOOKUP_OPEN;
+
 	if (flags & O_CREAT) {
 		op->intent |= LOOKUP_CREATE;
 		if (flags & O_EXCL)
@@ -1053,8 +1074,10 @@ int filp_close(struct file *filp, fl_owner_t id)
 	if (filp->f_op && filp->f_op->flush)
 		retval = filp->f_op->flush(filp, id);
 
-	dnotify_flush(filp, id);
-	locks_remove_posix(filp, id);
+	if (likely(!(filp->f_mode & FMODE_PATH))) {
+		dnotify_flush(filp, id);
+		locks_remove_posix(filp, id);
+	}
 	fput(filp);
 	return retval;
 }
diff --git a/include/asm-generic/fcntl.h b/include/asm-generic/fcntl.h
index 0fc16e3..84793c70 100644
--- a/include/asm-generic/fcntl.h
+++ b/include/asm-generic/fcntl.h
@@ -80,6 +80,10 @@
 #define O_SYNC		(__O_SYNC|O_DSYNC)
 #endif
 
+#ifndef O_PATH
+#define O_PATH		010000000
+#endif
+
 #ifndef O_NDELAY
 #define O_NDELAY	O_NONBLOCK
 #endif
diff --git a/include/linux/file.h b/include/linux/file.h
index e85baeb..21a7995 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -29,6 +29,8 @@ static inline void fput_light(struct file *file, int fput_needed)
 
 extern struct file *fget(unsigned int fd);
 extern struct file *fget_light(unsigned int fd, int *fput_needed);
+extern struct file *fget_raw(unsigned int fd);
+extern struct file *fget_raw_light(unsigned int fd, int *fput_needed);
 extern void set_close_on_exec(unsigned int fd, int flag);
 extern void put_filp(struct file *);
 extern int alloc_fd(unsigned start, unsigned flags);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f2143e0..13df14e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -102,6 +102,9 @@ struct inodes_stat_t {
 /* File is huge (eg. /dev/kmem): treat loff_t as unsigned */
 #define FMODE_UNSIGNED_OFFSET	((__force fmode_t)0x2000)
 
+/* File is opened with O_PATH; almost nothing can be done with it */
+#define FMODE_PATH		((__force fmode_t)0x4000)
+
 /* File was opened by fanotify and shouldn't generate fanotify events */
 #define FMODE_NONOTIFY		((__force fmode_t)0x1000000)
 
-- 
cgit v0.10.2


From bcda76524cd1fa32af748536f27f674a13e56700 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 13 Mar 2011 16:42:14 -0400
Subject: Allow O_PATH for symlinks

At that point we can't do almost nothing with them.  They can be opened
with O_PATH, we can manipulate such descriptors with dup(), etc. and
we can see them in /proc/*/{fd,fdinfo}/*.

We can't (and won't be able to) follow /proc/*/fd/* symlinks for those;
there's simply not enough information for pathname resolution to go on
from such point - to resolve a symlink we need to know which directory
does it live in.

We will be able to do useful things with them after the next commit, though -
readlinkat() and fchownat() will be possible to use with dfd being an
O_PATH-opened symlink and empty relative pathname.  Combined with
open_by_handle() it'll give us a way to do realink-by-handle and
lchown-by-handle without messing with more redundant syscalls.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/namei.c b/fs/namei.c
index e1d9f90..9d4f327 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -766,8 +766,14 @@ __do_follow_link(const struct path *link, struct nameidata *nd, void **p)
 		error = 0;
 		if (s)
 			error = __vfs_follow_link(nd, s);
-		else if (nd->last_type == LAST_BIND)
+		else if (nd->last_type == LAST_BIND) {
 			nd->flags |= LOOKUP_JUMPED;
+			if (nd->path.dentry->d_inode->i_op->follow_link) {
+				/* stepped on a _really_ weird one */
+				path_put(&nd->path);
+				error = -ELOOP;
+			}
+		}
 	}
 	return error;
 }
@@ -1954,6 +1960,10 @@ static int may_open(struct path *path, int acc_mode, int flag)
 	struct inode *inode = dentry->d_inode;
 	int error;
 
+	/* O_PATH? */
+	if (!acc_mode)
+		return 0;
+
 	if (!inode)
 		return -ENOENT;
 
@@ -2056,7 +2066,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 	int open_flag = op->open_flag;
 	int will_truncate = open_flag & O_TRUNC;
 	int want_write = 0;
-	int skip_perm = 0;
+	int acc_mode = op->acc_mode;
 	struct file *filp;
 	struct inode *inode;
 	int error;
@@ -2095,8 +2105,11 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 	}
 
 	if (!(open_flag & O_CREAT)) {
+		int symlink_ok = 0;
 		if (nd->last.name[nd->last.len])
 			nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
+		if (open_flag & O_PATH && !(nd->flags & LOOKUP_FOLLOW))
+			symlink_ok = 1;
 		/* we _can_ be in RCU mode here */
 		error = do_lookup(nd, &nd->last, path, &inode);
 		if (error) {
@@ -2108,7 +2121,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 			terminate_walk(nd);
 			return ERR_PTR(-ENOENT);
 		}
-		if (unlikely(inode->i_op->follow_link)) {
+		if (unlikely(inode->i_op->follow_link && !symlink_ok)) {
 			/* We drop rcu-walk here */
 			if (nameidata_dentry_drop_rcu_maybe(nd, path->dentry))
 				return ERR_PTR(-ECHILD);
@@ -2175,7 +2188,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 		/* Don't check for write permission, don't truncate */
 		open_flag &= ~O_TRUNC;
 		will_truncate = 0;
-		skip_perm = 1;
+		acc_mode = MAY_OPEN;
 		error = security_path_mknod(&nd->path, dentry, mode, 0);
 		if (error)
 			goto exit_mutex_unlock;
@@ -2225,7 +2238,7 @@ ok:
 		want_write = 1;
 	}
 common:
-	error = may_open(&nd->path, skip_perm ? 0 : op->acc_mode, open_flag);
+	error = may_open(&nd->path, acc_mode, open_flag);
 	if (error)
 		goto exit;
 	filp = nameidata_to_filp(nd);
@@ -2358,7 +2371,7 @@ struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
 
 	flags |= LOOKUP_ROOT;
 
-	if (dentry->d_inode->i_op->follow_link)
+	if (dentry->d_inode->i_op->follow_link && op->intent & LOOKUP_OPEN)
 		return ERR_PTR(-ELOOP);
 
 	file = path_openat(-1, name, &nd, op, flags | LOOKUP_RCU);
-- 
cgit v0.10.2


From 65cfc6722361570bfe255698d9cd4dccaf47570d Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 13 Mar 2011 15:56:26 -0400
Subject: readlinkat(), fchownat() and fstatat() with empty relative pathnames

For readlinkat() we simply allow empty pathname; it will fail unless
we have dfd equal to O_PATH-opened symlink, so we are outside of
POSIX scope here.  For fchownat() and fstatat() we allow AT_EMPTY_PATH;
let the caller explicitly ask for such behaviour.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/open.c b/fs/open.c
index 14a51de..3cac0bd 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -573,13 +573,15 @@ SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user,
 {
 	struct path path;
 	int error = -EINVAL;
-	int follow;
+	int lookup_flags;
 
-	if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0)
+	if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
 		goto out;
 
-	follow = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW;
-	error = user_path_at(dfd, filename, follow, &path);
+	lookup_flags = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW;
+	if (flag & AT_EMPTY_PATH)
+		lookup_flags |= LOOKUP_EMPTY;
+	error = user_path_at(dfd, filename, lookup_flags, &path);
 	if (error)
 		goto out;
 	error = mnt_want_write(path.mnt);
diff --git a/fs/stat.c b/fs/stat.c
index d5c61cf..9610391 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -75,13 +75,16 @@ int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat,
 	int error = -EINVAL;
 	int lookup_flags = 0;
 
-	if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT)) != 0)
+	if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT |
+		      AT_EMPTY_PATH)) != 0)
 		goto out;
 
 	if (!(flag & AT_SYMLINK_NOFOLLOW))
 		lookup_flags |= LOOKUP_FOLLOW;
 	if (flag & AT_NO_AUTOMOUNT)
 		lookup_flags |= LOOKUP_NO_AUTOMOUNT;
+	if (flag & AT_EMPTY_PATH)
+		lookup_flags |= LOOKUP_EMPTY;
 
 	error = user_path_at(dfd, filename, lookup_flags, &path);
 	if (error)
@@ -297,7 +300,7 @@ SYSCALL_DEFINE4(readlinkat, int, dfd, const char __user *, pathname,
 	if (bufsiz <= 0)
 		return -EINVAL;
 
-	error = user_path_at(dfd, pathname, 0, &path);
+	error = user_path_at(dfd, pathname, LOOKUP_EMPTY, &path);
 	if (!error) {
 		struct inode *inode = path.dentry->d_inode;
 
-- 
cgit v0.10.2


From 326be7b484843988afe57566b627fb7a70beac56 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 13 Mar 2011 17:08:22 -0400
Subject: Allow passing O_PATH descriptors via SCM_RIGHTS datagrams

Just need to make sure that AF_UNIX garbage collector won't
confuse O_PATHed socket on filesystem for real AF_UNIX opened
socket.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/file_table.c b/fs/file_table.c
index 3c16e1c..74a9544 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -305,6 +305,8 @@ struct file *fget_raw(unsigned int fd)
 	return file;
 }
 
+EXPORT_SYMBOL(fget_raw);
+
 /*
  * Lightweight file lookup - no refcnt increment if fd table isn't shared.
  *
diff --git a/net/core/scm.c b/net/core/scm.c
index bbe4544..4c1ef02 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -95,7 +95,7 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp)
 		int fd = fdp[i];
 		struct file *file;
 
-		if (fd < 0 || !(file = fget(fd)))
+		if (fd < 0 || !(file = fget_raw(fd)))
 			return -EBADF;
 		*fpp++ = file;
 		fpl->count++;
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index f89f83b..b6f4b99 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -104,7 +104,7 @@ struct sock *unix_get_socket(struct file *filp)
 	/*
 	 *	Socket ?
 	 */
-	if (S_ISSOCK(inode->i_mode)) {
+	if (S_ISSOCK(inode->i_mode) && !(filp->f_mode & FMODE_PATH)) {
 		struct socket *sock = SOCKET_I(inode);
 		struct sock *s = sock->sk;
 
-- 
cgit v0.10.2


From 25542c646afbf14c43fa7d2b443055cadb73b07a Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Tue, 15 Mar 2011 09:57:37 +0800
Subject: x86, tlb, UV: Do small micro-optimization for
 native_flush_tlb_others()

native_flush_tlb_others() is called from:

 flush_tlb_current_task()
 flush_tlb_mm()
 flush_tlb_page()

All these functions disable preemption explicitly, so we can use
smp_processor_id() instead of get_cpu() and put_cpu().

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Cc: Cliff Wickman <cpw@sgi.com>
LKML-Reference: <4D7EC791.4040003@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 55272d7..d6c0418 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -208,11 +208,10 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
 	if (is_uv_system()) {
 		unsigned int cpu;
 
-		cpu = get_cpu();
+		cpu = smp_processor_id();
 		cpumask = uv_flush_tlb_others(cpumask, mm, va, cpu);
 		if (cpumask)
 			flush_tlb_others_ipi(cpumask, mm, va);
-		put_cpu();
 		return;
 	}
 	flush_tlb_others_ipi(cpumask, mm, va);
-- 
cgit v0.10.2


From 5e814dd597c42daeb8d2a276e64a6ec986ad0e2a Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 15 Mar 2011 20:51:09 +0100
Subject: perf probe: Clean up probe_point_lazy_walker() return value
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Newer compilers (gcc 4.6) complains about:

        return ret < 0 ?: 0;

For the following reason:

  util/probe-finder.c: In function ‘probe_point_lazy_walker’:
  util/probe-finder.c:1331:18: error: the omitted middle operand in ?: will always be ‘true’, suggest explicit middle operand [-Werror=parentheses]

And indeed the return value is a somewhat obscure (but correct) value
of 'true', so return 'ret' instead - this is cleaner and unconfuses
GCC as well.

Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c
index 17f9c4a..194f9e2 100644
--- a/tools/perf/util/probe-finder.c
+++ b/tools/perf/util/probe-finder.c
@@ -1328,7 +1328,7 @@ static int probe_point_lazy_walker(const char *fname, int lineno,
 	 * Continue if no error, because the lazy pattern will match
 	 * to other lines
 	 */
-	return ret < 0 ?: 0;
+	return ret < 0 ? ret : 0;
 }
 
 /* Find probe points from lazy pattern  */
-- 
cgit v0.10.2


From 11a7b371b64ef39fc5fb1b6f2218eef7c4d035e3 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Sat, 29 Jan 2011 18:43:42 +0530
Subject: fs: allow AT_EMPTY_PATH in linkat(), limit that to
 CAP_DAC_READ_SEARCH

We don't want to allow creation of private hardlinks by different application
using the fd passed to them via SCM_RIGHTS. So limit the null relative name
usage in linkat syscall to CAP_DAC_READ_SEARCH

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>

diff --git a/fs/namei.c b/fs/namei.c
index 9d4f327..c9b7f5b 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2945,15 +2945,27 @@ SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
 	struct dentry *new_dentry;
 	struct nameidata nd;
 	struct path old_path;
+	int how = 0;
 	int error;
 	char *to;
 
-	if ((flags & ~AT_SYMLINK_FOLLOW) != 0)
+	if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0)
 		return -EINVAL;
+	/*
+	 * To use null names we require CAP_DAC_READ_SEARCH
+	 * This ensures that not everyone will be able to create
+	 * handlink using the passed filedescriptor.
+	 */
+	if (flags & AT_EMPTY_PATH) {
+		if (!capable(CAP_DAC_READ_SEARCH))
+			return -ENOENT;
+		how = LOOKUP_EMPTY;
+	}
+
+	if (flags & AT_SYMLINK_FOLLOW)
+		how |= LOOKUP_FOLLOW;
 
-	error = user_path_at(olddfd, oldname,
-			     flags & AT_SYMLINK_FOLLOW ? LOOKUP_FOLLOW : 0,
-			     &old_path);
+	error = user_path_at(olddfd, oldname, how, &old_path);
 	if (error)
 		return error;
 
-- 
cgit v0.10.2


From ce57dfc1791221ef58b6d6b8f5437fccefc4e187 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 13 Mar 2011 19:58:58 -0400
Subject: pull handling of one pathname component into a helper

new helper: walk_component().  Handles everything except symlinks;
returns negative on error, 0 on success and 1 on symlinks we decided
to follow.  Drops out of RCU mode on such symlinks.

link_path_walk() and do_last() switched to using that.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/namei.c b/fs/namei.c
index c9b7f5b..549bbe2 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -785,16 +785,11 @@ __do_follow_link(const struct path *link, struct nameidata *nd, void **p)
  * Without that kind of total limit, nasty chains of consecutive
  * symlinks can cause almost arbitrarily long lookups. 
  */
-static inline int do_follow_link(struct inode *inode, struct path *path, struct nameidata *nd)
+static inline int do_follow_link(struct path *path, struct nameidata *nd)
 {
 	void *cookie;
 	int err = -ELOOP;
 
-	/* We drop rcu-walk here */
-	if (nameidata_dentry_drop_rcu_maybe(nd, path->dentry))
-		return -ECHILD;
-	BUG_ON(inode != path->dentry->d_inode);
-
 	if (current->link_count >= MAX_NESTED_LINKS)
 		goto loop;
 	if (current->total_link_count >= 40)
@@ -1337,6 +1332,39 @@ static void terminate_walk(struct nameidata *nd)
 	}
 }
 
+static inline int walk_component(struct nameidata *nd, struct path *path,
+		struct qstr *name, int type, int follow)
+{
+	struct inode *inode;
+	int err;
+	/*
+	 * "." and ".." are special - ".." especially so because it has
+	 * to be able to know about the current root directory and
+	 * parent relationships.
+	 */
+	if (unlikely(type != LAST_NORM))
+		return handle_dots(nd, type);
+	err = do_lookup(nd, name, path, &inode);
+	if (unlikely(err)) {
+		terminate_walk(nd);
+		return err;
+	}
+	if (!inode) {
+		path_to_nameidata(path, nd);
+		terminate_walk(nd);
+		return -ENOENT;
+	}
+	if (unlikely(inode->i_op->follow_link) && follow) {
+		if (nameidata_dentry_drop_rcu_maybe(nd, path->dentry))
+			return -ECHILD;
+		BUG_ON(inode != path->dentry->d_inode);
+		return 1;
+	}
+	path_to_nameidata(path, nd);
+	nd->inode = inode;
+	return 0;
+}
+
 /*
  * Name resolution.
  * This is the basic name resolution function, turning a pathname into
@@ -1361,7 +1389,6 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 
 	/* At this point we know we have a real path component. */
 	for(;;) {
-		struct inode *inode;
 		unsigned long hash;
 		struct qstr this;
 		unsigned int c;
@@ -1414,34 +1441,16 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 		if (!*name)
 			goto last_with_slashes;
 
-		/*
-		 * "." and ".." are special - ".." especially so because it has
-		 * to be able to know about the current root directory and
-		 * parent relationships.
-		 */
-		if (unlikely(type != LAST_NORM)) {
-			if (handle_dots(nd, type))
-				return -ECHILD;
-			continue;
-		}
-
-		/* This does the actual lookups.. */
-		err = do_lookup(nd, &this, &next, &inode);
-		if (err)
-			break;
+		err = walk_component(nd, &next, &this, type, LOOKUP_FOLLOW);
+		if (err < 0)
+			return err;
 
-		if (inode && inode->i_op->follow_link) {
-			err = do_follow_link(inode, &next, nd);
+		if (err) {
+			err = do_follow_link(&next, nd);
 			if (err)
 				return err;
 			nd->inode = nd->path.dentry->d_inode;
-		} else {
-			path_to_nameidata(&next, nd);
-			nd->inode = inode;
 		}
-		err = -ENOENT;
-		if (!nd->inode)
-			break;
 		err = -ENOTDIR; 
 		if (!nd->inode->i_op->lookup)
 			break;
@@ -1453,36 +1462,27 @@ last_with_slashes:
 last_component:
 		/* Clear LOOKUP_CONTINUE iff it was previously unset */
 		nd->flags &= lookup_flags | ~LOOKUP_CONTINUE;
-		if (lookup_flags & LOOKUP_PARENT)
-			goto lookup_parent;
-		if (unlikely(type != LAST_NORM))
-			return handle_dots(nd, type);
-		err = do_lookup(nd, &this, &next, &inode);
-		if (err)
-			break;
-		if (inode && unlikely(inode->i_op->follow_link) &&
-		    (lookup_flags & LOOKUP_FOLLOW)) {
-			err = do_follow_link(inode, &next, nd);
+		if (lookup_flags & LOOKUP_PARENT) {
+			nd->last = this;
+			nd->last_type = type;
+			return 0;
+		}
+		err = walk_component(nd, &next, &this, type,
+					lookup_flags & LOOKUP_FOLLOW);
+		if (err < 0)
+			return err;
+		if (err) {
+			err = do_follow_link(&next, nd);
 			if (err)
 				return err;
 			nd->inode = nd->path.dentry->d_inode;
-		} else {
-			path_to_nameidata(&next, nd);
-			nd->inode = inode;
 		}
-		err = -ENOENT;
-		if (!nd->inode)
-			break;
 		if (lookup_flags & LOOKUP_DIRECTORY) {
 			err = -ENOTDIR; 
 			if (!nd->inode->i_op->lookup)
 				break;
 		}
 		return 0;
-lookup_parent:
-		nd->last = this;
-		nd->last_type = type;
-		return 0;
 	}
 	terminate_walk(nd);
 	return err;
@@ -2068,7 +2068,6 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 	int want_write = 0;
 	int acc_mode = op->acc_mode;
 	struct file *filp;
-	struct inode *inode;
 	int error;
 
 	nd->flags &= ~LOOKUP_PARENT;
@@ -2111,24 +2110,12 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 		if (open_flag & O_PATH && !(nd->flags & LOOKUP_FOLLOW))
 			symlink_ok = 1;
 		/* we _can_ be in RCU mode here */
-		error = do_lookup(nd, &nd->last, path, &inode);
-		if (error) {
-			terminate_walk(nd);
+		error = walk_component(nd, path, &nd->last, LAST_NORM,
+					!symlink_ok);
+		if (error < 0)
 			return ERR_PTR(error);
-		}
-		if (!inode) {
-			path_to_nameidata(path, nd);
-			terminate_walk(nd);
-			return ERR_PTR(-ENOENT);
-		}
-		if (unlikely(inode->i_op->follow_link && !symlink_ok)) {
-			/* We drop rcu-walk here */
-			if (nameidata_dentry_drop_rcu_maybe(nd, path->dentry))
-				return ERR_PTR(-ECHILD);
+		if (error) /* symlink */
 			return NULL;
-		}
-		path_to_nameidata(path, nd);
-		nd->inode = inode;
 		/* sayonara */
 		if (nd->flags & LOOKUP_RCU) {
 			if (nameidata_drop_rcu_last(nd))
@@ -2137,7 +2124,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 
 		error = -ENOTDIR;
 		if (nd->flags & LOOKUP_DIRECTORY) {
-			if (!inode->i_op->lookup)
+			if (!nd->inode->i_op->lookup)
 				goto exit;
 		}
 		audit_inode(pathname, nd->path.dentry);
-- 
cgit v0.10.2


From b21041d0f72899ed815bd2cbf7275339c74737b6 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 14 Mar 2011 20:01:51 -0400
Subject: update nd->inode in __do_follow_link() instead of after
 do_follow_link()

... and note that we only need to do it for LAST_BIND symlinks

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/namei.c b/fs/namei.c
index 549bbe2..9e7b18a 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -768,7 +768,8 @@ __do_follow_link(const struct path *link, struct nameidata *nd, void **p)
 			error = __vfs_follow_link(nd, s);
 		else if (nd->last_type == LAST_BIND) {
 			nd->flags |= LOOKUP_JUMPED;
-			if (nd->path.dentry->d_inode->i_op->follow_link) {
+			nd->inode = nd->path.dentry->d_inode;
+			if (nd->inode->i_op->follow_link) {
 				/* stepped on a _really_ weird one */
 				path_put(&nd->path);
 				error = -ELOOP;
@@ -1449,7 +1450,6 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 			err = do_follow_link(&next, nd);
 			if (err)
 				return err;
-			nd->inode = nd->path.dentry->d_inode;
 		}
 		err = -ENOTDIR; 
 		if (!nd->inode->i_op->lookup)
@@ -1475,7 +1475,6 @@ last_component:
 			err = do_follow_link(&next, nd);
 			if (err)
 				return err;
-			nd->inode = nd->path.dentry->d_inode;
 		}
 		if (lookup_flags & LOOKUP_DIRECTORY) {
 			err = -ENOTDIR; 
-- 
cgit v0.10.2


From bd92d7fed877ed1e6997e4f3f13dbcd872947653 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 14 Mar 2011 19:54:59 -0400
Subject: Make trailing symlink resolution in path_lookupat() iterative

Now the only caller of link_path_walk() that does *not* pass
LOOKUP_PARENT is do_follow_link()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/namei.c b/fs/namei.c
index 9e7b18a..a343163 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1588,12 +1588,23 @@ out_fail:
 	return retval;
 }
 
+static inline int lookup_last(struct nameidata *nd, struct path *path)
+{
+	if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len])
+		nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
+
+	nd->flags &= ~LOOKUP_PARENT;
+	return walk_component(nd, path, &nd->last, nd->last_type,
+					nd->flags & LOOKUP_FOLLOW);
+}
+
 /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
 static int path_lookupat(int dfd, const char *name,
 				unsigned int flags, struct nameidata *nd)
 {
 	struct file *base = NULL;
-	int retval;
+	struct path path;
+	int err;
 
 	/*
 	 * Path walking is largely split up into 2 different synchronisation
@@ -1609,23 +1620,55 @@ static int path_lookupat(int dfd, const char *name,
 	 * be handled by restarting a traditional ref-walk (which will always
 	 * be able to complete).
 	 */
-	retval = path_init(dfd, name, flags, nd, &base);
+	err = path_init(dfd, name, flags | LOOKUP_PARENT, nd, &base);
 
-	if (unlikely(retval))
-		return retval;
+	if (unlikely(err))
+		return err;
 
 	current->total_link_count = 0;
-	retval = link_path_walk(name, nd);
+	err = link_path_walk(name, nd);
+
+	if (!err && !(flags & LOOKUP_PARENT)) {
+		int count = 0;
+		err = lookup_last(nd, &path);
+		while (err > 0) {
+			void *cookie;
+			struct path link = path;
+			struct inode *inode = link.dentry->d_inode;
+
+			if (count++ > 32) {
+				path_put_conditional(&path, nd);
+				path_put(&nd->path);
+				err = -ELOOP;
+				break;
+			}
+			cond_resched();
+			nd->flags |= LOOKUP_PARENT;
+			err = __do_follow_link(&link, nd, &cookie);
+			if (!err)
+				err = lookup_last(nd, &path);
+			if (!IS_ERR(cookie) && inode->i_op->put_link)
+				inode->i_op->put_link(link.dentry, nd, cookie);
+			path_put(&link);
+		}
+	}
 
 	if (nd->flags & LOOKUP_RCU) {
 		/* went all way through without dropping RCU */
-		BUG_ON(retval);
+		BUG_ON(err);
 		if (nameidata_drop_rcu_last(nd))
-			retval = -ECHILD;
+			err = -ECHILD;
 	}
 
-	if (!retval)
-		retval = handle_reval_path(nd);
+	if (!err)
+		err = handle_reval_path(nd);
+
+	if (!err && nd->flags & LOOKUP_DIRECTORY) {
+		if (!nd->inode->i_op->lookup) {
+			path_put(&nd->path);
+			return -ENOTDIR;
+		}
+	}
 
 	if (base)
 		fput(base);
@@ -1634,7 +1677,7 @@ static int path_lookupat(int dfd, const char *name,
 		path_put(&nd->root);
 		nd->root.mnt = NULL;
 	}
-	return retval;
+	return err;
 }
 
 static int do_path_lookup(int dfd, const char *name,
-- 
cgit v0.10.2


From ce0525449da56444948c368f52e10f3db0465338 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 14 Mar 2011 21:28:04 -0400
Subject: simplify link_path_walk() tail

Now that link_path_walk() is called without LOOKUP_PARENT
only from do_follow_link(), we can simplify the checks in
last component handling.  First of all, checking if we'd
arrived to a directory is not needed - the caller will check
it anyway.  And LOOKUP_FOLLOW is guaranteed to be there,
since we only get to that place with nd->depth > 0.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/namei.c b/fs/namei.c
index a343163..9575d00 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1467,8 +1467,7 @@ last_component:
 			nd->last_type = type;
 			return 0;
 		}
-		err = walk_component(nd, &next, &this, type,
-					lookup_flags & LOOKUP_FOLLOW);
+		err = walk_component(nd, &next, &this, type, LOOKUP_FOLLOW);
 		if (err < 0)
 			return err;
 		if (err) {
@@ -1476,11 +1475,6 @@ last_component:
 			if (err)
 				return err;
 		}
-		if (lookup_flags & LOOKUP_DIRECTORY) {
-			err = -ENOTDIR; 
-			if (!nd->inode->i_op->lookup)
-				break;
-		}
 		return 0;
 	}
 	terminate_walk(nd);
-- 
cgit v0.10.2


From b356379a020bb7197603118bb1cbc903963aa198 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 14 Mar 2011 21:54:55 -0400
Subject: Turn resolution of trailing symlinks iterative everywhere

The last remaining place (resolution of nested symlink) converted
to the loop of the same kind we have in path_lookupat() and
path_openat().

Note that we still *do* have a recursion in pathname resolution;
can't avoid it, really.  However, it's strictly for nested symlinks
now - i.e. ones in the middle of a pathname.

link_path_walk() has lost the tail now - it always walks everything
except the last component.

do_follow_link() renamed to nested_symlink() and moved down.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/namei.c b/fs/namei.c
index 9575d00..017c3fa 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -779,40 +779,6 @@ __do_follow_link(const struct path *link, struct nameidata *nd, void **p)
 	return error;
 }
 
-/*
- * This limits recursive symlink follows to 8, while
- * limiting consecutive symlinks to 40.
- *
- * Without that kind of total limit, nasty chains of consecutive
- * symlinks can cause almost arbitrarily long lookups. 
- */
-static inline int do_follow_link(struct path *path, struct nameidata *nd)
-{
-	void *cookie;
-	int err = -ELOOP;
-
-	if (current->link_count >= MAX_NESTED_LINKS)
-		goto loop;
-	if (current->total_link_count >= 40)
-		goto loop;
-	BUG_ON(nd->depth >= MAX_NESTED_LINKS);
-	cond_resched();
-	current->link_count++;
-	current->total_link_count++;
-	nd->depth++;
-	err = __do_follow_link(path, nd, &cookie);
-	if (!IS_ERR(cookie) && path->dentry->d_inode->i_op->put_link)
-		path->dentry->d_inode->i_op->put_link(path->dentry, nd, cookie);
-	path_put(path);
-	current->link_count--;
-	nd->depth--;
-	return err;
-loop:
-	path_put_conditional(path, nd);
-	path_put(&nd->path);
-	return err;
-}
-
 static int follow_up_rcu(struct path *path)
 {
 	struct vfsmount *parent;
@@ -1367,6 +1333,52 @@ static inline int walk_component(struct nameidata *nd, struct path *path,
 }
 
 /*
+ * This limits recursive symlink follows to 8, while
+ * limiting consecutive symlinks to 40.
+ *
+ * Without that kind of total limit, nasty chains of consecutive
+ * symlinks can cause almost arbitrarily long lookups.
+ */
+static inline int nested_symlink(struct path *path, struct nameidata *nd)
+{
+	int res;
+
+	BUG_ON(nd->depth >= MAX_NESTED_LINKS);
+	if (unlikely(current->link_count >= MAX_NESTED_LINKS)) {
+		path_put_conditional(path, nd);
+		path_put(&nd->path);
+		return -ELOOP;
+	}
+
+	nd->depth++;
+	current->link_count++;
+
+	do {
+		struct path link = *path;
+		void *cookie;
+		if (unlikely(current->total_link_count >= 40)) {
+			path_put_conditional(path, nd);
+			path_put(&nd->path);
+			res = -ELOOP;
+			break;
+		}
+		cond_resched();
+		current->total_link_count++;
+		res = __do_follow_link(&link, nd, &cookie);
+		if (!res)
+			res = walk_component(nd, path, &nd->last,
+					     nd->last_type, LOOKUP_FOLLOW);
+		if (!IS_ERR(cookie) && link.dentry->d_inode->i_op->put_link)
+			link.dentry->d_inode->i_op->put_link(link.dentry, nd, cookie);
+		path_put(&link);
+	} while (res > 0);
+
+	current->link_count--;
+	nd->depth--;
+	return res;
+}
+
+/*
  * Name resolution.
  * This is the basic name resolution function, turning a pathname into
  * the final dentry. We expect 'base' to be positive and a directory.
@@ -1385,9 +1397,6 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 	if (!*name)
 		return 0;
 
-	if (nd->depth)
-		lookup_flags = LOOKUP_FOLLOW | (nd->flags & LOOKUP_CONTINUE);
-
 	/* At this point we know we have a real path component. */
 	for(;;) {
 		unsigned long hash;
@@ -1440,14 +1449,14 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 			goto last_component;
 		while (*++name == '/');
 		if (!*name)
-			goto last_with_slashes;
+			goto last_component;
 
 		err = walk_component(nd, &next, &this, type, LOOKUP_FOLLOW);
 		if (err < 0)
 			return err;
 
 		if (err) {
-			err = do_follow_link(&next, nd);
+			err = nested_symlink(&next, nd);
 			if (err)
 				return err;
 		}
@@ -1457,24 +1466,11 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 		continue;
 		/* here ends the main loop */
 
-last_with_slashes:
-		lookup_flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
 last_component:
 		/* Clear LOOKUP_CONTINUE iff it was previously unset */
 		nd->flags &= lookup_flags | ~LOOKUP_CONTINUE;
-		if (lookup_flags & LOOKUP_PARENT) {
-			nd->last = this;
-			nd->last_type = type;
-			return 0;
-		}
-		err = walk_component(nd, &next, &this, type, LOOKUP_FOLLOW);
-		if (err < 0)
-			return err;
-		if (err) {
-			err = do_follow_link(&next, nd);
-			if (err)
-				return err;
-		}
+		nd->last = this;
+		nd->last_type = type;
 		return 0;
 	}
 	terminate_walk(nd);
-- 
cgit v0.10.2


From 574197e0de46a8a4db5c54ef7b65e43ffa8873a7 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 14 Mar 2011 22:20:34 -0400
Subject: tidy the trailing symlinks traversal up

* pull the handling of current->total_link_count into
__do_follow_link()
* put the common "do ->put_link() if needed and path_put() the link"
  stuff into a helper (put_link(nd, link, cookie))
* rename __do_follow_link() to follow_link(), while we are at it

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/namei.c b/fs/namei.c
index 017c3fa..0a601ca 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -737,14 +737,31 @@ static inline void path_to_nameidata(const struct path *path,
 	nd->path.dentry = path->dentry;
 }
 
+static inline void put_link(struct nameidata *nd, struct path *link, void *cookie)
+{
+	struct inode *inode = link->dentry->d_inode;
+	if (!IS_ERR(cookie) && inode->i_op->put_link)
+		inode->i_op->put_link(link->dentry, nd, cookie);
+	path_put(link);
+}
+
 static __always_inline int
-__do_follow_link(const struct path *link, struct nameidata *nd, void **p)
+follow_link(struct path *link, struct nameidata *nd, void **p)
 {
 	int error;
 	struct dentry *dentry = link->dentry;
 
 	BUG_ON(nd->flags & LOOKUP_RCU);
 
+	if (unlikely(current->total_link_count >= 40)) {
+		*p = ERR_PTR(-ELOOP); /* no ->put_link(), please */
+		path_put_conditional(link, nd);
+		path_put(&nd->path);
+		return -ELOOP;
+	}
+	cond_resched();
+	current->total_link_count++;
+
 	touch_atime(link->mnt, dentry);
 	nd_set_link(nd, NULL);
 
@@ -1356,21 +1373,12 @@ static inline int nested_symlink(struct path *path, struct nameidata *nd)
 	do {
 		struct path link = *path;
 		void *cookie;
-		if (unlikely(current->total_link_count >= 40)) {
-			path_put_conditional(path, nd);
-			path_put(&nd->path);
-			res = -ELOOP;
-			break;
-		}
-		cond_resched();
-		current->total_link_count++;
-		res = __do_follow_link(&link, nd, &cookie);
+
+		res = follow_link(&link, nd, &cookie);
 		if (!res)
 			res = walk_component(nd, path, &nd->last,
 					     nd->last_type, LOOKUP_FOLLOW);
-		if (!IS_ERR(cookie) && link.dentry->d_inode->i_op->put_link)
-			link.dentry->d_inode->i_op->put_link(link.dentry, nd, cookie);
-		path_put(&link);
+		put_link(nd, &link, cookie);
 	} while (res > 0);
 
 	current->link_count--;
@@ -1619,27 +1627,15 @@ static int path_lookupat(int dfd, const char *name,
 	err = link_path_walk(name, nd);
 
 	if (!err && !(flags & LOOKUP_PARENT)) {
-		int count = 0;
 		err = lookup_last(nd, &path);
 		while (err > 0) {
 			void *cookie;
 			struct path link = path;
-			struct inode *inode = link.dentry->d_inode;
-
-			if (count++ > 32) {
-				path_put_conditional(&path, nd);
-				path_put(&nd->path);
-				err = -ELOOP;
-				break;
-			}
-			cond_resched();
 			nd->flags |= LOOKUP_PARENT;
-			err = __do_follow_link(&link, nd, &cookie);
+			err = follow_link(&link, nd, &cookie);
 			if (!err)
 				err = lookup_last(nd, &path);
-			if (!IS_ERR(cookie) && inode->i_op->put_link)
-				inode->i_op->put_link(link.dentry, nd, cookie);
-			path_put(&link);
+			put_link(nd, &link, cookie);
 		}
 	}
 
@@ -2298,7 +2294,6 @@ static struct file *path_openat(int dfd, const char *pathname,
 	struct file *base = NULL;
 	struct file *filp;
 	struct path path;
-	int count = 0;
 	int error;
 
 	filp = get_empty_filp();
@@ -2322,35 +2317,21 @@ static struct file *path_openat(int dfd, const char *pathname,
 	filp = do_last(nd, &path, op, pathname);
 	while (unlikely(!filp)) { /* trailing symlink */
 		struct path link = path;
-		struct inode *linki = link.dentry->d_inode;
 		void *cookie;
-		if (!(nd->flags & LOOKUP_FOLLOW) || count++ == 32) {
+		if (!(nd->flags & LOOKUP_FOLLOW)) {
 			path_put_conditional(&path, nd);
 			path_put(&nd->path);
 			filp = ERR_PTR(-ELOOP);
 			break;
 		}
-		/*
-		 * This is subtle. Instead of calling do_follow_link() we do
-		 * the thing by hands. The reason is that this way we have zero
-		 * link_count and path_walk() (called from ->follow_link)
-		 * honoring LOOKUP_PARENT.  After that we have the parent and
-		 * last component, i.e. we are in the same situation as after
-		 * the first path_walk().  Well, almost - if the last component
-		 * is normal we get its copy stored in nd->last.name and we will
-		 * have to putname() it when we are done. Procfs-like symlinks
-		 * just set LAST_BIND.
-		 */
 		nd->flags |= LOOKUP_PARENT;
 		nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
-		error = __do_follow_link(&link, nd, &cookie);
+		error = follow_link(&link, nd, &cookie);
 		if (unlikely(error))
 			filp = ERR_PTR(error);
 		else
 			filp = do_last(nd, &path, op, pathname);
-		if (!IS_ERR(cookie) && linki->i_op->put_link)
-			linki->i_op->put_link(link.dentry, nd, cookie);
-		path_put(&link);
+		put_link(nd, &link, cookie);
 	}
 out:
 	if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT))
-- 
cgit v0.10.2


From c826cb7dfce80512c26c984350077a25046bd215 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 15 Mar 2011 15:29:21 -0700
Subject: dcache.c: create helper function for duplicated functionality

This creates a helper function for he "try to ascend into the parent
directory" case, which was written out in triplicate before.  With all
the locking and subtle sequence number stuff, we really don't want to
duplicate that kind of code.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/fs/dcache.c b/fs/dcache.c
index 611ffe9..361882a 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1012,6 +1012,34 @@ void shrink_dcache_for_umount(struct super_block *sb)
 }
 
 /*
+ * This tries to ascend one level of parenthood, but
+ * we can race with renaming, so we need to re-check
+ * the parenthood after dropping the lock and check
+ * that the sequence number still matches.
+ */
+static struct dentry *try_to_ascend(struct dentry *old, int locked, unsigned seq)
+{
+	struct dentry *new = old->d_parent;
+
+	rcu_read_lock();
+	spin_unlock(&old->d_lock);
+	spin_lock(&new->d_lock);
+
+	/*
+	 * might go back up the wrong parent if we have had a rename
+	 * or deletion
+	 */
+	if (new != old->d_parent ||
+		 (!locked && read_seqretry(&rename_lock, seq))) {
+		spin_unlock(&new->d_lock);
+		new = NULL;
+	}
+	rcu_read_unlock();
+	return new;
+}
+
+
+/*
  * Search for at least 1 mount point in the dentry's subdirs.
  * We descend to the next level whenever the d_subdirs
  * list is non-empty and continue searching.
@@ -1066,24 +1094,10 @@ resume:
 	 * All done at this level ... ascend and resume the search.
 	 */
 	if (this_parent != parent) {
-		struct dentry *tmp;
-		struct dentry *child;
-
-		tmp = this_parent->d_parent;
-		rcu_read_lock();
-		spin_unlock(&this_parent->d_lock);
-		child = this_parent;
-		this_parent = tmp;
-		spin_lock(&this_parent->d_lock);
-		/* might go back up the wrong parent if we have had a rename
-		 * or deletion */
-		if (this_parent != child->d_parent ||
-			 (!locked && read_seqretry(&rename_lock, seq))) {
-			spin_unlock(&this_parent->d_lock);
-			rcu_read_unlock();
+		struct dentry *child = this_parent;
+		this_parent = try_to_ascend(this_parent, locked, seq);
+		if (!this_parent)
 			goto rename_retry;
-		}
-		rcu_read_unlock();
 		next = child->d_u.d_child.next;
 		goto resume;
 	}
@@ -1181,24 +1195,10 @@ resume:
 	 * All done at this level ... ascend and resume the search.
 	 */
 	if (this_parent != parent) {
-		struct dentry *tmp;
-		struct dentry *child;
-
-		tmp = this_parent->d_parent;
-		rcu_read_lock();
-		spin_unlock(&this_parent->d_lock);
-		child = this_parent;
-		this_parent = tmp;
-		spin_lock(&this_parent->d_lock);
-		/* might go back up the wrong parent if we have had a rename
-		 * or deletion */
-		if (this_parent != child->d_parent ||
-			(!locked && read_seqretry(&rename_lock, seq))) {
-			spin_unlock(&this_parent->d_lock);
-			rcu_read_unlock();
+		struct dentry *child = this_parent;
+		this_parent = try_to_ascend(this_parent, locked, seq);
+		if (!this_parent)
 			goto rename_retry;
-		}
-		rcu_read_unlock();
 		next = child->d_u.d_child.next;
 		goto resume;
 	}
@@ -2942,28 +2942,14 @@ resume:
 		spin_unlock(&dentry->d_lock);
 	}
 	if (this_parent != root) {
-		struct dentry *tmp;
-		struct dentry *child;
-
-		tmp = this_parent->d_parent;
+		struct dentry *child = this_parent;
 		if (!(this_parent->d_flags & DCACHE_GENOCIDE)) {
 			this_parent->d_flags |= DCACHE_GENOCIDE;
 			this_parent->d_count--;
 		}
-		rcu_read_lock();
-		spin_unlock(&this_parent->d_lock);
-		child = this_parent;
-		this_parent = tmp;
-		spin_lock(&this_parent->d_lock);
-		/* might go back up the wrong parent if we have had a rename
-		 * or deletion */
-		if (this_parent != child->d_parent ||
-			 (!locked && read_seqretry(&rename_lock, seq))) {
-			spin_unlock(&this_parent->d_lock);
-			rcu_read_unlock();
+		this_parent = try_to_ascend(this_parent, locked, seq);
+		if (!this_parent)
 			goto rename_retry;
-		}
-		rcu_read_unlock();
 		next = child->d_u.d_child.next;
 		goto resume;
 	}
-- 
cgit v0.10.2


From c83ce989cb5ff86575821992ea82c4df5c388ebc Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 15 Mar 2011 13:36:43 -0400
Subject: VFS: Fix the nfs sillyrename regression in kernel 2.6.38

The new vfs locking scheme introduced in 2.6.38 breaks NFS sillyrename
because the latter relies on being able to determine the parent
directory of the dentry in the ->iput() callback in order to send the
appropriate unlink rpc call.

Looking at the code that cares about races with dput(), there doesn't
seem to be anything that specifically uses d_parent as a test for
whether or not there is a race:
  - __d_lookup_rcu(), __d_lookup() all test for d_hashed() after d_parent
  - shrink_dcache_for_umount() is safe since nothing else can rearrange
    the dentries in that super block.
  - have_submount(), select_parent() and d_genocide() can test for a
    deletion if we set the DCACHE_DISCONNECTED flag when the dentry
    is removed from the parent's d_subdirs list.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: stable@kernel.org (2.6.38, needs commit c826cb7dfce8 "dcache.c:
	create helper function for duplicated functionality" )
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/fs/dcache.c b/fs/dcache.c
index 361882a..a39fe47 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -296,8 +296,12 @@ static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent)
 	__releases(parent->d_lock)
 	__releases(dentry->d_inode->i_lock)
 {
-	dentry->d_parent = NULL;
 	list_del(&dentry->d_u.d_child);
+	/*
+	 * Inform try_to_ascend() that we are no longer attached to the
+	 * dentry tree
+	 */
+	dentry->d_flags |= DCACHE_DISCONNECTED;
 	if (parent)
 		spin_unlock(&parent->d_lock);
 	dentry_iput(dentry);
@@ -1030,6 +1034,7 @@ static struct dentry *try_to_ascend(struct dentry *old, int locked, unsigned seq
 	 * or deletion
 	 */
 	if (new != old->d_parent ||
+		 (old->d_flags & DCACHE_DISCONNECTED) ||
 		 (!locked && read_seqretry(&rename_lock, seq))) {
 		spin_unlock(&new->d_lock);
 		new = NULL;
-- 
cgit v0.10.2


From 0e794589e588a88d34e339feee50c72606fb21a7 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 16 Mar 2011 02:45:02 -0400
Subject: fix follow_link() breakage

commit 574197e0de46a8a4db5c54ef7b65e43ffa8873a7 had a missing
piece, breaking the loop detection ;-/

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/namei.c b/fs/namei.c
index 0a601ca..b912b7a 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -753,9 +753,11 @@ follow_link(struct path *link, struct nameidata *nd, void **p)
 
 	BUG_ON(nd->flags & LOOKUP_RCU);
 
+	if (link->mnt == nd->path.mnt)
+		mntget(link->mnt);
+
 	if (unlikely(current->total_link_count >= 40)) {
 		*p = ERR_PTR(-ELOOP); /* no ->put_link(), please */
-		path_put_conditional(link, nd);
 		path_put(&nd->path);
 		return -ELOOP;
 	}
@@ -765,9 +767,6 @@ follow_link(struct path *link, struct nameidata *nd, void **p)
 	touch_atime(link->mnt, dentry);
 	nd_set_link(nd, NULL);
 
-	if (link->mnt == nd->path.mnt)
-		mntget(link->mnt);
-
 	error = security_inode_follow_link(link->dentry, nd);
 	if (error) {
 		*p = ERR_PTR(error); /* no ->put_link(), please */
-- 
cgit v0.10.2


From 5229645bdc35f1cc43eb8b25b6993c8fa58b4b43 Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Wed, 16 Mar 2011 18:09:27 +1100
Subject: vfs: add nonconflicting values for O_PATH

[AV: on architectures where default conflicts with existing
flags, that is]

Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/arch/alpha/include/asm/fcntl.h b/arch/alpha/include/asm/fcntl.h
index 70145cb..1b71ca7 100644
--- a/arch/alpha/include/asm/fcntl.h
+++ b/arch/alpha/include/asm/fcntl.h
@@ -31,6 +31,8 @@
 #define __O_SYNC	020000000
 #define O_SYNC		(__O_SYNC|O_DSYNC)
 
+#define O_PATH		040000000
+
 #define F_GETLK		7
 #define F_SETLK		8
 #define F_SETLKW	9
diff --git a/arch/parisc/include/asm/fcntl.h b/arch/parisc/include/asm/fcntl.h
index f357fc6..0304b92 100644
--- a/arch/parisc/include/asm/fcntl.h
+++ b/arch/parisc/include/asm/fcntl.h
@@ -19,6 +19,8 @@
 #define O_NOFOLLOW	000000200 /* don't follow links */
 #define O_INVISIBLE	004000000 /* invisible I/O, for DMAPI/XDSM */
 
+#define O_PATH		020000000
+
 #define F_GETLK64	8
 #define F_SETLK64	9
 #define F_SETLKW64	10
diff --git a/arch/sparc/include/asm/fcntl.h b/arch/sparc/include/asm/fcntl.h
index 38f37b33..d0b83f6 100644
--- a/arch/sparc/include/asm/fcntl.h
+++ b/arch/sparc/include/asm/fcntl.h
@@ -34,6 +34,8 @@
 #define __O_SYNC	0x800000
 #define O_SYNC		(__O_SYNC|O_DSYNC)
 
+#define O_PATH		0x1000000
+
 #define F_GETOWN	5	/*  for sockets. */
 #define F_SETOWN	6	/*  for sockets. */
 #define F_GETLK		7
-- 
cgit v0.10.2


From bab1d9444d9a147f1dc3478dd06c16f490227f3e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 15 Mar 2011 21:51:24 +0100
Subject: prune back iprune_sem

iprune_sem is continously giving us lockdep warnings because we do take it in
read mode in the reclaim path, but we're also doing non-NOFS allocations under
it taken in write mode.

Taking a bit deeper look at it I think it's fixable quite trivially:

 - for invalidate_inodes we do not need iprune_sem at all.  We have an active
   reference on the superblock, so the filesystem is not going away until it
   has finished.
 - for evict_inodes we do need it, to make sure prune_icache has done it's
   work before we tear down the superblock.  But there is no reason to
   hold it over the actual reclaim operation - it's enough to cycle through
   it after the actual reclaim to make sure we wait for any pending
   prune_icache to complete.  We just have to remove the WARN_ON for
   otherwise busy inodes as they can actually happen now.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/inode.c b/fs/inode.c
index 0647d80..9910c03 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -84,16 +84,13 @@ static struct hlist_head *inode_hashtable __read_mostly;
 DEFINE_SPINLOCK(inode_lock);
 
 /*
- * iprune_sem provides exclusion between the kswapd or try_to_free_pages
- * icache shrinking path, and the umount path.  Without this exclusion,
- * by the time prune_icache calls iput for the inode whose pages it has
- * been invalidating, or by the time it calls clear_inode & destroy_inode
- * from its final dispose_list, the struct super_block they refer to
- * (for inode->i_sb->s_op) may already have been freed and reused.
+ * iprune_sem provides exclusion between the icache shrinking and the
+ * umount path.
  *
- * We make this an rwsem because the fastpath is icache shrinking. In
- * some cases a filesystem may be doing a significant amount of work in
- * its inode reclaim code, so this should improve parallelism.
+ * We don't actually need it to protect anything in the umount path,
+ * but only need to cycle through it to make sure any inode that
+ * prune_icache took off the LRU list has been fully torn down by the
+ * time we are past evict_inodes.
  */
 static DECLARE_RWSEM(iprune_sem);
 
@@ -516,17 +513,12 @@ void evict_inodes(struct super_block *sb)
 	struct inode *inode, *next;
 	LIST_HEAD(dispose);
 
-	down_write(&iprune_sem);
-
 	spin_lock(&inode_lock);
 	list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
 		if (atomic_read(&inode->i_count))
 			continue;
-
-		if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
-			WARN_ON(1);
+		if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE))
 			continue;
-		}
 
 		inode->i_state |= I_FREEING;
 
@@ -542,6 +534,13 @@ void evict_inodes(struct super_block *sb)
 	spin_unlock(&inode_lock);
 
 	dispose_list(&dispose);
+
+	/*
+	 * Cycle through iprune_sem to make sure any inode that prune_icache
+	 * moved off the list before we took the lock has been fully torn
+	 * down.
+	 */
+	down_write(&iprune_sem);
 	up_write(&iprune_sem);
 }
 
@@ -561,8 +560,6 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty)
 	struct inode *inode, *next;
 	LIST_HEAD(dispose);
 
-	down_write(&iprune_sem);
-
 	spin_lock(&inode_lock);
 	list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
 		if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE))
@@ -590,7 +587,6 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty)
 	spin_unlock(&inode_lock);
 
 	dispose_list(&dispose);
-	up_write(&iprune_sem);
 
 	return busy;
 }
-- 
cgit v0.10.2


From 34d211a2d5df4984a35b18d8ccacbe1d10abb067 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Wed, 16 Mar 2011 08:04:07 -0700
Subject: Increase OSF partition limit from 8 to 18

It turns out that while a maximum of 8 partitions may be what people
"should" have had, you can actually fit up to 18 entries(*) in a sector.

And some people clearly were taking advantage of that, like Michael
Cree, who had ten partitions on one of his OSF disks.

(*) The OSF partition data starts at byte offset 64 in the first sector,
    and the array of 16-byte partition entries start at offset 148 in
    the on-disk partition structure.

Reported-by: Michael Cree <mcree@orcon.net.nz>
Cc: stable@kernel.org (v2.6.38)
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/fs/partitions/osf.c b/fs/partitions/osf.c
index be03a0b..764b86a 100644
--- a/fs/partitions/osf.c
+++ b/fs/partitions/osf.c
@@ -10,7 +10,7 @@
 #include "check.h"
 #include "osf.h"
 
-#define MAX_OSF_PARTITIONS 8
+#define MAX_OSF_PARTITIONS 18
 
 int osf_partition(struct parsed_partitions *state)
 {
-- 
cgit v0.10.2