From 87121ca504fd1d963a66b3fb0c72054b0fd9a177 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Fri, 7 Oct 2011 16:31:46 +0200 Subject: oprofile: Fix crash when unloading module (hr timer mode) Oprofile may crash in a KVM guest while unlaoding modules. This happens if oprofile_arch_init() fails and oprofile switches to the hr timer mode as a fallback. In this case oprofile_arch_exit() is called, but it never was initialized properly which causes the crash. This patch fixes this. oprofile: using timer interrupt. BUG: unable to handle kernel NULL pointer dereference at 0000000000000008 IP: [] unregister_syscore_ops+0x41/0x58 PGD 41da3f067 PUD 41d80e067 PMD 0 Oops: 0002 [#1] PREEMPT SMP CPU 5 Modules linked in: oprofile(-) Pid: 2382, comm: modprobe Not tainted 3.1.0-rc7-00018-g709a39d #18 Advanced Micro Device Anaheim/Anaheim RIP: 0010:[] [] unregister_syscore_ops+0x41/0x58 RSP: 0018:ffff88041de1de98 EFLAGS: 00010296 RAX: 0000000000000000 RBX: ffffffffa00060e0 RCX: dead000000200200 RDX: 0000000000000000 RSI: dead000000100100 RDI: ffffffff8178c620 RBP: ffff88041de1dea8 R08: 0000000000000001 R09: 0000000000000082 R10: 0000000000000000 R11: ffff88041de1dde8 R12: 0000000000000080 R13: fffffffffffffff5 R14: 0000000000000001 R15: 0000000000610210 FS: 00007f9ae5bef700(0000) GS:ffff88042fd40000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 0000000000000008 CR3: 000000041ca44000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process modprobe (pid: 2382, threadinfo ffff88041de1c000, task ffff88042db6d040) Stack: ffff88041de1deb8 ffffffffa0006770 ffff88041de1deb8 ffffffffa000251e ffff88041de1dec8 ffffffffa00022c2 ffff88041de1ded8 ffffffffa0004993 ffff88041de1df78 ffffffff81073115 656c69666f72706f 0000000000610200 Call Trace: [] op_nmi_exit+0x15/0x17 [oprofile] [] oprofile_arch_exit+0xe/0x10 [oprofile] [] oprofile_exit+0x13/0x15 [oprofile] [] sys_delete_module+0x1c3/0x22f [] ? trace_hardirqs_on_thunk+0x3a/0x3f [] system_call_fastpath+0x16/0x1b Code: 20 c6 78 81 e8 c5 cc 23 00 48 8b 13 48 8b 43 08 48 be 00 01 10 00 00 00 ad de 48 b9 00 02 20 00 00 00 ad de 48 c7 c7 20 c6 78 81 89 42 08 48 89 10 48 89 33 48 89 4b 08 e8 a6 c0 23 00 5a 5b RIP [] unregister_syscore_ops+0x41/0x58 RSP CR2: 0000000000000008 ---[ end trace 06d4e95b6aa3b437 ]--- CC: stable@kernel.org # 2.6.37+ Signed-off-by: Robert Richter diff --git a/drivers/oprofile/oprof.c b/drivers/oprofile/oprof.c index dccd863..f8c752e 100644 --- a/drivers/oprofile/oprof.c +++ b/drivers/oprofile/oprof.c @@ -239,26 +239,45 @@ int oprofile_set_ulong(unsigned long *addr, unsigned long val) return err; } +static int timer_mode; + static int __init oprofile_init(void) { int err; + /* always init architecture to setup backtrace support */ err = oprofile_arch_init(&oprofile_ops); - if (err < 0 || timer) { - printk(KERN_INFO "oprofile: using timer interrupt.\n"); + + timer_mode = err || timer; /* fall back to timer mode on errors */ + if (timer_mode) { + if (!err) + oprofile_arch_exit(); err = oprofile_timer_init(&oprofile_ops); if (err) return err; } - return oprofilefs_register(); + + err = oprofilefs_register(); + if (!err) + return 0; + + /* failed */ + if (timer_mode) + oprofile_timer_exit(); + else + oprofile_arch_exit(); + + return err; } static void __exit oprofile_exit(void) { - oprofile_timer_exit(); oprofilefs_unregister(); - oprofile_arch_exit(); + if (timer_mode) + oprofile_timer_exit(); + else + oprofile_arch_exit(); } diff --git a/drivers/oprofile/timer_int.c b/drivers/oprofile/timer_int.c index 3ef4462..878fba1 100644 --- a/drivers/oprofile/timer_int.c +++ b/drivers/oprofile/timer_int.c @@ -110,6 +110,7 @@ int oprofile_timer_init(struct oprofile_operations *ops) ops->start = oprofile_hrtimer_start; ops->stop = oprofile_hrtimer_stop; ops->cpu_type = "timer"; + printk(KERN_INFO "oprofile: using timer interrupt.\n"); return 0; } -- cgit v0.10.2 From 97f7f8189fe54e3cfe324ef9ad35064f3d2d3bff Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Mon, 10 Oct 2011 16:21:10 +0200 Subject: oprofile, x86: Fix crash when unloading module (nmi timer mode) If oprofile uses the nmi timer interrupt there is a crash while unloading the module. The bug can be triggered with oprofile build as module and kernel parameter nolapic set. This patch fixes this. oprofile: using NMI timer interrupt. BUG: unable to handle kernel NULL pointer dereference at 0000000000000008 IP: [] unregister_syscore_ops+0x41/0x58 PGD 42dbca067 PUD 41da6a067 PMD 0 Oops: 0002 [#1] PREEMPT SMP CPU 5 Modules linked in: oprofile(-) [last unloaded: oprofile] Pid: 2518, comm: modprobe Not tainted 3.1.0-rc7-00019-gb2fb49d #19 Advanced Micro Device Anaheim/Anaheim RIP: 0010:[] [] unregister_syscore_ops+0x41/0x58 RSP: 0018:ffff88041ef71e98 EFLAGS: 00010296 RAX: 0000000000000000 RBX: ffffffffa0017100 RCX: dead000000200200 RDX: 0000000000000000 RSI: dead000000100100 RDI: ffffffff8178c620 RBP: ffff88041ef71ea8 R08: 0000000000000001 R09: 0000000000000082 R10: 0000000000000000 R11: ffff88041ef71de8 R12: 0000000000000080 R13: fffffffffffffff5 R14: 0000000000000001 R15: 0000000000610210 FS: 00007fc902f20700(0000) GS:ffff88042fd40000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 0000000000000008 CR3: 000000041cdb6000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process modprobe (pid: 2518, threadinfo ffff88041ef70000, task ffff88041d348040) Stack: ffff88041ef71eb8 ffffffffa0017790 ffff88041ef71eb8 ffffffffa0013532 ffff88041ef71ec8 ffffffffa00132d6 ffff88041ef71ed8 ffffffffa00159b2 ffff88041ef71f78 ffffffff81073115 656c69666f72706f 0000000000610200 Call Trace: [] op_nmi_exit+0x15/0x17 [oprofile] [] oprofile_arch_exit+0xe/0x10 [oprofile] [] oprofile_exit+0x1e/0x20 [oprofile] [] sys_delete_module+0x1c3/0x22f [] ? trace_hardirqs_on_thunk+0x3a/0x3f [] system_call_fastpath+0x16/0x1b Code: 20 c6 78 81 e8 c5 cc 23 00 48 8b 13 48 8b 43 08 48 be 00 01 10 00 00 00 ad de 48 b9 00 02 20 00 00 00 ad de 48 c7 c7 20 c6 78 81 89 42 08 48 89 10 48 89 33 48 89 4b 08 e8 a6 c0 23 00 5a 5b RIP [] unregister_syscore_ops+0x41/0x58 RSP CR2: 0000000000000008 ---[ end trace 43a541a52956b7b0 ]--- CC: stable@kernel.org # 2.6.37+ Signed-off-by: Robert Richter diff --git a/arch/x86/oprofile/init.c b/arch/x86/oprofile/init.c index cdfe4c5..f148cf6 100644 --- a/arch/x86/oprofile/init.c +++ b/arch/x86/oprofile/init.c @@ -21,6 +21,7 @@ extern int op_nmi_timer_init(struct oprofile_operations *ops); extern void op_nmi_exit(void); extern void x86_backtrace(struct pt_regs * const regs, unsigned int depth); +static int nmi_timer; int __init oprofile_arch_init(struct oprofile_operations *ops) { @@ -31,8 +32,9 @@ int __init oprofile_arch_init(struct oprofile_operations *ops) #ifdef CONFIG_X86_LOCAL_APIC ret = op_nmi_init(ops); #endif + nmi_timer = (ret != 0); #ifdef CONFIG_X86_IO_APIC - if (ret < 0) + if (nmi_timer) ret = op_nmi_timer_init(ops); #endif ops->backtrace = x86_backtrace; @@ -44,6 +46,7 @@ int __init oprofile_arch_init(struct oprofile_operations *ops) void oprofile_arch_exit(void) { #ifdef CONFIG_X86_LOCAL_APIC - op_nmi_exit(); + if (!nmi_timer) + op_nmi_exit(); #endif } -- cgit v0.10.2 From 57d1c0c03c6b48b2b96870d831b9ce6b917f53ac Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 7 Oct 2011 13:36:40 +0200 Subject: perf/x86: Fix PEBS instruction unwind Masami spotted that we always try to decode the instruction stream as 64bit instructions when running a 64bit kernel, this doesn't work for ia32-compat proglets. Use TIF_IA32 to detect if we need to use the 32bit instruction decoder. Reported-by: Masami Hiramatsu Cc: stable@kernel.org Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index c0d238f..73da6b6 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -493,6 +493,7 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs) unsigned long from = cpuc->lbr_entries[0].from; unsigned long old_to, to = cpuc->lbr_entries[0].to; unsigned long ip = regs->ip; + int is_64bit = 0; /* * We don't need to fixup if the PEBS assist is fault like @@ -544,7 +545,10 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs) } else kaddr = (void *)to; - kernel_insn_init(&insn, kaddr); +#ifdef CONFIG_X86_64 + is_64bit = kernel_ip(to) || !test_thread_flag(TIF_IA32); +#endif + insn_init(&insn, kaddr, is_64bit); insn_get_length(&insn); to += insn.length; } while (to < ip); -- cgit v0.10.2 From 1d5f003f5a964711853514b04ddc872eec0fdc7b Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Sun, 23 Oct 2011 19:10:33 +0200 Subject: perf: Do not set task_ctx pointer in cpuctx if there are no events in the context Do not set task_ctx pointer during sched_in if there are no events associated with the context. Otherwise if during task execution total number of events in the system will become zero perf_event_context_sched_out() will not be called and cpuctx->task_ctx will be left with a stale value. Signed-off-by: Gleb Natapov Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20111023171033.GI17571@redhat.com Signed-off-by: Ingo Molnar diff --git a/kernel/events/core.c b/kernel/events/core.c index 0e8457d..b0c1186 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2173,7 +2173,8 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, perf_event_sched_in(cpuctx, ctx, task); - cpuctx->task_ctx = ctx; + if (ctx->nr_events) + cpuctx->task_ctx = ctx; perf_pmu_enable(ctx->pmu); perf_ctx_unlock(cpuctx, ctx); -- cgit v0.10.2 From aa2bc1ade59003a379ffc485d6da2d92ea3370a6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 9 Nov 2011 17:56:37 +0100 Subject: perf: Don't use -ENOSPC for out of PMU resources People (Linus) objected to using -ENOSPC to signal not having enough resources on the PMU to satisfy the request. Use -EINVAL. Requested-by: Linus Torvalds Cc: Stephane Eranian Cc: Will Deacon Cc: Deng-Cheng Zhu Cc: David Daney Cc: Ralf Baechle Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-xv8geaz2zpbjhlx0svmpp28n@git.kernel.org [ merged to newer kernel, fixed up MIPS impact ] Signed-off-by: Ingo Molnar diff --git a/arch/arm/kernel/perf_event.c b/arch/arm/kernel/perf_event.c index 24e2347..ff17b17 100644 --- a/arch/arm/kernel/perf_event.c +++ b/arch/arm/kernel/perf_event.c @@ -347,15 +347,15 @@ validate_group(struct perf_event *event) memset(&fake_pmu, 0, sizeof(fake_pmu)); if (!validate_event(&fake_pmu, leader)) - return -ENOSPC; + return -EINVAL; list_for_each_entry(sibling, &leader->sibling_list, group_entry) { if (!validate_event(&fake_pmu, sibling)) - return -ENOSPC; + return -EINVAL; } if (!validate_event(&fake_pmu, event)) - return -ENOSPC; + return -EINVAL; return 0; } diff --git a/arch/mips/kernel/perf_event_mipsxx.c b/arch/mips/kernel/perf_event_mipsxx.c index 4f2971b..315fc0b 100644 --- a/arch/mips/kernel/perf_event_mipsxx.c +++ b/arch/mips/kernel/perf_event_mipsxx.c @@ -623,7 +623,7 @@ static int mipspmu_event_init(struct perf_event *event) if (!atomic_inc_not_zero(&active_events)) { if (atomic_read(&active_events) > MIPS_MAX_HWEVENTS) { atomic_dec(&active_events); - return -ENOSPC; + return -EINVAL; } mutex_lock(&pmu_reserve_mutex); @@ -732,15 +732,15 @@ static int validate_group(struct perf_event *event) memset(&fake_cpuc, 0, sizeof(fake_cpuc)); if (!validate_event(&fake_cpuc, leader)) - return -ENOSPC; + return -EINVAL; list_for_each_entry(sibling, &leader->sibling_list, group_entry) { if (!validate_event(&fake_cpuc, sibling)) - return -ENOSPC; + return -EINVAL; } if (!validate_event(&fake_cpuc, event)) - return -ENOSPC; + return -EINVAL; return 0; } diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 6408910..ff0e8d4 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -588,7 +588,7 @@ done: x86_pmu.put_event_constraints(cpuc, cpuc->event_list[i]); } } - return num ? -ENOSPC : 0; + return num ? -EINVAL : 0; } /* @@ -607,7 +607,7 @@ static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, if (is_x86_event(leader)) { if (n >= max_count) - return -ENOSPC; + return -EINVAL; cpuc->event_list[n] = leader; n++; } @@ -620,7 +620,7 @@ static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, continue; if (n >= max_count) - return -ENOSPC; + return -EINVAL; cpuc->event_list[n] = event; n++; @@ -1316,7 +1316,7 @@ static int validate_event(struct perf_event *event) c = x86_pmu.get_event_constraints(fake_cpuc, event); if (!c || !c->weight) - ret = -ENOSPC; + ret = -EINVAL; if (x86_pmu.put_event_constraints) x86_pmu.put_event_constraints(fake_cpuc, event); @@ -1341,7 +1341,7 @@ static int validate_group(struct perf_event *event) { struct perf_event *leader = event->group_leader; struct cpu_hw_events *fake_cpuc; - int ret = -ENOSPC, n; + int ret = -EINVAL, n; fake_cpuc = allocate_fake_cpuc(); if (IS_ERR(fake_cpuc)) diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index 492bf13..ef484d9 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c @@ -1268,7 +1268,7 @@ reserve: } done: - return num ? -ENOSPC : 0; + return num ? -EINVAL : 0; } static __initconst const struct x86_pmu p4_pmu = { -- cgit v0.10.2 From ed13ec58bfe0d5dc95f748e6118432cb0fa283cb Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 14 Nov 2011 10:03:25 +0100 Subject: perf/x86: Enable raw event access to Intel offcore events Now that the core offcore support is fixed up (thanks Stephane) and we have sane generic events utilizing them, re-enable the raw access to the feature as well. Note that it doesn't matter if you use event 0x1b7 or 0x1bb to specify an offcore event, either one works and neither guarantees you'll end up on a particular offcore MSR. Based on original patch from: Vince Weaver . Signed-off-by: Peter Zijlstra Cc: Vince Weaver . Cc: Stephane Eranian Link: http://lkml.kernel.org/r/alpine.DEB.2.00.1108031200390.703@cl320.eecs.utk.edu Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index ff0e8d4..2bda212 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -312,12 +312,8 @@ int x86_setup_perfctr(struct perf_event *event) return -EOPNOTSUPP; } - /* - * Do not allow config1 (extended registers) to propagate, - * there's no sane user-space generalization yet: - */ if (attr->type == PERF_TYPE_RAW) - return 0; + return x86_pmu_extra_regs(event->attr.config, event); if (attr->type == PERF_TYPE_HW_CACHE) return set_ext_hw_attr(hwc, event); -- cgit v0.10.2 From 0e2a5f10fb550835e199a3b56a80ed88232188e9 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 4 Nov 2011 08:16:58 -0200 Subject: perf python: Fix undefined symbol problem Recently we made perf_evsel__init call hists__init, which broke the perf python binding: [root@emilia linux]# ./tools/perf/python/twatch.py Traceback (most recent call last): File "./tools/perf/python/twatch.py", line 16, in import perf ImportError: /home/acme/git/build/perf/python/perf.so: undefined symbol: hists__init Fix it by moving the hists__init function to its only caller, evsel.c. This way we avoid dragging in other parts of tools/perf/util/ to the perf python binding. Cc: David Ahern Cc: Frederic Weisbecker Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-5nffmdt5mu6ozxgj54oi4qon@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index e426264..d7915d4 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -34,6 +34,16 @@ int __perf_evsel__sample_size(u64 sample_type) return size; } +static void hists__init(struct hists *hists) +{ + memset(hists, 0, sizeof(*hists)); + hists->entries_in_array[0] = hists->entries_in_array[1] = RB_ROOT; + hists->entries_in = &hists->entries_in_array[0]; + hists->entries_collapsed = RB_ROOT; + hists->entries = RB_ROOT; + pthread_mutex_init(&hists->lock, NULL); +} + void perf_evsel__init(struct perf_evsel *evsel, struct perf_event_attr *attr, int idx) { diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index a36a3fa..abef270 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -1211,13 +1211,3 @@ size_t hists__fprintf_nr_events(struct hists *hists, FILE *fp) return ret; } - -void hists__init(struct hists *hists) -{ - memset(hists, 0, sizeof(*hists)); - hists->entries_in_array[0] = hists->entries_in_array[1] = RB_ROOT; - hists->entries_in = &hists->entries_in_array[0]; - hists->entries_collapsed = RB_ROOT; - hists->entries = RB_ROOT; - pthread_mutex_init(&hists->lock, NULL); -} diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h index c86c1d2..89289c8 100644 --- a/tools/perf/util/hist.h +++ b/tools/perf/util/hist.h @@ -63,8 +63,6 @@ struct hists { struct callchain_cursor callchain_cursor; }; -void hists__init(struct hists *hists); - struct hist_entry *__hists__add_entry(struct hists *self, struct addr_location *al, struct symbol *parent, u64 period); -- cgit v0.10.2 From 47fbe53bef3b219a365ebf3eca949d6cd4c5291c Mon Sep 17 00:00:00 2001 From: David Ahern Date: Sun, 13 Nov 2011 10:45:27 -0700 Subject: perf session: Fix crash with invalid CPU list commit 5d67be9 added the option to specify a range of CPUs of interest, but does not catch an invalid CPU list: $ perf script -c foo Segmentation fault (core dumped) Cc: Anton Blanchard Link: http://lkml.kernel.org/r/1321206327-5881-1-git-send-email-dsahern@gmail.com Signed-off-by: David Ahern Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 85c1e6b7..0f4555c 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -1333,6 +1333,10 @@ int perf_session__cpu_bitmap(struct perf_session *session, } map = cpu_map__new(cpu_list); + if (map == NULL) { + pr_err("Invalid cpu_list\n"); + return -1; + } for (i = 0; i < map->nr; i++) { int cpu = map->map[i]; -- cgit v0.10.2 From d3d9acf646679c1981032b0985b386d12fccc60c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Nov 2011 08:49:49 -0800 Subject: trace_events_filter: Use rcu_assign_pointer() when setting ftrace_event_call->filter ftrace_event_call->filter is sched RCU protected but didn't use rcu_assign_pointer(). Use it. TODO: Add proper __rcu annotation to call->filter and all its users. -v2: Use RCU_INIT_POINTER() for %NULL clearing as suggested by Eric. Link: http://lkml.kernel.org/r/20111123164949.GA29639@google.com Cc: Eric Dumazet Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: stable@kernel.org # (2.6.39+) Signed-off-by: Tejun Heo Signed-off-by: Steven Rostedt diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 816d3d0..d6e7926 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1686,7 +1686,7 @@ static int replace_system_preds(struct event_subsystem *system, * replace the filter for the call. */ filter = call->filter; - call->filter = filter_item->filter; + rcu_assign_pointer(call->filter, filter_item->filter); filter_item->filter = filter; fail = false; @@ -1741,7 +1741,7 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string) filter = call->filter; if (!filter) goto out_unlock; - call->filter = NULL; + RCU_INIT_POINTER(call->filter, NULL); /* Make sure the filter is not being used */ synchronize_sched(); __free_filter(filter); @@ -1782,7 +1782,7 @@ out: * string */ tmp = call->filter; - call->filter = filter; + rcu_assign_pointer(call->filter, filter); if (tmp) { /* Make sure the call is done with the filter */ synchronize_sched(); -- cgit v0.10.2 From 6a600a8b8749566a7d81ad75dcb8bf5342b5a39a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 15 Nov 2011 10:51:15 +0100 Subject: perf, x86: Disable PEBS on SandyBridge chips Cc: Stephane Eranian Cc: stable@kernel.org Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 2be5ebe..8d601b1 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1545,6 +1545,13 @@ static void intel_clovertown_quirks(void) x86_pmu.pebs_constraints = NULL; } +static void intel_sandybridge_quirks(void) +{ + printk(KERN_WARNING "PEBS disabled due to CPU errata.\n"); + x86_pmu.pebs = 0; + x86_pmu.pebs_constraints = NULL; +} + __init int intel_pmu_init(void) { union cpuid10_edx edx; @@ -1694,6 +1701,7 @@ __init int intel_pmu_init(void) break; case 42: /* SandyBridge */ + x86_pmu.quirks = intel_sandybridge_quirks; case 45: /* SandyBridge, "Romely-EP" */ memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids)); -- cgit v0.10.2 From 16e5294e5f8303756a179cf218e37dfb9ed34417 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 8 Nov 2011 19:20:44 +0100 Subject: perf, x86: Force IBS LVT offset assignment for family 10h On AMD family 10h we see firmware bug messages like the following: [Firmware Bug]: cpu 6, try to use APIC500 (LVT offset 0) for vector 0x10400, but the register is already in use for vector 0xf9 on another cpu [Firmware Bug]: cpu 6, IBS interrupt offset 0 not available (MSRC001103A=0x0000000000000100) [Firmware Bug]: using offset 1 for IBS interrupts [Firmware Bug]: workaround enabled for IBS LVT offset perf: AMD IBS detected (0x00000007) We always see this, since the offsets are not assigned by the BIOS for this family. Force LVT offset assignment in this case. If the OS assignment fails, fallback to BIOS settings and try to setup this. The fallback to BIOS settings weakens the family check since force_ibs_eilvt_setup() may fail e.g. in case of virtual machines. But setup may still succeed if BIOS offsets are correct. Other families don't have a workaround implemented that assigns LVT offsets. It's ok, to drop calling force_ibs_eilvt_setup() for that families. With the patch the [Firmware Bug] messages vanish. We see now: IBS: LVT offset 1 assigned perf: AMD IBS detected (0x00000007) Signed-off-by: Robert Richter Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20111109162225.GO12451@erda.amd.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c index ab6343d..3b8a2d3 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c +++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c @@ -199,8 +199,7 @@ static int force_ibs_eilvt_setup(void) goto out; } - pr_err(FW_BUG "using offset %d for IBS interrupts\n", offset); - pr_err(FW_BUG "workaround enabled for IBS LVT offset\n"); + pr_info("IBS: LVT offset %d assigned\n", offset); return 0; out: @@ -265,19 +264,23 @@ perf_ibs_cpu_notifier(struct notifier_block *self, unsigned long action, void *h static __init int amd_ibs_init(void) { u32 caps; - int ret; + int ret = -EINVAL; caps = __get_ibs_caps(); if (!caps) return -ENODEV; /* ibs not supported by the cpu */ - if (!ibs_eilvt_valid()) { - ret = force_ibs_eilvt_setup(); - if (ret) { - pr_err("Failed to setup IBS, %d\n", ret); - return ret; - } - } + /* + * Force LVT offset assignment for family 10h: The offsets are + * not assigned by the BIOS for this family, so the OS is + * responsible for doing it. If the OS assignment fails, fall + * back to BIOS settings and try to setup this. + */ + if (boot_cpu_data.x86 == 0x10) + force_ibs_eilvt_setup(); + + if (!ibs_eilvt_valid()) + goto out; get_online_cpus(); ibs_caps = caps; @@ -287,7 +290,11 @@ static __init int amd_ibs_init(void) smp_call_function(setup_APIC_ibs, NULL, 1); put_online_cpus(); - return perf_event_ibs_init(); + ret = perf_event_ibs_init(); +out: + if (ret) + pr_err("Failed to setup IBS, %d\n", ret); + return ret; } /* Since we need the pci subsystem to init ibs we can't do this earlier: */ -- cgit v0.10.2 From 10c6db110d0eb4466b59812c49088ab56218fc2e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 26 Nov 2011 02:47:31 +0100 Subject: perf: Fix loss of notification with multi-event When you do: $ perf record -e cycles,cycles,cycles noploop 10 You expect about 10,000 samples for each event, i.e., 10s at 1000samples/sec. However, this is not what's happening. You get much fewer samples, maybe 3700 samples/event: $ perf report -D | tail -15 Aggregated stats: TOTAL events: 10998 MMAP events: 66 COMM events: 2 SAMPLE events: 10930 cycles stats: TOTAL events: 3644 SAMPLE events: 3644 cycles stats: TOTAL events: 3642 SAMPLE events: 3642 cycles stats: TOTAL events: 3644 SAMPLE events: 3644 On a Intel Nehalem or even AMD64, there are 4 counters capable of measuring cycles, so there is plenty of space to measure those events without multiplexing (even with the NMI watchdog active). And even with multiplexing, we'd expect roughly the same number of samples per event. The root of the problem was that when the event that caused the buffer to become full was not the first event passed on the cmdline, the user notification would get lost. The notification was sent to the file descriptor of the overflowed event but the perf tool was not polling on it. The perf tool aggregates all samples into a single buffer, i.e., the buffer of the first event. Consequently, it assumes notifications for any event will come via that descriptor. The seemingly straight forward solution of moving the waitq into the ringbuffer object doesn't work because of life-time issues. One could perf_event_set_output() on a fd that you're also blocking on and cause the old rb object to be freed while its waitq would still be referenced by the blocked thread -> FAIL. Therefore link all events to the ringbuffer and broadcast the wakeup from the ringbuffer object to all possible events that could be waited upon. This is rather ugly, and we're open to better solutions but it works for now. Reported-by: Stephane Eranian Finished-by: Stephane Eranian Reviewed-by: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20111126014731.GA7030@quad Signed-off-by: Ingo Molnar diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 1e9ebe5..b1f8912 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -822,6 +822,7 @@ struct perf_event { int mmap_locked; struct user_struct *mmap_user; struct ring_buffer *rb; + struct list_head rb_entry; /* poll related */ wait_queue_head_t waitq; diff --git a/kernel/events/core.c b/kernel/events/core.c index b0c1186..600c162 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -185,6 +185,9 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, static void update_context_time(struct perf_event_context *ctx); static u64 perf_event_time(struct perf_event *event); +static void ring_buffer_attach(struct perf_event *event, + struct ring_buffer *rb); + void __weak perf_event_print_debug(void) { } extern __weak const char *perf_pmu_name(void) @@ -3191,12 +3194,33 @@ static unsigned int perf_poll(struct file *file, poll_table *wait) struct ring_buffer *rb; unsigned int events = POLL_HUP; + /* + * Race between perf_event_set_output() and perf_poll(): perf_poll() + * grabs the rb reference but perf_event_set_output() overrides it. + * Here is the timeline for two threads T1, T2: + * t0: T1, rb = rcu_dereference(event->rb) + * t1: T2, old_rb = event->rb + * t2: T2, event->rb = new rb + * t3: T2, ring_buffer_detach(old_rb) + * t4: T1, ring_buffer_attach(rb1) + * t5: T1, poll_wait(event->waitq) + * + * To avoid this problem, we grab mmap_mutex in perf_poll() + * thereby ensuring that the assignment of the new ring buffer + * and the detachment of the old buffer appear atomic to perf_poll() + */ + mutex_lock(&event->mmap_mutex); + rcu_read_lock(); rb = rcu_dereference(event->rb); - if (rb) + if (rb) { + ring_buffer_attach(event, rb); events = atomic_xchg(&rb->poll, 0); + } rcu_read_unlock(); + mutex_unlock(&event->mmap_mutex); + poll_wait(file, &event->waitq, wait); return events; @@ -3497,6 +3521,49 @@ unlock: return ret; } +static void ring_buffer_attach(struct perf_event *event, + struct ring_buffer *rb) +{ + unsigned long flags; + + if (!list_empty(&event->rb_entry)) + return; + + spin_lock_irqsave(&rb->event_lock, flags); + if (!list_empty(&event->rb_entry)) + goto unlock; + + list_add(&event->rb_entry, &rb->event_list); +unlock: + spin_unlock_irqrestore(&rb->event_lock, flags); +} + +static void ring_buffer_detach(struct perf_event *event, + struct ring_buffer *rb) +{ + unsigned long flags; + + if (list_empty(&event->rb_entry)) + return; + + spin_lock_irqsave(&rb->event_lock, flags); + list_del_init(&event->rb_entry); + wake_up_all(&event->waitq); + spin_unlock_irqrestore(&rb->event_lock, flags); +} + +static void ring_buffer_wakeup(struct perf_event *event) +{ + struct ring_buffer *rb; + + rcu_read_lock(); + rb = rcu_dereference(event->rb); + list_for_each_entry_rcu(event, &rb->event_list, rb_entry) { + wake_up_all(&event->waitq); + } + rcu_read_unlock(); +} + static void rb_free_rcu(struct rcu_head *rcu_head) { struct ring_buffer *rb; @@ -3522,9 +3589,19 @@ static struct ring_buffer *ring_buffer_get(struct perf_event *event) static void ring_buffer_put(struct ring_buffer *rb) { + struct perf_event *event, *n; + unsigned long flags; + if (!atomic_dec_and_test(&rb->refcount)) return; + spin_lock_irqsave(&rb->event_lock, flags); + list_for_each_entry_safe(event, n, &rb->event_list, rb_entry) { + list_del_init(&event->rb_entry); + wake_up_all(&event->waitq); + } + spin_unlock_irqrestore(&rb->event_lock, flags); + call_rcu(&rb->rcu_head, rb_free_rcu); } @@ -3547,6 +3624,7 @@ static void perf_mmap_close(struct vm_area_struct *vma) atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); vma->vm_mm->pinned_vm -= event->mmap_locked; rcu_assign_pointer(event->rb, NULL); + ring_buffer_detach(event, rb); mutex_unlock(&event->mmap_mutex); ring_buffer_put(rb); @@ -3701,7 +3779,7 @@ static const struct file_operations perf_fops = { void perf_event_wakeup(struct perf_event *event) { - wake_up_all(&event->waitq); + ring_buffer_wakeup(event); if (event->pending_kill) { kill_fasync(&event->fasync, SIGIO, event->pending_kill); @@ -5823,6 +5901,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, INIT_LIST_HEAD(&event->group_entry); INIT_LIST_HEAD(&event->event_entry); INIT_LIST_HEAD(&event->sibling_list); + INIT_LIST_HEAD(&event->rb_entry); + init_waitqueue_head(&event->waitq); init_irq_work(&event->pending, perf_pending_event); @@ -6029,6 +6109,8 @@ set: old_rb = event->rb; rcu_assign_pointer(event->rb, rb); + if (old_rb) + ring_buffer_detach(event, old_rb); ret = 0; unlock: mutex_unlock(&event->mmap_mutex); diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 09097dd..64568a6 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -22,6 +22,9 @@ struct ring_buffer { local_t lost; /* nr records lost */ long watermark; /* wakeup watermark */ + /* poll crap */ + spinlock_t event_lock; + struct list_head event_list; struct perf_event_mmap_page *user_page; void *data_pages[0]; diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index a2a2920..7f3011c 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -209,6 +209,9 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags) rb->writable = 1; atomic_set(&rb->refcount, 1); + + INIT_LIST_HEAD(&rb->event_list); + spin_lock_init(&rb->event_lock); } #ifndef CONFIG_PERF_USE_VMALLOC -- cgit v0.10.2