From bcdc09af3ef30ef071677544ce23a1c8873a2dda Mon Sep 17 00:00:00 2001 From: Brendan Gregg Date: Wed, 3 Aug 2016 02:47:49 +0000 Subject: perf script: Add 'bpf-output' field to usage message This adds the 'bpf-output' field to the perf script usage message, and docs. Signed-off-by: Brendan Gregg Cc: Alexander Shishkin Cc: Alexei Starovoitov Cc: Peter Zijlstra Cc: Wang Nan Link: http://lkml.kernel.org/r/1470192469-11910-4-git-send-email-bgregg@netflix.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt index 1f6c705..053bbbd 100644 --- a/tools/perf/Documentation/perf-script.txt +++ b/tools/perf/Documentation/perf-script.txt @@ -116,8 +116,8 @@ OPTIONS --fields:: Comma separated list of fields to print. Options are: comm, tid, pid, time, cpu, event, trace, ip, sym, dso, addr, symoff, - srcline, period, iregs, brstack, brstacksym, flags. - Field list can be prepended with the type, trace, sw or hw, + srcline, period, iregs, brstack, brstacksym, flags, bpf-output, + callindent. Field list can be prepended with the type, trace, sw or hw, to indicate to which event type the field list applies. e.g., -F sw:comm,tid,time,ip,sym and -F trace:time,cpu,trace diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 971ff91..9c640a8 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -2116,7 +2116,7 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused) "Valid types: hw,sw,trace,raw. " "Fields: comm,tid,pid,time,cpu,event,trace,ip,sym,dso," "addr,symoff,period,iregs,brstack,brstacksym,flags," - "callindent", parse_output_fields), + "bpf-output,callindent", parse_output_fields), OPT_BOOLEAN('a', "all-cpus", &system_wide, "system-wide collection from all CPUs"), OPT_STRING('S', "symbols", &symbol_conf.sym_list_str, "symbol[,symbol...]", -- cgit v0.10.2 From 887fa86d6fd7a45cee2d0f9d5f75026786d61df2 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 5 Aug 2016 12:37:21 -0300 Subject: perf hists: Trim libtraceevent trace_seq buffers When we use libtraceevent to format trace event fields into printable strings to use in hist entries it is important to trim it from the default 4 KiB it starts with to what is really used, to reduce the memory footprint, so use realloc(seq.buffer, seq.len + 1) when returning the seq.buffer formatted with the fields contents. Reported-and-Tested-by: Wang Nan Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Link: http://lkml.kernel.org/n/tip-t3hl7uxmilrkigzmc90rlhk2@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c index 947d21f..3d3cb83 100644 --- a/tools/perf/util/sort.c +++ b/tools/perf/util/sort.c @@ -588,7 +588,11 @@ static char *get_trace_output(struct hist_entry *he) } else { pevent_event_info(&seq, evsel->tp_format, &rec); } - return seq.buffer; + /* + * Trim the buffer, it starts at 4KB and we're not going to + * add anything more to this buffer. + */ + return realloc(seq.buffer, seq.len + 1); } static int64_t -- cgit v0.10.2 From 8e34189b347d76acf48ce05831176582201b664d Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 6 Aug 2016 19:29:48 +0900 Subject: perf probe: Adjust map->reloc offset when finding kernel symbol from map Adjust map->reloc offset for the unmapped address when finding alternative symbol address from map, because KASLR can relocate the kernel symbol address. The same adjustment has been done when finding appropriate kernel symbol address from map which was introduced by commit f90acac75713 ("perf probe: Find given address from offline dwarf") Reported-by: Arnaldo Carvalho de Melo Signed-off-by: Masami Hiramatsu Cc: Alexei Starovoitov Cc: Wang Nan Link: http://lkml.kernel.org/r/20160806192948.e366f3fbc4b194de600f8326@kernel.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c index 953dc1a..d5ccb65 100644 --- a/tools/perf/util/probe-event.c +++ b/tools/perf/util/probe-event.c @@ -385,7 +385,7 @@ static int find_alternative_probe_point(struct debuginfo *dinfo, if (uprobes) address = sym->start; else - address = map->unmap_ip(map, sym->start); + address = map->unmap_ip(map, sym->start) - map->reloc; break; } if (!address) { -- cgit v0.10.2 From cb3f3378cd09aa3fe975b4ad5ee0229dc76315bb Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Fri, 5 Aug 2016 15:22:36 +0300 Subject: perf probe: Fix module name matching If module is "module" then dso->short_name is "[module]". Substring comparing is't enough: "raid10" matches to "[raid1]". This patch also checks terminating zero in module name. Signed-off-by: Konstantin Khlebnikov Acked-by: Masami Hiramatsu Link: http://lkml.kernel.org/r/147039975648.715620.12985971832789032159.stgit@buzz Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c index d5ccb65..1201f73 100644 --- a/tools/perf/util/probe-event.c +++ b/tools/perf/util/probe-event.c @@ -170,8 +170,10 @@ static struct map *kernel_get_module_map(const char *module) module = "kernel"; for (pos = maps__first(maps); pos; pos = map__next(pos)) { + /* short_name is "[module]" */ if (strncmp(pos->dso->short_name + 1, module, - pos->dso->short_name_len - 2) == 0) { + pos->dso->short_name_len - 2) == 0 && + module[pos->dso->short_name_len - 2] == '\0') { return pos; } } -- cgit v0.10.2 From 3df33eff2ba96be4f1535db4f672013d756dc9b1 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 9 Aug 2016 14:04:29 +0100 Subject: perf stat: Avoid skew when reading events When we don't have a tracee (i.e. we're attaching to a task or CPU), counters can still be running after our workload finishes, and can still be running as we read their values. As we read events one-by-one, there can be arbitrary skew between values of events, even within a group. This means that ratios within an event group are not reliable. This skew can be seen if measuring a group of identical events, e.g: # perf stat -a -C0 -e '{cycles,cycles}' sleep 1 To avoid this, we must stop groups from counting before we read the values of any constituent events. This patch adds and makes use of a new disable_counters() helper, which disables group leaders (and thus each group as a whole). This mirrors the use of enable_counters() for starting event groups in the absence of a tracee. Closing a group leader splits the group, and without a disabled group leader the newly split events will begin counting. Thus to ensure counts are reliable we must defer closing group leaders until all counts have been read. To do so this patch removes the event closing logic from the read_counters() helper, explicitly closes the events using perf_evlist__close(), which also aids legibility. Signed-off-by: Mark Rutland Tested-by: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1470747869-3567-1-git-send-email-mark.rutland@arm.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 0c16d20..3c7452b 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -331,7 +331,7 @@ static int read_counter(struct perf_evsel *counter) return 0; } -static void read_counters(bool close_counters) +static void read_counters(void) { struct perf_evsel *counter; @@ -341,11 +341,6 @@ static void read_counters(bool close_counters) if (perf_stat_process_counter(&stat_config, counter)) pr_warning("failed to process counter %s\n", counter->name); - - if (close_counters) { - perf_evsel__close_fd(counter, perf_evsel__nr_cpus(counter), - thread_map__nr(evsel_list->threads)); - } } } @@ -353,7 +348,7 @@ static void process_interval(void) { struct timespec ts, rs; - read_counters(false); + read_counters(); clock_gettime(CLOCK_MONOTONIC, &ts); diff_timespec(&rs, &ts, &ref_time); @@ -380,6 +375,17 @@ static void enable_counters(void) perf_evlist__enable(evsel_list); } +static void disable_counters(void) +{ + /* + * If we don't have tracee (attaching to task or cpu), counters may + * still be running. To get accurate group ratios, we must stop groups + * from counting before reading their constituent counters. + */ + if (!target__none(&target)) + perf_evlist__disable(evsel_list); +} + static volatile int workload_exec_errno; /* @@ -657,11 +663,20 @@ try_again: } } + disable_counters(); + t1 = rdclock(); update_stats(&walltime_nsecs_stats, t1 - t0); - read_counters(true); + /* + * Closing a group leader splits the group, and as we only disable + * group leaders, results in remaining events becoming enabled. To + * avoid arbitrary skew, we must read all counters before closing any + * group leaders. + */ + read_counters(); + perf_evlist__close(evsel_list); return WEXITSTATUS(status); } -- cgit v0.10.2 From 19f00b011729417f69e4df53cc3fe5ecc25134a4 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 9 Aug 2016 11:40:08 +0900 Subject: perf probe: Support signedness casting The 'perf probe' tool detects a variable's type and use the detected type to add a new probe. Then, kprobes prints its variable in hexadecimal format if the variable is unsigned and prints in decimal if it is signed. We sometimes want to see unsigned variable in decimal format (i.e. sector_t or size_t). In that case, we need to investigate the variable's size manually to specify just signedness. This patch add signedness casting support. By specifying "s" or "u" as a type, perf-probe will investigate variable size as usual and use the specified signedness. E.g. without this: $ perf probe -a 'submit_bio bio->bi_iter.bi_sector' Added new event: probe:submit_bio (on submit_bio with bi_sector=bio->bi_iter.bi_sector) You can now use it in all perf tools, such as: perf record -e probe:submit_bio -aR sleep 1 $ cat trace_pipe|head dbench-9692 [003] d..1 971.096633: submit_bio: (submit_bio+0x0/0x140) bi_sector=0x3a3d00 dbench-9692 [003] d..1 971.096685: submit_bio: (submit_bio+0x0/0x140) bi_sector=0x1a3d80 dbench-9692 [003] d..1 971.096687: submit_bio: (submit_bio+0x0/0x140) bi_sector=0x3a3d80 ... // need to investigate the variable size $ perf probe -a 'submit_bio bio->bi_iter.bi_sector:s64' Added new event: probe:submit_bio (on submit_bio with bi_sector=bio->bi_iter.bi_sector:s64) You can now use it in all perf tools, such as: perf record -e probe:submit_bio -aR sleep 1 With this: // just use "s" to cast its signedness $ perf probe -v -a 'submit_bio bio->bi_iter.bi_sector:s' Added new event: probe:submit_bio (on submit_bio with bi_sector=bio->bi_iter.bi_sector:s) You can now use it in all perf tools, such as: perf record -e probe:submit_bio -aR sleep 1 $ cat trace_pipe|head dbench-9689 [001] d..1 1212.391237: submit_bio: (submit_bio+0x0/0x140) bi_sector=128 dbench-9689 [001] d..1 1212.391252: submit_bio: (submit_bio+0x0/0x140) bi_sector=131072 dbench-9697 [006] d..1 1212.398611: submit_bio: (submit_bio+0x0/0x140) bi_sector=30208 This commit also update perf-probe.txt to describe "types". Most parts are based on existing documentation: Documentation/trace/kprobetrace.txt Committer note: Testing using 'perf trace': # perf probe -a 'submit_bio bio->bi_iter.bi_sector' Added new event: probe:submit_bio (on submit_bio with bi_sector=bio->bi_iter.bi_sector) You can now use it in all perf tools, such as: perf record -e probe:submit_bio -aR sleep 1 # trace --no-syscalls --ev probe:submit_bio 0.000 probe:submit_bio:(ffffffffac3aee00) bi_sector=0xc133c0) 3181.861 probe:submit_bio:(ffffffffac3aee00) bi_sector=0x6cffb8) 3181.881 probe:submit_bio:(ffffffffac3aee00) bi_sector=0x6cffc0) 3184.488 probe:submit_bio:(ffffffffac3aee00) bi_sector=0x6cffc8) 4717.927 probe:submit_bio:(ffffffffac3aee00) bi_sector=0x4dc7a88) 4717.970 probe:submit_bio:(ffffffffac3aee00) bi_sector=0x4dc7880) ^C[root@jouet ~]# Now, using this new feature: [root@jouet ~]# perf probe -a 'submit_bio bio->bi_iter.bi_sector:s' Added new event: probe:submit_bio (on submit_bio with bi_sector=bio->bi_iter.bi_sector:s) You can now use it in all perf tools, such as: perf record -e probe:submit_bio -aR sleep 1 [root@jouet ~]# trace --no-syscalls --ev probe:submit_bio 0.000 probe:submit_bio:(ffffffffac3aee00) bi_sector=7145704) 0.017 probe:submit_bio:(ffffffffac3aee00) bi_sector=7145712) 0.019 probe:submit_bio:(ffffffffac3aee00) bi_sector=7145720) 2.567 probe:submit_bio:(ffffffffac3aee00) bi_sector=7145728) 5631.919 probe:submit_bio:(ffffffffac3aee00) bi_sector=0) 5631.941 probe:submit_bio:(ffffffffac3aee00) bi_sector=8) 5631.945 probe:submit_bio:(ffffffffac3aee00) bi_sector=16) 5631.948 probe:submit_bio:(ffffffffac3aee00) bi_sector=24) ^C# With callchains: # trace --no-syscalls --ev probe:submit_bio/max-stack=10/ 0.000 probe:submit_bio:(ffffffffac3aee00) bi_sector=50662544) submit_bio+0xa8200001 ([kernel.kallsyms]) submit_bh+0xa8200013 ([kernel.kallsyms]) jbd2_journal_commit_transaction+0xa8200691 ([kernel.kallsyms]) kjournald2+0xa82000ca ([kernel.kallsyms]) kthread+0xa82000d8 ([kernel.kallsyms]) ret_from_fork+0xa820001f ([kernel.kallsyms]) 0.023 probe:submit_bio:(ffffffffac3aee00) bi_sector=50662552) submit_bio+0xa8200001 ([kernel.kallsyms]) submit_bh+0xa8200013 ([kernel.kallsyms]) jbd2_journal_commit_transaction+0xa8200691 ([kernel.kallsyms]) kjournald2+0xa82000ca ([kernel.kallsyms]) kthread+0xa82000d8 ([kernel.kallsyms]) ret_from_fork+0xa820001f ([kernel.kallsyms]) 0.027 probe:submit_bio:(ffffffffac3aee00) bi_sector=50662560) submit_bio+0xa8200001 ([kernel.kallsyms]) submit_bh+0xa8200013 ([kernel.kallsyms]) jbd2_journal_commit_transaction+0xa8200691 ([kernel.kallsyms]) kjournald2+0xa82000ca ([kernel.kallsyms]) kthread+0xa82000d8 ([kernel.kallsyms]) ret_from_fork+0xa820001f ([kernel.kallsyms]) 2.593 probe:submit_bio:(ffffffffac3aee00) bi_sector=50662568) submit_bio+0xa8200001 ([kernel.kallsyms]) submit_bh+0xa8200013 ([kernel.kallsyms]) journal_submit_commit_record+0xa82001ac ([kernel.kallsyms]) jbd2_journal_commit_transaction+0xa82012e8 ([kernel.kallsyms]) kjournald2+0xa82000ca ([kernel.kallsyms]) kthread+0xa82000d8 ([kernel.kallsyms]) ret_from_fork+0xa820001f ([kernel.kallsyms]) ^C# Signed-off-by: Naohiro Aota Tested-by: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: Hemant Kumar Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Wang Nan Link: http://lkml.kernel.org/r/1470710408-23515-1-git-send-email-naohiro.aota@hgst.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/Documentation/perf-probe.txt b/tools/perf/Documentation/perf-probe.txt index 736da44..b303bcd 100644 --- a/tools/perf/Documentation/perf-probe.txt +++ b/tools/perf/Documentation/perf-probe.txt @@ -176,10 +176,18 @@ Each probe argument follows below syntax. 'NAME' specifies the name of this argument (optional). You can use the name of local variable, local data structure member (e.g. var->field, var.field2), local array with fixed index (e.g. array[1], var->array[0], var->pointer[2]), or kprobe-tracer argument format (e.g. $retval, %ax, etc). Note that the name of this argument will be set as the last member name if you specify a local data structure member (e.g. field2 for 'var->field1.field2'.) '$vars' and '$params' special arguments are also available for NAME, '$vars' is expanded to the local variables (including function parameters) which can access at given probe point. '$params' is expanded to only the function parameters. -'TYPE' casts the type of this argument (optional). If omitted, perf probe automatically set the type based on debuginfo. You can specify 'string' type only for the local variable or structure member which is an array of or a pointer to 'char' or 'unsigned char' type. +'TYPE' casts the type of this argument (optional). If omitted, perf probe automatically set the type based on debuginfo. Currently, basic types (u8/u16/u32/u64/s8/s16/s32/s64), signedness casting (u/s), "string" and bitfield are supported. (see TYPES for detail) On x86 systems %REG is always the short form of the register: for example %AX. %RAX or %EAX is not valid. +TYPES +----- +Basic types (u8/u16/u32/u64/s8/s16/s32/s64) are integer types. Prefix 's' and 'u' means those types are signed and unsigned respectively. Traced arguments are shown in decimal (signed) or hex (unsigned). You can also use 's' or 'u' to specify only signedness and leave its size auto-detected by perf probe. +String type is a special type, which fetches a "null-terminated" string from kernel space. This means it will fail and store NULL if the string container has been paged out. You can specify 'string' type only for the local variable or structure member which is an array of or a pointer to 'char' or 'unsigned char' type. +Bitfield is another special type, which takes 3 parameters, bit-width, bit-offset, and container-size (usually 32). The syntax is; + + b@/ + LINE SYNTAX ----------- Line range is described by following syntax. diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c index f2d9ff0..5c290c6 100644 --- a/tools/perf/util/probe-finder.c +++ b/tools/perf/util/probe-finder.c @@ -297,10 +297,13 @@ static int convert_variable_type(Dwarf_Die *vr_die, char sbuf[STRERR_BUFSIZE]; int bsize, boffs, total; int ret; + char sign; /* TODO: check all types */ - if (cast && strcmp(cast, "string") != 0) { + if (cast && strcmp(cast, "string") != 0 && + strcmp(cast, "s") != 0 && strcmp(cast, "u") != 0) { /* Non string type is OK */ + /* and respect signedness cast */ tvar->type = strdup(cast); return (tvar->type == NULL) ? -ENOMEM : 0; } @@ -361,6 +364,13 @@ static int convert_variable_type(Dwarf_Die *vr_die, return (tvar->type == NULL) ? -ENOMEM : 0; } + if (cast && (strcmp(cast, "u") == 0)) + sign = 'u'; + else if (cast && (strcmp(cast, "s") == 0)) + sign = 's'; + else + sign = die_is_signed_type(&type) ? 's' : 'u'; + ret = dwarf_bytesize(&type); if (ret <= 0) /* No size ... try to use default type */ @@ -373,8 +383,7 @@ static int convert_variable_type(Dwarf_Die *vr_die, dwarf_diename(&type), MAX_BASIC_TYPE_BITS); ret = MAX_BASIC_TYPE_BITS; } - ret = snprintf(buf, 16, "%c%d", - die_is_signed_type(&type) ? 's' : 'u', ret); + ret = snprintf(buf, 16, "%c%d", sign, ret); formatted: if (ret < 0 || ret >= 16) { -- cgit v0.10.2 From bebfb730125863aac74607dff24f1bdb9fb02a90 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 9 Aug 2016 11:21:57 -0300 Subject: tools: Sync cpufeatures.h and vmx.h with the kernel There were changes related to the deprecation of the "pcommit" instruction: fd1d961dd681 ("x86/insn: remove pcommit") dfa169bbee00 ("Revert "KVM: x86: add pcommit support"") No need to update anything in the tools, as "pcommit" wasn't being listed on the VMX_EXIT_REASONS in the tools/perf/arch/x86/util/kvm-stat.c file. Just grab fresh copies of these files to silence the file cache coherency detector: $ make -C tools/perf O=/tmp/build/perf install-bin make: Entering directory '/home/acme/git/linux/tools/perf' BUILD: Doing 'make -j4' parallel build Warning: tools/arch/x86/include/asm/cpufeatures.h differs from kernel Warning: tools/arch/x86/include/uapi/asm/vmx.h differs from kernel INSTALL GTK UI # Cc: Adrian Hunter Cc: Dan Williams Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Cc: Paolo Bonzini Cc: Ross Zwisler Cc: Wang Nan Cc: Xiao Guangrong Link: http://lkml.kernel.org/n/tip-07pmcc1ysydhyyxbmp1vt0l4@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h index 4a41348..92a8308 100644 --- a/tools/arch/x86/include/asm/cpufeatures.h +++ b/tools/arch/x86/include/asm/cpufeatures.h @@ -225,7 +225,6 @@ #define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */ #define X86_FEATURE_ADX ( 9*32+19) /* The ADCX and ADOX instructions */ #define X86_FEATURE_SMAP ( 9*32+20) /* Supervisor Mode Access Prevention */ -#define X86_FEATURE_PCOMMIT ( 9*32+22) /* PCOMMIT instruction */ #define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */ #define X86_FEATURE_CLWB ( 9*32+24) /* CLWB instruction */ #define X86_FEATURE_AVX512PF ( 9*32+26) /* AVX-512 Prefetch */ @@ -301,10 +300,6 @@ #define X86_BUG_FXSAVE_LEAK X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */ #define X86_BUG_CLFLUSH_MONITOR X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */ #define X86_BUG_SYSRET_SS_ATTRS X86_BUG(8) /* SYSRET doesn't fix up SS attrs */ -#define X86_BUG_NULL_SEG X86_BUG(9) /* Nulling a selector preserves the base */ -#define X86_BUG_SWAPGS_FENCE X86_BUG(10) /* SWAPGS without input dep on GS */ - - #ifdef CONFIG_X86_32 /* * 64-bit kernels don't use X86_BUG_ESPFIX. Make the define conditional @@ -312,5 +307,7 @@ */ #define X86_BUG_ESPFIX X86_BUG(9) /* "" IRET to 16-bit SS corrupts ESP/RSP high bits */ #endif - +#define X86_BUG_NULL_SEG X86_BUG(10) /* Nulling a selector preserves the base */ +#define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* SWAPGS without input dep on GS */ +#define X86_BUG_MONITOR X86_BUG(12) /* IPI required to wake up remote CPU */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/tools/arch/x86/include/uapi/asm/vmx.h b/tools/arch/x86/include/uapi/asm/vmx.h index 5b15d94..37fee27 100644 --- a/tools/arch/x86/include/uapi/asm/vmx.h +++ b/tools/arch/x86/include/uapi/asm/vmx.h @@ -78,7 +78,6 @@ #define EXIT_REASON_PML_FULL 62 #define EXIT_REASON_XSAVES 63 #define EXIT_REASON_XRSTORS 64 -#define EXIT_REASON_PCOMMIT 65 #define VMX_EXIT_REASONS \ { EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \ @@ -127,8 +126,7 @@ { EXIT_REASON_INVVPID, "INVVPID" }, \ { EXIT_REASON_INVPCID, "INVPCID" }, \ { EXIT_REASON_XSAVES, "XSAVES" }, \ - { EXIT_REASON_XRSTORS, "XRSTORS" }, \ - { EXIT_REASON_PCOMMIT, "PCOMMIT" } + { EXIT_REASON_XRSTORS, "XRSTORS" } #define VMX_ABORT_SAVE_GUEST_MSR_FAIL 1 #define VMX_ABORT_LOAD_HOST_MSR_FAIL 4 -- cgit v0.10.2 From 791cceb89f7987c0375ff2e8971928a47f62ccae Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 9 Aug 2016 11:48:07 -0300 Subject: toops: Sync tools/include/uapi/linux/bpf.h with the kernel The way we're using kernel headers in tools/ now, with a copy that is made to the same path prefixed by "tools/" plus checking if that copy got stale, i.e. if the kernel counterpart changed, helps in keeping track with new features that may be useful for tools to exploit. For instance, looking at all the changes to bpf.h since it was last copied to tools/include brings this to toolers' attention: Need to investigate this one to check how to run a program via perf, setting up a BPF event, that will take advantage of the way perf already calls clang/LLVM, sets up the event and runs the workload in a single command line, helping in debugging such semi cooperative programs: 96ae52279594 ("bpf: Add bpf_probe_write_user BPF helper to be called in tracers") This one needs further investigation about using the feature it improves in 'perf trace' to do some tcpdumpin' mixed with syscalls, tracepoints, probe points, callgraphs, etc: 555c8a8623a3 ("bpf: avoid stack copy and use skb ctx for event output") Add tracing just packets that are related to some container to that mix: 4a482f34afcc ("cgroup: bpf: Add bpf_skb_in_cgroup_proto") 4ed8ec521ed5 ("cgroup: bpf: Add BPF_MAP_TYPE_CGROUP_ARRAY") Definetely needs to have example programs accessing task_struct from a bpf proggie started from 'perf trace': 606274c5abd8 ("bpf: introduce bpf_get_current_task() helper") Core networking related, XDP: 6ce96ca348a9 ("bpf: add XDP_TX xdp_action for direct forwarding") 6a773a15a1e8 ("bpf: add XDP prog type for early driver filter") 13c5c240f789 ("bpf: add bpf_get_hash_recalc helper") d2485c4242a8 ("bpf: add bpf_skb_change_type helper") 6578171a7ff0 ("bpf: add bpf_skb_change_proto helper") Changes detected by the tools build system: $ make -C tools/perf O=/tmp/build/perf install-bin make: Entering directory '/home/acme/git/linux/tools/perf' BUILD: Doing 'make -j4' parallel build Warning: tools/include/uapi/linux/bpf.h differs from kernel INSTALL GTK UI CC /tmp/build/perf/bench/mem-memcpy-x86-64-asm.o $ Cc: Adrian Hunter Cc: Alexei Starovoitov Cc: Brenden Blanco Cc: Daniel Borkmann Cc: David Ahern Cc: David S. Miller Cc: Jiri Olsa Cc: Martin KaFai Lau Cc: Namhyung Kim Cc: Sargun Dhillon Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-difq4ts1xvww6eyfs9e7zlft@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 406459b..da218fe 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -84,6 +84,7 @@ enum bpf_map_type { BPF_MAP_TYPE_PERCPU_HASH, BPF_MAP_TYPE_PERCPU_ARRAY, BPF_MAP_TYPE_STACK_TRACE, + BPF_MAP_TYPE_CGROUP_ARRAY, }; enum bpf_prog_type { @@ -93,6 +94,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_SCHED_CLS, BPF_PROG_TYPE_SCHED_ACT, BPF_PROG_TYPE_TRACEPOINT, + BPF_PROG_TYPE_XDP, }; #define BPF_PSEUDO_MAP_FD 1 @@ -313,6 +315,66 @@ enum bpf_func_id { */ BPF_FUNC_skb_get_tunnel_opt, BPF_FUNC_skb_set_tunnel_opt, + + /** + * bpf_skb_change_proto(skb, proto, flags) + * Change protocol of the skb. Currently supported is + * v4 -> v6, v6 -> v4 transitions. The helper will also + * resize the skb. eBPF program is expected to fill the + * new headers via skb_store_bytes and lX_csum_replace. + * @skb: pointer to skb + * @proto: new skb->protocol type + * @flags: reserved + * Return: 0 on success or negative error + */ + BPF_FUNC_skb_change_proto, + + /** + * bpf_skb_change_type(skb, type) + * Change packet type of skb. + * @skb: pointer to skb + * @type: new skb->pkt_type type + * Return: 0 on success or negative error + */ + BPF_FUNC_skb_change_type, + + /** + * bpf_skb_in_cgroup(skb, map, index) - Check cgroup2 membership of skb + * @skb: pointer to skb + * @map: pointer to bpf_map in BPF_MAP_TYPE_CGROUP_ARRAY type + * @index: index of the cgroup in the bpf_map + * Return: + * == 0 skb failed the cgroup2 descendant test + * == 1 skb succeeded the cgroup2 descendant test + * < 0 error + */ + BPF_FUNC_skb_in_cgroup, + + /** + * bpf_get_hash_recalc(skb) + * Retrieve and possibly recalculate skb->hash. + * @skb: pointer to skb + * Return: hash + */ + BPF_FUNC_get_hash_recalc, + + /** + * u64 bpf_get_current_task(void) + * Returns current task_struct + * Return: current + */ + BPF_FUNC_get_current_task, + + /** + * bpf_probe_write_user(void *dst, void *src, int len) + * safely attempt to write to a location + * @dst: destination address in userspace + * @src: source address on stack + * @len: number of bytes to copy + * Return: 0 on success or negative error + */ + BPF_FUNC_probe_write_user, + __BPF_FUNC_MAX_ID, }; @@ -347,9 +409,11 @@ enum bpf_func_id { #define BPF_F_ZERO_CSUM_TX (1ULL << 1) #define BPF_F_DONT_FRAGMENT (1ULL << 2) -/* BPF_FUNC_perf_event_output flags. */ +/* BPF_FUNC_perf_event_output and BPF_FUNC_perf_event_read flags. */ #define BPF_F_INDEX_MASK 0xffffffffULL #define BPF_F_CURRENT_CPU BPF_F_INDEX_MASK +/* BPF_FUNC_perf_event_output for sk_buff input context. */ +#define BPF_F_CTXLEN_MASK (0xfffffULL << 32) /* user accessible mirror of in-kernel sk_buff. * new fields can only be added to the end of this structure @@ -386,4 +450,24 @@ struct bpf_tunnel_key { __u32 tunnel_label; }; +/* User return codes for XDP prog type. + * A valid XDP program must return one of these defined values. All other + * return codes are reserved for future use. Unknown return codes will result + * in packet drop. + */ +enum xdp_action { + XDP_ABORTED = 0, + XDP_DROP, + XDP_PASS, + XDP_TX, +}; + +/* user accessible metadata for XDP packet hook + * new fields must be added to the end of this structure + */ +struct xdp_md { + __u32 data; + __u32 data_end; +}; + #endif /* _UAPI__LINUX_BPF_H__ */ -- cgit v0.10.2 From 840b49ba554b40fa8301ad2716abd2fe3d9e382a Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 9 Aug 2016 11:56:33 -0300 Subject: tools: Sync cpufeatures headers with the kernel Due to: 1e61f78baf89 ("x86/cpufeature: Make sure DISABLED/REQUIRED macros are updated") No changes to tools using those headers (tools/arch/x86/lib/mem{set,cpu}_64.S) seems necessary. Detected by the tools build header drift checker: $ make -C tools/perf O=/tmp/build/perf make: Entering directory '/home/acme/git/linux/tools/perf' BUILD: Doing 'make -j4' parallel build GEN /tmp/build/perf/common-cmds.h Warning: tools/arch/x86/include/asm/disabled-features.h differs from kernel Warning: tools/arch/x86/include/asm/required-features.h differs from kernel Warning: tools/arch/x86/include/asm/cpufeatures.h differs from kernel CC /tmp/build/perf/util/probe-finder.o CC /tmp/build/perf/builtin-help.o ^C$ Cc: Adrian Hunter Cc: Dave Hansen Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-ja75m7zk8j0jkzmrv16i5ehw@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/arch/x86/include/asm/disabled-features.h b/tools/arch/x86/include/asm/disabled-features.h index 911e935..85599ad 100644 --- a/tools/arch/x86/include/asm/disabled-features.h +++ b/tools/arch/x86/include/asm/disabled-features.h @@ -56,5 +56,7 @@ #define DISABLED_MASK14 0 #define DISABLED_MASK15 0 #define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE) +#define DISABLED_MASK17 0 +#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 18) #endif /* _ASM_X86_DISABLED_FEATURES_H */ diff --git a/tools/arch/x86/include/asm/required-features.h b/tools/arch/x86/include/asm/required-features.h index 4916144..fac9a5c 100644 --- a/tools/arch/x86/include/asm/required-features.h +++ b/tools/arch/x86/include/asm/required-features.h @@ -99,5 +99,7 @@ #define REQUIRED_MASK14 0 #define REQUIRED_MASK15 0 #define REQUIRED_MASK16 0 +#define REQUIRED_MASK17 0 +#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 18) #endif /* _ASM_X86_REQUIRED_FEATURES_H */ -- cgit v0.10.2 From d820456dc70b231d62171ba46b43db0045e9bd57 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Tue, 9 Aug 2016 01:23:24 -0500 Subject: perf probe: Add function to post process kernel trace events Instead of inline code, introduce function to post process kernel probe trace events. Signed-off-by: Ravi Bangoria Acked-by: Masami Hiramatsu Cc: Alexander Shishkin Cc: Ananth N Mavinakayanahalli Cc: Balbir Singh Cc: Namhyung Kim Cc: Naveen N. Rao Cc: Peter Zijlstra Cc: Wang Nan Link: http://lkml.kernel.org/r/1470723805-5081-1-git-send-email-ravi.bangoria@linux.vnet.ibm.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c index 1201f73..234fbfb 100644 --- a/tools/perf/util/probe-event.c +++ b/tools/perf/util/probe-event.c @@ -666,22 +666,14 @@ static int add_module_to_probe_trace_events(struct probe_trace_event *tevs, return ret; } -/* Post processing the probe events */ -static int post_process_probe_trace_events(struct probe_trace_event *tevs, - int ntevs, const char *module, - bool uprobe) +static int +post_process_kernel_probe_trace_events(struct probe_trace_event *tevs, + int ntevs) { struct ref_reloc_sym *reloc_sym; char *tmp; int i, skipped = 0; - if (uprobe) - return add_exec_to_probe_trace_events(tevs, ntevs, module); - - /* Note that currently ref_reloc_sym based probe is not for drivers */ - if (module) - return add_module_to_probe_trace_events(tevs, ntevs, module); - reloc_sym = kernel_get_ref_reloc_sym(); if (!reloc_sym) { pr_warning("Relocated base symbol is not found!\n"); @@ -713,6 +705,21 @@ static int post_process_probe_trace_events(struct probe_trace_event *tevs, return skipped; } +/* Post processing the probe events */ +static int post_process_probe_trace_events(struct probe_trace_event *tevs, + int ntevs, const char *module, + bool uprobe) +{ + if (uprobe) + return add_exec_to_probe_trace_events(tevs, ntevs, module); + + if (module) + /* Currently ref_reloc_sym based probe is not for drivers */ + return add_module_to_probe_trace_events(tevs, ntevs, module); + + return post_process_kernel_probe_trace_events(tevs, ntevs); +} + /* Try to find perf_probe_event with debuginfo */ static int try_to_find_probe_trace_events(struct perf_probe_event *pev, struct probe_trace_event **tevs) -- cgit v0.10.2 From 99e608b5954c9e1ebadbf9660b74697d9dfd9f20 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Tue, 9 Aug 2016 01:23:25 -0500 Subject: perf probe ppc64le: Fix probe location when using DWARF Powerpc has Global Entry Point and Local Entry Point for functions. LEP catches call from both the GEP and the LEP. Symbol table of ELF contains GEP and Offset from which we can calculate LEP, but debuginfo does not have LEP info. Currently, perf prioritize symbol table over dwarf to probe on LEP for ppc64le. But when user tries to probe with function parameter, we fall back to using dwarf(i.e. GEP) and when function called via LEP, probe will never hit. For example: $ objdump -d vmlinux ... do_sys_open(): c0000000002eb4a0: e8 00 4c 3c addis r2,r12,232 c0000000002eb4a4: 60 00 42 38 addi r2,r2,96 c0000000002eb4a8: a6 02 08 7c mflr r0 c0000000002eb4ac: d0 ff 41 fb std r26,-48(r1) $ sudo ./perf probe do_sys_open $ sudo cat /sys/kernel/debug/tracing/kprobe_events p:probe/do_sys_open _text+3060904 $ sudo ./perf probe 'do_sys_open filename:string' $ sudo cat /sys/kernel/debug/tracing/kprobe_events p:probe/do_sys_open _text+3060896 filename_string=+0(%gpr4):string For second case, perf probed on GEP. So when function will be called via LEP, probe won't hit. $ sudo ./perf record -a -e probe:do_sys_open ls [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.195 MB perf.data ] To resolve this issue, let's not prioritize symbol table, let perf decide what it wants to use. Perf is already converting GEP to LEP when it uses symbol table. When perf uses debuginfo, let it find LEP offset form symbol table. This way we fall back to probe on LEP for all cases. After patch: $ sudo ./perf probe 'do_sys_open filename:string' $ sudo cat /sys/kernel/debug/tracing/kprobe_events p:probe/do_sys_open _text+3060904 filename_string=+0(%gpr4):string $ sudo ./perf record -a -e probe:do_sys_open ls [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.197 MB perf.data (11 samples) ] Signed-off-by: Ravi Bangoria Acked-by: Masami Hiramatsu Cc: Alexander Shishkin Cc: Ananth N Mavinakayanahalli Cc: Balbir Singh Cc: Namhyung Kim Cc: Naveen N. Rao Cc: Peter Zijlstra Cc: Wang Nan Link: http://lkml.kernel.org/r/1470723805-5081-2-git-send-email-ravi.bangoria@linux.vnet.ibm.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/arch/powerpc/util/sym-handling.c b/tools/perf/arch/powerpc/util/sym-handling.c index c6d0f91..8d4dc97 100644 --- a/tools/perf/arch/powerpc/util/sym-handling.c +++ b/tools/perf/arch/powerpc/util/sym-handling.c @@ -54,10 +54,6 @@ int arch__compare_symbol_names(const char *namea, const char *nameb) #endif #if defined(_CALL_ELF) && _CALL_ELF == 2 -bool arch__prefers_symtab(void) -{ - return true; -} #ifdef HAVE_LIBELF_SUPPORT void arch__sym_update(struct symbol *s, GElf_Sym *sym) @@ -100,4 +96,27 @@ void arch__fix_tev_from_maps(struct perf_probe_event *pev, tev->point.offset += lep_offset; } } + +void arch__post_process_probe_trace_events(struct perf_probe_event *pev, + int ntevs) +{ + struct probe_trace_event *tev; + struct map *map; + struct symbol *sym = NULL; + struct rb_node *tmp; + int i = 0; + + map = get_target_map(pev->target, pev->uprobes); + if (!map || map__load(map, NULL) < 0) + return; + + for (i = 0; i < ntevs; i++) { + tev = &pev->tevs[i]; + map__for_each_symbol(map, sym, tmp) { + if (map->unmap_ip(map, sym->start) == tev->point.address) + arch__fix_tev_from_maps(pev, tev, map, sym); + } + } +} + #endif diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c index 234fbfb..2873396 100644 --- a/tools/perf/util/probe-event.c +++ b/tools/perf/util/probe-event.c @@ -180,7 +180,7 @@ static struct map *kernel_get_module_map(const char *module) return NULL; } -static struct map *get_target_map(const char *target, bool user) +struct map *get_target_map(const char *target, bool user) { /* Init maps of given executable or kernel */ if (user) @@ -705,19 +705,32 @@ post_process_kernel_probe_trace_events(struct probe_trace_event *tevs, return skipped; } +void __weak +arch__post_process_probe_trace_events(struct perf_probe_event *pev __maybe_unused, + int ntevs __maybe_unused) +{ +} + /* Post processing the probe events */ -static int post_process_probe_trace_events(struct probe_trace_event *tevs, +static int post_process_probe_trace_events(struct perf_probe_event *pev, + struct probe_trace_event *tevs, int ntevs, const char *module, bool uprobe) { - if (uprobe) - return add_exec_to_probe_trace_events(tevs, ntevs, module); + int ret; - if (module) + if (uprobe) + ret = add_exec_to_probe_trace_events(tevs, ntevs, module); + else if (module) /* Currently ref_reloc_sym based probe is not for drivers */ - return add_module_to_probe_trace_events(tevs, ntevs, module); + ret = add_module_to_probe_trace_events(tevs, ntevs, module); + else + ret = post_process_kernel_probe_trace_events(tevs, ntevs); - return post_process_kernel_probe_trace_events(tevs, ntevs); + if (ret >= 0) + arch__post_process_probe_trace_events(pev, ntevs); + + return ret; } /* Try to find perf_probe_event with debuginfo */ @@ -758,7 +771,7 @@ static int try_to_find_probe_trace_events(struct perf_probe_event *pev, if (ntevs > 0) { /* Succeeded to find trace events */ pr_debug("Found %d probe_trace_events.\n", ntevs); - ret = post_process_probe_trace_events(*tevs, ntevs, + ret = post_process_probe_trace_events(pev, *tevs, ntevs, pev->target, pev->uprobes); if (ret < 0 || ret == ntevs) { clear_probe_trace_events(*tevs, ntevs); @@ -2945,8 +2958,6 @@ errout: return err; } -bool __weak arch__prefers_symtab(void) { return false; } - /* Concatinate two arrays */ static void *memcat(void *a, size_t sz_a, void *b, size_t sz_b) { @@ -3167,12 +3178,6 @@ static int convert_to_probe_trace_events(struct perf_probe_event *pev, if (ret > 0 || pev->sdt) /* SDT can be found only in the cache */ return ret == 0 ? -ENOENT : ret; /* Found in probe cache */ - if (arch__prefers_symtab() && !perf_probe_event_need_dwarf(pev)) { - ret = find_probe_trace_events_from_map(pev, tevs); - if (ret > 0) - return ret; /* Found in symbol table */ - } - /* Convert perf_probe_event with debuginfo */ ret = try_to_find_probe_trace_events(pev, tevs); if (ret != 0) diff --git a/tools/perf/util/probe-event.h b/tools/perf/util/probe-event.h index e18ea9f..f4f45db 100644 --- a/tools/perf/util/probe-event.h +++ b/tools/perf/util/probe-event.h @@ -158,7 +158,6 @@ int show_line_range(struct line_range *lr, const char *module, bool user); int show_available_vars(struct perf_probe_event *pevs, int npevs, struct strfilter *filter); int show_available_funcs(const char *module, struct strfilter *filter, bool user); -bool arch__prefers_symtab(void); void arch__fix_tev_from_maps(struct perf_probe_event *pev, struct probe_trace_event *tev, struct map *map, struct symbol *sym); @@ -173,4 +172,9 @@ int e_snprintf(char *str, size_t size, const char *format, ...) int copy_to_probe_trace_arg(struct probe_trace_arg *tvar, struct perf_probe_arg *pvar); +struct map *get_target_map(const char *target, bool user); + +void arch__post_process_probe_trace_events(struct perf_probe_event *pev, + int ntevs); + #endif /*_PROBE_EVENT_H */ -- cgit v0.10.2 From 0b8f1e2e26bfc6b9abe3f0f3faba2cb0eecb9fb9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 4 Aug 2016 14:37:24 +0200 Subject: perf/core: Fix sideband list-iteration vs. event ordering NULL pointer deference crash Vegard Nossum reported that perf fuzzing generates a NULL pointer dereference crash: > Digging a bit deeper into this, it seems the event itself is getting > created by perf_event_open() and it gets added to the pmu_event_list > through: > > perf_event_open() > - perf_event_alloc() > - account_event() > - account_pmu_sb_event() > - attach_sb_event() > > so at this point the event is being attached but its ->ctx is still > NULL. It seems like ->ctx is set just a bit later in > perf_event_open(), though. > > But before that, __schedule() comes along and creates a stack trace > similar to the one above: > > __schedule() > - __perf_event_task_sched_out() > - perf_iterate_sb() > - perf_iterate_sb_cpu() > - event_filter_match() > - perf_cgroup_match() > - __get_cpu_context() > - (dereference ctx which is NULL) > > So I guess the question is... should the event be attached (= put on > the list) before ->ctx gets set? Or should the cgroup code check for a > NULL ->ctx? The latter seems like the simplest solution. Moving the list-add later creates a bit of a mess. Reported-by: Vegard Nossum Tested-by: Vegard Nossum Tested-by: Vince Weaver Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: David Carrillo-Cisneros Cc: Jiri Olsa Cc: Kan Liang Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Fixes: f2fb6bef9251 ("perf/core: Optimize side-band event delivery") Link: http://lkml.kernel.org/r/20160804123724.GN6862@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar diff --git a/kernel/events/core.c b/kernel/events/core.c index a19550d..87d02b8 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1716,8 +1716,8 @@ static inline int pmu_filter_match(struct perf_event *event) static inline int event_filter_match(struct perf_event *event) { - return (event->cpu == -1 || event->cpu == smp_processor_id()) - && perf_cgroup_match(event) && pmu_filter_match(event); + return (event->cpu == -1 || event->cpu == smp_processor_id()) && + perf_cgroup_match(event) && pmu_filter_match(event); } static void @@ -1737,8 +1737,8 @@ event_sched_out(struct perf_event *event, * maintained, otherwise bogus information is return * via read() for time_enabled, time_running: */ - if (event->state == PERF_EVENT_STATE_INACTIVE - && !event_filter_match(event)) { + if (event->state == PERF_EVENT_STATE_INACTIVE && + !event_filter_match(event)) { delta = tstamp - event->tstamp_stopped; event->tstamp_running += delta; event->tstamp_stopped = tstamp; @@ -2236,10 +2236,15 @@ perf_install_in_context(struct perf_event_context *ctx, lockdep_assert_held(&ctx->mutex); - event->ctx = ctx; if (event->cpu != -1) event->cpu = cpu; + /* + * Ensures that if we can observe event->ctx, both the event and ctx + * will be 'complete'. See perf_iterate_sb_cpu(). + */ + smp_store_release(&event->ctx, ctx); + if (!task) { cpu_function_call(cpu, __perf_install_in_context, event); return; @@ -5969,6 +5974,14 @@ static void perf_iterate_sb_cpu(perf_iterate_f output, void *data) struct perf_event *event; list_for_each_entry_rcu(event, &pel->list, sb_list) { + /* + * Skip events that are not fully formed yet; ensure that + * if we observe event->ctx, both event and ctx will be + * complete enough. See perf_install_in_context(). + */ + if (!smp_load_acquire(&event->ctx)) + continue; + if (event->state < PERF_EVENT_STATE_INACTIVE) continue; if (!event_filter_match(event)) -- cgit v0.10.2 From db4a835601b73cf8d6cd8986381d966b8e13d2d9 Mon Sep 17 00:00:00 2001 From: David Carrillo-Cisneros Date: Tue, 2 Aug 2016 00:48:12 -0700 Subject: perf/core: Set cgroup in CPU contexts for new cgroup events There's a perf stat bug easy to observer on a machine with only one cgroup: $ perf stat -e cycles -I 1000 -C 0 -G / # time counts unit events 1.000161699 cycles / 2.000355591 cycles / 3.000565154 cycles / 4.000951350 cycles / We'd expect some output there. The underlying problem is that there is an optimization in perf_cgroup_sched_{in,out}() that skips the switch of cgroup events if the old and new cgroups in a task switch are the same. This optimization interacts with the current code in two ways that cause a CPU context's cgroup (cpuctx->cgrp) to be NULL even if a cgroup event matches the current task. These are: 1. On creation of the first cgroup event in a CPU: In current code, cpuctx->cpu is only set in perf_cgroup_sched_in, but due to the aforesaid optimization, perf_cgroup_sched_in will run until the next cgroup switches in that CPU. This may happen late or never happen, depending on system's number of cgroups, CPU load, etc. 2. On deletion of the last cgroup event in a cpuctx: In list_del_event, cpuctx->cgrp is set NULL. Any new cgroup event will not be sched in because cpuctx->cgrp == NULL until a cgroup switch occurs and perf_cgroup_sched_in is executed (updating cpuctx->cgrp). This patch fixes both problems by setting cpuctx->cgrp in list_add_event, mirroring what list_del_event does when removing a cgroup event from CPU context, as introduced in: commit 68cacd29167b ("perf_events: Fix stale ->cgrp pointer in update_cgrp_time_from_cpuctx()") With this patch, cpuctx->cgrp is always set/clear when installing/removing the first/last cgroup event in/from the CPU context. With cpuctx->cgrp correctly set, event_filter_match works as intended when events are sched in/out. After the fix, the output is as expected: $ perf stat -e cycles -I 1000 -a -G / # time counts unit events 1.004699159 627342882 cycles / 2.007397156 615272690 cycles / 3.010019057 616726074 cycles / Signed-off-by: David Carrillo-Cisneros Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Kan Liang Cc: Linus Torvalds Cc: Paul Turner Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vegard Nossum Cc: Vince Weaver Link: http://lkml.kernel.org/r/1470124092-113192-1-git-send-email-davidcc@google.com Signed-off-by: Ingo Molnar diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 8ed43261..2b6b43c 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -743,7 +743,9 @@ struct perf_event_context { u64 parent_gen; u64 generation; int pin_count; +#ifdef CONFIG_CGROUP_PERF int nr_cgroups; /* cgroup evts */ +#endif void *task_ctx_data; /* pmu specific data */ struct rcu_head rcu_head; }; @@ -769,7 +771,9 @@ struct perf_cpu_context { unsigned int hrtimer_active; struct pmu *unique_pmu; +#ifdef CONFIG_CGROUP_PERF struct perf_cgroup *cgrp; +#endif }; struct perf_output_handle { diff --git a/kernel/events/core.c b/kernel/events/core.c index 87d02b8..1903b8f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -843,6 +843,32 @@ perf_cgroup_mark_enabled(struct perf_event *event, } } } + +/* + * Update cpuctx->cgrp so that it is set when first cgroup event is added and + * cleared when last cgroup event is removed. + */ +static inline void +list_update_cgroup_event(struct perf_event *event, + struct perf_event_context *ctx, bool add) +{ + struct perf_cpu_context *cpuctx; + + if (!is_cgroup_event(event)) + return; + + if (add && ctx->nr_cgroups++) + return; + else if (!add && --ctx->nr_cgroups) + return; + /* + * Because cgroup events are always per-cpu events, + * this will always be called from the right CPU. + */ + cpuctx = __get_cpu_context(ctx); + cpuctx->cgrp = add ? event->cgrp : NULL; +} + #else /* !CONFIG_CGROUP_PERF */ static inline bool @@ -920,6 +946,13 @@ perf_cgroup_mark_enabled(struct perf_event *event, struct perf_event_context *ctx) { } + +static inline void +list_update_cgroup_event(struct perf_event *event, + struct perf_event_context *ctx, bool add) +{ +} + #endif /* @@ -1392,6 +1425,7 @@ ctx_group_list(struct perf_event *event, struct perf_event_context *ctx) static void list_add_event(struct perf_event *event, struct perf_event_context *ctx) { + lockdep_assert_held(&ctx->lock); WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT); @@ -1412,8 +1446,7 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) list_add_tail(&event->group_entry, list); } - if (is_cgroup_event(event)) - ctx->nr_cgroups++; + list_update_cgroup_event(event, ctx, true); list_add_rcu(&event->event_entry, &ctx->event_list); ctx->nr_events++; @@ -1581,8 +1614,6 @@ static void perf_group_attach(struct perf_event *event) static void list_del_event(struct perf_event *event, struct perf_event_context *ctx) { - struct perf_cpu_context *cpuctx; - WARN_ON_ONCE(event->ctx != ctx); lockdep_assert_held(&ctx->lock); @@ -1594,20 +1625,7 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) event->attach_state &= ~PERF_ATTACH_CONTEXT; - if (is_cgroup_event(event)) { - ctx->nr_cgroups--; - /* - * Because cgroup events are always per-cpu events, this will - * always be called from the right CPU. - */ - cpuctx = __get_cpu_context(ctx); - /* - * If there are no more cgroup events then clear cgrp to avoid - * stale pointer in update_cgrp_time_from_cpuctx(). - */ - if (!ctx->nr_cgroups) - cpuctx->cgrp = NULL; - } + list_update_cgroup_event(event, ctx, false); ctx->nr_events--; if (event->attr.inherit_stat) -- cgit v0.10.2 From 68187872c76a96ed4db7bfb064272591f02e208b Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Thu, 11 Aug 2016 17:45:21 +0200 Subject: uprobes/x86: Fix RIP-relative handling of EVEX-encoded instructions Since instruction decoder now supports EVEX-encoded instructions, two fixes are needed to correctly handle them in uprobes. Extended bits for MODRM.rm field need to be sanitized just like we do it for VEX3, to avoid encoding wrong register for register-relative access. EVEX has _two_ extended bits: b and x. Theoretically, EVEX.x should be ignored by the CPU (since GPRs go only up to 15, not 31), but let's be paranoid here: proper encoding for register-relative access should have EVEX.x = 1. Secondly, we should fetch vex.vvvv for EVEX too. This is now super easy because instruction decoder populates vex_prefix.bytes[2] for all flavors of (e)vex encodings, even for VEX2. Signed-off-by: Denys Vlasenko Acked-by: Masami Hiramatsu Acked-by: Srikar Dronamraju Cc: Alexander Shishkin Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: Brian Gerst Cc: H. Peter Anvin Cc: Jim Keniston Cc: Jiri Olsa Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: linux-kernel@vger.kernel.org Cc: # v4.1+ Fixes: 8a764a875fe3 ("x86/asm/decoder: Create artificial 3rd byte for 2-byte VEX") Link: http://lkml.kernel.org/r/20160811154521.20469-1-dvlasenk@redhat.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 6c1ff31..495c776 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -357,20 +357,22 @@ static void riprel_analyze(struct arch_uprobe *auprobe, struct insn *insn) *cursor &= 0xfe; } /* - * Similar treatment for VEX3 prefix. - * TODO: add XOP/EVEX treatment when insn decoder supports them + * Similar treatment for VEX3/EVEX prefix. + * TODO: add XOP treatment when insn decoder supports them */ - if (insn->vex_prefix.nbytes == 3) { + if (insn->vex_prefix.nbytes >= 3) { /* * vex2: c5 rvvvvLpp (has no b bit) * vex3/xop: c4/8f rxbmmmmm wvvvvLpp * evex: 62 rxbR00mm wvvvv1pp zllBVaaa - * (evex will need setting of both b and x since - * in non-sib encoding evex.x is 4th bit of MODRM.rm) - * Setting VEX3.b (setting because it has inverted meaning): + * Setting VEX3.b (setting because it has inverted meaning). + * Setting EVEX.x since (in non-SIB encoding) EVEX.x + * is the 4th bit of MODRM.rm, and needs the same treatment. + * For VEX3-encoded insns, VEX3.x value has no effect in + * non-SIB encoding, the change is superfluous but harmless. */ cursor = auprobe->insn + insn_offset_vex_prefix(insn) + 1; - *cursor |= 0x20; + *cursor |= 0x60; } /* @@ -415,12 +417,10 @@ static void riprel_analyze(struct arch_uprobe *auprobe, struct insn *insn) reg = MODRM_REG(insn); /* Fetch modrm.reg */ reg2 = 0xff; /* Fetch vex.vvvv */ - if (insn->vex_prefix.nbytes == 2) - reg2 = insn->vex_prefix.bytes[1]; - else if (insn->vex_prefix.nbytes == 3) + if (insn->vex_prefix.nbytes) reg2 = insn->vex_prefix.bytes[2]; /* - * TODO: add XOP, EXEV vvvv reading. + * TODO: add XOP vvvv reading. * * vex.vvvv field is in bits 6-3, bits are inverted. * But in 32-bit mode, high-order bit may be ignored. -- cgit v0.10.2 From 10e9e7bd598f9a66a11a22514c68c13c41fc821b Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 11 Aug 2016 07:30:20 -0700 Subject: perf/x86/intel/uncore: Fix uncore num_counters Some uncore boxes' num_counters value for Haswell server and Broadwell server are not correct (too large, off by one). This issue was found by comparing the code with the document. Although there is no bug report from users yet, accessing non-existent counters is dangerous and the behavior is undefined: it may cause miscounting or even crashes. This patch makes them consistent with the uncore document. Reported-by: Lukasz Odzioba Signed-off-by: Kan Liang Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: Link: http://lkml.kernel.org/r/1470925820-59847-1-git-send-email-kan.liang@intel.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 824e540..8aee83b 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -2626,7 +2626,7 @@ void hswep_uncore_cpu_init(void) static struct intel_uncore_type hswep_uncore_ha = { .name = "ha", - .num_counters = 5, + .num_counters = 4, .num_boxes = 2, .perf_ctr_bits = 48, SNBEP_UNCORE_PCI_COMMON_INIT(), @@ -2645,7 +2645,7 @@ static struct uncore_event_desc hswep_uncore_imc_events[] = { static struct intel_uncore_type hswep_uncore_imc = { .name = "imc", - .num_counters = 5, + .num_counters = 4, .num_boxes = 8, .perf_ctr_bits = 48, .fixed_ctr_bits = 48, @@ -2691,7 +2691,7 @@ static struct intel_uncore_type hswep_uncore_irp = { static struct intel_uncore_type hswep_uncore_qpi = { .name = "qpi", - .num_counters = 5, + .num_counters = 4, .num_boxes = 3, .perf_ctr_bits = 48, .perf_ctr = SNBEP_PCI_PMON_CTR0, @@ -2773,7 +2773,7 @@ static struct event_constraint hswep_uncore_r3qpi_constraints[] = { static struct intel_uncore_type hswep_uncore_r3qpi = { .name = "r3qpi", - .num_counters = 4, + .num_counters = 3, .num_boxes = 3, .perf_ctr_bits = 44, .constraints = hswep_uncore_r3qpi_constraints, @@ -2972,7 +2972,7 @@ static struct intel_uncore_type bdx_uncore_ha = { static struct intel_uncore_type bdx_uncore_imc = { .name = "imc", - .num_counters = 5, + .num_counters = 4, .num_boxes = 8, .perf_ctr_bits = 48, .fixed_ctr_bits = 48, -- cgit v0.10.2 From 95f3be798472f63b495ca4712af005ea5ac7aa47 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 11 Aug 2016 07:31:14 -0700 Subject: perf/x86/intel/uncore: Add enable_box for client MSR uncore There are bug reports about miscounting uncore counters on some client machines like Sandybridge, Broadwell and Skylake. It is very likely to be observed on idle systems. This issue is caused by a hardware issue. PERF_GLOBAL_CTL could be cleared after Package C7, and nothing will be count. The related errata (HSD 158) could be found in: www.intel.com/content/dam/www/public/us/en/documents/specification-updates/4th-gen-core-family-desktop-specification-update.pdf This patch tries to work around this issue by re-enabling PERF_GLOBAL_CTL in ->enable_box(). The workaround does not cover all cases. It helps for new events after returning from C7. But it cannot prevent C7, it will still miscount if a counter is already active. There is no drawback in leaving it enabled, so it does not need disable_box() here. Signed-off-by: Kan Liang Cc: Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Link: http://lkml.kernel.org/r/1470925874-59943-1-git-send-email-kan.liang@intel.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c index 97a69db..9d35ec0 100644 --- a/arch/x86/events/intel/uncore_snb.c +++ b/arch/x86/events/intel/uncore_snb.c @@ -100,6 +100,12 @@ static void snb_uncore_msr_init_box(struct intel_uncore_box *box) } } +static void snb_uncore_msr_enable_box(struct intel_uncore_box *box) +{ + wrmsrl(SNB_UNC_PERF_GLOBAL_CTL, + SNB_UNC_GLOBAL_CTL_EN | SNB_UNC_GLOBAL_CTL_CORE_ALL); +} + static void snb_uncore_msr_exit_box(struct intel_uncore_box *box) { if (box->pmu->pmu_idx == 0) @@ -127,6 +133,7 @@ static struct attribute_group snb_uncore_format_group = { static struct intel_uncore_ops snb_uncore_msr_ops = { .init_box = snb_uncore_msr_init_box, + .enable_box = snb_uncore_msr_enable_box, .exit_box = snb_uncore_msr_exit_box, .disable_event = snb_uncore_msr_disable_event, .enable_event = snb_uncore_msr_enable_event, @@ -192,6 +199,12 @@ static void skl_uncore_msr_init_box(struct intel_uncore_box *box) } } +static void skl_uncore_msr_enable_box(struct intel_uncore_box *box) +{ + wrmsrl(SKL_UNC_PERF_GLOBAL_CTL, + SNB_UNC_GLOBAL_CTL_EN | SKL_UNC_GLOBAL_CTL_CORE_ALL); +} + static void skl_uncore_msr_exit_box(struct intel_uncore_box *box) { if (box->pmu->pmu_idx == 0) @@ -200,6 +213,7 @@ static void skl_uncore_msr_exit_box(struct intel_uncore_box *box) static struct intel_uncore_ops skl_uncore_msr_ops = { .init_box = skl_uncore_msr_init_box, + .enable_box = skl_uncore_msr_enable_box, .exit_box = skl_uncore_msr_exit_box, .disable_event = snb_uncore_msr_disable_event, .enable_event = snb_uncore_msr_enable_event, -- cgit v0.10.2