From 93ec4ce789995c5c58dff82193b3ec77caa8aecb Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 2 Jul 2015 17:48:23 -0300 Subject: perf tools: Asprintf like functions to format integer filter expression char *asprintf_expr_in_ints(const char *var, size_t nints, int *ints); char *asprintf_expr_not_in_ints(const char *var, size_t nints, int *ints); Example of output formatted with those functions: # ./tp_filter 6 12 2015 asprintf_expr_in_ints: id == 6 || id == 12 || id == 2015 asprintf_expr_not_in_ints: id != 6 && id != 12 && id != 2015 # It'll be used with, for instance, perf_evsel__set_filter_in_ints(), that will be used in turn to ask the kernel to filter out all raw_syscalls:* except for the ones specified by the user via: $ perf trace -e some,list,of,syscalls Cc: Adrian Hunter Cc: Borislav Petkov Cc: David Ahern Cc: Don Zickus Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Namhyung Kim Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-jt07vfp6bd8y50c05j1t7hrn@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/string.c b/tools/perf/util/string.c index 6afd610..fc8781d 100644 --- a/tools/perf/util/string.c +++ b/tools/perf/util/string.c @@ -357,3 +357,42 @@ void *memdup(const void *src, size_t len) return p; } + +char *asprintf_expr_inout_ints(const char *var, bool in, size_t nints, int *ints) +{ + /* + * FIXME: replace this with an expression using log10() when we + * find a suitable implementation, maybe the one in the dvb drivers... + * + * "%s == %d || " = log10(MAXINT) * 2 + 8 chars for the operators + */ + size_t size = nints * 28 + 1; /* \0 */ + size_t i, printed = 0; + char *expr = malloc(size); + + if (expr) { + const char *or_and = "||", *eq_neq = "=="; + char *e = expr; + + if (!in) { + or_and = "&&"; + eq_neq = "!="; + } + + for (i = 0; i < nints; ++i) { + if (printed == size) + goto out_err_overflow; + + if (i > 0) + printed += snprintf(e + printed, size - printed, " %s ", or_and); + printed += scnprintf(e + printed, size - printed, + "%s %s %d", var, eq_neq, ints[i]); + } + } + + return expr; + +out_err_overflow: + free(expr); + return NULL; +} diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h index 8bce58b..20d625a 100644 --- a/tools/perf/util/util.h +++ b/tools/perf/util/util.h @@ -339,4 +339,16 @@ int gzip_decompress_to_file(const char *input, int output_fd); int lzma_decompress_to_file(const char *input, int output_fd); #endif +char *asprintf_expr_inout_ints(const char *var, bool in, size_t nints, int *ints); + +static inline char *asprintf_expr_in_ints(const char *var, size_t nints, int *ints) +{ + return asprintf_expr_inout_ints(var, true, nints, ints); +} + +static inline char *asprintf_expr_not_in_ints(const char *var, size_t nints, int *ints) +{ + return asprintf_expr_inout_ints(var, false, nints, ints); +} + #endif /* GIT_COMPAT_UTIL_H */ -- cgit v0.10.2 From c27366f0f9dac552251a92453e4dc65164d530d4 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 2 Jul 2015 18:24:51 -0300 Subject: perf trace: Remember what are the syscalls tracepoint evsels We will need to set filters on then. Cc: Adrian Hunter Cc: Borislav Petkov Cc: David Ahern Cc: Don Zickus Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Namhyung Kim Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-u8hpgjpf3w8o1prnnjnwegwf@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 39ad4d0..431e297 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -247,42 +247,6 @@ out_delete: ({ struct syscall_tp *fields = evsel->priv; \ fields->name.pointer(&fields->name, sample); }) -static int perf_evlist__add_syscall_newtp(struct perf_evlist *evlist, - void *sys_enter_handler, - void *sys_exit_handler) -{ - int ret = -1; - struct perf_evsel *sys_enter, *sys_exit; - - sys_enter = perf_evsel__syscall_newtp("sys_enter", sys_enter_handler); - if (sys_enter == NULL) - goto out; - - if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args)) - goto out_delete_sys_enter; - - sys_exit = perf_evsel__syscall_newtp("sys_exit", sys_exit_handler); - if (sys_exit == NULL) - goto out_delete_sys_enter; - - if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret)) - goto out_delete_sys_exit; - - perf_evlist__add(evlist, sys_enter); - perf_evlist__add(evlist, sys_exit); - - ret = 0; -out: - return ret; - -out_delete_sys_exit: - perf_evsel__delete_priv(sys_exit); -out_delete_sys_enter: - perf_evsel__delete_priv(sys_enter); - goto out; -} - - struct syscall_arg { unsigned long val; struct thread *thread; @@ -1307,6 +1271,10 @@ struct trace { struct { int max; struct syscall *table; + struct { + struct perf_evsel *enter, + *exit; + } events; } syscalls; struct record_opts opts; struct perf_evlist *evlist; @@ -2283,6 +2251,44 @@ static void trace__handle_event(struct trace *trace, union perf_event *event, st } } +static int trace__add_syscall_newtp(struct trace *trace) +{ + int ret = -1; + struct perf_evlist *evlist = trace->evlist; + struct perf_evsel *sys_enter, *sys_exit; + + sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter); + if (sys_enter == NULL) + goto out; + + if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args)) + goto out_delete_sys_enter; + + sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit); + if (sys_exit == NULL) + goto out_delete_sys_enter; + + if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret)) + goto out_delete_sys_exit; + + perf_evlist__add(evlist, sys_enter); + perf_evlist__add(evlist, sys_exit); + + trace->syscalls.events.enter = sys_enter; + trace->syscalls.events.exit = sys_exit; + + ret = 0; +out: + return ret; + +out_delete_sys_exit: + perf_evsel__delete_priv(sys_exit); +out_delete_sys_enter: + perf_evsel__delete_priv(sys_enter); + goto out; +} + + static int trace__run(struct trace *trace, int argc, const char **argv) { struct perf_evlist *evlist = trace->evlist; @@ -2293,9 +2299,7 @@ static int trace__run(struct trace *trace, int argc, const char **argv) trace->live = true; - if (trace->trace_syscalls && - perf_evlist__add_syscall_newtp(evlist, trace__sys_enter, - trace__sys_exit)) + if (trace->trace_syscalls && trace__add_syscall_newtp(trace)) goto out_error_raw_syscalls; if (trace->trace_syscalls) -- cgit v0.10.2 From 8b3ce7576598b8e3b746beb9e8d10c33bcc64edd Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 2 Jul 2015 18:28:11 -0300 Subject: perf trace: Store the syscall ids for the event qualifiers in a table That we will use to set a filter on raw_syscalls:sys_{enter,exit} events. Cc: Adrian Hunter Cc: Borislav Petkov Cc: David Ahern Cc: Don Zickus Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Namhyung Kim Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-2acxrcxyu7tlolrfilpty38y@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 431e297..b106086 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -1272,8 +1272,8 @@ struct trace { int max; struct syscall *table; struct { - struct perf_evsel *enter, - *exit; + struct perf_evsel *sys_enter, + *sys_exit; } events; } syscalls; struct record_opts opts; @@ -1284,6 +1284,10 @@ struct trace { FILE *output; unsigned long nr_events; struct strlist *ev_qualifier; + struct { + size_t nr; + int *entries; + } ev_qualifier_ids; const char *last_vfs_getname; struct intlist *tid_list; struct intlist *pid_list; @@ -1587,13 +1591,27 @@ static int trace__read_syscall_info(struct trace *trace, int id) static int trace__validate_ev_qualifier(struct trace *trace) { - int err = 0; + int err = 0, i; struct str_node *pos; + trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier); + trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr * + sizeof(trace->ev_qualifier_ids.entries[0])); + + if (trace->ev_qualifier_ids.entries == NULL) { + fputs("Error:\tNot enough memory for allocating events qualifier ids\n", + trace->output); + err = -EINVAL; + goto out; + } + + i = 0; + strlist__for_each(pos, trace->ev_qualifier) { const char *sc = pos->s; + int id = audit_name_to_syscall(sc, trace->audit.machine); - if (audit_name_to_syscall(sc, trace->audit.machine) < 0) { + if (id < 0) { if (err == 0) { fputs("Error:\tInvalid syscall ", trace->output); err = -EINVAL; @@ -1603,13 +1621,17 @@ static int trace__validate_ev_qualifier(struct trace *trace) fputs(sc, trace->output); } + + trace->ev_qualifier_ids.entries[i++] = id; } if (err < 0) { fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'" "\nHint:\tand: 'man syscalls'\n", trace->output); + zfree(&trace->ev_qualifier_ids.entries); + trace->ev_qualifier_ids.nr = 0; } - +out: return err; } @@ -2274,8 +2296,8 @@ static int trace__add_syscall_newtp(struct trace *trace) perf_evlist__add(evlist, sys_enter); perf_evlist__add(evlist, sys_exit); - trace->syscalls.events.enter = sys_enter; - trace->syscalls.events.exit = sys_exit; + trace->syscalls.events.sys_enter = sys_enter; + trace->syscalls.events.sys_exit = sys_exit; ret = 0; out: -- cgit v0.10.2 From f47805a2af3ba83881ca52434bbbc6e9886b72fd Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 3 Jul 2015 15:53:49 -0300 Subject: perf evsel: Rename set_filter to apply_filter We need to be able to go on constructing a complex filter in multiple stages, since we can only set one filter per event. For instance, we need to be able, in 'perf trace' to filter by the 'common_pid' field all the time, if only for the tracer itself, to avoid a feedback loop, and, in addition, we may want to filter the raw_syscalls:sys_{enter,exit} events by its 'id' filter, when using 'perf trace -e open,close' or 'perf trace -e !open,close', i.e. when we are interested in just a subset of syscalls or when we are not interested in it. So we will have: perf_evsel__set_filter(evsel, char *filter) Replaces whatever is in evsel->filter. perf_evsel__append_filter(evsel, const char *op, char *filter) Appends, using op ("&&" or "||") with what is in evsel->filter. perf_evsel__apply_filter(evsel, filter): That actually applies a filter, be it the one being constructed in evsel->filter, or any other, for tools with more specific ways to build the filter, issuing the appropriate ioctl for all the evsel fds. The same changes will be made to the evlist__{set,apply} variants to keep everything consistent. Cc: Adrian Hunter Cc: Borislav Petkov Cc: David Ahern Cc: Don Zickus Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Namhyung Kim Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-2s5z9xtpnc2lwio3cv5x0jek@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 6cfdee6..5191328 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -1161,7 +1161,7 @@ int perf_evlist__apply_filters(struct perf_evlist *evlist, struct perf_evsel **e if (evsel->filter == NULL) continue; - err = perf_evsel__set_filter(evsel, ncpus, nthreads, evsel->filter); + err = perf_evsel__apply_filter(evsel, ncpus, nthreads, evsel->filter); if (err) { *err_evsel = evsel; break; @@ -1179,7 +1179,7 @@ int perf_evlist__set_filter(struct perf_evlist *evlist, const char *filter) nthreads = thread_map__nr(evlist->threads); evlist__for_each(evlist, evsel) { - err = perf_evsel__set_filter(evsel, ncpus, nthreads, filter); + err = perf_evsel__apply_filter(evsel, ncpus, nthreads, filter); if (err) break; } diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 2936b30..6676774 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -815,8 +815,8 @@ static int perf_evsel__run_ioctl(struct perf_evsel *evsel, int ncpus, int nthrea return 0; } -int perf_evsel__set_filter(struct perf_evsel *evsel, int ncpus, int nthreads, - const char *filter) +int perf_evsel__apply_filter(struct perf_evsel *evsel, int ncpus, int nthreads, + const char *filter) { return perf_evsel__run_ioctl(evsel, ncpus, nthreads, PERF_EVENT_IOC_SET_FILTER, diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index 4a7ed56..a3ebb83 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -182,8 +182,8 @@ void __perf_evsel__reset_sample_bit(struct perf_evsel *evsel, void perf_evsel__set_sample_id(struct perf_evsel *evsel, bool use_sample_identifier); -int perf_evsel__set_filter(struct perf_evsel *evsel, int ncpus, int nthreads, - const char *filter); +int perf_evsel__apply_filter(struct perf_evsel *evsel, int ncpus, int nthreads, + const char *filter); int perf_evsel__enable(struct perf_evsel *evsel, int ncpus, int nthreads); int perf_evsel__open_per_cpu(struct perf_evsel *evsel, -- cgit v0.10.2 From 12467ae4e37aab5e263cc9ab6b6909ae0588844c Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 3 Jul 2015 17:05:50 -0300 Subject: perf evsel: Introduce set_filter method Replaces existing filter string with the one provided. Cc: Adrian Hunter Cc: Borislav Petkov Cc: David Ahern Cc: Don Zickus Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Namhyung Kim Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-jst49z83li0yx3g18o54u51a@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 6676774..ae925b9 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -823,6 +823,19 @@ int perf_evsel__apply_filter(struct perf_evsel *evsel, int ncpus, int nthreads, (void *)filter); } +int perf_evsel__set_filter(struct perf_evsel *evsel, const char *filter) +{ + char *new_filter = strdup(filter); + + if (new_filter != NULL) { + free(evsel->filter); + evsel->filter = new_filter; + return 0; + } + + return -1; +} + int perf_evsel__enable(struct perf_evsel *evsel, int ncpus, int nthreads) { return perf_evsel__run_ioctl(evsel, ncpus, nthreads, diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index a3ebb83..8aea4ba 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -182,6 +182,7 @@ void __perf_evsel__reset_sample_bit(struct perf_evsel *evsel, void perf_evsel__set_sample_id(struct perf_evsel *evsel, bool use_sample_identifier); +int perf_evsel__set_filter(struct perf_evsel *evsel, const char *filter); int perf_evsel__apply_filter(struct perf_evsel *evsel, int ncpus, int nthreads, const char *filter); int perf_evsel__enable(struct perf_evsel *evsel, int ncpus, int nthreads); diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 09f8d23..a71eeb2 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -1177,8 +1177,7 @@ int parse_filter(const struct option *opt, const char *str, return -1; } - last->filter = strdup(str); - if (last->filter == NULL) { + if (perf_evsel__set_filter(last, str) < 0) { fprintf(stderr, "not enough memory to hold filter string\n"); return -1; } -- cgit v0.10.2 From 94ad89bc8a61b79c416c180c51b3e61ac16f6124 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 3 Jul 2015 17:42:03 -0300 Subject: perf evlist: Make perf_evlist__set_filter use perf_evsel__set_filter Instead of calling perf_evsel__apply_filter straight away, so that we can, in the next patches, expand the filter with more conditions before actually calling the ioctl to pass the end result filter to the kernel. Now we need to call perf_evlist__apply_filters() after the filter is completely setup, i.e. do the ioctl calls. The perf_evlist__apply_filters() method was already in place, because that is the model for the other tools that receives filters in the command line: go on setting then in the evsel->filter and only at the end, after parsing the whole command line, apply them. We get, as a bonus, a more expressive message that states which event, if any, failed to have the filter applied to, with an error message stating what happened. Cc: Adrian Hunter Cc: Borislav Petkov Cc: David Ahern Cc: Don Zickus Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Namhyung Kim Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-f429pgz75ryz7tpe6v74etre@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index b106086..c02b65a 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -2314,6 +2314,7 @@ out_delete_sys_enter: static int trace__run(struct trace *trace, int argc, const char **argv) { struct perf_evlist *evlist = trace->evlist; + struct perf_evsel *evsel; int err = -1, i; unsigned long before; const bool forks = argc > 0; @@ -2382,10 +2383,12 @@ static int trace__run(struct trace *trace, int argc, const char **argv) else if (thread_map__pid(evlist->threads, 0) == -1) err = perf_evlist__set_filter_pid(evlist, getpid()); - if (err < 0) { - printf("err=%d,%s\n", -err, strerror(-err)); - exit(1); - } + if (err < 0) + goto out_error_mem; + + err = perf_evlist__apply_filters(evlist, &evsel); + if (err < 0) + goto out_error_apply_filters; err = perf_evlist__mmap(evlist, trace->opts.mmap_pages, false); if (err < 0) @@ -2488,6 +2491,13 @@ out_error_open: out_error: fprintf(trace->output, "%s\n", errbuf); goto out_delete_evlist; + +out_error_apply_filters: + fprintf(trace->output, + "Failed to set filter \"%s\" on event %s with %d (%s)\n", + evsel->filter, perf_evsel__name(evsel), errno, + strerror_r(errno, errbuf, sizeof(errbuf))); + goto out_delete_evlist; } out_error_mem: fprintf(trace->output, "Not enough memory to run!\n"); diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 5191328..f7d9c77 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -1175,11 +1175,9 @@ int perf_evlist__set_filter(struct perf_evlist *evlist, const char *filter) { struct perf_evsel *evsel; int err = 0; - const int ncpus = cpu_map__nr(evlist->cpus), - nthreads = thread_map__nr(evlist->threads); evlist__for_each(evlist, evsel) { - err = perf_evsel__apply_filter(evsel, ncpus, nthreads, filter); + err = perf_evsel__set_filter(evsel, filter); if (err) break; } -- cgit v0.10.2 From 64ec84f5f96ea1d043c01e32e49e45286c6f4160 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sat, 4 Jul 2015 12:19:13 -0300 Subject: perf evsel: Introduce append_filter() method To allow building filters in evsel->filter, that will eventually be applied via perf_evsel__apply_filter(). Cc: Adrian Hunter Cc: Borislav Petkov Cc: David Ahern Cc: Don Zickus Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Namhyung Kim Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-sjfoes3pycx7nlpmgedca13v@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index ae925b9..6cc97f3 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -836,6 +836,23 @@ int perf_evsel__set_filter(struct perf_evsel *evsel, const char *filter) return -1; } +int perf_evsel__append_filter(struct perf_evsel *evsel, + const char *op, const char *filter) +{ + char *new_filter; + + if (evsel->filter == NULL) + return perf_evsel__set_filter(evsel, filter); + + if (asprintf(&new_filter,"(%s) %s (%s)", evsel->filter, op, filter) > 0) { + free(evsel->filter); + evsel->filter = new_filter; + return 0; + } + + return -1; +} + int perf_evsel__enable(struct perf_evsel *evsel, int ncpus, int nthreads) { return perf_evsel__run_ioctl(evsel, ncpus, nthreads, diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index 8aea4ba..fe9f327 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -183,6 +183,8 @@ void perf_evsel__set_sample_id(struct perf_evsel *evsel, bool use_sample_identifier); int perf_evsel__set_filter(struct perf_evsel *evsel, const char *filter); +int perf_evsel__append_filter(struct perf_evsel *evsel, + const char *op, const char *filter); int perf_evsel__apply_filter(struct perf_evsel *evsel, int ncpus, int nthreads, const char *filter); int perf_evsel__enable(struct perf_evsel *evsel, int ncpus, int nthreads); -- cgit v0.10.2 From 19867b6186f3fd38be65d86e20ef6f49d0caaa0b Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sat, 4 Jul 2015 12:44:59 -0300 Subject: perf trace: Use event filters for the event qualifier list We use raw_syscalls:sys_{enter,exit} events to show the syscalls, but were using a rather lazy/inneficient way to implement our 'strace -e' equivalent: filter out after reading the events in the ring buffer. Deflect more work to the kernel by appending a filter expression for that, that, together with the pid list, that is always present, if only to filter the tracer itself, reduces pressure on the ring buffer and otherwise use infrastructure already in place in the kernel to do early filtering. If we use it with -v we can see the filter passed to the kernel, for instance, for this contrieved case: # trace -v -e \!open,close,write,poll,recvfrom,select,recvmsg,writev,sendmsg,read,futex,epoll_wait,ioctl,eventfd --filter-pids 2189,2566,1398,2692,4475,4532 (common_pid != 2514 && common_pid != 1398 && common_pid != 2189 && common_pid != 2566 && common_pid != 2692 && common_pid != 4475 && common_pid != 4532) && (id != 3 && id != 232 && id != 284 && id != 202 && id != 16 && id != 2 && id != 7 && id != 0 && id != 45 && id != 47 && id != 23 && id != 46 && id != 1 && id != 20) 0.011 (0.011 ms): caribou/2295 eventfd2(flags: CLOEXEC|NONBLOCK) = 18 16.946 (0.019 ms): caribou/2295 eventfd2(flags: CLOEXEC|NONBLOCK) = 18 38.598 (0.167 ms): chronyd/794 socket(family: INET, type: DGRAM ) = 4 38.603 (0.002 ms): chronyd/794 fcntl(fd: 4, cmd: GETFD) = 0 38.605 (0.001 ms): chronyd/794 fcntl(fd: 4, cmd: SETFD, arg: 1) = 0 ^C # Cc: Adrian Hunter Cc: Borislav Petkov Cc: David Ahern Cc: Don Zickus Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Namhyung Kim Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-ti2tg18atproqpguc2moinp6@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index c02b65a..0ebf55b 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -1187,7 +1187,6 @@ struct syscall { int nr_args; struct format_field *args; const char *name; - bool filtered; bool is_exit; struct syscall_fmt *fmt; size_t (**arg_scnprintf)(char *bf, size_t size, struct syscall_arg *arg); @@ -1550,19 +1549,6 @@ static int trace__read_syscall_info(struct trace *trace, int id) sc = trace->syscalls.table + id; sc->name = name; - if (trace->ev_qualifier) { - bool in = strlist__find(trace->ev_qualifier, name) != NULL; - - if (!(in ^ trace->not_ev_qualifier)) { - sc->filtered = true; - /* - * No need to do read tracepoint information since this will be - * filtered out. - */ - return 0; - } - } - sc->fmt = syscall_fmt__find(sc->name); snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name); @@ -1823,9 +1809,6 @@ static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel, if (sc == NULL) return -1; - if (sc->filtered) - return 0; - thread = machine__findnew_thread(trace->host, sample->pid, sample->tid); ttrace = thread__trace(thread, trace->output); if (ttrace == NULL) @@ -1881,9 +1864,6 @@ static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel, if (sc == NULL) return -1; - if (sc->filtered) - return 0; - thread = machine__findnew_thread(trace->host, sample->pid, sample->tid); ttrace = thread__trace(thread, trace->output); if (ttrace == NULL) @@ -2310,6 +2290,26 @@ out_delete_sys_enter: goto out; } +static int trace__set_ev_qualifier_filter(struct trace *trace) +{ + int err = -1; + char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier, + trace->ev_qualifier_ids.nr, + trace->ev_qualifier_ids.entries); + + if (filter == NULL) + goto out_enomem; + + if (!perf_evsel__append_filter(trace->syscalls.events.sys_enter, "&&", filter)) + err = perf_evsel__append_filter(trace->syscalls.events.sys_exit, "&&", filter); + + free(filter); +out: + return err; +out_enomem: + errno = ENOMEM; + goto out; +} static int trace__run(struct trace *trace, int argc, const char **argv) { @@ -2386,6 +2386,14 @@ static int trace__run(struct trace *trace, int argc, const char **argv) if (err < 0) goto out_error_mem; + if (trace->ev_qualifier_ids.nr > 0) { + err = trace__set_ev_qualifier_filter(trace); + if (err < 0) + goto out_errno; + } + + pr_debug("%s\n", trace->syscalls.events.sys_exit->filter); + err = perf_evlist__apply_filters(evlist, &evsel); if (err < 0) goto out_error_apply_filters; @@ -2502,6 +2510,10 @@ out_error_apply_filters: out_error_mem: fprintf(trace->output, "Not enough memory to run!\n"); goto out_delete_evlist; + +out_errno: + fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno)); + goto out_delete_evlist; } static int trace__replay(struct trace *trace) -- cgit v0.10.2 From 200802a4a79b3a7a1cf924ad870d3265023d0787 Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Sat, 4 Jul 2015 07:44:22 +0200 Subject: perf probe: Delete an unnecessary check before the function call "strfilter__delete" The strfilter__delete() function tests whether its argument is NULL and then returns immediately. Thus the test around the call is not needed. This issue was detected by using the Coccinelle software. Signed-off-by: Markus Elfring Acked-by: Masami Hiramatsu Cc: Julia Lawall Cc: Peter Zijlstra Cc: kernel-janitors@vger.kernel.org Link: http://lkml.kernel.org/r/5597751A.5000506@users.sourceforge.net Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/builtin-probe.c b/tools/perf/builtin-probe.c index 1272559..b81cec3 100644 --- a/tools/perf/builtin-probe.c +++ b/tools/perf/builtin-probe.c @@ -297,8 +297,7 @@ static void cleanup_params(void) clear_perf_probe_event(params.events + i); line_range__clear(¶ms.line_range); free(params.target); - if (params.filter) - strfilter__delete(params.filter); + strfilter__delete(params.filter); memset(¶ms, 0, sizeof(params)); } -- cgit v0.10.2 From 3abebc55d70b6e3247d1f0e34c0bb906e40d2a18 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 6 Jul 2015 14:51:01 +0300 Subject: perf record: Let user have timestamps with per-thread recording If the option -T is used with option --per-thread, then time is still not sampled. Fix that by using OPT_BOOLEAN_SET to distinguish when the user used the -T option as opposed to the default case when timestamps are enabled but only for per-cpu recording. Signed-off-by: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Link: http://lkml.kernel.org/r/1436183461-1918-1-git-send-email-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index de165a1..283fe96 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -1030,7 +1030,9 @@ struct option __record_options[] = { OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat, "per thread counts"), OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"), - OPT_BOOLEAN('T', "timestamp", &record.opts.sample_time, "Record the sample timestamps"), + OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time, + &record.opts.sample_time_set, + "Record the sample timestamps"), OPT_BOOLEAN('P', "period", &record.opts.period, "Record the sample period"), OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples, "don't sample"), diff --git a/tools/perf/perf.h b/tools/perf/perf.h index 4a5827ff..937b16a 100644 --- a/tools/perf/perf.h +++ b/tools/perf/perf.h @@ -51,6 +51,7 @@ struct record_opts { bool sample_address; bool sample_weight; bool sample_time; + bool sample_time_set; bool period; bool sample_intr_regs; bool running_time; diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 6cc97f3..83c0803 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -707,7 +707,8 @@ void perf_evsel__config(struct perf_evsel *evsel, struct record_opts *opts) */ if (opts->sample_time && (!perf_missing_features.sample_id_all && - (!opts->no_inherit || target__has_cpu(&opts->target) || per_cpu))) + (!opts->no_inherit || target__has_cpu(&opts->target) || per_cpu || + opts->sample_time_set))) perf_evsel__set_sample_bit(evsel, TIME); if (opts->raw_samples && !evsel->no_aux_samples) { -- cgit v0.10.2 From ab85785aa13c36440a91a8e9f7616357de411a1f Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 6 Jul 2015 08:51:41 -0300 Subject: tools lib api debugfs: Check for tracefs when reporting errors Now that we have two mountpoints, one for debugfs and another, for tracefs, we end up needing to check permissions for both, so, on a system with default config we were always asking the user to check the permission of the debugfs mountpoint, even when it was already sufficient. Fix it. E.g.: $ trace -e nanosleep usleep 1 Error: No permissions to read /sys/kernel/debug/tracing/events/raw_syscalls/sys_(enter|exit) Hint: Try 'sudo mount -o remount,mode=755 /sys/kernel/debug' $ sudo mount -o remount,mode=755 /sys/kernel/debug $ trace -e nanosleep usleep 1 Error: No permissions to read /sys/kernel/debug/tracing/events/raw_syscalls/sys_(enter|exit) Hint: Try 'sudo mount -o remount,mode=755 /sys/kernel/debug/tracing' $ sudo mount -o remount,mode=755 /sys/kernel/debug/tracing $ trace -e nanosleep usleep 1 0.326 ( 0.061 ms): usleep/11961 nanosleep(rqtp: 0x7ffef1081c50) = 0 $ Cc: Adrian Hunter Cc: Borislav Petkov Cc: David Ahern Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Namhyung Kim Cc: Stephane Eranian Cc: Steven Rostedt Link: http://lkml.kernel.org/n/tip-0viljeuhc7q84ic8kobsna43@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/lib/api/fs/debugfs.c b/tools/lib/api/fs/debugfs.c index 8305b3e..eb7cf4d 100644 --- a/tools/lib/api/fs/debugfs.c +++ b/tools/lib/api/fs/debugfs.c @@ -12,6 +12,7 @@ #include #include "debugfs.h" +#include "tracefs.h" #ifndef DEBUGFS_DEFAULT_PATH #define DEBUGFS_DEFAULT_PATH "/sys/kernel/debug" @@ -94,11 +95,21 @@ int debugfs__strerror_open(int err, char *buf, size_t size, const char *filename "Hint:\tIs the debugfs filesystem mounted?\n" "Hint:\tTry 'sudo mount -t debugfs nodev /sys/kernel/debug'"); break; - case EACCES: + case EACCES: { + const char *mountpoint = debugfs_mountpoint; + + if (!access(debugfs_mountpoint, R_OK) && strncmp(filename, "tracing/", 8) == 0) { + const char *tracefs_mntpoint = tracefs_find_mountpoint(); + + if (tracefs_mntpoint) + mountpoint = tracefs_mntpoint; + } + snprintf(buf, size, "Error:\tNo permissions to read %s/%s\n" "Hint:\tTry 'sudo mount -o remount,mode=755 %s'\n", - debugfs_mountpoint, filename, debugfs_mountpoint); + debugfs_mountpoint, filename, mountpoint); + } break; default: snprintf(buf, size, "%s", strerror_r(err, sbuf, sizeof(sbuf))); -- cgit v0.10.2 From 3d5d68aabf4ff47cfb447ad90448ce50531be7f5 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 8 Jul 2015 04:44:54 -0400 Subject: perf record: Document setting '-e pmu/period=N/' in man page The 'period' param is not defined in /sys/bus/event_sources/devices//format/*, but can be used, document it. Signed-off-by: Kan Liang Acked-by: Jiri Olsa Cc: Andi Kleen Cc: Namhyung Kim Link: http://lkml.kernel.org/r/1436345097-11113-3-git-send-email-kan.liang@intel.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt index 9b9d9d0..5b47b2c 100644 --- a/tools/perf/Documentation/perf-record.txt +++ b/tools/perf/Documentation/perf-record.txt @@ -45,6 +45,14 @@ OPTIONS param1 and param2 are defined as formats for the PMU in: /sys/bus/event_sources/devices//format/* + There are also some params which are not defined in ...//format/*. + These params can be used to set event defaults. + Here is a list of the params. + - 'period': Set event sampling period + + Note: If user explicitly sets options which conflict with the params, + the value set by the params will be overridden. + - a hardware breakpoint event in the form of '\mem:addr[/len][:access]' where addr is the address in memory you want to break in. Access is the memory access type (read, write, execute) it can -- cgit v0.10.2 From ab7322af8c620987ed058e39506c97e5f2d3c65c Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 16 Jul 2015 11:08:34 -0300 Subject: perf strlist: load() should return a negative errno To match what its users return. Cc: Adrian Hunter Cc: Borislav Petkov Cc: David Ahern Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Namhyung Kim Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-jntpe2lwg1fxn1bku7uccan0@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/strlist.c b/tools/perf/util/strlist.c index 71f9d10..68ae673 100644 --- a/tools/perf/util/strlist.c +++ b/tools/perf/util/strlist.c @@ -72,7 +72,7 @@ int strlist__load(struct strlist *slist, const char *filename) FILE *fp = fopen(filename, "r"); if (fp == NULL) - return errno; + return -errno; while (fgets(entry, sizeof(entry), fp) != NULL) { const size_t len = strlen(entry); -- cgit v0.10.2 From 4a77e2183fc0260c0efc7adeccf933fef893ad5f Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 20 Jul 2015 12:13:34 -0300 Subject: perf strlist: Make dupstr be the default and part of an extensible config parm So that we can pass more info to strlist__new() without having to change its function signature, just adding entries to the strlist_config struct with sensible defaults for when those fields are not specified. Cc: Adrian Hunter Cc: Borislav Petkov Cc: David Ahern Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Namhyung Kim Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-5uaaler4931i0s9sedxjquhq@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/builtin-buildid-cache.c b/tools/perf/builtin-buildid-cache.c index d47a0cd..ddca990 100644 --- a/tools/perf/builtin-buildid-cache.c +++ b/tools/perf/builtin-buildid-cache.c @@ -363,7 +363,7 @@ int cmd_buildid_cache(int argc, const char **argv, setup_pager(); if (add_name_list_str) { - list = strlist__new(true, add_name_list_str); + list = strlist__new(add_name_list_str, NULL); if (list) { strlist__for_each(pos, list) if (build_id_cache__add_file(pos->s)) { @@ -381,7 +381,7 @@ int cmd_buildid_cache(int argc, const char **argv, } if (remove_name_list_str) { - list = strlist__new(true, remove_name_list_str); + list = strlist__new(remove_name_list_str, NULL); if (list) { strlist__for_each(pos, list) if (build_id_cache__remove_file(pos->s)) { @@ -399,7 +399,7 @@ int cmd_buildid_cache(int argc, const char **argv, } if (purge_name_list_str) { - list = strlist__new(true, purge_name_list_str); + list = strlist__new(purge_name_list_str, NULL); if (list) { strlist__for_each(pos, list) if (build_id_cache__purge_path(pos->s)) { @@ -420,7 +420,7 @@ int cmd_buildid_cache(int argc, const char **argv, ret = build_id_cache__fprintf_missing(session, stdout); if (update_name_list_str) { - list = strlist__new(true, update_name_list_str); + list = strlist__new(update_name_list_str, NULL); if (list) { strlist__for_each(pos, list) if (build_id_cache__update_file(pos->s)) { diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 0ebf55b..3a3173c 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -2931,7 +2931,7 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused) trace.not_ev_qualifier = *s == '!'; if (trace.not_ev_qualifier) ++s; - trace.ev_qualifier = strlist__new(true, s); + trace.ev_qualifier = strlist__new(s, NULL); if (trace.ev_qualifier == NULL) { fputs("Not enough memory to parse event qualifier", trace.output); diff --git a/tools/perf/util/build-id.c b/tools/perf/util/build-id.c index 1f6fc23..f98c2ff 100644 --- a/tools/perf/util/build-id.c +++ b/tools/perf/util/build-id.c @@ -291,7 +291,7 @@ int build_id_cache__list_build_ids(const char *pathname, struct dirent *d; int ret = 0; - list = strlist__new(true, NULL); + list = strlist__new(NULL, NULL); dir_name = build_id_cache__dirname_from_path(pathname, false, false); if (!list || !dir_name) { ret = -ENOMEM; diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index 7ff6827..d0bf1e5 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -250,7 +250,7 @@ struct machine *machines__findnew(struct machines *machines, pid_t pid) static struct strlist *seen; if (!seen) - seen = strlist__new(true, NULL); + seen = strlist__new(NULL, NULL); if (!strlist__has_entry(seen, path)) { pr_err("Can't access file %s\n", path); diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c index 381f23a..7abaac4 100644 --- a/tools/perf/util/probe-event.c +++ b/tools/perf/util/probe-event.c @@ -2066,7 +2066,7 @@ static struct strlist *get_probe_trace_command_rawlist(int fd) char *p; struct strlist *sl; - sl = strlist__new(true, NULL); + sl = strlist__new(NULL, NULL); fp = fdopen(dup(fd), "r"); while (!feof(fp)) { @@ -2362,7 +2362,7 @@ static struct strlist *get_probe_trace_event_names(int fd, bool include_group) rawlist = get_probe_trace_command_rawlist(fd); if (!rawlist) return NULL; - sl = strlist__new(true, NULL); + sl = strlist__new(NULL, NULL); strlist__for_each(ent, rawlist) { ret = parse_probe_trace_command(ent->s, &tev); if (ret < 0) diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c index 2da65a7..7b80f8c 100644 --- a/tools/perf/util/probe-finder.c +++ b/tools/perf/util/probe-finder.c @@ -1355,7 +1355,7 @@ static int add_available_vars(Dwarf_Die *sc_die, struct probe_finder *pf) vl->point.offset); /* Find local variables */ - vl->vars = strlist__new(true, NULL); + vl->vars = strlist__new(NULL, NULL); if (vl->vars == NULL) return -ENOMEM; af->child = true; diff --git a/tools/perf/util/strlist.c b/tools/perf/util/strlist.c index 68ae673..dd40385 100644 --- a/tools/perf/util/strlist.c +++ b/tools/perf/util/strlist.c @@ -133,7 +133,7 @@ int strlist__parse_list(struct strlist *slist, const char *s) return *s ? strlist__parse_list_entry(slist, s) : 0; } -struct strlist *strlist__new(bool dupstr, const char *list) +struct strlist *strlist__new(const char *list, const struct strlist_config *config) { struct strlist *slist = malloc(sizeof(*slist)); @@ -143,7 +143,7 @@ struct strlist *strlist__new(bool dupstr, const char *list) slist->rblist.node_new = strlist__node_new; slist->rblist.node_delete = strlist__node_delete; - slist->dupstr = dupstr; + slist->dupstr = config ? !config->dont_dupstr : true; if (list && strlist__parse_list(slist, list) != 0) goto out_error; } diff --git a/tools/perf/util/strlist.h b/tools/perf/util/strlist.h index 5c7f870..a182785 100644 --- a/tools/perf/util/strlist.h +++ b/tools/perf/util/strlist.h @@ -16,7 +16,11 @@ struct strlist { bool dupstr; }; -struct strlist *strlist__new(bool dupstr, const char *slist); +struct strlist_config { + bool dont_dupstr; +}; + +struct strlist *strlist__new(const char *slist, const struct strlist_config *config); void strlist__delete(struct strlist *slist); void strlist__remove(struct strlist *slist, struct str_node *sn); diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c index 48b588c..db39388 100644 --- a/tools/perf/util/symbol.c +++ b/tools/perf/util/symbol.c @@ -1906,7 +1906,7 @@ int setup_list(struct strlist **list, const char *list_str, if (list_str == NULL) return 0; - *list = strlist__new(true, list_str); + *list = strlist__new(list_str, NULL); if (!*list) { pr_err("problems parsing %s list\n", list_name); return -1; diff --git a/tools/perf/util/thread_map.c b/tools/perf/util/thread_map.c index da7646d..d526dea 100644 --- a/tools/perf/util/thread_map.c +++ b/tools/perf/util/thread_map.c @@ -196,7 +196,8 @@ static struct thread_map *thread_map__new_by_pid_str(const char *pid_str) pid_t pid, prev_pid = INT_MAX; char *end_ptr; struct str_node *pos; - struct strlist *slist = strlist__new(false, pid_str); + struct strlist_config slist_config = { .dont_dupstr = true, }; + struct strlist *slist = strlist__new(pid_str, &slist_config); if (!slist) return NULL; @@ -266,13 +267,14 @@ static struct thread_map *thread_map__new_by_tid_str(const char *tid_str) pid_t tid, prev_tid = INT_MAX; char *end_ptr; struct str_node *pos; + struct strlist_config slist_config = { .dont_dupstr = true, }; struct strlist *slist; /* perf-stat expects threads to be generated even if tid not given */ if (!tid_str) return thread_map__new_dummy(); - slist = strlist__new(false, tid_str); + slist = strlist__new(tid_str, &slist_config); if (!slist) return NULL; -- cgit v0.10.2 From 8ff9daf3c16769817d0eaf16154d9e9198ec1bda Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 17 Jul 2015 12:07:25 -0300 Subject: perf strlist: Allow substitutions from file contents in a given directory So, if we have an strlist equal to: "file,close" And we call it as: struct strlist_config *config = { .dirname = "~/strace/groups", }; struct strlist *slist = strlist__new("file, close", &config); And we have: $ cat ~/strace/groups/file access open openat statfs Then the resulting strlist will have these contents: [ "access", "open", "openat", "statfs", "close" ] This will be used to implement strace syscall groups in 'perf trace', but can be used in some other tool, thus being implemented in 'strlist'. Cc: Adrian Hunter Cc: Borislav Petkov Cc: David Ahern Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Namhyung Kim Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-wi6l6qtomqlywwr6005jvs05@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/strlist.c b/tools/perf/util/strlist.c index dd40385..7abc75a 100644 --- a/tools/perf/util/strlist.c +++ b/tools/perf/util/strlist.c @@ -108,29 +108,47 @@ struct str_node *strlist__find(struct strlist *slist, const char *entry) return snode; } -static int strlist__parse_list_entry(struct strlist *slist, const char *s) +static int strlist__parse_list_entry(struct strlist *slist, const char *s, + const char *subst_dir) { + int err; + char *subst = NULL; + if (strncmp(s, "file://", 7) == 0) return strlist__load(slist, s + 7); - return strlist__add(slist, s); + if (subst_dir) { + err = -ENOMEM; + if (asprintf(&subst, "%s/%s", subst_dir, s) < 0) + goto out; + + if (access(subst, F_OK) == 0) { + err = strlist__load(slist, subst); + goto out; + } + } + + err = strlist__add(slist, s); +out: + free(subst); + return err; } -int strlist__parse_list(struct strlist *slist, const char *s) +int strlist__parse_list(struct strlist *slist, const char *s, const char *subst_dir) { char *sep; int err; while ((sep = strchr(s, ',')) != NULL) { *sep = '\0'; - err = strlist__parse_list_entry(slist, s); + err = strlist__parse_list_entry(slist, s, subst_dir); *sep = ','; if (err != 0) return err; s = sep + 1; } - return *s ? strlist__parse_list_entry(slist, s) : 0; + return *s ? strlist__parse_list_entry(slist, s, subst_dir) : 0; } struct strlist *strlist__new(const char *list, const struct strlist_config *config) @@ -138,13 +156,22 @@ struct strlist *strlist__new(const char *list, const struct strlist_config *conf struct strlist *slist = malloc(sizeof(*slist)); if (slist != NULL) { + bool dupstr = true; + const char *dirname = NULL; + + if (config) { + dupstr = !config->dont_dupstr; + dirname = config->dirname; + } + rblist__init(&slist->rblist); slist->rblist.node_cmp = strlist__node_cmp; slist->rblist.node_new = strlist__node_new; slist->rblist.node_delete = strlist__node_delete; - slist->dupstr = config ? !config->dont_dupstr : true; - if (list && strlist__parse_list(slist, list) != 0) + slist->dupstr = dupstr; + + if (list && strlist__parse_list(slist, list, dirname) != 0) goto out_error; } diff --git a/tools/perf/util/strlist.h b/tools/perf/util/strlist.h index a182785..9bb9823 100644 --- a/tools/perf/util/strlist.h +++ b/tools/perf/util/strlist.h @@ -18,6 +18,7 @@ struct strlist { struct strlist_config { bool dont_dupstr; + const char *dirname; }; struct strlist *strlist__new(const char *slist, const struct strlist_config *config); @@ -79,5 +80,5 @@ static inline struct str_node *strlist__next(struct str_node *sn) for (pos = strlist__first(slist), n = strlist__next(pos); pos;\ pos = n, n = strlist__next(n)) -int strlist__parse_list(struct strlist *slist, const char *s); +int strlist__parse_list(struct strlist *slist, const char *s, const char *subst_dir); #endif /* __PERF_STRLIST_H */ -- cgit v0.10.2 From 8816d38d49d37e255d98d0204af2acba7094385b Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 17 Jul 2015 15:10:33 -0300 Subject: perf strlist: Make parse_list() private It is not used anywhere, expose it when/if needed. Cc: Adrian Hunter Cc: Borislav Petkov Cc: David Ahern Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Namhyung Kim Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-f6in51stj17avhk4rv11gjgg@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/strlist.c b/tools/perf/util/strlist.c index 7abc75a..bdf98f6 100644 --- a/tools/perf/util/strlist.c +++ b/tools/perf/util/strlist.c @@ -134,7 +134,7 @@ out: return err; } -int strlist__parse_list(struct strlist *slist, const char *s, const char *subst_dir) +static int strlist__parse_list(struct strlist *slist, const char *s, const char *subst_dir) { char *sep; int err; diff --git a/tools/perf/util/strlist.h b/tools/perf/util/strlist.h index 9bb9823..297565a 100644 --- a/tools/perf/util/strlist.h +++ b/tools/perf/util/strlist.h @@ -79,6 +79,4 @@ static inline struct str_node *strlist__next(struct str_node *sn) #define strlist__for_each_safe(pos, n, slist) \ for (pos = strlist__first(slist), n = strlist__next(pos); pos;\ pos = n, n = strlist__next(n)) - -int strlist__parse_list(struct strlist *slist, const char *s, const char *subst_dir); #endif /* __PERF_STRLIST_H */ -- cgit v0.10.2 From 005438a8eef063495ac059d128eea71b58de50e5 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 20 Jul 2015 12:02:09 -0300 Subject: perf trace: Support 'strace' syscall event groups I.e.: $ cat ~/share/perf-core/strace/groups/file access chmod creat execve faccessat getcwd lstat mkdir open openat quotactl readlink rename rmdir stat statfs symlink unlink $ Then, on a quiet desktop, try running this and then moving your mouse to see the deluge of mouse related activity: # perf probe 'vfs_getname=getname_flags:72 pathname=filename:string' Added new event: probe:vfs_getname (on getname_flags:72 with pathname=filename:string) You can now use it in all perf tools, such as: perf record -e probe:vfs_getname -aR sleep 1 # # trace --ev probe:vfs_getname --filter-pids 2232 -e file 0.042 (0.042 ms): mousetweaks/2235 open(filename: 0x14e3910, mode: 438 ) ... 0.042 ( ): probe:vfs_getname:(ffffffff812230bc) pathname="/home/acme/.icons/Adwaita/cursors/xterm") 0.100 (0.100 ms): mousetweaks/2235 ... [continued]: open()) = -1 ENOENT No such file or directory 0.142 (0.018 ms): mousetweaks/2235 open(filename: 0x14c3c10, mode: 438 ) ... 0.142 ( ): probe:vfs_getname:(ffffffff812230bc) pathname="/home/acme/.icons/Adwaita/index.theme") 0.192 (0.069 ms): mousetweaks/2235 ... [continued]: open()) = -1 ENOENT No such file or directory 0.230 (0.017 ms): mousetweaks/2235 open(filename: 0x14c3c10, mode: 438 ) ... 0.230 ( ): probe:vfs_getname:(ffffffff812230bc) pathname="/usr/share/icons/Adwaita/cursors/xterm") 0.253 (0.041 ms): mousetweaks/2235 ... [continued]: open()) = 14 0.459 (0.008 ms): mousetweaks/2235 open(filename: 0x14e3910, mode: 438 ) ... 0.459 ( ): probe:vfs_getname:(ffffffff812230bc) pathname="/home/acme/.icons/Adwaita/cursors/left_side") 0.468 (0.017 ms): mousetweaks/2235 ... [continued]: open()) = -1 ENOENT No such file or directory Need to combine that raw_syscalls:sys_enter(open) + probe:vfs_getname + raw_syscalls:sys_exit(open) sequence... Now, if you're bored, please write some more syscall groups, like the ones in 'strace' and send it our way :-) Cc: Adrian Hunter Cc: Borislav Petkov Cc: David Ahern Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Milian Wolff Cc: Namhyung Kim Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-a42xklu59lcbxp7bbnic74a8@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/Build b/tools/perf/Build index b77370e..7223745 100644 --- a/tools/perf/Build +++ b/tools/perf/Build @@ -35,6 +35,7 @@ paths += -DPERF_MAN_PATH="BUILD_STR($(mandir_SQ))" CFLAGS_builtin-help.o += $(paths) CFLAGS_builtin-timechart.o += $(paths) CFLAGS_perf.o += -DPERF_HTML_PATH="BUILD_STR($(htmldir_SQ))" -include $(OUTPUT)PERF-VERSION-FILE +CFLAGS_builtin-trace.o += -DSTRACE_GROUPS_DIR="BUILD_STR($(STRACE_GROUPS_DIR_SQ))" libperf-y += util/ libperf-y += arch/ diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf index 7a4b549..04170fc 100644 --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf @@ -494,6 +494,11 @@ endif $(INSTALL) $(OUTPUT)perf-archive -t '$(DESTDIR_SQ)$(perfexec_instdir_SQ)' $(call QUIET_INSTALL, perf-with-kcore) \ $(INSTALL) $(OUTPUT)perf-with-kcore -t '$(DESTDIR_SQ)$(perfexec_instdir_SQ)' +ifndef NO_LIBAUDIT + $(call QUIET_INSTALL, strace/groups) \ + $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(STRACE_GROUPS_INSTDIR_SQ)'; \ + $(INSTALL) trace/strace/groups/* -t '$(DESTDIR_SQ)$(STRACE_GROUPS_INSTDIR_SQ)' +endif ifndef NO_LIBPERL $(call QUIET_INSTALL, perl-scripts) \ $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/perl/Perf-Trace-Util/lib/Perf/Trace'; \ diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 3a3173c..32b4d28 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -3,6 +3,7 @@ #include "util/color.h" #include "util/debug.h" #include "util/evlist.h" +#include "util/exec_cmd.h" #include "util/machine.h" #include "util/session.h" #include "util/thread.h" @@ -2927,11 +2928,14 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused) if (ev_qualifier_str != NULL) { const char *s = ev_qualifier_str; + struct strlist_config slist_config = { + .dirname = system_path(STRACE_GROUPS_DIR), + }; trace.not_ev_qualifier = *s == '!'; if (trace.not_ev_qualifier) ++s; - trace.ev_qualifier = strlist__new(s, NULL); + trace.ev_qualifier = strlist__new(s, &slist_config); if (trace.ev_qualifier == NULL) { fputs("Not enough memory to parse event qualifier", trace.output); diff --git a/tools/perf/config/Makefile b/tools/perf/config/Makefile index 094ddae..a9b93d1 100644 --- a/tools/perf/config/Makefile +++ b/tools/perf/config/Makefile @@ -644,6 +644,7 @@ infodir = share/info perfexecdir = libexec/perf-core sharedir = $(prefix)/share template_dir = share/perf-core/templates +STRACE_GROUPS_DIR = share/perf-core/strace/groups htmldir = share/doc/perf-doc ifeq ($(prefix),/usr) sysconfdir = /etc @@ -663,6 +664,7 @@ libdir = $(prefix)/$(lib) # Shell quote (do not use $(call) to accommodate ancient setups); ETC_PERFCONFIG_SQ = $(subst ','\'',$(ETC_PERFCONFIG)) +STRACE_GROUPS_DIR_SQ = $(subst ','\'',$(STRACE_GROUPS_DIR)) DESTDIR_SQ = $(subst ','\'',$(DESTDIR)) bindir_SQ = $(subst ','\'',$(bindir)) mandir_SQ = $(subst ','\'',$(mandir)) @@ -676,10 +678,13 @@ libdir_SQ = $(subst ','\'',$(libdir)) ifneq ($(filter /%,$(firstword $(perfexecdir))),) perfexec_instdir = $(perfexecdir) +STRACE_GROUPS_INSTDIR = $(STRACE_GROUPS_DIR) else perfexec_instdir = $(prefix)/$(perfexecdir) +STRACE_GROUPS_INSTDIR = $(prefix)/$(STRACE_GROUPS_DIR) endif perfexec_instdir_SQ = $(subst ','\'',$(perfexec_instdir)) +STRACE_GROUPS_INSTDIR_SQ = $(subst ','\'',$(STRACE_GROUPS_INSTDIR)) # If we install to $(HOME) we keep the traceevent default: # $(HOME)/.traceevent/plugins @@ -713,6 +718,7 @@ $(call detected_var,htmldir_SQ) $(call detected_var,infodir_SQ) $(call detected_var,mandir_SQ) $(call detected_var,ETC_PERFCONFIG_SQ) +$(call detected_var,STRACE_GROUPS_DIR_SQ) $(call detected_var,prefix_SQ) $(call detected_var,perfexecdir_SQ) $(call detected_var,LIBDIR) diff --git a/tools/perf/trace/strace/groups/file b/tools/perf/trace/strace/groups/file new file mode 100644 index 0000000..62378a8 --- /dev/null +++ b/tools/perf/trace/strace/groups/file @@ -0,0 +1,18 @@ +access +chmod +creat +execve +faccessat +getcwd +lstat +mkdir +open +openat +quotactl +readlink +rename +rmdir +stat +statfs +symlink +unlink -- cgit v0.10.2 From 15bfd2cc107a9971ac8aeb4b7724ced581a2ed30 Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Fri, 10 Jul 2015 07:36:09 +0000 Subject: perf record: Apply filter to all events in a glob matching There is an old problem in perf's filter applying which first posted at Sep. 2014 at https://lkml.org/lkml/2014/9/9/944 that, if passing multiple events in a glob matching expression in cmdline then add '--filter' after them, the filter will be applied on only the last one. For example: # dd if=/dev/zero of=/dev/null & [1] 464 # perf record -a -e 'syscalls:sys_*_read' --filter 'common_pid != 464' sleep 0.1 [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.239 MB perf.data (2094 samples) ] # perf report --stdio | tee ... # Samples: 2K of event 'syscalls:sys_enter_read' # Event count (approx.): 2092 ... # Samples: 2 of event 'syscalls:sys_exit_read' # Event count (approx.): 2 ... In this example, filter only applied on 'syscalls:sys_exit_read', and there's no way to set filter for ''syscalls:sys_enter_read'. This patch adds a 'cmdline_group_boundary' for 'struct evsel', and apply filter on all events between two boundary marks. After applying this patch: # perf record -a -e 'syscalls:sys_*_read' --filter 'common_pid != 464' sleep 0.1 [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.031 MB perf.data (3 samples) ] # perf report --stdio | tee ... # Samples: 1 of event 'syscalls:sys_enter_read' # Event count (approx.): 1 ... # Samples: 2 of event 'syscalls:sys_exit_read' # Event count (approx.): 2 ... Signed-off-by: Wang Nan Reported-by: Brendan Gregg Cc: Andi Kleen Cc: Steven Rostedt Cc: Zefan Li Cc: pi3orama@163.com Link: http://lkml.kernel.org/r/1436513770-8896-1-git-send-email-wangnan0@huawei.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 83c0803..49fb7b5 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -210,6 +210,7 @@ void perf_evsel__init(struct perf_evsel *evsel, perf_evsel__object.init(evsel); evsel->sample_size = __perf_evsel__sample_size(attr->sample_type); perf_evsel__calc_id_pos(evsel); + evsel->cmdline_group_boundary = false; } struct perf_evsel *perf_evsel__new_idx(struct perf_event_attr *attr, int idx) diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index fe9f327..1fc263a 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -86,6 +86,7 @@ struct perf_evsel { unsigned long *per_pkg_mask; struct perf_evsel *leader; char *group_name; + bool cmdline_group_boundary; }; union u64_swap { diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index a71eeb2..bbb7fbc 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -1065,8 +1065,13 @@ int parse_events(struct perf_evlist *evlist, const char *str, perf_pmu__parse_cleanup(); if (!ret) { int entries = data.idx - evlist->nr_entries; + struct perf_evsel *last; + perf_evlist__splice_list_tail(evlist, &data.list, entries); evlist->nr_groups += data.nr_groups; + last = perf_evlist__last(evlist); + last->cmdline_group_boundary = true; + return 0; } @@ -1171,16 +1176,23 @@ int parse_filter(const struct option *opt, const char *str, if (evlist->nr_entries > 0) last = perf_evlist__last(evlist); - if (last == NULL || last->attr.type != PERF_TYPE_TRACEPOINT) { - fprintf(stderr, - "--filter option should follow a -e tracepoint option\n"); - return -1; - } + do { + if (last == NULL || last->attr.type != PERF_TYPE_TRACEPOINT) { + fprintf(stderr, + "--filter option should follow a -e tracepoint option\n"); + return -1; + } - if (perf_evsel__set_filter(last, str) < 0) { - fprintf(stderr, "not enough memory to hold filter string\n"); - return -1; - } + if (perf_evsel__set_filter(last, str) < 0) { + fprintf(stderr, + "not enough memory to hold filter string\n"); + return -1; + } + + if (last->node.prev == &evlist->entries) + return 0; + last = list_entry(last->node.prev, struct perf_evsel, node); + } while (!last->cmdline_group_boundary); return 0; } -- cgit v0.10.2 From 4ba1faa19fa5f415bd69b1d7c366028332468bca Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Fri, 10 Jul 2015 07:36:10 +0000 Subject: perf record: Allow filtering perf's pid via --exclude-perf This patch allows 'perf record' to exclude events issued by perf itself by '--exclude-perf' option. Before this patch, when doing something like: # perf record -a -e syscalls:sys_enter_write One could easily get result like this: # /tmp/perf report --stdio ... # Overhead Command Shared Object Symbol # ........ ....... .................. .................... # 99.99% perf libpthread-2.18.so [.] __write_nocancel 0.01% ls libc-2.18.so [.] write 0.01% sshd libc-2.18.so [.] write ... Where most events are generated by perf itself. A shell trick can be done to filter perf itself out: # cat << EOF > ./tmp > #!/bin/sh > exec perf record -e ... --filter="common_pid != \$\$" -a sleep 10 > EOF # chmod a+x ./tmp # ./tmp However, doing so is user unfriendly. This patch extracts evsel iteration framework introduced by patch 'perf record: Apply filter to all events in a glob matching' into foreach_evsel_in_last_glob(), and makes exclude_perf() function append new filter expression to each evsel selected by a '-e' selector. To avoid losing filters if user pass '--filter' after '--exclude-perf', this patch uses perf_evsel__append_filter() in both case, instead of perf_evsel__set_filter() which removes old filter. As a side effect, now it is possible to use multiple '--filter' option for one selector. They are combinded with '&&'. Signed-off-by: Wang Nan Cc: Andi Kleen Cc: Brendan Gregg Cc: Steven Rostedt Cc: Zefan Li Cc: pi3orama@163.com Link: http://lkml.kernel.org/r/1436513770-8896-2-git-send-email-wangnan0@huawei.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt index 5b47b2c..29e5307 100644 --- a/tools/perf/Documentation/perf-record.txt +++ b/tools/perf/Documentation/perf-record.txt @@ -69,7 +69,16 @@ OPTIONS "perf report" to view group events together. --filter=:: - Event filter. + Event filter. This option should follow a event selector (-e) which + selects tracepoint event(s). Multiple '--filter' options are combined + using '&&'. + +--exclude-perf:: + Don't record events issued by perf itself. This option should follow + a event selector (-e) which selects tracepoint event(s). It adds a + filter expression 'common_pid != $PERFPID' to filters. If other + '--filter' exists, the new filter expression will be combined with + them by '&&'. -a:: --all-cpus:: diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 283fe96..1932e27 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -992,6 +992,9 @@ struct option __record_options[] = { parse_events_option), OPT_CALLBACK(0, "filter", &record.evlist, "filter", "event filter", parse_filter), + OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist, + NULL, "don't record events from perf itself", + exclude_perf), OPT_STRING('p', "pid", &record.opts.target.pid, "pid", "record events on existing process id"), OPT_STRING('t', "tid", &record.opts.target.tid, "tid", diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index bbb7fbc..4f807fc 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -1167,27 +1167,24 @@ int parse_events_option(const struct option *opt, const char *str, return ret; } -int parse_filter(const struct option *opt, const char *str, - int unset __maybe_unused) +static int +foreach_evsel_in_last_glob(struct perf_evlist *evlist, + int (*func)(struct perf_evsel *evsel, + const void *arg), + const void *arg) { - struct perf_evlist *evlist = *(struct perf_evlist **)opt->value; struct perf_evsel *last = NULL; + int err; if (evlist->nr_entries > 0) last = perf_evlist__last(evlist); do { - if (last == NULL || last->attr.type != PERF_TYPE_TRACEPOINT) { - fprintf(stderr, - "--filter option should follow a -e tracepoint option\n"); - return -1; - } - - if (perf_evsel__set_filter(last, str) < 0) { - fprintf(stderr, - "not enough memory to hold filter string\n"); + err = (*func)(last, arg); + if (err) return -1; - } + if (!last) + return 0; if (last->node.prev == &evlist->entries) return 0; @@ -1197,6 +1194,66 @@ int parse_filter(const struct option *opt, const char *str, return 0; } +static int set_filter(struct perf_evsel *evsel, const void *arg) +{ + const char *str = arg; + + if (evsel == NULL || evsel->attr.type != PERF_TYPE_TRACEPOINT) { + fprintf(stderr, + "--filter option should follow a -e tracepoint option\n"); + return -1; + } + + if (perf_evsel__append_filter(evsel, "&&", str) < 0) { + fprintf(stderr, + "not enough memory to hold filter string\n"); + return -1; + } + + return 0; +} + +int parse_filter(const struct option *opt, const char *str, + int unset __maybe_unused) +{ + struct perf_evlist *evlist = *(struct perf_evlist **)opt->value; + + return foreach_evsel_in_last_glob(evlist, set_filter, + (const void *)str); +} + +static int add_exclude_perf_filter(struct perf_evsel *evsel, + const void *arg __maybe_unused) +{ + char new_filter[64]; + + if (evsel == NULL || evsel->attr.type != PERF_TYPE_TRACEPOINT) { + fprintf(stderr, + "--exclude-perf option should follow a -e tracepoint option\n"); + return -1; + } + + snprintf(new_filter, sizeof(new_filter), "common_pid != %d", getpid()); + + if (perf_evsel__append_filter(evsel, "&&", new_filter) < 0) { + fprintf(stderr, + "not enough memory to hold filter string\n"); + return -1; + } + + return 0; +} + +int exclude_perf(const struct option *opt, + const char *arg __maybe_unused, + int unset __maybe_unused) +{ + struct perf_evlist *evlist = *(struct perf_evlist **)opt->value; + + return foreach_evsel_in_last_glob(evlist, add_exclude_perf_filter, + NULL); +} + static const char * const event_type_descriptors[] = { "Hardware event", "Software event", diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h index 131f29b..2063048 100644 --- a/tools/perf/util/parse-events.h +++ b/tools/perf/util/parse-events.h @@ -34,6 +34,7 @@ extern int parse_events(struct perf_evlist *evlist, const char *str, struct parse_events_error *error); extern int parse_events_terms(struct list_head *terms, const char *str); extern int parse_filter(const struct option *opt, const char *str, int unset); +extern int exclude_perf(const struct option *opt, const char *arg, int unset); #define EVENTS_HELP_MAX (128*1024) -- cgit v0.10.2 From a3c9de6280b8d196ab89ca7fad143bfa2a949790 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 15 Jul 2015 18:14:00 +0900 Subject: perf probe: Simplify __add_probe_trace_events code Simplify the __add_probe_trace_events() code by taking out the probe_trace_event__set_name() and updating show_perf_probe_event() Signed-off-by: Masami Hiramatsu Cc: Adrian Hunter Cc: Borislav Petkov Cc: Hemant Kumar Cc: Jiri Olsa Cc: Namhyung Kim Cc: Paul Mackerras Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20150715091400.8915.85501.stgit@localhost.localdomain Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c index 7abaac4..54a91d7 100644 --- a/tools/perf/util/probe-event.c +++ b/tools/perf/util/probe-event.c @@ -2478,16 +2478,54 @@ out: free(buf); } +/* Set new name from original perf_probe_event and namelist */ +static int probe_trace_event__set_name(struct probe_trace_event *tev, + struct perf_probe_event *pev, + struct strlist *namelist, + bool allow_suffix) +{ + const char *event, *group; + char buf[64]; + int ret; + + if (pev->event) + event = pev->event; + else + if (pev->point.function && !strisglob(pev->point.function)) + event = pev->point.function; + else + event = tev->point.realname; + if (pev->group) + group = pev->group; + else + group = PERFPROBE_GROUP; + + /* Get an unused new event name */ + ret = get_new_event_name(buf, 64, event, + namelist, allow_suffix); + if (ret < 0) + return ret; + + event = buf; + + tev->event = strdup(event); + tev->group = strdup(group); + if (tev->event == NULL || tev->group == NULL) + return -ENOMEM; + + /* Add added event name to namelist */ + strlist__add(namelist, event); + return 0; +} + static int __add_probe_trace_events(struct perf_probe_event *pev, struct probe_trace_event *tevs, int ntevs, bool allow_suffix) { int i, fd, ret; struct probe_trace_event *tev = NULL; - char buf[64]; const char *event = NULL, *group = NULL; struct strlist *namelist; - bool safename; if (pev->uprobes) fd = open_uprobe_events(true); @@ -2507,7 +2545,6 @@ static int __add_probe_trace_events(struct perf_probe_event *pev, goto close_out; } - safename = (pev->point.function && !strisglob(pev->point.function)); ret = 0; pr_info("Added new event%s\n", (ntevs > 1) ? "s:" : ":"); for (i = 0; i < ntevs; i++) { @@ -2516,36 +2553,15 @@ static int __add_probe_trace_events(struct perf_probe_event *pev, if (!tev->point.symbol) continue; - if (pev->event) - event = pev->event; - else - if (safename) - event = pev->point.function; - else - event = tev->point.realname; - if (pev->group) - group = pev->group; - else - group = PERFPROBE_GROUP; - - /* Get an unused new event name */ - ret = get_new_event_name(buf, 64, event, - namelist, allow_suffix); + /* Set new name for tev (and update namelist) */ + ret = probe_trace_event__set_name(tev, pev, namelist, + allow_suffix); if (ret < 0) break; - event = buf; - tev->event = strdup(event); - tev->group = strdup(group); - if (tev->event == NULL || tev->group == NULL) { - ret = -ENOMEM; - break; - } ret = write_probe_trace_event(fd, tev); if (ret < 0) break; - /* Add added event name to namelist */ - strlist__add(namelist, event); /* We use tev's name for showing new events */ show_perf_probe_event(tev->group, tev->event, pev, -- cgit v0.10.2 From 92f6c72e7ac40cbf8d12682d1aeeb82c905f2a64 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 15 Jul 2015 18:14:07 +0900 Subject: perf probe: Move ftrace probe-event operations to probe-file.c Move ftrace probe-event operations to probe-file.c from probe-event.c. Signed-off-by: Masami Hiramatsu Cc: Adrian Hunter Cc: Borislav Petkov Cc: Hemant Kumar Cc: Jiri Olsa Cc: Namhyung Kim Cc: Paul Mackerras Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20150715091407.8915.14316.stgit@localhost.localdomain [ Fixed up strlist__new() calls wrt 4a77e2183fc0 ("perf strlist: Make dupstr be the...") ] Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/Build b/tools/perf/util/Build index 601d114..8d1bdf8 100644 --- a/tools/perf/util/Build +++ b/tools/perf/util/Build @@ -79,6 +79,7 @@ libperf-$(CONFIG_AUXTRACE) += auxtrace.o libperf-y += parse-branch-options.o libperf-$(CONFIG_LIBELF) += symbol-elf.o +libperf-$(CONFIG_LIBELF) += probe-file.o libperf-$(CONFIG_LIBELF) += probe-event.o ifndef CONFIG_LIBELF diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c index 54a91d7..fe4941a 100644 --- a/tools/perf/util/probe-event.c +++ b/tools/perf/util/probe-event.c @@ -45,6 +45,7 @@ #include "trace-event.h" /* For __maybe_unused */ #include "probe-event.h" #include "probe-finder.h" +#include "probe-file.h" #include "session.h" #define MAX_CMDLEN 256 @@ -55,11 +56,7 @@ struct probe_conf probe_conf; #define semantic_error(msg ...) pr_err("Semantic error :" msg) -/* If there is no space to write, returns -E2BIG. */ -static int e_snprintf(char *str, size_t size, const char *format, ...) - __attribute__((format(printf, 3, 4))); - -static int e_snprintf(char *str, size_t size, const char *format, ...) +int e_snprintf(char *str, size_t size, const char *format, ...) { int ret; va_list ap; @@ -72,7 +69,6 @@ static int e_snprintf(char *str, size_t size, const char *format, ...) } static char *synthesize_perf_probe_point(struct perf_probe_point *pp); -static void clear_probe_trace_event(struct probe_trace_event *tev); static struct machine *host_machine; /* Initialize symbol maps and path of vmlinux/modules */ @@ -1467,8 +1463,7 @@ bool perf_probe_event_need_dwarf(struct perf_probe_event *pev) } /* Parse probe_events event into struct probe_point */ -static int parse_probe_trace_command(const char *cmd, - struct probe_trace_event *tev) +int parse_probe_trace_command(const char *cmd, struct probe_trace_event *tev) { struct probe_trace_point *tp = &tev->point; char pr; @@ -1951,7 +1946,7 @@ void clear_perf_probe_event(struct perf_probe_event *pev) memset(pev, 0, sizeof(*pev)); } -static void clear_probe_trace_event(struct probe_trace_event *tev) +void clear_probe_trace_event(struct probe_trace_event *tev) { struct probe_trace_arg_ref *ref, *next; int i; @@ -1976,119 +1971,6 @@ static void clear_probe_trace_event(struct probe_trace_event *tev) memset(tev, 0, sizeof(*tev)); } -static void print_open_warning(int err, bool is_kprobe) -{ - char sbuf[STRERR_BUFSIZE]; - - if (err == -ENOENT) { - const char *config; - - if (!is_kprobe) - config = "CONFIG_UPROBE_EVENTS"; - else - config = "CONFIG_KPROBE_EVENTS"; - - pr_warning("%cprobe_events file does not exist" - " - please rebuild kernel with %s.\n", - is_kprobe ? 'k' : 'u', config); - } else if (err == -ENOTSUP) - pr_warning("Tracefs or debugfs is not mounted.\n"); - else - pr_warning("Failed to open %cprobe_events: %s\n", - is_kprobe ? 'k' : 'u', - strerror_r(-err, sbuf, sizeof(sbuf))); -} - -static void print_both_open_warning(int kerr, int uerr) -{ - /* Both kprobes and uprobes are disabled, warn it. */ - if (kerr == -ENOTSUP && uerr == -ENOTSUP) - pr_warning("Tracefs or debugfs is not mounted.\n"); - else if (kerr == -ENOENT && uerr == -ENOENT) - pr_warning("Please rebuild kernel with CONFIG_KPROBE_EVENTS " - "or/and CONFIG_UPROBE_EVENTS.\n"); - else { - char sbuf[STRERR_BUFSIZE]; - pr_warning("Failed to open kprobe events: %s.\n", - strerror_r(-kerr, sbuf, sizeof(sbuf))); - pr_warning("Failed to open uprobe events: %s.\n", - strerror_r(-uerr, sbuf, sizeof(sbuf))); - } -} - -static int open_probe_events(const char *trace_file, bool readwrite) -{ - char buf[PATH_MAX]; - const char *__debugfs; - const char *tracing_dir = ""; - int ret; - - __debugfs = tracefs_find_mountpoint(); - if (__debugfs == NULL) { - tracing_dir = "tracing/"; - - __debugfs = debugfs_find_mountpoint(); - if (__debugfs == NULL) - return -ENOTSUP; - } - - ret = e_snprintf(buf, PATH_MAX, "%s/%s%s", - __debugfs, tracing_dir, trace_file); - if (ret >= 0) { - pr_debug("Opening %s write=%d\n", buf, readwrite); - if (readwrite && !probe_event_dry_run) - ret = open(buf, O_RDWR | O_APPEND, 0); - else - ret = open(buf, O_RDONLY, 0); - - if (ret < 0) - ret = -errno; - } - return ret; -} - -static int open_kprobe_events(bool readwrite) -{ - return open_probe_events("kprobe_events", readwrite); -} - -static int open_uprobe_events(bool readwrite) -{ - return open_probe_events("uprobe_events", readwrite); -} - -/* Get raw string list of current kprobe_events or uprobe_events */ -static struct strlist *get_probe_trace_command_rawlist(int fd) -{ - int ret, idx; - FILE *fp; - char buf[MAX_CMDLEN]; - char *p; - struct strlist *sl; - - sl = strlist__new(NULL, NULL); - - fp = fdopen(dup(fd), "r"); - while (!feof(fp)) { - p = fgets(buf, MAX_CMDLEN, fp); - if (!p) - break; - - idx = strlen(p) - 1; - if (p[idx] == '\n') - p[idx] = '\0'; - ret = strlist__add(sl, buf); - if (ret < 0) { - pr_debug("strlist__add failed (%d)\n", ret); - strlist__delete(sl); - return NULL; - } - } - fclose(fp); - - return sl; -} - struct kprobe_blacklist_node { struct list_head list; unsigned long start; @@ -2284,7 +2166,7 @@ static int __show_perf_probe_events(int fd, bool is_kprobe, memset(&tev, 0, sizeof(tev)); memset(&pev, 0, sizeof(pev)); - rawlist = get_probe_trace_command_rawlist(fd); + rawlist = probe_file__get_rawlist(fd); if (!rawlist) return -ENOMEM; @@ -2325,89 +2207,20 @@ int show_perf_probe_events(struct strfilter *filter) if (ret < 0) return ret; - kp_fd = open_kprobe_events(false); - if (kp_fd >= 0) { - ret = __show_perf_probe_events(kp_fd, true, filter); - close(kp_fd); - if (ret < 0) - goto out; - } - - up_fd = open_uprobe_events(false); - if (kp_fd < 0 && up_fd < 0) { - print_both_open_warning(kp_fd, up_fd); - ret = kp_fd; - goto out; - } + ret = probe_file__open_both(&kp_fd, &up_fd, 0); + if (ret < 0) + return ret; - if (up_fd >= 0) { + if (kp_fd >= 0) + ret = __show_perf_probe_events(kp_fd, true, filter); + if (up_fd >= 0 && ret >= 0) ret = __show_perf_probe_events(up_fd, false, filter); + if (kp_fd > 0) + close(kp_fd); + if (up_fd > 0) close(up_fd); - } -out: exit_symbol_maps(); - return ret; -} -/* Get current perf-probe event names */ -static struct strlist *get_probe_trace_event_names(int fd, bool include_group) -{ - char buf[128]; - struct strlist *sl, *rawlist; - struct str_node *ent; - struct probe_trace_event tev; - int ret = 0; - - memset(&tev, 0, sizeof(tev)); - rawlist = get_probe_trace_command_rawlist(fd); - if (!rawlist) - return NULL; - sl = strlist__new(NULL, NULL); - strlist__for_each(ent, rawlist) { - ret = parse_probe_trace_command(ent->s, &tev); - if (ret < 0) - break; - if (include_group) { - ret = e_snprintf(buf, 128, "%s:%s", tev.group, - tev.event); - if (ret >= 0) - ret = strlist__add(sl, buf); - } else - ret = strlist__add(sl, tev.event); - clear_probe_trace_event(&tev); - if (ret < 0) - break; - } - strlist__delete(rawlist); - - if (ret < 0) { - strlist__delete(sl); - return NULL; - } - return sl; -} - -static int write_probe_trace_event(int fd, struct probe_trace_event *tev) -{ - int ret = 0; - char *buf = synthesize_probe_trace_command(tev); - char sbuf[STRERR_BUFSIZE]; - - if (!buf) { - pr_debug("Failed to synthesize probe trace event.\n"); - return -EINVAL; - } - - pr_debug("Writing event: %s\n", buf); - if (!probe_event_dry_run) { - ret = write(fd, buf, strlen(buf)); - if (ret <= 0) { - ret = -errno; - pr_warning("Failed to write event: %s\n", - strerror_r(errno, sbuf, sizeof(sbuf))); - } - } - free(buf); return ret; } @@ -2527,18 +2340,12 @@ static int __add_probe_trace_events(struct perf_probe_event *pev, const char *event = NULL, *group = NULL; struct strlist *namelist; - if (pev->uprobes) - fd = open_uprobe_events(true); - else - fd = open_kprobe_events(true); - - if (fd < 0) { - print_open_warning(fd, !pev->uprobes); + fd = probe_file__open(PF_FL_RW | (pev->uprobes ? PF_FL_UPROBE : 0)); + if (fd < 0) return fd; - } /* Get current event names */ - namelist = get_probe_trace_event_names(fd, false); + namelist = probe_file__get_namelist(fd); if (!namelist) { pr_debug("Failed to get current event list.\n"); ret = -ENOMEM; @@ -2559,7 +2366,7 @@ static int __add_probe_trace_events(struct perf_probe_event *pev, if (ret < 0) break; - ret = write_probe_trace_event(fd, tev); + ret = probe_file__add_event(fd, tev); if (ret < 0) break; @@ -2854,68 +2661,9 @@ end: return ret; } -static int __del_trace_probe_event(int fd, struct str_node *ent) -{ - char *p; - char buf[128]; - int ret; - - /* Convert from perf-probe event to trace-probe event */ - ret = e_snprintf(buf, 128, "-:%s", ent->s); - if (ret < 0) - goto error; - - p = strchr(buf + 2, ':'); - if (!p) { - pr_debug("Internal error: %s should have ':' but not.\n", - ent->s); - ret = -ENOTSUP; - goto error; - } - *p = '/'; - - pr_debug("Writing event: %s\n", buf); - ret = write(fd, buf, strlen(buf)); - if (ret < 0) { - ret = -errno; - goto error; - } - - pr_info("Removed event: %s\n", ent->s); - return 0; -error: - pr_warning("Failed to delete event: %s\n", - strerror_r(-ret, buf, sizeof(buf))); - return ret; -} - -static int del_trace_probe_events(int fd, struct strfilter *filter, - struct strlist *namelist) -{ - struct str_node *ent; - const char *p; - int ret = -ENOENT; - - if (!namelist) - return -ENOENT; - - strlist__for_each(ent, namelist) { - p = strchr(ent->s, ':'); - if ((p && strfilter__compare(filter, p + 1)) || - strfilter__compare(filter, ent->s)) { - ret = __del_trace_probe_event(fd, ent); - if (ret < 0) - break; - } - } - - return ret; -} - int del_perf_probe_events(struct strfilter *filter) { int ret, ret2, ufd = -1, kfd = -1; - struct strlist *namelist = NULL, *unamelist = NULL; char *str = strfilter__string(filter); if (!str) @@ -2924,25 +2672,15 @@ int del_perf_probe_events(struct strfilter *filter) pr_debug("Delete filter: \'%s\'\n", str); /* Get current event names */ - kfd = open_kprobe_events(true); - if (kfd >= 0) - namelist = get_probe_trace_event_names(kfd, true); - - ufd = open_uprobe_events(true); - if (ufd >= 0) - unamelist = get_probe_trace_event_names(ufd, true); - - if (kfd < 0 && ufd < 0) { - print_both_open_warning(kfd, ufd); - ret = kfd; - goto error; - } + ret = probe_file__open_both(&kfd, &ufd, PF_FL_RW); + if (ret < 0) + goto out; - ret = del_trace_probe_events(kfd, filter, namelist); + ret = probe_file__del_events(kfd, filter); if (ret < 0 && ret != -ENOENT) goto error; - ret2 = del_trace_probe_events(ufd, filter, unamelist); + ret2 = probe_file__del_events(ufd, filter); if (ret2 < 0 && ret2 != -ENOENT) { ret = ret2; goto error; @@ -2953,15 +2691,11 @@ int del_perf_probe_events(struct strfilter *filter) ret = 0; error: - if (kfd >= 0) { - strlist__delete(namelist); + if (kfd >= 0) close(kfd); - } - - if (ufd >= 0) { - strlist__delete(unamelist); + if (ufd >= 0) close(ufd); - } +out: free(str); return ret; diff --git a/tools/perf/util/probe-event.h b/tools/perf/util/probe-event.h index 31db6ee..20f555d 100644 --- a/tools/perf/util/probe-event.h +++ b/tools/perf/util/probe-event.h @@ -109,6 +109,8 @@ struct variable_list { /* Command string to events */ extern int parse_perf_probe_command(const char *cmd, struct perf_probe_event *pev); +extern int parse_probe_trace_command(const char *cmd, + struct probe_trace_event *tev); /* Events to command string */ extern char *synthesize_perf_probe_command(struct perf_probe_event *pev); @@ -121,6 +123,7 @@ extern bool perf_probe_event_need_dwarf(struct perf_probe_event *pev); /* Release event contents */ extern void clear_perf_probe_event(struct perf_probe_event *pev); +extern void clear_probe_trace_event(struct probe_trace_event *tev); /* Command string to line-range */ extern int parse_line_range_desc(const char *cmd, struct line_range *lr); @@ -144,6 +147,10 @@ bool arch__prefers_symtab(void); void arch__fix_tev_from_maps(struct perf_probe_event *pev, struct probe_trace_event *tev, struct map *map); +/* If there is no space to write, returns -E2BIG. */ +int e_snprintf(char *str, size_t size, const char *format, ...) + __attribute__((format(printf, 3, 4))); + /* Maximum index number of event-name postfix */ #define MAX_EVENT_INDEX 1024 diff --git a/tools/perf/util/probe-file.c b/tools/perf/util/probe-file.c new file mode 100644 index 0000000..bbb2437 --- /dev/null +++ b/tools/perf/util/probe-file.c @@ -0,0 +1,301 @@ +/* + * probe-file.c : operate ftrace k/uprobe events files + * + * Written by Masami Hiramatsu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ +#include "util.h" +#include "event.h" +#include "strlist.h" +#include "debug.h" +#include "cache.h" +#include "color.h" +#include "symbol.h" +#include "thread.h" +#include +#include +#include "probe-event.h" +#include "probe-file.h" +#include "session.h" + +#define MAX_CMDLEN 256 + +static void print_open_warning(int err, bool uprobe) +{ + char sbuf[STRERR_BUFSIZE]; + + if (err == -ENOENT) { + const char *config; + + if (uprobe) + config = "CONFIG_UPROBE_EVENTS"; + else + config = "CONFIG_KPROBE_EVENTS"; + + pr_warning("%cprobe_events file does not exist" + " - please rebuild kernel with %s.\n", + uprobe ? 'u' : 'k', config); + } else if (err == -ENOTSUP) + pr_warning("Tracefs or debugfs is not mounted.\n"); + else + pr_warning("Failed to open %cprobe_events: %s\n", + uprobe ? 'u' : 'k', + strerror_r(-err, sbuf, sizeof(sbuf))); +} + +static void print_both_open_warning(int kerr, int uerr) +{ + /* Both kprobes and uprobes are disabled, warn it. */ + if (kerr == -ENOTSUP && uerr == -ENOTSUP) + pr_warning("Tracefs or debugfs is not mounted.\n"); + else if (kerr == -ENOENT && uerr == -ENOENT) + pr_warning("Please rebuild kernel with CONFIG_KPROBE_EVENTS " + "or/and CONFIG_UPROBE_EVENTS.\n"); + else { + char sbuf[STRERR_BUFSIZE]; + pr_warning("Failed to open kprobe events: %s.\n", + strerror_r(-kerr, sbuf, sizeof(sbuf))); + pr_warning("Failed to open uprobe events: %s.\n", + strerror_r(-uerr, sbuf, sizeof(sbuf))); + } +} + +static int open_probe_events(const char *trace_file, bool readwrite) +{ + char buf[PATH_MAX]; + const char *__debugfs; + const char *tracing_dir = ""; + int ret; + + __debugfs = tracefs_find_mountpoint(); + if (__debugfs == NULL) { + tracing_dir = "tracing/"; + + __debugfs = debugfs_find_mountpoint(); + if (__debugfs == NULL) + return -ENOTSUP; + } + + ret = e_snprintf(buf, PATH_MAX, "%s/%s%s", + __debugfs, tracing_dir, trace_file); + if (ret >= 0) { + pr_debug("Opening %s write=%d\n", buf, readwrite); + if (readwrite && !probe_event_dry_run) + ret = open(buf, O_RDWR | O_APPEND, 0); + else + ret = open(buf, O_RDONLY, 0); + + if (ret < 0) + ret = -errno; + } + return ret; +} + +static int open_kprobe_events(bool readwrite) +{ + return open_probe_events("kprobe_events", readwrite); +} + +static int open_uprobe_events(bool readwrite) +{ + return open_probe_events("uprobe_events", readwrite); +} + +int probe_file__open(int flag) +{ + int fd; + + if (flag & PF_FL_UPROBE) + fd = open_uprobe_events(flag & PF_FL_RW); + else + fd = open_kprobe_events(flag & PF_FL_RW); + if (fd < 0) + print_open_warning(fd, flag & PF_FL_UPROBE); + + return fd; +} + +int probe_file__open_both(int *kfd, int *ufd, int flag) +{ + if (!kfd || !ufd) + return -EINVAL; + + *kfd = open_kprobe_events(flag & PF_FL_RW); + *ufd = open_uprobe_events(flag & PF_FL_RW); + if (*kfd < 0 && *ufd < 0) { + print_both_open_warning(*kfd, *ufd); + return *kfd; + } + + return 0; +} + +/* Get raw string list of current kprobe_events or uprobe_events */ +struct strlist *probe_file__get_rawlist(int fd) +{ + int ret, idx; + FILE *fp; + char buf[MAX_CMDLEN]; + char *p; + struct strlist *sl; + + sl = strlist__new(NULL, NULL); + + fp = fdopen(dup(fd), "r"); + while (!feof(fp)) { + p = fgets(buf, MAX_CMDLEN, fp); + if (!p) + break; + + idx = strlen(p) - 1; + if (p[idx] == '\n') + p[idx] = '\0'; + ret = strlist__add(sl, buf); + if (ret < 0) { + pr_debug("strlist__add failed (%d)\n", ret); + strlist__delete(sl); + return NULL; + } + } + fclose(fp); + + return sl; +} + +static struct strlist *__probe_file__get_namelist(int fd, bool include_group) +{ + char buf[128]; + struct strlist *sl, *rawlist; + struct str_node *ent; + struct probe_trace_event tev; + int ret = 0; + + memset(&tev, 0, sizeof(tev)); + rawlist = probe_file__get_rawlist(fd); + if (!rawlist) + return NULL; + sl = strlist__new(NULL, NULL); + strlist__for_each(ent, rawlist) { + ret = parse_probe_trace_command(ent->s, &tev); + if (ret < 0) + break; + if (include_group) { + ret = e_snprintf(buf, 128, "%s:%s", tev.group, + tev.event); + if (ret >= 0) + ret = strlist__add(sl, buf); + } else + ret = strlist__add(sl, tev.event); + clear_probe_trace_event(&tev); + if (ret < 0) + break; + } + strlist__delete(rawlist); + + if (ret < 0) { + strlist__delete(sl); + return NULL; + } + return sl; +} + +/* Get current perf-probe event names */ +struct strlist *probe_file__get_namelist(int fd) +{ + return __probe_file__get_namelist(fd, false); +} + +int probe_file__add_event(int fd, struct probe_trace_event *tev) +{ + int ret = 0; + char *buf = synthesize_probe_trace_command(tev); + char sbuf[STRERR_BUFSIZE]; + + if (!buf) { + pr_debug("Failed to synthesize probe trace event.\n"); + return -EINVAL; + } + + pr_debug("Writing event: %s\n", buf); + if (!probe_event_dry_run) { + ret = write(fd, buf, strlen(buf)); + if (ret <= 0) { + ret = -errno; + pr_warning("Failed to write event: %s\n", + strerror_r(errno, sbuf, sizeof(sbuf))); + } + } + free(buf); + + return ret; +} + +static int __del_trace_probe_event(int fd, struct str_node *ent) +{ + char *p; + char buf[128]; + int ret; + + /* Convert from perf-probe event to trace-probe event */ + ret = e_snprintf(buf, 128, "-:%s", ent->s); + if (ret < 0) + goto error; + + p = strchr(buf + 2, ':'); + if (!p) { + pr_debug("Internal error: %s should have ':' but not.\n", + ent->s); + ret = -ENOTSUP; + goto error; + } + *p = '/'; + + pr_debug("Writing event: %s\n", buf); + ret = write(fd, buf, strlen(buf)); + if (ret < 0) { + ret = -errno; + goto error; + } + + pr_info("Removed event: %s\n", ent->s); + return 0; +error: + pr_warning("Failed to delete event: %s\n", + strerror_r(-ret, buf, sizeof(buf))); + return ret; +} + +int probe_file__del_events(int fd, struct strfilter *filter) +{ + struct strlist *namelist; + struct str_node *ent; + const char *p; + int ret = -ENOENT; + + namelist = __probe_file__get_namelist(fd, true); + if (!namelist) + return -ENOENT; + + strlist__for_each(ent, namelist) { + p = strchr(ent->s, ':'); + if ((p && strfilter__compare(filter, p + 1)) || + strfilter__compare(filter, ent->s)) { + ret = __del_trace_probe_event(fd, ent); + if (ret < 0) + break; + } + } + strlist__delete(namelist); + + return ret; +} diff --git a/tools/perf/util/probe-file.h b/tools/perf/util/probe-file.h new file mode 100644 index 0000000..ada94a2 --- /dev/null +++ b/tools/perf/util/probe-file.h @@ -0,0 +1,18 @@ +#ifndef __PROBE_FILE_H +#define __PROBE_FILE_H + +#include "strlist.h" +#include "strfilter.h" +#include "probe-event.h" + +#define PF_FL_UPROBE 1 +#define PF_FL_RW 2 + +int probe_file__open(int flag); +int probe_file__open_both(int *kfd, int *ufd, int flag); +struct strlist *probe_file__get_namelist(int fd); +struct strlist *probe_file__get_rawlist(int fd); +int probe_file__add_event(int fd, struct probe_trace_event *tev); +int probe_file__del_events(int fd, struct strfilter *filter); + +#endif -- cgit v0.10.2 From d77fac7f9e687d137b17296d86d9143c2cccab6a Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 15 Jul 2015 18:14:28 +0900 Subject: perf buildid: Use SBUILD_ID_SIZE macro Introduce SBUILD_ID_SIZE macro and use it instead of using BUILD_ID_SIZE * 2 + 1. Signed-off-by: Masami Hiramatsu Cc: Adrian Hunter Cc: Borislav Petkov Cc: Hemant Kumar Cc: Jiri Olsa Cc: Namhyung Kim Cc: Paul Mackerras Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20150715091428.8915.75265.stgit@localhost.localdomain Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/builtin-buildid-cache.c b/tools/perf/builtin-buildid-cache.c index ddca990..65b4835 100644 --- a/tools/perf/builtin-buildid-cache.c +++ b/tools/perf/builtin-buildid-cache.c @@ -127,7 +127,7 @@ static int build_id_cache__kcore_existing(const char *from_dir, char *to_dir, static int build_id_cache__add_kcore(const char *filename, bool force) { - char dir[32], sbuildid[BUILD_ID_SIZE * 2 + 1]; + char dir[32], sbuildid[SBUILD_ID_SIZE]; char from_dir[PATH_MAX], to_dir[PATH_MAX]; char *p; @@ -184,7 +184,7 @@ static int build_id_cache__add_kcore(const char *filename, bool force) static int build_id_cache__add_file(const char *filename) { - char sbuild_id[BUILD_ID_SIZE * 2 + 1]; + char sbuild_id[SBUILD_ID_SIZE]; u8 build_id[BUILD_ID_SIZE]; int err; @@ -204,7 +204,7 @@ static int build_id_cache__add_file(const char *filename) static int build_id_cache__remove_file(const char *filename) { u8 build_id[BUILD_ID_SIZE]; - char sbuild_id[BUILD_ID_SIZE * 2 + 1]; + char sbuild_id[SBUILD_ID_SIZE]; int err; @@ -276,7 +276,7 @@ static int build_id_cache__fprintf_missing(struct perf_session *session, FILE *f static int build_id_cache__update_file(const char *filename) { u8 build_id[BUILD_ID_SIZE]; - char sbuild_id[BUILD_ID_SIZE * 2 + 1]; + char sbuild_id[SBUILD_ID_SIZE]; int err = 0; diff --git a/tools/perf/builtin-buildid-list.c b/tools/perf/builtin-buildid-list.c index 9fe93c8..b5ca988 100644 --- a/tools/perf/builtin-buildid-list.c +++ b/tools/perf/builtin-buildid-list.c @@ -20,7 +20,7 @@ static int sysfs__fprintf_build_id(FILE *fp) { u8 kallsyms_build_id[BUILD_ID_SIZE]; - char sbuild_id[BUILD_ID_SIZE * 2 + 1]; + char sbuild_id[SBUILD_ID_SIZE]; if (sysfs__read_build_id("/sys/kernel/notes", kallsyms_build_id, sizeof(kallsyms_build_id)) != 0) @@ -35,7 +35,7 @@ static int sysfs__fprintf_build_id(FILE *fp) static int filename__fprintf_build_id(const char *name, FILE *fp) { u8 build_id[BUILD_ID_SIZE]; - char sbuild_id[BUILD_ID_SIZE * 2 + 1]; + char sbuild_id[SBUILD_ID_SIZE]; if (filename__read_build_id(name, build_id, sizeof(build_id)) != sizeof(build_id)) diff --git a/tools/perf/util/build-id.c b/tools/perf/util/build-id.c index f98c2ff..4a2c2f0 100644 --- a/tools/perf/util/build-id.c +++ b/tools/perf/util/build-id.c @@ -124,7 +124,7 @@ static char *build_id__filename(const char *sbuild_id, char *bf, size_t size) char *dso__build_id_filename(const struct dso *dso, char *bf, size_t size) { - char build_id_hex[BUILD_ID_SIZE * 2 + 1]; + char build_id_hex[SBUILD_ID_SIZE]; if (!dso->has_build_id) return NULL; @@ -384,7 +384,7 @@ static int build_id_cache__add_b(const u8 *build_id, size_t build_id_size, const char *name, bool is_kallsyms, bool is_vdso) { - char sbuild_id[BUILD_ID_SIZE * 2 + 1]; + char sbuild_id[SBUILD_ID_SIZE]; build_id__sprintf(build_id, build_id_size, sbuild_id); diff --git a/tools/perf/util/build-id.h b/tools/perf/util/build-id.h index 8501122..ce2f493 100644 --- a/tools/perf/util/build-id.h +++ b/tools/perf/util/build-id.h @@ -1,7 +1,8 @@ #ifndef PERF_BUILD_ID_H_ #define PERF_BUILD_ID_H_ 1 -#define BUILD_ID_SIZE 20 +#define BUILD_ID_SIZE 20 +#define SBUILD_ID_SIZE (BUILD_ID_SIZE * 2 + 1) #include "tool.h" #include "strlist.h" -- cgit v0.10.2 From 52c0a18b9010fb19d10889e8a00aa784197d357c Mon Sep 17 00:00:00 2001 From: Sergei Trofimovich Date: Sun, 19 Jul 2015 10:30:05 +0100 Subject: perf tools: Fix makefile generation under dash Under dash 'echo -n' yields '-n' to stdout. Use printf "" instead. Signed-off-by: Sergei Trofimovich Acked-by: Ingo Molnar Acked-by: Jiri Olsa Cc: Adrian Hunter Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1437298205-29305-1-git-send-email-siarheit@google.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/config/Makefile b/tools/perf/config/Makefile index a9b93d1..8768f84 100644 --- a/tools/perf/config/Makefile +++ b/tools/perf/config/Makefile @@ -11,7 +11,7 @@ ifneq ($(obj-perf),) obj-perf := $(abspath $(obj-perf))/ endif -$(shell echo -n > $(OUTPUT).config-detected) +$(shell printf "" > $(OUTPUT).config-detected) detected = $(shell echo "$(1)=y" >> $(OUTPUT).config-detected) detected_var = $(shell echo "$(1)=$($(1))" >> $(OUTPUT).config-detected) -- cgit v0.10.2 From d2f3f5d2e9cae6e73f9642a5ddc8c8a07c35e79b Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 7 Jul 2015 01:55:53 -0700 Subject: perf bench futex: Add lock_pi stresser Allows a way of measuring low level kernel implementation of FUTEX_LOCK_PI and FUTEX_UNLOCK_PI. The program comes in two flavors: (i) single futex (default), all threads contend on the same uaddr. For the sake of the benchmark, we call into kernel space even when the lock is uncontended. The kernel will set it to TID, any waters that come in and contend for the pi futex will be handled respectively by the kernel. (ii) -M option for multiple futexes, each thread deals with its own futex. This is a trivial scenario and only measures kernel handling of 0->TID transition. Signed-off-by: Davidlohr Bueso Cc: Mel Gorman Link: http://lkml.kernel.org/r/1436259353.12255.78.camel@stgolabs.net Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/Documentation/perf-bench.txt b/tools/perf/Documentation/perf-bench.txt index bf3d064..ab632d9 100644 --- a/tools/perf/Documentation/perf-bench.txt +++ b/tools/perf/Documentation/perf-bench.txt @@ -216,6 +216,10 @@ Suite for evaluating parallel wake calls. *requeue*:: Suite for evaluating requeue calls. +*lock-pi*:: +Suite for evaluating futex lock_pi calls. + + SEE ALSO -------- linkperf:perf[1] diff --git a/tools/perf/bench/Build b/tools/perf/bench/Build index c3ab760..573e288 100644 --- a/tools/perf/bench/Build +++ b/tools/perf/bench/Build @@ -5,6 +5,7 @@ perf-y += futex-hash.o perf-y += futex-wake.o perf-y += futex-wake-parallel.o perf-y += futex-requeue.o +perf-y += futex-lock-pi.o perf-$(CONFIG_X86_64) += mem-memcpy-x86-64-asm.o perf-$(CONFIG_X86_64) += mem-memset-x86-64-asm.o diff --git a/tools/perf/bench/bench.h b/tools/perf/bench/bench.h index 70b2f71..a50df86 100644 --- a/tools/perf/bench/bench.h +++ b/tools/perf/bench/bench.h @@ -36,6 +36,8 @@ extern int bench_futex_wake(int argc, const char **argv, const char *prefix); extern int bench_futex_wake_parallel(int argc, const char **argv, const char *prefix); extern int bench_futex_requeue(int argc, const char **argv, const char *prefix); +/* pi futexes */ +extern int bench_futex_lock_pi(int argc, const char **argv, const char *prefix); #define BENCH_FORMAT_DEFAULT_STR "default" #define BENCH_FORMAT_DEFAULT 0 diff --git a/tools/perf/bench/futex-lock-pi.c b/tools/perf/bench/futex-lock-pi.c new file mode 100644 index 0000000..bc6a16a --- /dev/null +++ b/tools/perf/bench/futex-lock-pi.c @@ -0,0 +1,219 @@ +/* + * Copyright (C) 2015 Davidlohr Bueso. + */ + +#include "../perf.h" +#include "../util/util.h" +#include "../util/stat.h" +#include "../util/parse-options.h" +#include "../util/header.h" +#include "bench.h" +#include "futex.h" + +#include +#include +#include +#include + +struct worker { + int tid; + u_int32_t *futex; + pthread_t thread; + unsigned long ops; +}; + +static u_int32_t global_futex = 0; +static struct worker *worker; +static unsigned int nsecs = 10; +static bool silent = false, multi = false; +static bool done = false, fshared = false; +static unsigned int ncpus, nthreads = 0; +static int futex_flag = 0; +struct timeval start, end, runtime; +static pthread_mutex_t thread_lock; +static unsigned int threads_starting; +static struct stats throughput_stats; +static pthread_cond_t thread_parent, thread_worker; + +static const struct option options[] = { + OPT_UINTEGER('t', "threads", &nthreads, "Specify amount of threads"), + OPT_UINTEGER('r', "runtime", &nsecs, "Specify runtime (in seconds)"), + OPT_BOOLEAN( 'M', "multi", &multi, "Use multiple futexes"), + OPT_BOOLEAN( 's', "silent", &silent, "Silent mode: do not display data/details"), + OPT_BOOLEAN( 'S', "shared", &fshared, "Use shared futexes instead of private ones"), + OPT_END() +}; + +static const char * const bench_futex_lock_pi_usage[] = { + "perf bench futex requeue ", + NULL +}; + +static void print_summary(void) +{ + unsigned long avg = avg_stats(&throughput_stats); + double stddev = stddev_stats(&throughput_stats); + + printf("%sAveraged %ld operations/sec (+- %.2f%%), total secs = %d\n", + !silent ? "\n" : "", avg, rel_stddev_stats(stddev, avg), + (int) runtime.tv_sec); +} + +static void toggle_done(int sig __maybe_unused, + siginfo_t *info __maybe_unused, + void *uc __maybe_unused) +{ + /* inform all threads that we're done for the day */ + done = true; + gettimeofday(&end, NULL); + timersub(&end, &start, &runtime); +} + +static void *workerfn(void *arg) +{ + struct worker *w = (struct worker *) arg; + + pthread_mutex_lock(&thread_lock); + threads_starting--; + if (!threads_starting) + pthread_cond_signal(&thread_parent); + pthread_cond_wait(&thread_worker, &thread_lock); + pthread_mutex_unlock(&thread_lock); + + do { + int ret; + again: + ret = futex_lock_pi(w->futex, NULL, 0, futex_flag); + + if (ret) { /* handle lock acquisition */ + if (!silent) + warn("thread %d: Could not lock pi-lock for %p (%d)", + w->tid, w->futex, ret); + if (done) + break; + + goto again; + } + + usleep(1); + ret = futex_unlock_pi(w->futex, futex_flag); + if (ret && !silent) + warn("thread %d: Could not unlock pi-lock for %p (%d)", + w->tid, w->futex, ret); + w->ops++; /* account for thread's share of work */ + } while (!done); + + return NULL; +} + +static void create_threads(struct worker *w, pthread_attr_t thread_attr) +{ + cpu_set_t cpu; + unsigned int i; + + threads_starting = nthreads; + + for (i = 0; i < nthreads; i++) { + worker[i].tid = i; + + if (multi) { + worker[i].futex = calloc(1, sizeof(u_int32_t)); + if (!worker[i].futex) + err(EXIT_FAILURE, "calloc"); + } else + worker[i].futex = &global_futex; + + CPU_ZERO(&cpu); + CPU_SET(i % ncpus, &cpu); + + if (pthread_attr_setaffinity_np(&thread_attr, sizeof(cpu_set_t), &cpu)) + err(EXIT_FAILURE, "pthread_attr_setaffinity_np"); + + if (pthread_create(&w[i].thread, &thread_attr, workerfn, &worker[i])) + err(EXIT_FAILURE, "pthread_create"); + } +} + +int bench_futex_lock_pi(int argc, const char **argv, + const char *prefix __maybe_unused) +{ + int ret = 0; + unsigned int i; + struct sigaction act; + pthread_attr_t thread_attr; + + argc = parse_options(argc, argv, options, bench_futex_lock_pi_usage, 0); + if (argc) + goto err; + + ncpus = sysconf(_SC_NPROCESSORS_ONLN); + + sigfillset(&act.sa_mask); + act.sa_sigaction = toggle_done; + sigaction(SIGINT, &act, NULL); + + if (!nthreads) + nthreads = ncpus; + + worker = calloc(nthreads, sizeof(*worker)); + if (!worker) + err(EXIT_FAILURE, "calloc"); + + if (!fshared) + futex_flag = FUTEX_PRIVATE_FLAG; + + printf("Run summary [PID %d]: %d threads doing pi lock/unlock pairing for %d secs.\n\n", + getpid(), nthreads, nsecs); + + init_stats(&throughput_stats); + pthread_mutex_init(&thread_lock, NULL); + pthread_cond_init(&thread_parent, NULL); + pthread_cond_init(&thread_worker, NULL); + + threads_starting = nthreads; + pthread_attr_init(&thread_attr); + gettimeofday(&start, NULL); + + create_threads(worker, thread_attr); + pthread_attr_destroy(&thread_attr); + + pthread_mutex_lock(&thread_lock); + while (threads_starting) + pthread_cond_wait(&thread_parent, &thread_lock); + pthread_cond_broadcast(&thread_worker); + pthread_mutex_unlock(&thread_lock); + + sleep(nsecs); + toggle_done(0, NULL, NULL); + + for (i = 0; i < nthreads; i++) { + ret = pthread_join(worker[i].thread, NULL); + if (ret) + err(EXIT_FAILURE, "pthread_join"); + } + + /* cleanup & report results */ + pthread_cond_destroy(&thread_parent); + pthread_cond_destroy(&thread_worker); + pthread_mutex_destroy(&thread_lock); + + for (i = 0; i < nthreads; i++) { + unsigned long t = worker[i].ops/runtime.tv_sec; + + update_stats(&throughput_stats, t); + if (!silent) + printf("[thread %3d] futex: %p [ %ld ops/sec ]\n", + worker[i].tid, worker[i].futex, t); + + if (multi) + free(worker[i].futex); + } + + print_summary(); + + free(worker); + return ret; +err: + usage_with_options(bench_futex_lock_pi_usage, options); + exit(EXIT_FAILURE); +} diff --git a/tools/perf/bench/futex.h b/tools/perf/bench/futex.h index 7ed22ff..d44de9f 100644 --- a/tools/perf/bench/futex.h +++ b/tools/perf/bench/futex.h @@ -56,6 +56,26 @@ futex_wake(u_int32_t *uaddr, int nr_wake, int opflags) } /** + * futex_lock_pi() - block on uaddr as a PI mutex + * @detect: whether (1) or not (0) to perform deadlock detection + */ +static inline int +futex_lock_pi(u_int32_t *uaddr, struct timespec *timeout, int detect, + int opflags) +{ + return futex(uaddr, FUTEX_LOCK_PI, detect, timeout, NULL, 0, opflags); +} + +/** + * futex_unlock_pi() - release uaddr as a PI mutex, waking the top waiter + */ +static inline int +futex_unlock_pi(u_int32_t *uaddr, int opflags) +{ + return futex(uaddr, FUTEX_UNLOCK_PI, 0, NULL, NULL, 0, opflags); +} + +/** * futex_cmp_requeue() - requeue tasks from uaddr to uaddr2 * @nr_wake: wake up to this many tasks * @nr_requeue: requeue up to this many tasks diff --git a/tools/perf/builtin-bench.c b/tools/perf/builtin-bench.c index b5314e4..f67934d 100644 --- a/tools/perf/builtin-bench.c +++ b/tools/perf/builtin-bench.c @@ -60,6 +60,8 @@ static struct bench futex_benchmarks[] = { { "wake", "Benchmark for futex wake calls", bench_futex_wake }, { "wake-parallel", "Benchmark for parallel futex wake calls", bench_futex_wake_parallel }, { "requeue", "Benchmark for futex requeue calls", bench_futex_requeue }, + /* pi-futexes */ + { "lock-pi", "Benchmark for futex lock_pi calls", bench_futex_lock_pi }, { "all", "Test all futex benchmarks", NULL }, { NULL, NULL, NULL } }; -- cgit v0.10.2 From 35318d204db83f5c1f24c281839763b271b9b323 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 21 Jul 2015 14:31:21 +0200 Subject: perf test: Check for refcnt in thread_map test Checking also for refcnt in thread_map test. Signed-off-by: Jiri Olsa Cc: David Ahern Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1437481927-29538-2-git-send-email-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/tests/thread-map.c b/tools/perf/tests/thread-map.c index 5acf000..138a0e3 100644 --- a/tools/perf/tests/thread-map.c +++ b/tools/perf/tests/thread-map.c @@ -20,6 +20,8 @@ int test__thread_map(void) TEST_ASSERT_VAL("wrong comm", thread_map__comm(map, 0) && !strcmp(thread_map__comm(map, 0), "perf")); + TEST_ASSERT_VAL("wrong refcnt", + atomic_read(&map->refcnt) == 1); thread_map__put(map); /* test dummy pid */ @@ -33,6 +35,8 @@ int test__thread_map(void) TEST_ASSERT_VAL("wrong comm", thread_map__comm(map, 0) && !strcmp(thread_map__comm(map, 0), "dummy")); + TEST_ASSERT_VAL("wrong refcnt", + atomic_read(&map->refcnt) == 1); thread_map__put(map); return 0; } -- cgit v0.10.2 From 3de5cfb04435b82aa427d0285df996ba73d2f426 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 21 Jul 2015 14:31:30 +0200 Subject: perf evlist: Force perf_evlist__set_maps to propagate maps through events Forcing perf_evlist__set_maps to propagate maps through events, so cpu/thread maps get set within evlist. Signed-off-by: Jiri Olsa Cc: David Ahern Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1437481927-29538-11-git-send-email-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index f7d9c77..6bfcab9b 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -1150,6 +1150,23 @@ out_delete_threads: return -1; } +int perf_evlist__set_maps(struct perf_evlist *evlist, + struct cpu_map *cpus, + struct thread_map *threads) +{ + if (evlist->cpus) + cpu_map__put(evlist->cpus); + + evlist->cpus = cpus; + + if (evlist->threads) + thread_map__put(evlist->threads); + + evlist->threads = threads; + + return perf_evlist__propagate_maps(evlist, false); +} + int perf_evlist__apply_filters(struct perf_evlist *evlist, struct perf_evsel **err_evsel) { struct perf_evsel *evsel; diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h index 037633c..406a821 100644 --- a/tools/perf/util/evlist.h +++ b/tools/perf/util/evlist.h @@ -152,14 +152,9 @@ int perf_evlist__enable_event_idx(struct perf_evlist *evlist, void perf_evlist__set_selected(struct perf_evlist *evlist, struct perf_evsel *evsel); -static inline void perf_evlist__set_maps(struct perf_evlist *evlist, - struct cpu_map *cpus, - struct thread_map *threads) -{ - evlist->cpus = cpus; - evlist->threads = threads; -} - +int perf_evlist__set_maps(struct perf_evlist *evlist, + struct cpu_map *cpus, + struct thread_map *threads); int perf_evlist__create_maps(struct perf_evlist *evlist, struct target *target); int perf_evlist__apply_filters(struct perf_evlist *evlist, struct perf_evsel **err_evsel); -- cgit v0.10.2 From 52361ff093c807464f5a32a587a370b360bd399a Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 21 Jul 2015 14:31:28 +0200 Subject: perf evlist: Use bool instead of target argument in propagate_maps() We need only bool info wether user defined her own set of cpus. Switching target argument to bool so it could be used from places without target object defined in following patches. Signed-off-by: Jiri Olsa Cc: David Ahern Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1437481927-29538-9-git-send-email-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 6bfcab9b..80ab942 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -1102,7 +1102,7 @@ int perf_evlist__mmap(struct perf_evlist *evlist, unsigned int pages, } static int perf_evlist__propagate_maps(struct perf_evlist *evlist, - struct target *target) + bool has_user_cpus) { struct perf_evsel *evsel; @@ -1111,10 +1111,10 @@ static int perf_evlist__propagate_maps(struct perf_evlist *evlist, * We already have cpus for evsel (via PMU sysfs) so * keep it, if there's no target cpu list defined. */ - if (evsel->cpus && target->cpu_list) + if (evsel->cpus && has_user_cpus) cpu_map__put(evsel->cpus); - if (!evsel->cpus || target->cpu_list) + if (!evsel->cpus || has_user_cpus) evsel->cpus = cpu_map__get(evlist->cpus); evsel->threads = thread_map__get(evlist->threads); @@ -1142,7 +1142,7 @@ int perf_evlist__create_maps(struct perf_evlist *evlist, struct target *target) if (evlist->cpus == NULL) goto out_delete_threads; - return perf_evlist__propagate_maps(evlist, target); + return perf_evlist__propagate_maps(evlist, !!target->cpu_list); out_delete_threads: thread_map__put(evlist->threads); -- cgit v0.10.2 From 0e5ffb317d7f861c309b0ab679ed2f59e9f72adf Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 21 Jul 2015 14:31:29 +0200 Subject: perf evlist: Tolerate NULL maps in propagate_maps Tolerating NULL maps in perf_evlist__propagate_maps, so we dont need to pass evlist with both cpus and threads maps defined. Signed-off-by: Jiri Olsa Cc: David Ahern Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1437481927-29538-10-git-send-email-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 80ab942..3b9f411 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -1119,7 +1119,8 @@ static int perf_evlist__propagate_maps(struct perf_evlist *evlist, evsel->threads = thread_map__get(evlist->threads); - if (!evsel->cpus || !evsel->threads) + if ((evlist->cpus && !evsel->cpus) || + (evlist->threads && !evsel->threads)) return -ENOMEM; } -- cgit v0.10.2 From 768dd3f3a6af25730ed1eec458e47a3c481bc3e5 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 21 Jul 2015 14:31:31 +0200 Subject: perf header: Use argv style storage for cmdline feature data We will reuse argv style data in following change to display counters header showing monitored command line. Signed-off-by: Jiri Olsa Cc: David Ahern Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1437481927-29538-12-git-send-email-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index 03ace57..179b2bd 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -923,17 +923,13 @@ static void print_cmdline(struct perf_header *ph, int fd __maybe_unused, FILE *fp) { int nr, i; - char *str; nr = ph->env.nr_cmdline; - str = ph->env.cmdline; fprintf(fp, "# cmdline : "); - for (i = 0; i < nr; i++) { - fprintf(fp, "%s ", str); - str += strlen(str) + 1; - } + for (i = 0; i < nr; i++) + fprintf(fp, "%s ", ph->env.cmdline_argv[i]); fputc('\n', fp); } @@ -1541,14 +1537,13 @@ process_event_desc(struct perf_file_section *section __maybe_unused, return 0; } -static int process_cmdline(struct perf_file_section *section __maybe_unused, +static int process_cmdline(struct perf_file_section *section, struct perf_header *ph, int fd, void *data __maybe_unused) { ssize_t ret; - char *str; - u32 nr, i; - struct strbuf sb; + char *str, *cmdline = NULL, **argv = NULL; + u32 nr, i, len = 0; ret = readn(fd, &nr, sizeof(nr)); if (ret != sizeof(nr)) @@ -1558,22 +1553,32 @@ static int process_cmdline(struct perf_file_section *section __maybe_unused, nr = bswap_32(nr); ph->env.nr_cmdline = nr; - strbuf_init(&sb, 128); + + cmdline = zalloc(section->size + nr + 1); + if (!cmdline) + return -1; + + argv = zalloc(sizeof(char *) * (nr + 1)); + if (!argv) + goto error; for (i = 0; i < nr; i++) { str = do_read_string(fd, ph); if (!str) goto error; - /* include a NULL character at the end */ - strbuf_add(&sb, str, strlen(str) + 1); + argv[i] = cmdline + len; + memcpy(argv[i], str, strlen(str) + 1); + len += strlen(str) + 1; free(str); } - ph->env.cmdline = strbuf_detach(&sb, NULL); + ph->env.cmdline = cmdline; + ph->env.cmdline_argv = (const char **) argv; return 0; error: - strbuf_release(&sb); + free(argv); + free(cmdline); return -1; } diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h index d4d5796..9b53b65 100644 --- a/tools/perf/util/header.h +++ b/tools/perf/util/header.h @@ -84,6 +84,7 @@ struct perf_session_env { int nr_pmu_mappings; int nr_groups; char *cmdline; + const char **cmdline_argv; char *sibling_cores; char *sibling_threads; char *numa_nodes; diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index ed9dc25..fb1d525 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -180,6 +180,7 @@ static void perf_session_env__delete(struct perf_session_env *env) zfree(&env->cpuid); zfree(&env->cmdline); + zfree(&env->cmdline_argv); zfree(&env->sibling_cores); zfree(&env->sibling_threads); zfree(&env->numa_nodes); -- cgit v0.10.2 From b685ac22b436455db9c9c16b384d7aa4531cdd80 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 22 Jul 2015 12:52:17 -0300 Subject: perf symbols: Add front end cache for DSO symbol lookup Acked-by: David Ahern Cc: Adrian Hunter Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Namhyung Kim Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-voo94tow8wpkcc76mlkny6sc@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/dso.h b/tools/perf/util/dso.h index 2fe98bb..c73276d 100644 --- a/tools/perf/util/dso.h +++ b/tools/perf/util/dso.h @@ -137,6 +137,10 @@ struct dso { struct rb_node rb_node; /* rbtree node sorted by long name */ struct rb_root symbols[MAP__NR_TYPES]; struct rb_root symbol_names[MAP__NR_TYPES]; + struct { + u64 addr; + struct symbol *symbol; + } last_find_result[MAP__NR_TYPES]; void *a2l; char *symsrc_filename; unsigned int a2l_fails; diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c index 45c2e3a..725640f 100644 --- a/tools/perf/util/symbol.c +++ b/tools/perf/util/symbol.c @@ -444,7 +444,12 @@ static struct symbol *symbols__find_by_name(struct rb_root *symbols, struct symbol *dso__find_symbol(struct dso *dso, enum map_type type, u64 addr) { - return symbols__find(&dso->symbols[type], addr); + if (dso->last_find_result[type].addr != addr) { + dso->last_find_result[type].addr = addr; + dso->last_find_result[type].symbol = symbols__find(&dso->symbols[type], addr); + } + + return dso->last_find_result[type].symbol; } struct symbol *dso__first_symbol(struct dso *dso, enum map_type type) -- cgit v0.10.2 From e6ce712634ea038ce42bbb11932b692696756882 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 23 Jul 2015 11:06:16 -0300 Subject: perf symbols: Introduce map__is_(kernel,kmodule)() To, with members we already have, check if a kernel level map is for the kernel proper or for a module. Acked-by: David Ahern Cc: Adrian Hunter Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Namhyung Kim Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-m5ic7h0z2crmtj7vi1a1rj3b@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c index b5a5e9c..ce37e95 100644 --- a/tools/perf/util/map.c +++ b/tools/perf/util/map.c @@ -224,6 +224,20 @@ struct map *map__new2(u64 start, struct dso *dso, enum map_type type) return map; } +/* + * Use this and __map__is_kmodule() for map instances that are in + * machine->kmaps, and thus have map->groups->machine all properly set, to + * disambiguate between the kernel and modules. + * + * When the need arises, introduce map__is_{kernel,kmodule)() that + * checks (map->groups != NULL && map->groups->machine != NULL && + * map->dso->kernel) before calling __map__is_{kernel,kmodule}()) + */ +bool __map__is_kernel(const struct map *map) +{ + return map->groups->machine->vmlinux_maps[map->type] == map; +} + static void map__exit(struct map *map) { BUG_ON(!RB_EMPTY_NODE(&map->rb_node)); diff --git a/tools/perf/util/map.h b/tools/perf/util/map.h index d73e687..57829e8 100644 --- a/tools/perf/util/map.h +++ b/tools/perf/util/map.h @@ -256,4 +256,11 @@ int map_groups__fixup_overlappings(struct map_groups *mg, struct map *map, struct map *map_groups__find_by_name(struct map_groups *mg, enum map_type type, const char *name); +bool __map__is_kernel(const struct map *map); + +static inline bool __map__is_kmodule(const struct map *map) +{ + return !__map__is_kernel(map); +} + #endif /* __PERF_MAP_H */ -- cgit v0.10.2 From 33a2471cc9b7b1fb27ff2031dbaff701644b1a4d Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 22 Jul 2015 12:36:55 -0300 Subject: tools lib traceevent: Allow setting an alternative symbol resolver The perf tools have a symbol resolver that includes solving kernel symbols using either kallsyms or ELF symtabs, and it also is using libtraceevent to format the trace events fields, including via subsystem specific plugins, like the "timer" one. To solve fields like "timer:hrtimer_start"'s "function", libtraceevent needs a way to map from its value to a function name and addr. This patch provides a way for tools that already have symbol resolving facilities to ask libtraceevent to use it when needing to resolve kernel symbols. Reviewed-by: Steven Rostedt Acked-by: David Ahern Cc: Adrian Hunter Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Namhyung Kim Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-fdx1fazols17w5py26ia3bwh@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/lib/traceevent/event-parse.c b/tools/lib/traceevent/event-parse.c index cc25f05..fcd8a9e 100644 --- a/tools/lib/traceevent/event-parse.c +++ b/tools/lib/traceevent/event-parse.c @@ -418,7 +418,7 @@ static int func_map_init(struct pevent *pevent) } static struct func_map * -find_func(struct pevent *pevent, unsigned long long addr) +__find_func(struct pevent *pevent, unsigned long long addr) { struct func_map *func; struct func_map key; @@ -434,6 +434,71 @@ find_func(struct pevent *pevent, unsigned long long addr) return func; } +struct func_resolver { + pevent_func_resolver_t *func; + void *priv; + struct func_map map; +}; + +/** + * pevent_set_function_resolver - set an alternative function resolver + * @pevent: handle for the pevent + * @resolver: function to be used + * @priv: resolver function private state. + * + * Some tools may have already a way to resolve kernel functions, allow them to + * keep using it instead of duplicating all the entries inside + * pevent->funclist. + */ +int pevent_set_function_resolver(struct pevent *pevent, + pevent_func_resolver_t *func, void *priv) +{ + struct func_resolver *resolver = malloc(sizeof(*resolver)); + + if (resolver == NULL) + return -1; + + resolver->func = func; + resolver->priv = priv; + + free(pevent->func_resolver); + pevent->func_resolver = resolver; + + return 0; +} + +/** + * pevent_reset_function_resolver - reset alternative function resolver + * @pevent: handle for the pevent + * + * Stop using whatever alternative resolver was set, use the default + * one instead. + */ +void pevent_reset_function_resolver(struct pevent *pevent) +{ + free(pevent->func_resolver); + pevent->func_resolver = NULL; +} + +static struct func_map * +find_func(struct pevent *pevent, unsigned long long addr) +{ + struct func_map *map; + + if (!pevent->func_resolver) + return __find_func(pevent, addr); + + map = &pevent->func_resolver->map; + map->mod = NULL; + map->addr = addr; + map->func = pevent->func_resolver->func(pevent->func_resolver->priv, + &map->addr, &map->mod); + if (map->func == NULL) + return NULL; + + return map; +} + /** * pevent_find_function - find a function by a given address * @pevent: handle for the pevent @@ -6564,6 +6629,7 @@ void pevent_free(struct pevent *pevent) free(pevent->trace_clock); free(pevent->events); free(pevent->sort_events); + free(pevent->func_resolver); free(pevent); } diff --git a/tools/lib/traceevent/event-parse.h b/tools/lib/traceevent/event-parse.h index 063b197..204befb 100644 --- a/tools/lib/traceevent/event-parse.h +++ b/tools/lib/traceevent/event-parse.h @@ -453,6 +453,10 @@ struct cmdline_list; struct func_map; struct func_list; struct event_handler; +struct func_resolver; + +typedef char *(pevent_func_resolver_t)(void *priv, + unsigned long long *addrp, char **modp); struct pevent { int ref_count; @@ -481,6 +485,7 @@ struct pevent { int cmdline_count; struct func_map *func_map; + struct func_resolver *func_resolver; struct func_list *funclist; unsigned int func_count; @@ -611,6 +616,9 @@ enum trace_flag_type { TRACE_FLAG_SOFTIRQ = 0x10, }; +int pevent_set_function_resolver(struct pevent *pevent, + pevent_func_resolver_t *func, void *priv); +void pevent_reset_function_resolver(struct pevent *pevent); int pevent_register_comm(struct pevent *pevent, const char *comm, int pid); int pevent_register_trace_clock(struct pevent *pevent, const char *trace_clock); int pevent_register_function(struct pevent *pevent, char *name, -- cgit v0.10.2 From c3168b0db93ad5ffeede4ecdf807dab64270f55d Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 22 Jul 2015 16:14:29 -0300 Subject: perf symbols: Provide libtraceevent callback to resolve kernel symbols That provides the function signature expected by libtraceevent's pevent_set_function_resolver(). Acked-by: David Ahern Cc: Adrian Hunter Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Namhyung Kim Cc: Stephane Eranian Cc: Steven Rostedt Link: http://lkml.kernel.org/n/tip-ie6hvlb6u15y4ulg9j1612zg@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index d0bf1e5..22006c1 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -1993,3 +1993,17 @@ struct dso *machine__findnew_dso(struct machine *machine, const char *filename) { return dsos__findnew(&machine->dsos, filename); } + +char *machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp) +{ + struct machine *machine = vmachine; + struct map *map; + struct symbol *sym = map_groups__find_symbol(&machine->kmaps, MAP__FUNCTION, *addrp, &map, NULL); + + if (sym == NULL) + return NULL; + + *modp = __map__is_kmodule(map) ? (char *)map->dso->short_name : NULL; + *addrp = map->unmap_ip(map, sym->start); + return sym->name; +} diff --git a/tools/perf/util/machine.h b/tools/perf/util/machine.h index 887798e..ff5f78c 100644 --- a/tools/perf/util/machine.h +++ b/tools/perf/util/machine.h @@ -237,5 +237,9 @@ int machine__synthesize_threads(struct machine *machine, struct target *target, pid_t machine__get_current_tid(struct machine *machine, int cpu); int machine__set_current_tid(struct machine *machine, int cpu, pid_t pid, pid_t tid); +/* + * For use with libtraceevent's pevent_set_function_resolver() + */ +char *machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp); #endif /* __PERF_MACHINE_H */ diff --git a/tools/perf/util/trace-event.c b/tools/perf/util/trace-event.c index 6322d37..667bd10 100644 --- a/tools/perf/util/trace-event.c +++ b/tools/perf/util/trace-event.c @@ -9,6 +9,7 @@ #include #include #include "trace-event.h" +#include "machine.h" #include "util.h" /* @@ -19,6 +20,7 @@ * there. */ static struct trace_event tevent; +static bool tevent_initialized; int trace_event__init(struct trace_event *t) { @@ -32,6 +34,32 @@ int trace_event__init(struct trace_event *t) return pevent ? 0 : -1; } +static int trace_event__init2(void) +{ + int be = traceevent_host_bigendian(); + struct pevent *pevent; + + if (trace_event__init(&tevent)) + return -1; + + pevent = tevent.pevent; + pevent_set_flag(pevent, PEVENT_NSEC_OUTPUT); + pevent_set_file_bigendian(pevent, be); + pevent_set_host_bigendian(pevent, be); + tevent_initialized = true; + return 0; +} + +int trace_event__register_resolver(struct machine *machine) +{ + if (!tevent_initialized && trace_event__init2()) + return -1; + + return pevent_set_function_resolver(tevent.pevent, + machine__resolve_kernel_addr, + machine); +} + void trace_event__cleanup(struct trace_event *t) { traceevent_unload_plugins(t->plugin_list, t->pevent); @@ -62,21 +90,8 @@ tp_format(const char *sys, const char *name) struct event_format* trace_event__tp_format(const char *sys, const char *name) { - static bool initialized; - - if (!initialized) { - int be = traceevent_host_bigendian(); - struct pevent *pevent; - - if (trace_event__init(&tevent)) - return NULL; - - pevent = tevent.pevent; - pevent_set_flag(pevent, PEVENT_NSEC_OUTPUT); - pevent_set_file_bigendian(pevent, be); - pevent_set_host_bigendian(pevent, be); - initialized = true; - } + if (!tevent_initialized && trace_event__init2()) + return NULL; return tp_format(sys, name); } diff --git a/tools/perf/util/trace-event.h b/tools/perf/util/trace-event.h index d5168f0..568128c 100644 --- a/tools/perf/util/trace-event.h +++ b/tools/perf/util/trace-event.h @@ -18,6 +18,7 @@ struct trace_event { int trace_event__init(struct trace_event *t); void trace_event__cleanup(struct trace_event *t); +int trace_event__register_resolver(struct machine *machine); struct event_format* trace_event__tp_format(const char *sys, const char *name); -- cgit v0.10.2 From 706c3da409dfc6964740ac503c040c4c621954fd Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 22 Jul 2015 16:16:16 -0300 Subject: perf trace: Provide libtracevent with a kernel symbol resolver So that beautifiers wanting to resolve kernel function addresses to names can do its work, now, for instance, the 'timer' tracepoints beautifiers works with 'perf trace', see the "function=tick..." part: # perf trace --event timer:hrtimer_start 0.000 timer:hrtimer_start:hrtimer=0xffff88026f3101c0 function=tick_sched_timer/0x0 expires=52098339000000 softexpires=52098339000000) 0.003 timer:hrtimer_start:hrtimer=0xffff88026f3101c0 function=tick_sched_timer/0x0 expires=52098339000000 softexpires=52098339000000) Reported-by: Thomas Gleixner Acked-by: David Ahern Cc: Adrian Hunter Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Namhyung Kim Cc: Stephane Eranian Cc: Steven Rostedt Link: http://lkml.kernel.org/n/tip-n4i0hxpbl1tnleiqkok47fw2@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 32b4d28..282841b1 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -1489,6 +1489,9 @@ static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist) if (trace->host == NULL) return -ENOMEM; + if (trace_event__register_resolver(trace->host) < 0) + return -errno; + err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target, evlist->threads, trace__tool_process, false, trace->opts.proc_map_timeout); -- cgit v0.10.2 From ccb3a8294a7d346257b526702eabcb2faeb721ac Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 22 Jul 2015 16:43:37 -0300 Subject: perf script: Switch from perf.data's kallsyms to perf's symbol resolver We were storing a copy of kallsyms inside perf.data file so that we could resolve kernel addresses to function (start, name, mod) tuples, but that can be achieved using the symbol resolving routines we have in symbols.c, and that are used elsewhere in tools/perf. So, do just like 'perf trace' did and ask libtraceevent to use perf's symbol resolution routines. The next step is to just skip whatever kallsyms data is embedded in older perf.data files and finally to stop storing kallsyms in the perf data file, as the 20-bytes build-id stored in perf.data's header is enough to find out the right symtab (be it ELF, kcore, kallsyms, etc) to use. Acked-by: David Ahern Cc: Adrian Hunter Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Namhyung Kim Cc: Stephane Eranian Cc: Steven Rostedt Link: http://lkml.kernel.org/n/tip-d0rtb8tk9j72pz0ehw5fnp24@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 24809787..2777c64 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -1830,6 +1830,13 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused) else symbol_conf.use_callchain = false; + if (pevent_set_function_resolver(session->tevent.pevent, + machine__resolve_kernel_addr, + &session->machines.host) < 0) { + pr_err("%s: failed to set libtraceevent function resolver\n", __func__); + return -1; + } + if (generate_script_lang) { struct stat perf_stat; int input; -- cgit v0.10.2 From 4263cece22e3da94f16fbbcf71ce3807946d3ef3 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 22 Jul 2015 16:48:16 -0300 Subject: perf tools: Stop reading the kallsyms data from perf.data As it is not used anymore, since 'perf script' switched to asking libtraceevent to use tools/perf's symbol resolution routines. Acked-by: David Ahern Cc: Adrian Hunter Cc: Borislav Petkov Cc: David Ahern Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Namhyung Kim Cc: Stephane Eranian Cc: Steven Rostedt Link: http://lkml.kernel.org/n/tip-4ilhofz4b7o8yokvutjt9yzz@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/trace-event-parse.c b/tools/perf/util/trace-event-parse.c index d495741..8ff7d62 100644 --- a/tools/perf/util/trace-event-parse.c +++ b/tools/perf/util/trace-event-parse.c @@ -135,36 +135,6 @@ void event_format__print(struct event_format *event, return event_format__fprintf(event, cpu, data, size, stdout); } -void parse_proc_kallsyms(struct pevent *pevent, - char *file, unsigned int size __maybe_unused) -{ - unsigned long long addr; - char *func; - char *line; - char *next = NULL; - char *addr_str; - char *mod; - char *fmt = NULL; - - line = strtok_r(file, "\n", &next); - while (line) { - mod = NULL; - addr_str = strtok_r(line, " ", &fmt); - addr = strtoull(addr_str, NULL, 16); - /* skip character */ - strtok_r(NULL, " ", &fmt); - func = strtok_r(NULL, "\t", &fmt); - mod = strtok_r(NULL, "]", &fmt); - /* truncate the extra '[' */ - if (mod) - mod = mod + 1; - - pevent_register_function(pevent, func, addr, mod); - - line = strtok_r(NULL, "\n", &next); - } -} - void parse_ftrace_printk(struct pevent *pevent, char *file, unsigned int size __maybe_unused) { diff --git a/tools/perf/util/trace-event-read.c b/tools/perf/util/trace-event-read.c index 54d9e9b..b67a0cc 100644 --- a/tools/perf/util/trace-event-read.c +++ b/tools/perf/util/trace-event-read.c @@ -162,25 +162,23 @@ out: static int read_proc_kallsyms(struct pevent *pevent) { unsigned int size; - char *buf; size = read4(pevent); if (!size) return 0; - - buf = malloc(size + 1); - if (buf == NULL) - return -1; - - if (do_read(buf, size) < 0) { - free(buf); - return -1; - } - buf[size] = '\0'; - - parse_proc_kallsyms(pevent, buf, size); - - free(buf); + /* + * Just skip it, now that we configure libtraceevent to use the + * tools/perf/ symbol resolver. + * + * We need to skip it so that we can continue parsing old perf.data + * files, that contains this /proc/kallsyms payload. + * + * Newer perf.data files will have just the 4-bytes zeros "kallsyms + * payload", so that older tools can continue reading it and interpret + * it as "no kallsyms payload is present". + */ + lseek(input_fd, size, SEEK_CUR); + trace_data_size += size; return 0; } -- cgit v0.10.2 From 6e5259e9b5b711b325a8455feb3ed27cdd0af0da Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 22 Jul 2015 17:02:18 -0300 Subject: perf tools: Stop copying kallsyms into the perf.data file header Since we now ask libtraceevent, the only user of this payload, to use perf's symbol resolution routines, there is no need to carry about ~4.5MB per perf.data when we can get it from one of the places the perf symbol resolution looks for that symtab (debuginfo, ~/.debug/, /proc/kallsyms, --symfs, etc), using the kernel and modules build-ids to make sure the right table is used. Acked-by: David Ahern Cc: Adrian Hunter Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Namhyung Kim Cc: Stephane Eranian Cc: Steven Rostedt Link: http://lkml.kernel.org/n/tip-h89ituf9rso2rv1v7kjrbeda@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/trace-event-info.c b/tools/perf/util/trace-event-info.c index eb72716..2224598 100644 --- a/tools/perf/util/trace-event-info.c +++ b/tools/perf/util/trace-event-info.c @@ -341,20 +341,14 @@ out: static int record_proc_kallsyms(void) { - unsigned int size; - const char *path = "/proc/kallsyms"; - struct stat st; - int ret, err = 0; - - ret = stat(path, &st); - if (ret < 0) { - /* not found */ - size = 0; - if (write(output_fd, &size, 4) != 4) - err = -EIO; - return err; - } - return record_file(path, 4); + unsigned long long size = 0; + /* + * Just to keep older perf.data file parsers happy, record a zero + * sized kallsyms file, i.e. do the same thing that was done when + * /proc/kallsyms (or something specified via --kallsyms, in a + * different path) couldn't be read. + */ + return write(output_fd, &size, 4) != 4 ? -EIO : 0; } static int record_ftrace_printk(void) -- cgit v0.10.2 From 45ac1403f564f411c6a383a2448688ba8dd705a4 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Tue, 21 Jul 2015 12:44:02 +0300 Subject: perf: Add PERF_RECORD_SWITCH to indicate context switches There are already two events for context switches, namely the tracepoint sched:sched_switch and the software event context_switches. Unfortunately neither are suitable for use by non-privileged users for the purpose of synchronizing hardware trace data (e.g. Intel PT) to the context switch. Tracepoints are no good at all for non-privileged users because they need either CAP_SYS_ADMIN or /proc/sys/kernel/perf_event_paranoid <= -1. On the other hand, kernel software events need either CAP_SYS_ADMIN or /proc/sys/kernel/perf_event_paranoid <= 1. Now many distributions do default perf_event_paranoid to 1 making context_switches a contender, except it has another problem (which is also shared with sched:sched_switch) which is that it happens before perf schedules events out instead of after perf schedules events in. Whereas a privileged user can see all the events anyway, a non-privileged user only sees events for their own processes, in other words they see when their process was scheduled out not when it was scheduled in. That presents two problems to use the event: 1. the information comes too late, so tools have to look ahead in the event stream to find out what the current state is 2. if they are unlucky tracing might have stopped before the context-switches event is recorded. This new PERF_RECORD_SWITCH event does not have those problems and it also has a couple of other small advantages. It is easier to use because it is an auxiliary event (like mmap, comm and task events) which can be enabled by setting a single bit. It is smaller than sched:sched_switch and easier to parse. To make the event useful for privileged users also, if the context is cpu-wide then the event record will be PERF_RECORD_SWITCH_CPU_WIDE which is the same as PERF_RECORD_SWITCH except it also provides the next or previous pid/tid. Signed-off-by: Adrian Hunter Acked-by: Peter Zijlstra (Intel) Tested-by: Jiri Olsa Cc: Andi Kleen Cc: Mathieu Poirier Cc: Pawel Moll Cc: Stephane Eranian Link: http://lkml.kernel.org/r/1437471846-26995-2-git-send-email-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index d97f84c..022d0ac 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -330,7 +330,8 @@ struct perf_event_attr { mmap2 : 1, /* include mmap with inode data */ comm_exec : 1, /* flag comm events that are due to an exec */ use_clockid : 1, /* use @clockid for time fields */ - __reserved_1 : 38; + context_switch : 1, /* context switch data */ + __reserved_1 : 37; union { __u32 wakeup_events; /* wakeup every n events */ @@ -572,9 +573,11 @@ struct perf_event_mmap_page { /* * PERF_RECORD_MISC_MMAP_DATA and PERF_RECORD_MISC_COMM_EXEC are used on * different events so can reuse the same bit position. + * Ditto PERF_RECORD_MISC_SWITCH_OUT. */ #define PERF_RECORD_MISC_MMAP_DATA (1 << 13) #define PERF_RECORD_MISC_COMM_EXEC (1 << 13) +#define PERF_RECORD_MISC_SWITCH_OUT (1 << 13) /* * Indicates that the content of PERF_SAMPLE_IP points to * the actual instruction that triggered the event. See also @@ -818,6 +821,32 @@ enum perf_event_type { */ PERF_RECORD_LOST_SAMPLES = 13, + /* + * Records a context switch in or out (flagged by + * PERF_RECORD_MISC_SWITCH_OUT). See also + * PERF_RECORD_SWITCH_CPU_WIDE. + * + * struct { + * struct perf_event_header header; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_SWITCH = 14, + + /* + * CPU-wide version of PERF_RECORD_SWITCH with next_prev_pid and + * next_prev_tid that are the next (switching out) or previous + * (switching in) pid/tid. + * + * struct { + * struct perf_event_header header; + * u32 next_prev_pid; + * u32 next_prev_tid; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_SWITCH_CPU_WIDE = 15, + PERF_RECORD_MAX, /* non-ABI */ }; diff --git a/kernel/events/core.c b/kernel/events/core.c index d3dae34..ce21143 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -163,6 +163,7 @@ static atomic_t nr_mmap_events __read_mostly; static atomic_t nr_comm_events __read_mostly; static atomic_t nr_task_events __read_mostly; static atomic_t nr_freq_events __read_mostly; +static atomic_t nr_switch_events __read_mostly; static LIST_HEAD(pmus); static DEFINE_MUTEX(pmus_lock); @@ -2619,6 +2620,9 @@ static void perf_pmu_sched_task(struct task_struct *prev, local_irq_restore(flags); } +static void perf_event_switch(struct task_struct *task, + struct task_struct *next_prev, bool sched_in); + #define for_each_task_context_nr(ctxn) \ for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++) @@ -2641,6 +2645,9 @@ void __perf_event_task_sched_out(struct task_struct *task, if (__this_cpu_read(perf_sched_cb_usages)) perf_pmu_sched_task(task, next, false); + if (atomic_read(&nr_switch_events)) + perf_event_switch(task, next, false); + for_each_task_context_nr(ctxn) perf_event_context_sched_out(task, ctxn, next); @@ -2831,6 +2838,9 @@ void __perf_event_task_sched_in(struct task_struct *prev, if (atomic_read(this_cpu_ptr(&perf_cgroup_events))) perf_cgroup_sched_in(prev, task); + if (atomic_read(&nr_switch_events)) + perf_event_switch(task, prev, true); + if (__this_cpu_read(perf_sched_cb_usages)) perf_pmu_sched_task(prev, task, true); } @@ -3454,6 +3464,10 @@ static void unaccount_event(struct perf_event *event) atomic_dec(&nr_task_events); if (event->attr.freq) atomic_dec(&nr_freq_events); + if (event->attr.context_switch) { + static_key_slow_dec_deferred(&perf_sched_events); + atomic_dec(&nr_switch_events); + } if (is_cgroup_event(event)) static_key_slow_dec_deferred(&perf_sched_events); if (has_branch_stack(event)) @@ -5982,6 +5996,91 @@ void perf_log_lost_samples(struct perf_event *event, u64 lost) } /* + * context_switch tracking + */ + +struct perf_switch_event { + struct task_struct *task; + struct task_struct *next_prev; + + struct { + struct perf_event_header header; + u32 next_prev_pid; + u32 next_prev_tid; + } event_id; +}; + +static int perf_event_switch_match(struct perf_event *event) +{ + return event->attr.context_switch; +} + +static void perf_event_switch_output(struct perf_event *event, void *data) +{ + struct perf_switch_event *se = data; + struct perf_output_handle handle; + struct perf_sample_data sample; + int ret; + + if (!perf_event_switch_match(event)) + return; + + /* Only CPU-wide events are allowed to see next/prev pid/tid */ + if (event->ctx->task) { + se->event_id.header.type = PERF_RECORD_SWITCH; + se->event_id.header.size = sizeof(se->event_id.header); + } else { + se->event_id.header.type = PERF_RECORD_SWITCH_CPU_WIDE; + se->event_id.header.size = sizeof(se->event_id); + se->event_id.next_prev_pid = + perf_event_pid(event, se->next_prev); + se->event_id.next_prev_tid = + perf_event_tid(event, se->next_prev); + } + + perf_event_header__init_id(&se->event_id.header, &sample, event); + + ret = perf_output_begin(&handle, event, se->event_id.header.size); + if (ret) + return; + + if (event->ctx->task) + perf_output_put(&handle, se->event_id.header); + else + perf_output_put(&handle, se->event_id); + + perf_event__output_id_sample(event, &handle, &sample); + + perf_output_end(&handle); +} + +static void perf_event_switch(struct task_struct *task, + struct task_struct *next_prev, bool sched_in) +{ + struct perf_switch_event switch_event; + + /* N.B. caller checks nr_switch_events != 0 */ + + switch_event = (struct perf_switch_event){ + .task = task, + .next_prev = next_prev, + .event_id = { + .header = { + /* .type */ + .misc = sched_in ? 0 : PERF_RECORD_MISC_SWITCH_OUT, + /* .size */ + }, + /* .next_prev_pid */ + /* .next_prev_tid */ + }, + }; + + perf_event_aux(perf_event_switch_output, + &switch_event, + NULL); +} + +/* * IRQ throttle logging */ @@ -7479,6 +7578,10 @@ static void account_event(struct perf_event *event) if (atomic_inc_return(&nr_freq_events) == 1) tick_nohz_full_kick_all(); } + if (event->attr.context_switch) { + atomic_inc(&nr_switch_events); + static_key_slow_inc(&perf_sched_events.key); + } if (has_branch_stack(event)) static_key_slow_inc(&perf_sched_events.key); if (is_cgroup_event(event)) -- cgit v0.10.2 From 0286039f777ec0b6684868c34f7b16f97a069d6e Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Tue, 21 Jul 2015 12:44:03 +0300 Subject: perf tools: Add new PERF_RECORD_SWITCH event Support processing of PERF_RECORD_SWITCH events and PERF_RECORD_SWITCH_CPU_WIDE events. There is a single tools callback for them both so that the tool must check the event type before using the extra members in PERF_RECORD_SWITCH_CPU_WIDE. There is still no way to select the events, though. That is added in a subsequest patch. Signed-off-by: Adrian Hunter Acked-by: Peter Zijlstra (Intel) Tested-by: Jiri Olsa Cc: Andi Kleen Cc: Mathieu Poirier Cc: Pawel Moll Cc: Stephane Eranian Link: http://lkml.kernel.org/r/1437471846-26995-3-git-send-email-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c index 01b0649..f62c49b 100644 --- a/tools/perf/builtin-inject.c +++ b/tools/perf/builtin-inject.c @@ -561,6 +561,7 @@ int cmd_inject(int argc, const char **argv, const char *prefix __maybe_unused) .lost = perf_event__repipe, .aux = perf_event__repipe, .itrace_start = perf_event__repipe, + .context_switch = perf_event__repipe, .read = perf_event__repipe_sample, .throttle = perf_event__repipe, .unthrottle = perf_event__repipe, diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c index 67a977e..7ff6127 100644 --- a/tools/perf/util/event.c +++ b/tools/perf/util/event.c @@ -26,6 +26,8 @@ static const char *perf_event__names[] = { [PERF_RECORD_AUX] = "AUX", [PERF_RECORD_ITRACE_START] = "ITRACE_START", [PERF_RECORD_LOST_SAMPLES] = "LOST_SAMPLES", + [PERF_RECORD_SWITCH] = "SWITCH", + [PERF_RECORD_SWITCH_CPU_WIDE] = "SWITCH_CPU_WIDE", [PERF_RECORD_HEADER_ATTR] = "ATTR", [PERF_RECORD_HEADER_EVENT_TYPE] = "EVENT_TYPE", [PERF_RECORD_HEADER_TRACING_DATA] = "TRACING_DATA", @@ -749,6 +751,14 @@ int perf_event__process_lost_samples(struct perf_tool *tool __maybe_unused, return machine__process_lost_samples_event(machine, event, sample); } +int perf_event__process_switch(struct perf_tool *tool __maybe_unused, + union perf_event *event, + struct perf_sample *sample __maybe_unused, + struct machine *machine) +{ + return machine__process_switch_event(machine, event); +} + size_t perf_event__fprintf_mmap(union perf_event *event, FILE *fp) { return fprintf(fp, " %d/%d: [%#" PRIx64 "(%#" PRIx64 ") @ %#" PRIx64 "]: %c %s\n", @@ -827,6 +837,20 @@ size_t perf_event__fprintf_itrace_start(union perf_event *event, FILE *fp) event->itrace_start.pid, event->itrace_start.tid); } +size_t perf_event__fprintf_switch(union perf_event *event, FILE *fp) +{ + bool out = event->header.misc & PERF_RECORD_MISC_SWITCH_OUT; + const char *in_out = out ? "OUT" : "IN "; + + if (event->header.type == PERF_RECORD_SWITCH) + return fprintf(fp, " %s\n", in_out); + + return fprintf(fp, " %s %s pid/tid: %5u/%-5u\n", + in_out, out ? "next" : "prev", + event->context_switch.next_prev_pid, + event->context_switch.next_prev_tid); +} + size_t perf_event__fprintf(union perf_event *event, FILE *fp) { size_t ret = fprintf(fp, "PERF_RECORD_%s", @@ -852,6 +876,10 @@ size_t perf_event__fprintf(union perf_event *event, FILE *fp) case PERF_RECORD_ITRACE_START: ret += perf_event__fprintf_itrace_start(event, fp); break; + case PERF_RECORD_SWITCH: + case PERF_RECORD_SWITCH_CPU_WIDE: + ret += perf_event__fprintf_switch(event, fp); + break; default: ret += fprintf(fp, "\n"); } diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h index c53f363..4bb2ae8 100644 --- a/tools/perf/util/event.h +++ b/tools/perf/util/event.h @@ -348,6 +348,12 @@ struct itrace_start_event { u32 pid, tid; }; +struct context_switch_event { + struct perf_event_header header; + u32 next_prev_pid; + u32 next_prev_tid; +}; + union perf_event { struct perf_event_header header; struct mmap_event mmap; @@ -369,6 +375,7 @@ union perf_event { struct auxtrace_error_event auxtrace_error; struct aux_event aux; struct itrace_start_event itrace_start; + struct context_switch_event context_switch; }; void perf_event__print_totals(void); @@ -418,6 +425,10 @@ int perf_event__process_itrace_start(struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct machine *machine); +int perf_event__process_switch(struct perf_tool *tool, + union perf_event *event, + struct perf_sample *sample, + struct machine *machine); int perf_event__process_mmap(struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, @@ -480,6 +491,7 @@ size_t perf_event__fprintf_mmap2(union perf_event *event, FILE *fp); size_t perf_event__fprintf_task(union perf_event *event, FILE *fp); size_t perf_event__fprintf_aux(union perf_event *event, FILE *fp); size_t perf_event__fprintf_itrace_start(union perf_event *event, FILE *fp); +size_t perf_event__fprintf_switch(union perf_event *event, FILE *fp); size_t perf_event__fprintf(union perf_event *event, FILE *fp); u64 kallsyms__get_function_start(const char *kallsyms_filename, diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 49fb7b5..9e6e6f4 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -1127,6 +1127,7 @@ int perf_event_attr__fprintf(FILE *fp, struct perf_event_attr *attr, PRINT_ATTRf(mmap2, p_unsigned); PRINT_ATTRf(comm_exec, p_unsigned); PRINT_ATTRf(use_clockid, p_unsigned); + PRINT_ATTRf(context_switch, p_unsigned); PRINT_ATTRn("{ wakeup_events, wakeup_watermark }", wakeup_events, p_unsigned); PRINT_ATTRf(bp_type, p_unsigned); diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index 22006c1..be3e008 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -550,6 +550,14 @@ int machine__process_itrace_start_event(struct machine *machine __maybe_unused, return 0; } +int machine__process_switch_event(struct machine *machine __maybe_unused, + union perf_event *event) +{ + if (dump_trace) + perf_event__fprintf_switch(event, stdout); + return 0; +} + struct map *machine__findnew_module_map(struct machine *machine, u64 start, const char *filename) { @@ -1451,6 +1459,9 @@ int machine__process_event(struct machine *machine, union perf_event *event, ret = machine__process_itrace_start_event(machine, event); break; case PERF_RECORD_LOST_SAMPLES: ret = machine__process_lost_samples_event(machine, event, sample); break; + case PERF_RECORD_SWITCH: + case PERF_RECORD_SWITCH_CPU_WIDE: + ret = machine__process_switch_event(machine, event); break; default: ret = -1; break; diff --git a/tools/perf/util/machine.h b/tools/perf/util/machine.h index ff5f78c..ea5cb4a 100644 --- a/tools/perf/util/machine.h +++ b/tools/perf/util/machine.h @@ -87,6 +87,8 @@ int machine__process_aux_event(struct machine *machine, union perf_event *event); int machine__process_itrace_start_event(struct machine *machine, union perf_event *event); +int machine__process_switch_event(struct machine *machine __maybe_unused, + union perf_event *event); int machine__process_mmap_event(struct machine *machine, union perf_event *event, struct perf_sample *sample); int machine__process_mmap2_event(struct machine *machine, union perf_event *event, diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index fb1d525..2d95747 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -333,6 +333,8 @@ void perf_tool__fill_defaults(struct perf_tool *tool) tool->aux = perf_event__process_aux; if (tool->itrace_start == NULL) tool->itrace_start = perf_event__process_itrace_start; + if (tool->context_switch == NULL) + tool->context_switch = perf_event__process_switch; if (tool->read == NULL) tool->read = process_event_sample_stub; if (tool->throttle == NULL) @@ -471,6 +473,19 @@ static void perf_event__itrace_start_swap(union perf_event *event, swap_sample_id_all(event, &event->itrace_start + 1); } +static void perf_event__switch_swap(union perf_event *event, bool sample_id_all) +{ + if (event->header.type == PERF_RECORD_SWITCH_CPU_WIDE) { + event->context_switch.next_prev_pid = + bswap_32(event->context_switch.next_prev_pid); + event->context_switch.next_prev_tid = + bswap_32(event->context_switch.next_prev_tid); + } + + if (sample_id_all) + swap_sample_id_all(event, &event->context_switch + 1); +} + static void perf_event__throttle_swap(union perf_event *event, bool sample_id_all) { @@ -633,6 +648,8 @@ static perf_event__swap_op perf_event__swap_ops[] = { [PERF_RECORD_AUX] = perf_event__aux_swap, [PERF_RECORD_ITRACE_START] = perf_event__itrace_start_swap, [PERF_RECORD_LOST_SAMPLES] = perf_event__all64_swap, + [PERF_RECORD_SWITCH] = perf_event__switch_swap, + [PERF_RECORD_SWITCH_CPU_WIDE] = perf_event__switch_swap, [PERF_RECORD_HEADER_ATTR] = perf_event__hdr_attr_swap, [PERF_RECORD_HEADER_EVENT_TYPE] = perf_event__event_type_swap, [PERF_RECORD_HEADER_TRACING_DATA] = perf_event__tracing_data_swap, @@ -1094,6 +1111,9 @@ static int machines__deliver_event(struct machines *machines, return tool->aux(tool, event, sample, machine); case PERF_RECORD_ITRACE_START: return tool->itrace_start(tool, event, sample, machine); + case PERF_RECORD_SWITCH: + case PERF_RECORD_SWITCH_CPU_WIDE: + return tool->context_switch(tool, event, sample, machine); default: ++evlist->stats.nr_unknown_events; return -1; diff --git a/tools/perf/util/tool.h b/tools/perf/util/tool.h index c307dd4..cab8cc2 100644 --- a/tools/perf/util/tool.h +++ b/tools/perf/util/tool.h @@ -46,6 +46,7 @@ struct perf_tool { lost_samples, aux, itrace_start, + context_switch, throttle, unthrottle; event_attr_op attr; -- cgit v0.10.2 From b757bb09134f479a087ece08d2cd2a6ba31c9210 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Tue, 21 Jul 2015 12:44:04 +0300 Subject: perf record: Add option --switch-events to select PERF_RECORD_SWITCH events Add an option to select PERF_RECORD_SWITCH events. Signed-off-by: Adrian Hunter Acked-by: Peter Zijlstra (Intel) Tested-by: Arnaldo Carvalho de Melo Tested-by: Jiri Olsa Cc: Andi Kleen Cc: Mathieu Poirier Cc: Pawel Moll Cc: Stephane Eranian Link: http://lkml.kernel.org/r/1437471846-26995-4-git-send-email-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt index 29e5307..63ee040 100644 --- a/tools/perf/Documentation/perf-record.txt +++ b/tools/perf/Documentation/perf-record.txt @@ -293,6 +293,10 @@ When processing pre-existing threads /proc/XXX/mmap, it may take a long time, because the file may be huge. A time out is needed in such cases. This option sets the time out limit. The default value is 500 ms. +--switch-events:: +Record context switch events i.e. events of type PERF_RECORD_SWITCH or +PERF_RECORD_SWITCH_CPU_WIDE. + SEE ALSO -------- linkperf:perf-stat[1], linkperf:perf-list[1] diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 1932e27..445a64d 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -1075,6 +1075,8 @@ struct option __record_options[] = { "opts", "AUX area tracing Snapshot Mode", ""), OPT_UINTEGER(0, "proc-map-timeout", &record.opts.proc_map_timeout, "per thread proc mmap processing timeout in ms"), + OPT_BOOLEAN(0, "switch-events", &record.opts.record_switch_events, + "Record context switch events"), OPT_END() }; @@ -1102,6 +1104,11 @@ int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused) " system-wide mode\n"); usage_with_options(record_usage, record_options); } + if (rec->opts.record_switch_events && + !perf_can_record_switch_events()) { + ui__error("kernel does not support recording context switch events (--switch-events option)\n"); + usage_with_options(record_usage, record_options); + } if (!rec->itr) { rec->itr = auxtrace_record__init(rec->evlist, &err); diff --git a/tools/perf/perf.h b/tools/perf/perf.h index 937b16a..cf459f8 100644 --- a/tools/perf/perf.h +++ b/tools/perf/perf.h @@ -57,6 +57,7 @@ struct record_opts { bool running_time; bool full_auxtrace; bool auxtrace_snapshot_mode; + bool record_switch_events; unsigned int freq; unsigned int mmap_pages; unsigned int auxtrace_mmap_pages; diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h index 406a821..a8930b6 100644 --- a/tools/perf/util/evlist.h +++ b/tools/perf/util/evlist.h @@ -114,6 +114,7 @@ void perf_evlist__close(struct perf_evlist *evlist); void perf_evlist__set_id_pos(struct perf_evlist *evlist); bool perf_can_sample_identifier(void); +bool perf_can_record_switch_events(void); void perf_evlist__config(struct perf_evlist *evlist, struct record_opts *opts); int record_opts__config(struct record_opts *opts); diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 9e6e6f4..71f6905 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -738,6 +738,9 @@ void perf_evsel__config(struct perf_evsel *evsel, struct record_opts *opts) attr->mmap2 = track && !perf_missing_features.mmap2; attr->comm = track; + if (opts->record_switch_events) + attr->context_switch = track; + if (opts->sample_transaction) perf_evsel__set_sample_bit(evsel, TRANSACTION); diff --git a/tools/perf/util/record.c b/tools/perf/util/record.c index 1f7becb..0d228a2 100644 --- a/tools/perf/util/record.c +++ b/tools/perf/util/record.c @@ -85,6 +85,11 @@ static void perf_probe_comm_exec(struct perf_evsel *evsel) evsel->attr.comm_exec = 1; } +static void perf_probe_context_switch(struct perf_evsel *evsel) +{ + evsel->attr.context_switch = 1; +} + bool perf_can_sample_identifier(void) { return perf_probe_api(perf_probe_sample_identifier); @@ -95,6 +100,11 @@ static bool perf_can_comm_exec(void) return perf_probe_api(perf_probe_comm_exec); } +bool perf_can_record_switch_events(void) +{ + return perf_probe_api(perf_probe_context_switch); +} + void perf_evlist__config(struct perf_evlist *evlist, struct record_opts *opts) { struct perf_evsel *evsel; -- cgit v0.10.2 From 06b234ec26fde8d0fce54030fe0858e218636410 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Tue, 21 Jul 2015 12:44:05 +0300 Subject: perf script: Don't assume evsel position of tracking events The tracking event does not have to be the first event so replace perf_evlist__first() with perf_evlist__id2evsel() which uses the event ID to find the correct evsel. Signed-off-by: Adrian Hunter Acked-by: Peter Zijlstra (Intel) Tested-by: Jiri Olsa Cc: Andi Kleen Cc: Mathieu Poirier Cc: Pawel Moll Cc: Stephane Eranian Link: http://lkml.kernel.org/r/1437471846-26995-5-git-send-email-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 2777c64..8a8e2ea 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -661,7 +661,7 @@ static int process_comm_event(struct perf_tool *tool, struct thread *thread; struct perf_script *script = container_of(tool, struct perf_script, tool); struct perf_session *session = script->session; - struct perf_evsel *evsel = perf_evlist__first(session->evlist); + struct perf_evsel *evsel = perf_evlist__id2evsel(session->evlist, sample->id); int ret = -1; thread = machine__findnew_thread(machine, event->comm.pid, event->comm.tid); @@ -695,7 +695,7 @@ static int process_fork_event(struct perf_tool *tool, struct thread *thread; struct perf_script *script = container_of(tool, struct perf_script, tool); struct perf_session *session = script->session; - struct perf_evsel *evsel = perf_evlist__first(session->evlist); + struct perf_evsel *evsel = perf_evlist__id2evsel(session->evlist, sample->id); if (perf_event__process_fork(tool, event, sample, machine) < 0) return -1; @@ -727,7 +727,7 @@ static int process_exit_event(struct perf_tool *tool, struct thread *thread; struct perf_script *script = container_of(tool, struct perf_script, tool); struct perf_session *session = script->session; - struct perf_evsel *evsel = perf_evlist__first(session->evlist); + struct perf_evsel *evsel = perf_evlist__id2evsel(session->evlist, sample->id); thread = machine__findnew_thread(machine, event->fork.pid, event->fork.tid); if (thread == NULL) { @@ -759,7 +759,7 @@ static int process_mmap_event(struct perf_tool *tool, struct thread *thread; struct perf_script *script = container_of(tool, struct perf_script, tool); struct perf_session *session = script->session; - struct perf_evsel *evsel = perf_evlist__first(session->evlist); + struct perf_evsel *evsel = perf_evlist__id2evsel(session->evlist, sample->id); if (perf_event__process_mmap(tool, event, sample, machine) < 0) return -1; @@ -790,7 +790,7 @@ static int process_mmap2_event(struct perf_tool *tool, struct thread *thread; struct perf_script *script = container_of(tool, struct perf_script, tool); struct perf_session *session = script->session; - struct perf_evsel *evsel = perf_evlist__first(session->evlist); + struct perf_evsel *evsel = perf_evlist__id2evsel(session->evlist, sample->id); if (perf_event__process_mmap2(tool, event, sample, machine) < 0) return -1; -- cgit v0.10.2 From 7c14898ba9386ee5c939bb418643ac6baff52840 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Tue, 21 Jul 2015 12:44:06 +0300 Subject: perf script: Add option --show-switch-events Add option --show-switch-events to show switch events in a similar fashion to --show-task-events and --show-mmap-events. Signed-off-by: Adrian Hunter Acked-by: Peter Zijlstra (Intel) Tested-by: Jiri Olsa Tested-by: Arnaldo Carvalho de Melo Cc: Andi Kleen Cc: Mathieu Poirier Cc: Pawel Moll Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/r/1437471846-26995-6-git-send-email-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt index c82df57..e2fec5f 100644 --- a/tools/perf/Documentation/perf-script.txt +++ b/tools/perf/Documentation/perf-script.txt @@ -222,6 +222,10 @@ OPTIONS --show-mmap-events Display mmap related events (e.g. MMAP, MMAP2). +--show-switch-events + Display context switch events i.e. events of type PERF_RECORD_SWITCH or + PERF_RECORD_SWITCH_CPU_WIDE. + --header Show perf.data header. diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 8a8e2ea..bd31380 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -623,6 +623,7 @@ struct perf_script { struct perf_session *session; bool show_task_events; bool show_mmap_events; + bool show_switch_events; }; static int process_attr(struct perf_tool *tool, union perf_event *event, @@ -813,6 +814,32 @@ static int process_mmap2_event(struct perf_tool *tool, return 0; } +static int process_switch_event(struct perf_tool *tool, + union perf_event *event, + struct perf_sample *sample, + struct machine *machine) +{ + struct thread *thread; + struct perf_script *script = container_of(tool, struct perf_script, tool); + struct perf_session *session = script->session; + struct perf_evsel *evsel = perf_evlist__id2evsel(session->evlist, sample->id); + + if (perf_event__process_switch(tool, event, sample, machine) < 0) + return -1; + + thread = machine__findnew_thread(machine, sample->pid, + sample->tid); + if (thread == NULL) { + pr_debug("problem processing SWITCH event, skipping it.\n"); + return -1; + } + + print_sample_start(sample, thread, evsel); + perf_event__fprintf(event, stdout); + thread__put(thread); + return 0; +} + static void sig_handler(int sig __maybe_unused) { session_done = 1; @@ -834,6 +861,8 @@ static int __cmd_script(struct perf_script *script) script->tool.mmap = process_mmap_event; script->tool.mmap2 = process_mmap2_event; } + if (script->show_switch_events) + script->tool.context_switch = process_switch_event; ret = perf_session__process_events(script->session); @@ -1618,6 +1647,8 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused) "Show the fork/comm/exit events"), OPT_BOOLEAN('\0', "show-mmap-events", &script.show_mmap_events, "Show the mmap events"), + OPT_BOOLEAN('\0', "show-switch-events", &script.show_switch_events, + "Show context switch events (if recorded)"), OPT_BOOLEAN('f', "force", &file.force, "don't complain, do it"), OPT_CALLBACK_OPTARG(0, "itrace", &itrace_synth_opts, NULL, "opts", "Instruction Tracing options", -- cgit v0.10.2 From 959c2199d4c0e330d17f1f5234527e87132d5e07 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 24 Jul 2015 12:13:05 -0300 Subject: perf python: Remove dependency on 'machine' methods The python binding still doesn't provide symbol resolving facilities, but the recent addition of the trace_event__register_resolver() function made it add as a dependency the machine__resolve_kernel_addr() method, that in turn drags all the symbol resolving code. The problem: [root@zoo ~]# perf test -v python 17: Try 'import perf' in python, checking link problems : --- start --- test child forked, pid 6853 Traceback (most recent call last): File "", line 1, in ImportError: /tmp/build/perf/python/perf.so: undefined symbol: machine__resolve_kernel_addr test child finished with -1 ---- end ---- Try 'import perf' in python, checking link problems: FAILED! [root@zoo ~]# Fix it by requiring this function to receive the resolver as a parameter, just like pevent_register_function_resolver(), i.e. do not explicitely refer to an object file not included in tools/perf/util/python-ext-sources. [root@zoo ~]# perf test python 17: Try 'import perf' in python, checking link problems : Ok [root@zoo ~]# Cc: Adrian Hunter Cc: Borislav Petkov Cc: David Ahern Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Namhyung Kim Cc: Stephane Eranian Fixes: c3168b0db93a ("perf symbols: Provide libtraceevent callback to resolve kernel symbols") Link: http://lkml.kernel.org/n/tip-vxlhh95v2em9zdbgj3jm7xi5@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 282841b1..06cfa93 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -1489,7 +1489,7 @@ static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist) if (trace->host == NULL) return -ENOMEM; - if (trace_event__register_resolver(trace->host) < 0) + if (trace_event__register_resolver(trace->host, machine__resolve_kernel_addr) < 0) return -errno; err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target, diff --git a/tools/perf/util/trace-event.c b/tools/perf/util/trace-event.c index 667bd10..b90e646 100644 --- a/tools/perf/util/trace-event.c +++ b/tools/perf/util/trace-event.c @@ -50,14 +50,13 @@ static int trace_event__init2(void) return 0; } -int trace_event__register_resolver(struct machine *machine) +int trace_event__register_resolver(struct machine *machine, + pevent_func_resolver_t *func) { if (!tevent_initialized && trace_event__init2()) return -1; - return pevent_set_function_resolver(tevent.pevent, - machine__resolve_kernel_addr, - machine); + return pevent_set_function_resolver(tevent.pevent, func, machine); } void trace_event__cleanup(struct trace_event *t) diff --git a/tools/perf/util/trace-event.h b/tools/perf/util/trace-event.h index 568128c..da6cc4c 100644 --- a/tools/perf/util/trace-event.h +++ b/tools/perf/util/trace-event.h @@ -18,7 +18,8 @@ struct trace_event { int trace_event__init(struct trace_event *t); void trace_event__cleanup(struct trace_event *t); -int trace_event__register_resolver(struct machine *machine); +int trace_event__register_resolver(struct machine *machine, + pevent_func_resolver_t *func); struct event_format* trace_event__tp_format(const char *sys, const char *name); -- cgit v0.10.2 From 5865fe361a234e50b5217da3edd2c5571fffc7fd Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 24 Jul 2015 13:00:03 -0300 Subject: perf python: Add macro to simplify maintainance of the constants array Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Link: http://lkml.kernel.org/n/tip-ffuchgsbr5mqu91xl9oggfss@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c index 626422e..a851b79 100644 --- a/tools/perf/util/python.c +++ b/tools/perf/util/python.c @@ -941,76 +941,78 @@ static int pyrf_evlist__setup_types(void) return PyType_Ready(&pyrf_evlist__type); } +#define PERF_CONST(name) { #name, PERF_##name } + static struct { const char *name; int value; } perf__constants[] = { - { "TYPE_HARDWARE", PERF_TYPE_HARDWARE }, - { "TYPE_SOFTWARE", PERF_TYPE_SOFTWARE }, - { "TYPE_TRACEPOINT", PERF_TYPE_TRACEPOINT }, - { "TYPE_HW_CACHE", PERF_TYPE_HW_CACHE }, - { "TYPE_RAW", PERF_TYPE_RAW }, - { "TYPE_BREAKPOINT", PERF_TYPE_BREAKPOINT }, - - { "COUNT_HW_CPU_CYCLES", PERF_COUNT_HW_CPU_CYCLES }, - { "COUNT_HW_INSTRUCTIONS", PERF_COUNT_HW_INSTRUCTIONS }, - { "COUNT_HW_CACHE_REFERENCES", PERF_COUNT_HW_CACHE_REFERENCES }, - { "COUNT_HW_CACHE_MISSES", PERF_COUNT_HW_CACHE_MISSES }, - { "COUNT_HW_BRANCH_INSTRUCTIONS", PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, - { "COUNT_HW_BRANCH_MISSES", PERF_COUNT_HW_BRANCH_MISSES }, - { "COUNT_HW_BUS_CYCLES", PERF_COUNT_HW_BUS_CYCLES }, - { "COUNT_HW_CACHE_L1D", PERF_COUNT_HW_CACHE_L1D }, - { "COUNT_HW_CACHE_L1I", PERF_COUNT_HW_CACHE_L1I }, - { "COUNT_HW_CACHE_LL", PERF_COUNT_HW_CACHE_LL }, - { "COUNT_HW_CACHE_DTLB", PERF_COUNT_HW_CACHE_DTLB }, - { "COUNT_HW_CACHE_ITLB", PERF_COUNT_HW_CACHE_ITLB }, - { "COUNT_HW_CACHE_BPU", PERF_COUNT_HW_CACHE_BPU }, - { "COUNT_HW_CACHE_OP_READ", PERF_COUNT_HW_CACHE_OP_READ }, - { "COUNT_HW_CACHE_OP_WRITE", PERF_COUNT_HW_CACHE_OP_WRITE }, - { "COUNT_HW_CACHE_OP_PREFETCH", PERF_COUNT_HW_CACHE_OP_PREFETCH }, - { "COUNT_HW_CACHE_RESULT_ACCESS", PERF_COUNT_HW_CACHE_RESULT_ACCESS }, - { "COUNT_HW_CACHE_RESULT_MISS", PERF_COUNT_HW_CACHE_RESULT_MISS }, - - { "COUNT_HW_STALLED_CYCLES_FRONTEND", PERF_COUNT_HW_STALLED_CYCLES_FRONTEND }, - { "COUNT_HW_STALLED_CYCLES_BACKEND", PERF_COUNT_HW_STALLED_CYCLES_BACKEND }, - - { "COUNT_SW_CPU_CLOCK", PERF_COUNT_SW_CPU_CLOCK }, - { "COUNT_SW_TASK_CLOCK", PERF_COUNT_SW_TASK_CLOCK }, - { "COUNT_SW_PAGE_FAULTS", PERF_COUNT_SW_PAGE_FAULTS }, - { "COUNT_SW_CONTEXT_SWITCHES", PERF_COUNT_SW_CONTEXT_SWITCHES }, - { "COUNT_SW_CPU_MIGRATIONS", PERF_COUNT_SW_CPU_MIGRATIONS }, - { "COUNT_SW_PAGE_FAULTS_MIN", PERF_COUNT_SW_PAGE_FAULTS_MIN }, - { "COUNT_SW_PAGE_FAULTS_MAJ", PERF_COUNT_SW_PAGE_FAULTS_MAJ }, - { "COUNT_SW_ALIGNMENT_FAULTS", PERF_COUNT_SW_ALIGNMENT_FAULTS }, - { "COUNT_SW_EMULATION_FAULTS", PERF_COUNT_SW_EMULATION_FAULTS }, - { "COUNT_SW_DUMMY", PERF_COUNT_SW_DUMMY }, - - { "SAMPLE_IP", PERF_SAMPLE_IP }, - { "SAMPLE_TID", PERF_SAMPLE_TID }, - { "SAMPLE_TIME", PERF_SAMPLE_TIME }, - { "SAMPLE_ADDR", PERF_SAMPLE_ADDR }, - { "SAMPLE_READ", PERF_SAMPLE_READ }, - { "SAMPLE_CALLCHAIN", PERF_SAMPLE_CALLCHAIN }, - { "SAMPLE_ID", PERF_SAMPLE_ID }, - { "SAMPLE_CPU", PERF_SAMPLE_CPU }, - { "SAMPLE_PERIOD", PERF_SAMPLE_PERIOD }, - { "SAMPLE_STREAM_ID", PERF_SAMPLE_STREAM_ID }, - { "SAMPLE_RAW", PERF_SAMPLE_RAW }, - - { "FORMAT_TOTAL_TIME_ENABLED", PERF_FORMAT_TOTAL_TIME_ENABLED }, - { "FORMAT_TOTAL_TIME_RUNNING", PERF_FORMAT_TOTAL_TIME_RUNNING }, - { "FORMAT_ID", PERF_FORMAT_ID }, - { "FORMAT_GROUP", PERF_FORMAT_GROUP }, - - { "RECORD_MMAP", PERF_RECORD_MMAP }, - { "RECORD_LOST", PERF_RECORD_LOST }, - { "RECORD_COMM", PERF_RECORD_COMM }, - { "RECORD_EXIT", PERF_RECORD_EXIT }, - { "RECORD_THROTTLE", PERF_RECORD_THROTTLE }, - { "RECORD_UNTHROTTLE", PERF_RECORD_UNTHROTTLE }, - { "RECORD_FORK", PERF_RECORD_FORK }, - { "RECORD_READ", PERF_RECORD_READ }, - { "RECORD_SAMPLE", PERF_RECORD_SAMPLE }, + PERF_CONST(TYPE_HARDWARE), + PERF_CONST(TYPE_SOFTWARE), + PERF_CONST(TYPE_TRACEPOINT), + PERF_CONST(TYPE_HW_CACHE), + PERF_CONST(TYPE_RAW), + PERF_CONST(TYPE_BREAKPOINT), + + PERF_CONST(COUNT_HW_CPU_CYCLES), + PERF_CONST(COUNT_HW_INSTRUCTIONS), + PERF_CONST(COUNT_HW_CACHE_REFERENCES), + PERF_CONST(COUNT_HW_CACHE_MISSES), + PERF_CONST(COUNT_HW_BRANCH_INSTRUCTIONS), + PERF_CONST(COUNT_HW_BRANCH_MISSES), + PERF_CONST(COUNT_HW_BUS_CYCLES), + PERF_CONST(COUNT_HW_CACHE_L1D), + PERF_CONST(COUNT_HW_CACHE_L1I), + PERF_CONST(COUNT_HW_CACHE_LL), + PERF_CONST(COUNT_HW_CACHE_DTLB), + PERF_CONST(COUNT_HW_CACHE_ITLB), + PERF_CONST(COUNT_HW_CACHE_BPU), + PERF_CONST(COUNT_HW_CACHE_OP_READ), + PERF_CONST(COUNT_HW_CACHE_OP_WRITE), + PERF_CONST(COUNT_HW_CACHE_OP_PREFETCH), + PERF_CONST(COUNT_HW_CACHE_RESULT_ACCESS), + PERF_CONST(COUNT_HW_CACHE_RESULT_MISS), + + PERF_CONST(COUNT_HW_STALLED_CYCLES_FRONTEND), + PERF_CONST(COUNT_HW_STALLED_CYCLES_BACKEND), + + PERF_CONST(COUNT_SW_CPU_CLOCK), + PERF_CONST(COUNT_SW_TASK_CLOCK), + PERF_CONST(COUNT_SW_PAGE_FAULTS), + PERF_CONST(COUNT_SW_CONTEXT_SWITCHES), + PERF_CONST(COUNT_SW_CPU_MIGRATIONS), + PERF_CONST(COUNT_SW_PAGE_FAULTS_MIN), + PERF_CONST(COUNT_SW_PAGE_FAULTS_MAJ), + PERF_CONST(COUNT_SW_ALIGNMENT_FAULTS), + PERF_CONST(COUNT_SW_EMULATION_FAULTS), + PERF_CONST(COUNT_SW_DUMMY), + + PERF_CONST(SAMPLE_IP), + PERF_CONST(SAMPLE_TID), + PERF_CONST(SAMPLE_TIME), + PERF_CONST(SAMPLE_ADDR), + PERF_CONST(SAMPLE_READ), + PERF_CONST(SAMPLE_CALLCHAIN), + PERF_CONST(SAMPLE_ID), + PERF_CONST(SAMPLE_CPU), + PERF_CONST(SAMPLE_PERIOD), + PERF_CONST(SAMPLE_STREAM_ID), + PERF_CONST(SAMPLE_RAW), + + PERF_CONST(FORMAT_TOTAL_TIME_ENABLED), + PERF_CONST(FORMAT_TOTAL_TIME_RUNNING), + PERF_CONST(FORMAT_ID), + PERF_CONST(FORMAT_GROUP), + + PERF_CONST(RECORD_MMAP), + PERF_CONST(RECORD_LOST), + PERF_CONST(RECORD_COMM), + PERF_CONST(RECORD_EXIT), + PERF_CONST(RECORD_THROTTLE), + PERF_CONST(RECORD_UNTHROTTLE), + PERF_CONST(RECORD_FORK), + PERF_CONST(RECORD_READ), + PERF_CONST(RECORD_SAMPLE), { .name = NULL, }, }; -- cgit v0.10.2 From 84576da2f72e51be63c5b63c0e297028fa7ac59f Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 24 Jul 2015 13:04:09 -0300 Subject: perf python: Add missing PERF_RECORD_{MMAP2,AUX,etc} Those were added to the kernel and tooling but we forgot to expose them via the python binding, fix it. Cc: Adrian Hunter Cc: Borislav Petkov Cc: David Ahern Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Namhyung Kim Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-sg1m6t2c58gchidfce4hmitg@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c index a851b79..6324fe6 100644 --- a/tools/perf/util/python.c +++ b/tools/perf/util/python.c @@ -1013,6 +1013,12 @@ static struct { PERF_CONST(RECORD_FORK), PERF_CONST(RECORD_READ), PERF_CONST(RECORD_SAMPLE), + PERF_CONST(RECORD_MMAP2), + PERF_CONST(RECORD_AUX), + PERF_CONST(RECORD_ITRACE_START), + PERF_CONST(RECORD_LOST_SAMPLES), + PERF_CONST(RECORD_SWITCH), + PERF_CONST(RECORD_SWITCH_CPU_WIDE), { .name = NULL, }, }; -- cgit v0.10.2 From 58b32c1b538f2d197ce385d6a314e83f8b787021 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 27 Jul 2015 17:13:21 -0300 Subject: perf python: Make twatch.py use soft dummy event, freq=0 To not sample, what we want are just the PERF_RECORD_ lifetime events for threads, using the default, PERF_TYPE_HARDWARE + PERF_COUNT_HW_CYCLES and freq=1 (the default), makes perf reenable irq_vectors:local_timer_entry, disabling nohz, not good for some use cases where all we want is to get notifications when threads comes and goes... Fix it by using PERF_TYPE_SOFTWARE (no counter rotation) and PERF_COUNT_SW_DUMMY (created by Adrian so that we could have access to those PERF_RECORD_ goodies). Reported-by: Luiz Fernando Capitulino Suggested-by: Peter Zijlstra Cc: Adrian Hunter Cc: Borislav Petkov Cc: David Ahern Cc: Frederic Weisbecker Cc: Jaroslav Skarvada Cc: Jeremy Eder Cc: Jiri Olsa Cc: Namhyung Kim Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-kfsijirfrs6xfhkcdxeoen06@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/python/twatch.py b/tools/perf/python/twatch.py index 2225162..b9d5083 100755 --- a/tools/perf/python/twatch.py +++ b/tools/perf/python/twatch.py @@ -18,10 +18,20 @@ import perf def main(): cpus = perf.cpu_map() threads = perf.thread_map() - evsel = perf.evsel(task = 1, comm = 1, mmap = 0, + evsel = perf.evsel(type = perf.TYPE_SOFTWARE, + config = perf.COUNT_SW_DUMMY, + task = 1, comm = 1, mmap = 0, freq = 0, wakeup_events = 1, watermark = 1, sample_id_all = 1, sample_type = perf.SAMPLE_PERIOD | perf.SAMPLE_TID | perf.SAMPLE_CPU) + + """What we want are just the PERF_RECORD_ lifetime events for threads, + using the default, PERF_TYPE_HARDWARE + PERF_COUNT_HW_CYCLES & freq=1 + (the default), makes perf reenable irq_vectors:local_timer_entry, when + disabling nohz, not good for some use cases where all we want is to get + threads comes and goes... So use (perf.TYPE_SOFTWARE, perf_COUNT_SW_DUMMY, + freq=0) instead.""" + evsel.open(cpus = cpus, threads = threads); evlist = perf.evlist(cpus, threads) evlist.add(evsel) -- cgit v0.10.2 From f785f2357673d520a0b7b468973cdd197f336494 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 28 Jul 2015 12:01:33 -0300 Subject: perf symbols: Fix mismatched declarations for elf_getphdrnum MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When HAVE_ELF_GETPHDRNUM_SUPPORT is false we trip on this problem: CC /tmp/build/perf/util/symbol-elf.o util/symbol-elf.c:41:12: error: static declaration of ‘elf_getphdrnum’ follows non-static declaration static int elf_getphdrnum(Elf *elf, size_t *dst) ^ In file included from util/symbol.h:19:0, from util/symbol-elf.c:8: /usr/include/libelf.h:206:12: note: previous declaration of ‘elf_getphdrnum’ was here extern int elf_getphdrnum (Elf *__elf, size_t *__dst); ^ MKDIR /tmp/build/perf/bench/ /home/git/linux/tools/build/Makefile.build:68: recipe for target '/tmp/build/perf/util/symbol-elf.o' failed make[3]: *** [/tmp/build/perf/util/symbol-elf.o] Error 1 Fix it. Cc: Adrian Hunter Cc: Borislav Petkov Cc: David Ahern Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Namhyung Kim Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-qcmekyfedmov4sxr0wahcikr@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c index 65f7e38..b0ad810 100644 --- a/tools/perf/util/symbol-elf.c +++ b/tools/perf/util/symbol-elf.c @@ -38,7 +38,7 @@ static inline char *bfd_demangle(void __maybe_unused *v, #endif #ifndef HAVE_ELF_GETPHDRNUM_SUPPORT -static int elf_getphdrnum(Elf *elf, size_t *dst) +int elf_getphdrnum(Elf *elf, size_t *dst) { GElf_Ehdr gehdr; GElf_Ehdr *ehdr; -- cgit v0.10.2 From 4c7de49a2977aa2a0f556c803afbb24848372e7a Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 29 Jul 2015 12:18:24 -0300 Subject: perf session env: Rename exit method The semantic associated in tools/perf/ with foo__delete(instance) is to release all resources referenced by 'instance' members and then release the memory for 'instance' itself. The perf_session_env__delete() function isn't doing this, it just does the first part, but the space used by 'instance' itself isn't freed, as it is embedded in a larger structure, that will be freed at other stage. For these cases we se foo__exit(), i.e. the usage is: void foo__delete(foo) { if (foo) { foo__exit(foo); free(foo); } } But when we have something like: struct bar { struct foo foo; . . . } Then we can't really call foo__delete(&bar.foo), we must have this instead: void bar__exit(bar) { foo__exit(&bar.foo); /* free other bar-> resources */ } void bar__delete(bar) { if (bar) { bar__exit(bar); free(bar); } } So just rename perf_session_env__delete() to perf_session_env__exit(). Cc: Adrian Hunter Cc: Borislav Petkov Cc: David Ahern Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Kan Liang Cc: Namhyung Kim Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-djbgpcfo5udqptx3q0flwtmk@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 2d95747..f51eb54 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -170,7 +170,7 @@ static void perf_session__delete_threads(struct perf_session *session) machine__delete_threads(&session->machines.host); } -static void perf_session_env__delete(struct perf_session_env *env) +static void perf_session_env__exit(struct perf_session_env *env) { zfree(&env->hostname); zfree(&env->os_release); @@ -193,7 +193,7 @@ void perf_session__delete(struct perf_session *session) auxtrace_index__free(&session->auxtrace_index); perf_session__destroy_kernel_maps(session); perf_session__delete_threads(session); - perf_session_env__delete(&session->header.env); + perf_session_env__exit(&session->header.env); machines__exit(&session->machines); if (session->file) perf_data_file__close(session->file); -- cgit v0.10.2 From 930a2e29758f865e3a7b34b8b3b37c08d40f0254 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 29 Jul 2015 05:42:10 -0400 Subject: perf tools: Add support for event post configuration Add support to overload any global settings for event and force user specified term value. It will be useful for new time and backtrace terms. Signed-off-by: Jiri Olsa Cc: Andi Kleen Cc: Kan Liang Cc: Namhyung Kim Link: http://lkml.kernel.org/r/1438162936-59698-2-git-send-email-kan.liang@intel.com Signed-off-by: Kan Liang Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 71f6905..048d61d 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -207,6 +207,7 @@ void perf_evsel__init(struct perf_evsel *evsel, evsel->unit = ""; evsel->scale = 1.0; INIT_LIST_HEAD(&evsel->node); + INIT_LIST_HEAD(&evsel->config_terms); perf_evsel__object.init(evsel); evsel->sample_size = __perf_evsel__sample_size(attr->sample_type); perf_evsel__calc_id_pos(evsel); @@ -586,6 +587,19 @@ perf_evsel__config_callgraph(struct perf_evsel *evsel, } } +static void apply_config_terms(struct perf_event_attr *attr __maybe_unused, + struct list_head *config_terms) +{ + struct perf_evsel_config_term *term; + + list_for_each_entry(term, config_terms, list) { + switch (term->type) { + default: + break; + } + } +} + /* * The enable_on_exec/disabled value strategy: * @@ -777,6 +791,12 @@ void perf_evsel__config(struct perf_evsel *evsel, struct record_opts *opts) attr->use_clockid = 1; attr->clockid = opts->clockid; } + + /* + * Apply event specific term settings, + * it overloads any global configuration. + */ + apply_config_terms(attr, &evsel->config_terms); } static int perf_evsel__alloc_fd(struct perf_evsel *evsel, int ncpus, int nthreads) @@ -900,6 +920,16 @@ static void perf_evsel__free_id(struct perf_evsel *evsel) zfree(&evsel->id); } +static void perf_evsel__free_config_terms(struct perf_evsel *evsel) +{ + struct perf_evsel_config_term *term, *h; + + list_for_each_entry_safe(term, h, &evsel->config_terms, list) { + list_del(&term->list); + free(term); + } +} + void perf_evsel__close_fd(struct perf_evsel *evsel, int ncpus, int nthreads) { int cpu, thread; @@ -919,6 +949,7 @@ void perf_evsel__exit(struct perf_evsel *evsel) assert(list_empty(&evsel->node)); perf_evsel__free_fd(evsel); perf_evsel__free_id(evsel); + perf_evsel__free_config_terms(evsel); close_cgroup(evsel->cgrp); cpu_map__put(evsel->cpus); thread_map__put(evsel->threads); diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index 1fc263a..0339819 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -31,6 +31,24 @@ struct perf_sample_id { struct cgroup_sel; +/* + * The 'struct perf_evsel_config_term' is used to pass event + * specific configuration data to perf_evsel__config routine. + * It is allocated within event parsing and attached to + * perf_evsel::config_terms list head. +*/ +enum { + PERF_EVSEL__CONFIG_TERM_MAX, +}; + +struct perf_evsel_config_term { + struct list_head list; + int type; + union { + u64 period; + } val; +}; + /** struct perf_evsel - event selector * * @name - Can be set to retain the original event name passed by the user, @@ -87,6 +105,7 @@ struct perf_evsel { struct perf_evsel *leader; char *group_name; bool cmdline_group_boundary; + struct list_head config_terms; }; union u64_swap { diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 4f807fc..3271d13 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -276,7 +276,8 @@ const char *event_type(int type) static struct perf_evsel * __add_event(struct list_head *list, int *idx, struct perf_event_attr *attr, - char *name, struct cpu_map *cpus) + char *name, struct cpu_map *cpus, + struct list_head *config_terms) { struct perf_evsel *evsel; @@ -291,14 +292,19 @@ __add_event(struct list_head *list, int *idx, if (name) evsel->name = strdup(name); + + if (config_terms) + list_splice(config_terms, &evsel->config_terms); + list_add_tail(&evsel->node, list); return evsel; } static int add_event(struct list_head *list, int *idx, - struct perf_event_attr *attr, char *name) + struct perf_event_attr *attr, char *name, + struct list_head *config_terms) { - return __add_event(list, idx, attr, name, NULL) ? 0 : -ENOMEM; + return __add_event(list, idx, attr, name, NULL, config_terms) ? 0 : -ENOMEM; } static int parse_aliases(char *str, const char *names[][PERF_EVSEL__MAX_ALIASES], int size) @@ -377,7 +383,7 @@ int parse_events_add_cache(struct list_head *list, int *idx, memset(&attr, 0, sizeof(attr)); attr.config = cache_type | (cache_op << 8) | (cache_result << 16); attr.type = PERF_TYPE_HW_CACHE; - return add_event(list, idx, &attr, name); + return add_event(list, idx, &attr, name, NULL); } static int add_tracepoint(struct list_head *list, int *idx, @@ -539,7 +545,7 @@ int parse_events_add_breakpoint(struct list_head *list, int *idx, attr.type = PERF_TYPE_BREAKPOINT; attr.sample_period = 1; - return add_event(list, idx, &attr, NULL); + return add_event(list, idx, &attr, NULL, NULL); } static int check_type_val(struct parse_events_term *term, @@ -622,22 +628,56 @@ static int config_attr(struct perf_event_attr *attr, return 0; } +static int get_config_terms(struct list_head *head_config, + struct list_head *head_terms __maybe_unused) +{ +#define ADD_CONFIG_TERM(__type, __name, __val) \ +do { \ + struct perf_evsel_config_term *__t; \ + \ + __t = zalloc(sizeof(*__t)); \ + if (!__t) \ + return -ENOMEM; \ + \ + INIT_LIST_HEAD(&__t->list); \ + __t->type = PERF_EVSEL__CONFIG_TERM_ ## __type; \ + __t->val.__name = __val; \ + list_add_tail(&__t->list, head_terms); \ +} while (0) + + struct parse_events_term *term; + + list_for_each_entry(term, head_config, list) { + switch (term->type_term) { + default: + break; + } + } +#undef ADD_EVSEL_CONFIG + return 0; +} + int parse_events_add_numeric(struct parse_events_evlist *data, struct list_head *list, u32 type, u64 config, struct list_head *head_config) { struct perf_event_attr attr; + LIST_HEAD(config_terms); memset(&attr, 0, sizeof(attr)); attr.type = type; attr.config = config; - if (head_config && - config_attr(&attr, head_config, data->error)) - return -EINVAL; + if (head_config) { + if (config_attr(&attr, head_config, data->error)) + return -EINVAL; + + if (get_config_terms(head_config, &config_terms)) + return -ENOMEM; + } - return add_event(list, &data->idx, &attr, NULL); + return add_event(list, &data->idx, &attr, NULL, &config_terms); } static int parse_events__is_name_term(struct parse_events_term *term) @@ -664,6 +704,7 @@ int parse_events_add_pmu(struct parse_events_evlist *data, struct perf_pmu_info info; struct perf_pmu *pmu; struct perf_evsel *evsel; + LIST_HEAD(config_terms); pmu = perf_pmu__find(name); if (!pmu) @@ -678,7 +719,7 @@ int parse_events_add_pmu(struct parse_events_evlist *data, if (!head_config) { attr.type = pmu->type; - evsel = __add_event(list, &data->idx, &attr, NULL, pmu->cpus); + evsel = __add_event(list, &data->idx, &attr, NULL, pmu->cpus, NULL); return evsel ? 0 : -ENOMEM; } @@ -692,11 +733,15 @@ int parse_events_add_pmu(struct parse_events_evlist *data, if (config_attr(&attr, head_config, data->error)) return -EINVAL; + if (get_config_terms(head_config, &config_terms)) + return -ENOMEM; + if (perf_pmu__config(pmu, &attr, head_config, data->error)) return -EINVAL; evsel = __add_event(list, &data->idx, &attr, - pmu_event_name(head_config), pmu->cpus); + pmu_event_name(head_config), pmu->cpus, + &config_terms); if (evsel) { evsel->unit = info.unit; evsel->scale = info.scale; -- cgit v0.10.2 From ee4c75887d12bcd6ecd897291797d969256f39ca Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 29 Jul 2015 05:42:11 -0400 Subject: perf tools: Force period term to overload global settings Currently the command line option settings beats the per event period settings: With no global settings, we get per-event configuration: $ perf record -e 'cpu/instructions,period=20000/' sleep 1 $ perf evlist -v ... { sample_period, sample_freq }: 20000 ... With 'c' option period setup, we get 'c' option value: $ perf record -e 'cpu/instructions,period=20000/' -c 1000 sleep 1 $ perf evlist -v ... { sample_period, sample_freq }: 1000 ... This patch makes the per-event settings overload the global 'c' option setup: $ perf record -e 'cpu/instructions,period=20000/' -c 1000 sleep 1 $ perf evlist -v ... { sample_period, sample_freq }: 20000 ... I think the making the per-event settings to overload any other config makes more sense than current state. However it breaks the current 'period' term handling, which might cause some noise.. so let's see ;-). Also fixing parse event tests with the new behaviour. Signed-off-by: Jiri Olsa Tested-by: Arnaldo Carvalho de Melo Cc: Andi Kleen Cc: Kan Liang Cc: Namhyung Kim Link: http://lkml.kernel.org/r/1438162936-59698-3-git-send-email-kan.liang@intel.com Signed-off-by: Kan Liang Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt index 63ee040..ac41350 100644 --- a/tools/perf/Documentation/perf-record.txt +++ b/tools/perf/Documentation/perf-record.txt @@ -46,7 +46,7 @@ OPTIONS /sys/bus/event_sources/devices//format/* There are also some params which are not defined in ...//format/*. - These params can be used to set event defaults. + These params can be used to overload default config values per event. Here is a list of the params. - 'period': Set event sampling period diff --git a/tools/perf/tests/parse-events.c b/tools/perf/tests/parse-events.c index d76963f..f65bb89 100644 --- a/tools/perf/tests/parse-events.c +++ b/tools/perf/tests/parse-events.c @@ -82,8 +82,12 @@ static int test__checkevent_symbolic_name_config(struct perf_evlist *evlist) TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->attr.type); TEST_ASSERT_VAL("wrong config", PERF_COUNT_HW_CPU_CYCLES == evsel->attr.config); + /* + * The period value gets configured within perf_evlist__config, + * while this test executes only parse events method. + */ TEST_ASSERT_VAL("wrong period", - 100000 == evsel->attr.sample_period); + 0 == evsel->attr.sample_period); TEST_ASSERT_VAL("wrong config1", 0 == evsel->attr.config1); TEST_ASSERT_VAL("wrong config2", @@ -406,7 +410,11 @@ static int test__checkevent_pmu(struct perf_evlist *evlist) TEST_ASSERT_VAL("wrong config", 10 == evsel->attr.config); TEST_ASSERT_VAL("wrong config1", 1 == evsel->attr.config1); TEST_ASSERT_VAL("wrong config2", 3 == evsel->attr.config2); - TEST_ASSERT_VAL("wrong period", 1000 == evsel->attr.sample_period); + /* + * The period value gets configured within perf_evlist__config, + * while this test executes only parse events method. + */ + TEST_ASSERT_VAL("wrong period", 0 == evsel->attr.sample_period); return 0; } diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 048d61d..7d3acba 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -594,6 +594,8 @@ static void apply_config_terms(struct perf_event_attr *attr __maybe_unused, list_for_each_entry(term, config_terms, list) { switch (term->type) { + case PERF_EVSEL__CONFIG_TERM_PERIOD: + attr->sample_period = term->val.period; default: break; } diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index 0339819..a7d2175 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -38,6 +38,7 @@ struct cgroup_sel; * perf_evsel::config_terms list head. */ enum { + PERF_EVSEL__CONFIG_TERM_PERIOD, PERF_EVSEL__CONFIG_TERM_MAX, }; diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 3271d13..09bee93 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -596,7 +596,6 @@ do { \ break; case PARSE_EVENTS__TERM_TYPE_SAMPLE_PERIOD: CHECK_TYPE_VAL(NUM); - attr->sample_period = term->val.num; break; case PARSE_EVENTS__TERM_TYPE_BRANCH_SAMPLE_TYPE: /* @@ -649,6 +648,8 @@ do { \ list_for_each_entry(term, head_config, list) { switch (term->type_term) { + case PARSE_EVENTS__TERM_TYPE_SAMPLE_PERIOD: + ADD_CONFIG_TERM(PERIOD, period, term->val.num); default: break; } -- cgit v0.10.2 From c421e80b1073e2ed58842ee6ee493386ace2d9eb Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 29 Jul 2015 05:42:12 -0400 Subject: perf tools: Introduce callgraph_set for callgraph option Introduce callgraph_set to indicate whether the callgraph option was set by user. Signed-off-by: Kan Liang Cc: Andi Kleen Cc: Jiri Olsa Cc: Namhyung Kim Link: http://lkml.kernel.org/r/1438162936-59698-4-git-send-email-kan.liang@intel.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 445a64d..f51131b 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -762,12 +762,14 @@ static void callchain_debug(void) callchain_param.dump_size); } -int record_parse_callchain_opt(const struct option *opt __maybe_unused, +int record_parse_callchain_opt(const struct option *opt, const char *arg, int unset) { int ret; + struct record_opts *record = (struct record_opts *)opt->value; + record->callgraph_set = true; callchain_param.enabled = !unset; /* --no-call-graph */ @@ -784,10 +786,13 @@ int record_parse_callchain_opt(const struct option *opt __maybe_unused, return ret; } -int record_callchain_opt(const struct option *opt __maybe_unused, +int record_callchain_opt(const struct option *opt, const char *arg __maybe_unused, int unset __maybe_unused) { + struct record_opts *record = (struct record_opts *)opt->value; + + record->callgraph_set = true; callchain_param.enabled = true; if (callchain_param.record_mode == CALLCHAIN_NONE) diff --git a/tools/perf/perf.h b/tools/perf/perf.h index cf459f8..cccb4cf 100644 --- a/tools/perf/perf.h +++ b/tools/perf/perf.h @@ -52,6 +52,7 @@ struct record_opts { bool sample_weight; bool sample_time; bool sample_time_set; + bool callgraph_set; bool period; bool sample_intr_regs; bool running_time; -- cgit v0.10.2 From aa53c09e90a19c215549bd1ca970fddcb7c0c001 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 27 Jul 2015 20:24:17 +0200 Subject: perf tests: Adding build test for having ending double slash MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pawel Moll reported build issue for having extra slash (/) at the end of the prefix variable. $ make prefix=/usr/local/ CC tests/attr.o tests/attr.c: In function ‘test__attr’: tests/attr.c:168:50: error: expected ‘)’ before ‘;’ token snprintf(path_perf, PATH_MAX, "%s/perf", BINDIR); ^ tests/attr.c:176:1: error: expected ‘;’ before ‘}’ token } ^ tests/attr.c:176:1: error: control reaches end of non-void function [-Werror=return-type] } ^ cc1: all warnings being treated as errors Adding automated test case for this. Reported-by: Pawel Moll Signed-off-by: Jiri Olsa Cc: Adrian Hunter Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20150727182417.GD20509@krava.brq.redhat.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/tests/make b/tools/perf/tests/make index 729112f..ba31c4b 100644 --- a/tools/perf/tests/make +++ b/tools/perf/tests/make @@ -58,7 +58,8 @@ make_install_man := install-man make_install_html := install-html make_install_info := install-info make_install_pdf := install-pdf -make_install_prefix := install prefix=/tmp/krava +make_install_prefix := install prefix=/tmp/krava +make_install_prefix_slash := install prefix=/tmp/krava/ make_static := LDFLAGS=-static # all the NO_* variable combined @@ -101,6 +102,7 @@ run += make_util_pmu_bison_o run += make_install run += make_install_bin run += make_install_prefix +run += make_install_prefix_slash # FIXME 'install-*' commented out till they're fixed # run += make_install_doc # run += make_install_man @@ -175,11 +177,14 @@ test_make_install_O := $(call test_dest_files,$(installed_files_all)) test_make_install_bin := $(call test_dest_files,$(installed_files_bin)) test_make_install_bin_O := $(call test_dest_files,$(installed_files_bin)) -# We prefix all installed files for make_install_prefix +# We prefix all installed files for make_install_prefix(_slash) # with '/tmp/krava' to match installed/prefix-ed files. installed_files_all_prefix := $(addprefix /tmp/krava/,$(installed_files_all)) -test_make_install_prefix := $(call test_dest_files,$(installed_files_all_prefix)) -test_make_install_prefix_O := $(call test_dest_files,$(installed_files_all_prefix)) +test_make_install_prefix := $(call test_dest_files,$(installed_files_all_prefix)) +test_make_install_prefix_O := $(call test_dest_files,$(installed_files_all_prefix)) + +test_make_install_prefix_slash := $(test_make_install_prefix) +test_make_install_prefix_slash_O := $(test_make_install_prefix_O) # FIXME nothing gets installed test_make_install_man := test -f $$TMP_DEST/share/man/man1/perf.1 -- cgit v0.10.2 From f231722a2b27ee99cbcd0c6bcf4c866612b78137 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 21 Jul 2015 15:40:03 +0200 Subject: uprobes: Introduce get_uprobe() Cosmetic. Add the new trivial helper, get_uprobe(). It matches put_uprobe() we already have and we can simplify a couple of its users. Tested-by: Pratyush Anand Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Acked-by: Anton Arapov Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150721134003.GA4736@redhat.com Signed-off-by: Ingo Molnar diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index cb346f2..a9847b4 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -366,6 +366,18 @@ set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long v return uprobe_write_opcode(mm, vaddr, *(uprobe_opcode_t *)&auprobe->insn); } +static struct uprobe *get_uprobe(struct uprobe *uprobe) +{ + atomic_inc(&uprobe->ref); + return uprobe; +} + +static void put_uprobe(struct uprobe *uprobe) +{ + if (atomic_dec_and_test(&uprobe->ref)) + kfree(uprobe); +} + static int match_uprobe(struct uprobe *l, struct uprobe *r) { if (l->inode < r->inode) @@ -393,10 +405,8 @@ static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset) while (n) { uprobe = rb_entry(n, struct uprobe, rb_node); match = match_uprobe(&u, uprobe); - if (!match) { - atomic_inc(&uprobe->ref); - return uprobe; - } + if (!match) + return get_uprobe(uprobe); if (match < 0) n = n->rb_left; @@ -432,10 +442,8 @@ static struct uprobe *__insert_uprobe(struct uprobe *uprobe) parent = *p; u = rb_entry(parent, struct uprobe, rb_node); match = match_uprobe(uprobe, u); - if (!match) { - atomic_inc(&u->ref); - return u; - } + if (!match) + return get_uprobe(u); if (match < 0) p = &parent->rb_left; @@ -472,12 +480,6 @@ static struct uprobe *insert_uprobe(struct uprobe *uprobe) return u; } -static void put_uprobe(struct uprobe *uprobe) -{ - if (atomic_dec_and_test(&uprobe->ref)) - kfree(uprobe); -} - static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset) { struct uprobe *uprobe, *cur_uprobe; @@ -1039,14 +1041,14 @@ static void build_probe_list(struct inode *inode, if (u->inode != inode || u->offset < min) break; list_add(&u->pending_list, head); - atomic_inc(&u->ref); + get_uprobe(u); } for (t = n; (t = rb_next(t)); ) { u = rb_entry(t, struct uprobe, rb_node); if (u->inode != inode || u->offset > max) break; list_add(&u->pending_list, head); - atomic_inc(&u->ref); + get_uprobe(u); } } spin_unlock(&uprobes_treelock); @@ -1437,7 +1439,7 @@ static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask) return -ENOMEM; *n = *o; - atomic_inc(&n->uprobe->ref); + get_uprobe(n->uprobe); n->next = NULL; *p = n; @@ -1565,8 +1567,7 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs) orig_ret_vaddr = utask->return_instances->orig_ret_vaddr; } - atomic_inc(&uprobe->ref); - ri->uprobe = uprobe; + ri->uprobe = get_uprobe(uprobe); ri->func = instruction_pointer(regs); ri->orig_ret_vaddr = orig_ret_vaddr; ri->chained = chained; -- cgit v0.10.2 From 2bb5e840e873f8778a41801141771f54f547fa65 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 21 Jul 2015 15:40:06 +0200 Subject: uprobes: Introduce free_ret_instance() We can simplify uprobe_free_utask() and handle_uretprobe_chain() if we add a simple helper which does put_uprobe/kfree and returns the ->next return_instance. Tested-by: Pratyush Anand Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Acked-by: Anton Arapov Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150721134006.GA4740@redhat.com Signed-off-by: Ingo Molnar diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index a9847b4..d8c702f 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1378,6 +1378,14 @@ unsigned long uprobe_get_trap_addr(struct pt_regs *regs) return instruction_pointer(regs); } +static struct return_instance *free_ret_instance(struct return_instance *ri) +{ + struct return_instance *next = ri->next; + put_uprobe(ri->uprobe); + kfree(ri); + return next; +} + /* * Called with no locks held. * Called in context of a exiting or a exec-ing thread. @@ -1385,7 +1393,7 @@ unsigned long uprobe_get_trap_addr(struct pt_regs *regs) void uprobe_free_utask(struct task_struct *t) { struct uprobe_task *utask = t->utask; - struct return_instance *ri, *tmp; + struct return_instance *ri; if (!utask) return; @@ -1394,13 +1402,8 @@ void uprobe_free_utask(struct task_struct *t) put_uprobe(utask->active_uprobe); ri = utask->return_instances; - while (ri) { - tmp = ri; - ri = ri->next; - - put_uprobe(tmp->uprobe); - kfree(tmp); - } + while (ri) + ri = free_ret_instance(ri); xol_free_insn_slot(t); kfree(utask); @@ -1770,7 +1773,7 @@ handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs) static bool handle_trampoline(struct pt_regs *regs) { struct uprobe_task *utask; - struct return_instance *ri, *tmp; + struct return_instance *ri; bool chained; utask = current->utask; @@ -1792,11 +1795,7 @@ static bool handle_trampoline(struct pt_regs *regs) handle_uretprobe_chain(ri, regs); chained = ri->chained; - put_uprobe(ri->uprobe); - - tmp = ri; - ri = ri->next; - kfree(tmp); + ri = free_ret_instance(ri); utask->depth--; if (!chained) -- cgit v0.10.2 From 0b5256c7f173258b19d98364adb57f707dda22f3 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 21 Jul 2015 15:40:08 +0200 Subject: uprobes: Send SIGILL if handle_trampoline() fails 1. It doesn't make sense to continue if handle_trampoline() fails, change handle_swbp() to always return after this call. 2. Turn pr_warn() into uprobe_warn(), and change handle_trampoline() to send SIGILL on failure. It is pointless to return to user mode with the corrupted instruction_pointer() which we can't restore. Tested-by: Pratyush Anand Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Acked-by: Anton Arapov Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150721134008.GA4745@redhat.com Signed-off-by: Ingo Molnar diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index d8c702f..eabdc21 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1770,7 +1770,7 @@ handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs) up_read(&uprobe->register_rwsem); } -static bool handle_trampoline(struct pt_regs *regs) +static void handle_trampoline(struct pt_regs *regs) { struct uprobe_task *utask; struct return_instance *ri; @@ -1778,11 +1778,11 @@ static bool handle_trampoline(struct pt_regs *regs) utask = current->utask; if (!utask) - return false; + goto sigill; ri = utask->return_instances; if (!ri) - return false; + goto sigill; /* * TODO: we should throw out return_instance's invalidated by @@ -1804,8 +1804,12 @@ static bool handle_trampoline(struct pt_regs *regs) } utask->return_instances = ri; + return; + + sigill: + uprobe_warn(current, "handle uretprobe, sending SIGILL."); + force_sig_info(SIGILL, SEND_SIG_FORCED, current); - return true; } bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs) @@ -1824,13 +1828,8 @@ static void handle_swbp(struct pt_regs *regs) int uninitialized_var(is_swbp); bp_vaddr = uprobe_get_swbp_addr(regs); - if (bp_vaddr == get_trampoline_vaddr()) { - if (handle_trampoline(regs)) - return; - - pr_warn("uprobe: unable to handle uretprobe pid/tgid=%d/%d\n", - current->pid, current->tgid); - } + if (bp_vaddr == get_trampoline_vaddr()) + return handle_trampoline(regs); uprobe = find_active_uprobe(bp_vaddr, &is_swbp); if (!uprobe) { -- cgit v0.10.2 From 6c58d0e4cc26ea8882928e64c0de9afed4fc37cb Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 21 Jul 2015 15:40:10 +0200 Subject: uprobes: Change prepare_uretprobe() to use uprobe_warn() Turn the last pr_warn() in uprobes.c into uprobe_warn(). While at it: - s/kzalloc/kmalloc, we initialize every member of 'ri' - remove the pointless comment above the obvious code Tested-by: Pratyush Anand Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Acked-by: Anton Arapov Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150721134010.GA4752@redhat.com Signed-off-by: Ingo Molnar diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index eabdc21..4c941fe 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1541,9 +1541,9 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs) return; } - ri = kzalloc(sizeof(struct return_instance), GFP_KERNEL); + ri = kmalloc(sizeof(struct return_instance), GFP_KERNEL); if (!ri) - goto fail; + return; trampoline_vaddr = get_trampoline_vaddr(); orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs); @@ -1561,8 +1561,7 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs) * This situation is not possible. Likely we have an * attack from user-space. */ - pr_warn("uprobe: unable to set uretprobe pid/tgid=%d/%d\n", - current->pid, current->tgid); + uprobe_warn(current, "handle tail call"); goto fail; } @@ -1576,13 +1575,10 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs) ri->chained = chained; utask->depth++; - - /* add instance to the stack */ ri->next = utask->return_instances; utask->return_instances = ri; return; - fail: kfree(ri); } -- cgit v0.10.2 From a83cfeb92132c279b20bbc8ed3cef833b0fe417e Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 21 Jul 2015 15:40:13 +0200 Subject: uprobes: Change handle_trampoline() to find the next chain beforehand No functional changes, preparation. Add the new helper, find_next_ret_chain(), which finds the first !chained entry and returns its ->next. Yes, it is suboptimal. We probably want to turn ->chained into ->start_of_this_chain pointer and avoid another loop. But this needs the boring changes in dup_utask(), so lets do this later. Change the main loop in handle_trampoline() to unwind the stack until ri is equal to the pointer returned by this new helper. Tested-by: Pratyush Anand Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Acked-by: Anton Arapov Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150721134013.GA4755@redhat.com Signed-off-by: Ingo Molnar diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 4c941fe..98e4d97 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1766,11 +1766,22 @@ handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs) up_read(&uprobe->register_rwsem); } +static struct return_instance *find_next_ret_chain(struct return_instance *ri) +{ + bool chained; + + do { + chained = ri->chained; + ri = ri->next; /* can't be NULL if chained */ + } while (chained); + + return ri; +} + static void handle_trampoline(struct pt_regs *regs) { struct uprobe_task *utask; - struct return_instance *ri; - bool chained; + struct return_instance *ri, *next; utask = current->utask; if (!utask) @@ -1780,24 +1791,18 @@ static void handle_trampoline(struct pt_regs *regs) if (!ri) goto sigill; + next = find_next_ret_chain(ri); /* * TODO: we should throw out return_instance's invalidated by * longjmp(), currently we assume that the probed function always * returns. */ instruction_pointer_set(regs, ri->orig_ret_vaddr); - - for (;;) { + do { handle_uretprobe_chain(ri, regs); - - chained = ri->chained; ri = free_ret_instance(ri); utask->depth--; - - if (!chained) - break; - BUG_ON(!ri); - } + } while (ri != next); utask->return_instances = ri; return; -- cgit v0.10.2 From 97da89767d398c1dfa1f34e5f312eb8ebb382f7f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 21 Jul 2015 15:40:16 +0200 Subject: uprobes: Export 'struct return_instance', introduce arch_uretprobe_is_alive() Add the new "weak" helper, arch_uretprobe_is_alive(), used by the next patches. It should return true if this return_instance is still valid. The arch agnostic version just always returns true. The patch exports "struct return_instance" for the architectures which want to override this hook. We can also cleanup prepare_uretprobe() if we pass the new return_instance to arch_uretprobe_hijack_return_addr(). Tested-by: Pratyush Anand Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Acked-by: Anton Arapov Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150721134016.GA4762@redhat.com Signed-off-by: Ingo Molnar diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 60beb5d..50d2764 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -92,6 +92,15 @@ struct uprobe_task { unsigned int depth; }; +struct return_instance { + struct uprobe *uprobe; + unsigned long func; + unsigned long orig_ret_vaddr; /* original return address */ + bool chained; /* true, if instance is nested */ + + struct return_instance *next; /* keep as stack */ +}; + struct xol_area; struct uprobes_state { @@ -128,6 +137,7 @@ extern bool arch_uprobe_xol_was_trapped(struct task_struct *tsk); extern int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val, void *data); extern void arch_uprobe_abort_xol(struct arch_uprobe *aup, struct pt_regs *regs); extern unsigned long arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs *regs); +extern bool arch_uretprobe_is_alive(struct return_instance *ret, struct pt_regs *regs); extern bool arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs); extern void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, void *src, unsigned long len); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 98e4d97..1c71b62 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -86,15 +86,6 @@ struct uprobe { struct arch_uprobe arch; }; -struct return_instance { - struct uprobe *uprobe; - unsigned long func; - unsigned long orig_ret_vaddr; /* original return address */ - bool chained; /* true, if instance is nested */ - - struct return_instance *next; /* keep as stack */ -}; - /* * Execute out of line area: anonymous executable mapping installed * by the probed task to execute the copy of the original instruction @@ -1818,6 +1809,11 @@ bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs) return false; } +bool __weak arch_uretprobe_is_alive(struct return_instance *ret, struct pt_regs *regs) +{ + return true; +} + /* * Run handler and ask thread to singlestep. * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. -- cgit v0.10.2 From 7b868e4802a86d867aad1be0471b5767d9c20e10 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 21 Jul 2015 15:40:18 +0200 Subject: uprobes/x86: Reimplement arch_uretprobe_is_alive() Add the x86 specific version of arch_uretprobe_is_alive() helper. It returns true if the stack frame mangled by prepare_uretprobe() is still on stack. So if it returns false, we know that the probed function has already returned. We add the new return_instance->stack member and change the generic code to initialize it in prepare_uretprobe, but it should be equally useful for other architectures. TODO: this assumes that the probed application can't use multiple stacks (say sigaltstack). We will try to improve this logic later. Tested-by: Pratyush Anand Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Acked-by: Anton Arapov Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150721134018.GA4766@redhat.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 6647624..58e9b84 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -985,3 +985,8 @@ arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs return -1; } + +bool arch_uretprobe_is_alive(struct return_instance *ret, struct pt_regs *regs) +{ + return regs->sp <= ret->stack; +} diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 50d2764..7ab6d2c 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -95,6 +95,7 @@ struct uprobe_task { struct return_instance { struct uprobe *uprobe; unsigned long func; + unsigned long stack; /* stack pointer */ unsigned long orig_ret_vaddr; /* original return address */ bool chained; /* true, if instance is nested */ diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 1c71b62..c5f316e 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1562,6 +1562,7 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs) ri->uprobe = get_uprobe(uprobe); ri->func = instruction_pointer(regs); + ri->stack = user_stack_pointer(regs); ri->orig_ret_vaddr = orig_ret_vaddr; ri->chained = chained; -- cgit v0.10.2 From 5eeb50de42fd3251845d03c556db012267c72b3f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 21 Jul 2015 15:40:21 +0200 Subject: uprobes: Change handle_trampoline() to flush the frames invalidated by longjmp() Test-case: #include #include jmp_buf jmp; void func_2(void) { longjmp(jmp, 1); } void func_1(void) { if (setjmp(jmp)) return; func_2(); printf("ERR!! I am running on the caller's stack\n"); } int main(void) { func_1(); return 0; } fails if you probe func_1() and func_2() because handle_trampoline() assumes that the probed function should must return and hit the bp installed be prepare_uretprobe(). But in this case func_2() does not return, so when func_1() returns the kernel uses the no longer valid return_instance of func_2(). Change handle_trampoline() to unwind ->return_instances until we know that the next chain is alive or NULL, this ensures that the current chain is the last we need to report and free. Alternatively, every return_instance could use unique trampoline_vaddr, in this case we could use it as a key. And this could solve the problem with sigaltstack() automatically. But this approach needs more changes, and it puts the "hard" limit on MAX_URETPROBE_DEPTH. Plus it can not solve another problem partially fixed by the next patch. Note: this change has no effect on !x86, the arch-agnostic version of arch_uretprobe_is_alive() just returns "true". TODO: as documented by the previous change, arch_uretprobe_is_alive() can be fooled by sigaltstack/etc. Tested-by: Pratyush Anand Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Acked-by: Anton Arapov Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150721134021.GA4773@redhat.com Signed-off-by: Ingo Molnar diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index c5f316e..93d939c8 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1774,6 +1774,7 @@ static void handle_trampoline(struct pt_regs *regs) { struct uprobe_task *utask; struct return_instance *ri, *next; + bool valid; utask = current->utask; if (!utask) @@ -1783,18 +1784,24 @@ static void handle_trampoline(struct pt_regs *regs) if (!ri) goto sigill; - next = find_next_ret_chain(ri); - /* - * TODO: we should throw out return_instance's invalidated by - * longjmp(), currently we assume that the probed function always - * returns. - */ - instruction_pointer_set(regs, ri->orig_ret_vaddr); do { - handle_uretprobe_chain(ri, regs); - ri = free_ret_instance(ri); - utask->depth--; - } while (ri != next); + /* + * We should throw out the frames invalidated by longjmp(). + * If this chain is valid, then the next one should be alive + * or NULL; the latter case means that nobody but ri->func + * could hit this trampoline on return. TODO: sigaltstack(). + */ + next = find_next_ret_chain(ri); + valid = !next || arch_uretprobe_is_alive(next, regs); + + instruction_pointer_set(regs, ri->orig_ret_vaddr); + do { + if (valid) + handle_uretprobe_chain(ri, regs); + ri = free_ret_instance(ri); + utask->depth--; + } while (ri != next); + } while (!valid); utask->return_instances = ri; return; -- cgit v0.10.2 From a5b7e1a89b820f2b9b23634ca4c59b555e8d9a0d Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 21 Jul 2015 15:40:23 +0200 Subject: uprobes: Change prepare_uretprobe() to (try to) flush the dead frames Change prepare_uretprobe() to flush the !arch_uretprobe_is_alive() return_instance's. This is not needed correctness-wise, but can help to avoid the failure caused by MAX_URETPROBE_DEPTH. Note: in this case arch_uretprobe_is_alive() can be false positive, the stack can grow after longjmp(). Unfortunately, the kernel can't 100% solve this problem, but see the next patch. Tested-by: Pratyush Anand Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Acked-by: Anton Arapov Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150721134023.GA4776@redhat.com Signed-off-by: Ingo Molnar diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 93d939c8..7e61c8c 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1511,6 +1511,16 @@ static unsigned long get_trampoline_vaddr(void) return trampoline_vaddr; } +static void cleanup_return_instances(struct uprobe_task *utask, struct pt_regs *regs) +{ + struct return_instance *ri = utask->return_instances; + while (ri && !arch_uretprobe_is_alive(ri, regs)) { + ri = free_ret_instance(ri); + utask->depth--; + } + utask->return_instances = ri; +} + static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs) { struct return_instance *ri; @@ -1541,6 +1551,9 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs) if (orig_ret_vaddr == -1) goto fail; + /* drop the entries invalidated by longjmp() */ + cleanup_return_instances(utask, regs); + /* * We don't want to keep trampoline address in stack, rather keep the * original return address of first caller thru all the consequent -- cgit v0.10.2 From 86dcb702e74b8ab7d3b2d36984ef00671cea73b9 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 21 Jul 2015 15:40:26 +0200 Subject: uprobes: Add the "enum rp_check ctx" arg to arch_uretprobe_is_alive() arch/x86 doesn't care (so far), but as Pratyush Anand pointed out other architectures might want why arch_uretprobe_is_alive() was called and use different checks depending on the context. Add the new argument to distinguish 2 callers. Tested-by: Pratyush Anand Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Acked-by: Anton Arapov Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150721134026.GA4779@redhat.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 58e9b84..acf8b90 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -986,7 +986,8 @@ arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs return -1; } -bool arch_uretprobe_is_alive(struct return_instance *ret, struct pt_regs *regs) +bool arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx, + struct pt_regs *regs) { return regs->sp <= ret->stack; } diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 7ab6d2c..c0a5402 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -102,6 +102,11 @@ struct return_instance { struct return_instance *next; /* keep as stack */ }; +enum rp_check { + RP_CHECK_CALL, + RP_CHECK_RET, +}; + struct xol_area; struct uprobes_state { @@ -138,7 +143,7 @@ extern bool arch_uprobe_xol_was_trapped(struct task_struct *tsk); extern int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val, void *data); extern void arch_uprobe_abort_xol(struct arch_uprobe *aup, struct pt_regs *regs); extern unsigned long arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs *regs); -extern bool arch_uretprobe_is_alive(struct return_instance *ret, struct pt_regs *regs); +extern bool arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx, struct pt_regs *regs); extern bool arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs); extern void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, void *src, unsigned long len); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 7e61c8c..df5661a 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1514,7 +1514,9 @@ static unsigned long get_trampoline_vaddr(void) static void cleanup_return_instances(struct uprobe_task *utask, struct pt_regs *regs) { struct return_instance *ri = utask->return_instances; - while (ri && !arch_uretprobe_is_alive(ri, regs)) { + enum rp_check ctx = RP_CHECK_CALL; + + while (ri && !arch_uretprobe_is_alive(ri, ctx, regs)) { ri = free_ret_instance(ri); utask->depth--; } @@ -1805,7 +1807,7 @@ static void handle_trampoline(struct pt_regs *regs) * could hit this trampoline on return. TODO: sigaltstack(). */ next = find_next_ret_chain(ri); - valid = !next || arch_uretprobe_is_alive(next, regs); + valid = !next || arch_uretprobe_is_alive(next, RP_CHECK_RET, regs); instruction_pointer_set(regs, ri->orig_ret_vaddr); do { @@ -1830,7 +1832,8 @@ bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs) return false; } -bool __weak arch_uretprobe_is_alive(struct return_instance *ret, struct pt_regs *regs) +bool __weak arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx, + struct pt_regs *regs) { return true; } -- cgit v0.10.2 From db087ef69a2b155ae001665bf0b3806abde7ee34 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 21 Jul 2015 15:40:28 +0200 Subject: uprobes/x86: Make arch_uretprobe_is_alive(RP_CHECK_CALL) more clever The previous change documents that cleanup_return_instances() can't always detect the dead frames, the stack can grow. But there is one special case which imho worth fixing: arch_uretprobe_is_alive() can return true when the stack didn't actually grow, but the next "call" insn uses the already invalidated frame. Test-case: #include #include jmp_buf jmp; int nr = 1024; void func_2(void) { if (--nr == 0) return; longjmp(jmp, 1); } void func_1(void) { setjmp(jmp); func_2(); } int main(void) { func_1(); return 0; } If you ret-probe func_1() and func_2() prepare_uretprobe() hits the MAX_URETPROBE_DEPTH limit and "return" from func_2() is not reported. When we know that the new call is not chained, we can do the more strict check. In this case "sp" points to the new ret-addr, so every frame which uses the same "sp" must be dead. The only complication is that arch_uretprobe_is_alive() needs to know was it chained or not, so we add the new RP_CHECK_CHAIN_CALL enum and change prepare_uretprobe() to pass RP_CHECK_CALL only if !chained. Note: arch_uretprobe_is_alive() could also re-read *sp and check if this word is still trampoline_vaddr. This could obviously improve the logic, but I would like to avoid another copy_from_user() especially in the case when we can't avoid the false "alive == T" positives. Tested-by: Pratyush Anand Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Acked-by: Anton Arapov Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150721134028.GA4786@redhat.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index acf8b90..bf4db6e 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -989,5 +989,8 @@ arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs bool arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx, struct pt_regs *regs) { - return regs->sp <= ret->stack; + if (ctx == RP_CHECK_CALL) /* sp was just decremented by "call" insn */ + return regs->sp < ret->stack; + else + return regs->sp <= ret->stack; } diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index c0a5402..0bdc72f 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -104,6 +104,7 @@ struct return_instance { enum rp_check { RP_CHECK_CALL, + RP_CHECK_CHAIN_CALL, RP_CHECK_RET, }; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index df5661a..0f370ef 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1511,10 +1511,11 @@ static unsigned long get_trampoline_vaddr(void) return trampoline_vaddr; } -static void cleanup_return_instances(struct uprobe_task *utask, struct pt_regs *regs) +static void cleanup_return_instances(struct uprobe_task *utask, bool chained, + struct pt_regs *regs) { struct return_instance *ri = utask->return_instances; - enum rp_check ctx = RP_CHECK_CALL; + enum rp_check ctx = chained ? RP_CHECK_CHAIN_CALL : RP_CHECK_CALL; while (ri && !arch_uretprobe_is_alive(ri, ctx, regs)) { ri = free_ret_instance(ri); @@ -1528,7 +1529,7 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs) struct return_instance *ri; struct uprobe_task *utask; unsigned long orig_ret_vaddr, trampoline_vaddr; - bool chained = false; + bool chained; if (!get_xol_area()) return; @@ -1554,14 +1555,15 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs) goto fail; /* drop the entries invalidated by longjmp() */ - cleanup_return_instances(utask, regs); + chained = (orig_ret_vaddr == trampoline_vaddr); + cleanup_return_instances(utask, chained, regs); /* * We don't want to keep trampoline address in stack, rather keep the * original return address of first caller thru all the consequent * instances. This also makes breakpoint unwrapping easier. */ - if (orig_ret_vaddr == trampoline_vaddr) { + if (chained) { if (!utask->return_instances) { /* * This situation is not possible. Likely we have an @@ -1570,8 +1572,6 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs) uprobe_warn(current, "handle tail call"); goto fail; } - - chained = true; orig_ret_vaddr = utask->return_instances->orig_ret_vaddr; } -- cgit v0.10.2 From f58bea2fec63db72f8050ade709358257e9102ab Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 21 Jul 2015 15:40:31 +0200 Subject: uprobes: Fix the usage of install_special_mapping() install_special_mapping(pages) expects that "pages" is the zero- terminated array while xol_add_vma() passes &area->page, this means that special_mapping_fault() can wrongly use the next member in xol_area (vaddr) as "struct page *". Fortunately, this area is not expandable so pgoff != 0 isn't possible (modulo bugs in special_mapping_vmops), but still this does not look good. Signed-off-by: Oleg Nesterov Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Pratyush Anand Cc: Srikar Dronamraju Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150721134031.GA4789@redhat.com Signed-off-by: Ingo Molnar diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 0f370ef..4b8ac5f 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -99,7 +99,7 @@ struct xol_area { wait_queue_head_t wq; /* if all slots are busy */ atomic_t slot_count; /* number of in-use slots */ unsigned long *bitmap; /* 0 = free slot */ - struct page *page; + struct page *pages[2]; /* * We keep the vma's vm_start rather than a pointer to the vma @@ -1142,7 +1142,7 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) } ret = install_special_mapping(mm, area->vaddr, PAGE_SIZE, - VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO, &area->page); + VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO, area->pages); if (ret) goto fail; @@ -1168,21 +1168,22 @@ static struct xol_area *__create_xol_area(unsigned long vaddr) if (!area->bitmap) goto free_area; - area->page = alloc_page(GFP_HIGHUSER); - if (!area->page) + area->pages[0] = alloc_page(GFP_HIGHUSER); + if (!area->pages[0]) goto free_bitmap; + area->pages[1] = NULL; area->vaddr = vaddr; init_waitqueue_head(&area->wq); /* Reserve the 1st slot for get_trampoline_vaddr() */ set_bit(0, area->bitmap); atomic_set(&area->slot_count, 1); - copy_to_page(area->page, 0, &insn, UPROBE_SWBP_INSN_SIZE); + copy_to_page(area->pages[0], 0, &insn, UPROBE_SWBP_INSN_SIZE); if (!xol_add_vma(mm, area)) return area; - __free_page(area->page); + __free_page(area->pages[0]); free_bitmap: kfree(area->bitmap); free_area: @@ -1220,7 +1221,7 @@ void uprobe_clear_state(struct mm_struct *mm) if (!area) return; - put_page(area->page); + put_page(area->pages[0]); kfree(area->bitmap); kfree(area); } @@ -1289,7 +1290,7 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe) if (unlikely(!xol_vaddr)) return 0; - arch_uprobe_copy_ixol(area->page, xol_vaddr, + arch_uprobe_copy_ixol(area->pages[0], xol_vaddr, &uprobe->arch.ixol, sizeof(uprobe->arch.ixol)); return xol_vaddr; -- cgit v0.10.2 From 704bde3cc26a4cb34386c164107b59e09745a022 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 21 Jul 2015 15:40:33 +0200 Subject: uprobes: Use vm_special_mapping to name the XOL vma Change xol_add_vma() to use _install_special_mapping(), this way we can name the vma installed by uprobes. Currently it looks like private anonymous mapping, this is confusing and complicates the debugging. With this change /proc/$pid/maps reports "[uprobes]". As a side effect this will cause core dumps to include the XOL vma and I think this is good; this can help to debug the problem if the app crashed because it was probed. Signed-off-by: Oleg Nesterov Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Pratyush Anand Cc: Srikar Dronamraju Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150721134033.GA4796@redhat.com Signed-off-by: Ingo Molnar diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 4b8ac5f..2d5b7bd 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -96,17 +96,18 @@ struct uprobe { * allocated. */ struct xol_area { - wait_queue_head_t wq; /* if all slots are busy */ - atomic_t slot_count; /* number of in-use slots */ - unsigned long *bitmap; /* 0 = free slot */ - struct page *pages[2]; + wait_queue_head_t wq; /* if all slots are busy */ + atomic_t slot_count; /* number of in-use slots */ + unsigned long *bitmap; /* 0 = free slot */ + struct vm_special_mapping xol_mapping; + struct page *pages[2]; /* * We keep the vma's vm_start rather than a pointer to the vma * itself. The probed process or a naughty kernel module could make * the vma go away, and we must handle that reasonably gracefully. */ - unsigned long vaddr; /* Page(s) of instruction slots */ + unsigned long vaddr; /* Page(s) of instruction slots */ }; /* @@ -1125,11 +1126,14 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon /* Slot allocation for XOL */ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) { - int ret = -EALREADY; + struct vm_area_struct *vma; + int ret; down_write(&mm->mmap_sem); - if (mm->uprobes_state.xol_area) + if (mm->uprobes_state.xol_area) { + ret = -EALREADY; goto fail; + } if (!area->vaddr) { /* Try to map as high as possible, this is only a hint. */ @@ -1141,11 +1145,15 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) } } - ret = install_special_mapping(mm, area->vaddr, PAGE_SIZE, - VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO, area->pages); - if (ret) + vma = _install_special_mapping(mm, area->vaddr, PAGE_SIZE, + VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO, + &area->xol_mapping); + if (IS_ERR(vma)) { + ret = PTR_ERR(vma); goto fail; + } + ret = 0; smp_wmb(); /* pairs with get_xol_area() */ mm->uprobes_state.xol_area = area; fail: @@ -1168,6 +1176,8 @@ static struct xol_area *__create_xol_area(unsigned long vaddr) if (!area->bitmap) goto free_area; + area->xol_mapping.name = "[uprobes]"; + area->xol_mapping.pages = area->pages; area->pages[0] = alloc_page(GFP_HIGHUSER); if (!area->pages[0]) goto free_bitmap; -- cgit v0.10.2 From 2a742cedcf13572999436676cbe36c3a9b733b0f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 21 Jul 2015 15:40:36 +0200 Subject: uprobes: Fix the waitqueue_active() check in xol_free_insn_slot() The xol_free_insn_slot()->waitqueue_active() check is buggy. We need mb() after we set the conditon for wait_event(), or xol_take_insn_slot() can miss the wakeup. Signed-off-by: Oleg Nesterov Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Pratyush Anand Cc: Srikar Dronamraju Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150721134036.GA4799@redhat.com Signed-off-by: Ingo Molnar diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 2d5b7bd..4e5e979 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1337,6 +1337,7 @@ static void xol_free_insn_slot(struct task_struct *tsk) clear_bit(slot_nr, area->bitmap); atomic_dec(&area->slot_count); + smp_mb__after_atomic(); /* pairs with prepare_to_wait() */ if (waitqueue_active(&area->wq)) wake_up(&area->wq); -- cgit v0.10.2 From 3a2a7797326a4bc59b7ff0cc92c8b274abf21892 Mon Sep 17 00:00:00 2001 From: Dasaratharaman Chandramouli Date: Tue, 26 May 2015 11:47:39 -0700 Subject: perf/x86/intel/rapl: Add support for Knights Landing (KNL) Knights Landing DRAM RAPL supports PKG and DRAM RAPL domains. DRAM RAPL has a different fixed energy unit (2^-16J) similar to that of HSW. Signed-off-by: Dasaratharaman Chandramouli Signed-off-by: Peter Zijlstra (Intel) Acked-by: Stephane Eranian Acked-by: Jacob Pan Cc: Arnaldo Carvalho de Melo Cc: H. Peter Anvin Cc: Jacob Pan Jun Cc: Linus Torvalds Cc: Nikhil Rao Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/aa63b4a3af3160152fea1a10c807f4200527280c.1432665809.git.dasaratharaman.chandramouli@intel.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c index 5cbd4e6..81431c0 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_rapl.c +++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c @@ -86,6 +86,10 @@ static const char *rapl_domain_names[NR_RAPL_DOMAINS] __initconst = { 1<config * any other bit is reserved @@ -486,6 +490,18 @@ static struct attribute *rapl_events_hsw_attr[] = { NULL, }; +static struct attribute *rapl_events_knl_attr[] = { + EVENT_PTR(rapl_pkg), + EVENT_PTR(rapl_ram), + + EVENT_PTR(rapl_pkg_unit), + EVENT_PTR(rapl_ram_unit), + + EVENT_PTR(rapl_pkg_scale), + EVENT_PTR(rapl_ram_scale), + NULL, +}; + static struct attribute_group rapl_pmu_events_group = { .name = "events", .attrs = NULL, /* patched at runtime */ @@ -730,6 +746,10 @@ static int __init rapl_pmu_init(void) rapl_cntr_mask = RAPL_IDX_SRV; rapl_pmu_events_group.attrs = rapl_events_srv_attr; break; + case 87: /* Knights Landing */ + rapl_add_quirk(rapl_hsw_server_quirk); + rapl_cntr_mask = RAPL_IDX_KNL; + rapl_pmu_events_group.attrs = rapl_events_knl_attr; default: /* unsupported */ -- cgit v0.10.2 From 070a7cdfa4a0a799235d79e58e7b0b2d94dff190 Mon Sep 17 00:00:00 2001 From: Vaishali Thakkar Date: Fri, 17 Jul 2015 10:57:59 +0530 Subject: perf/x86/intel/uncore: Remove use of macro DEFINE_PCI_DEVICE_TABLE() The DEFINE_PCI_DEVICE_TABLE() macro is deprecated. Use 'struct pci_device_id' instead of DEFINE_PCI_DEVICE_TABLE(), with the goal of getting rid of this macro completely. This Coccinelle semantic patch performs this transformation: @@ identifier a; declarer name DEFINE_PCI_DEVICE_TABLE; initializer i; @@ - DEFINE_PCI_DEVICE_TABLE(a) + const struct pci_device_id a[] = i; Signed-off-by: Vaishali Thakkar Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150717052759.GA6265@vaishali-Ideapad-Z570 Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c index 6d6e85d..76a3feb 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c @@ -2215,7 +2215,7 @@ static struct intel_uncore_type *hswep_pci_uncores[] = { NULL, }; -static DEFINE_PCI_DEVICE_TABLE(hswep_uncore_pci_ids) = { +static const struct pci_device_id hswep_uncore_pci_ids[] = { { /* Home Agent 0 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f30), .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_HA, 0), -- cgit v0.10.2 From e3a13192d86048e91a2a1d534abe5ac2397d9113 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sun, 14 Jun 2015 22:57:40 -0700 Subject: perf/x86/intel/uncore: Add support for ARB uncore PMU on Sandy/IvyBridge Add a new "ARB" uncore PMU that is used to monitor the uncore queue arbiter. This is useful to measure uncore queue occupancy and similar statistics. The registers all have the same format as the existing CBOX PMU. Also move the event constraints from the CBOX to ARB. The 0x80+ events are ARB events and cannot be scheduled on a CBOX PMU. Signed-off-by: Andi Kleen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: eranian@google.com Cc: kan.liang@intel.com Link: http://lkml.kernel.org/r/1434347862-28490-1-git-send-email-andi@firstfloor.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c index b005a78..f78574b 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c @@ -45,6 +45,11 @@ #define SNB_UNC_CBO_0_PER_CTR0 0x706 #define SNB_UNC_CBO_MSR_OFFSET 0x10 +/* SNB ARB register */ +#define SNB_UNC_ARB_PER_CTR0 0x3b0 +#define SNB_UNC_ARB_PERFEVTSEL0 0x3b2 +#define SNB_UNC_ARB_MSR_OFFSET 0x10 + /* NHM global control register */ #define NHM_UNC_PERF_GLOBAL_CTL 0x391 #define NHM_UNC_FIXED_CTR 0x394 @@ -115,7 +120,7 @@ static struct intel_uncore_ops snb_uncore_msr_ops = { .read_counter = uncore_msr_read_counter, }; -static struct event_constraint snb_uncore_cbox_constraints[] = { +static struct event_constraint snb_uncore_arb_constraints[] = { UNCORE_EVENT_CONSTRAINT(0x80, 0x1), UNCORE_EVENT_CONSTRAINT(0x83, 0x1), EVENT_CONSTRAINT_END @@ -134,14 +139,28 @@ static struct intel_uncore_type snb_uncore_cbox = { .single_fixed = 1, .event_mask = SNB_UNC_RAW_EVENT_MASK, .msr_offset = SNB_UNC_CBO_MSR_OFFSET, - .constraints = snb_uncore_cbox_constraints, .ops = &snb_uncore_msr_ops, .format_group = &snb_uncore_format_group, .event_descs = snb_uncore_events, }; +static struct intel_uncore_type snb_uncore_arb = { + .name = "arb", + .num_counters = 2, + .num_boxes = 1, + .perf_ctr_bits = 44, + .perf_ctr = SNB_UNC_ARB_PER_CTR0, + .event_ctl = SNB_UNC_ARB_PERFEVTSEL0, + .event_mask = SNB_UNC_RAW_EVENT_MASK, + .msr_offset = SNB_UNC_ARB_MSR_OFFSET, + .constraints = snb_uncore_arb_constraints, + .ops = &snb_uncore_msr_ops, + .format_group = &snb_uncore_format_group, +}; + static struct intel_uncore_type *snb_msr_uncores[] = { &snb_uncore_cbox, + &snb_uncore_arb, NULL, }; -- cgit v0.10.2 From 3a999587b4a1815cf4dadddf6b5aad470f048239 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sun, 14 Jun 2015 22:57:41 -0700 Subject: perf/x86/intel/uncore: Use Sandy Bridge client PMU on Haswell/Broadwell Haswell and Broadwell have the same uncore CBOX/ARB PMU as Sandy Bridge. Add the respective model numbers to enable the SNB uncore PMU. Signed-off-by: Andi Kleen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: eranian@google.com Cc: kan.liang@intel.com Link: http://lkml.kernel.org/r/1434347862-28490-2-git-send-email-andi@firstfloor.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index 21b5e38..c2af967 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -1209,6 +1209,11 @@ static int __init uncore_cpu_init(void) break; case 42: /* Sandy Bridge */ case 58: /* Ivy Bridge */ + case 60: /* Haswell */ + case 69: /* Haswell */ + case 70: /* Haswell */ + case 61: /* Broadwell */ + case 71: /* Broadwell */ snb_uncore_cpu_init(); break; case 45: /* Sandy Bridge-EP */ -- cgit v0.10.2 From e9b3bd379c283577e102529bfb22484238be7c91 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Wed, 24 Jun 2015 13:05:48 +0300 Subject: perf/x86/intel/bts: Drop redundant declarations Both intel_pmu_enable_bts() and intel_pmu_disable_bts() are in perf_event.h header file, no need to have them declared again in the driver. Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: acme@infradead.org Cc: adrian.hunter@intel.com Cc: hpa@zytor.com Link: http://lkml.kernel.org/r/1435140349-32588-2-git-send-email-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/cpu/perf_event_intel_bts.c b/arch/x86/kernel/cpu/perf_event_intel_bts.c index 43dd672..54690e8 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_bts.c +++ b/arch/x86/kernel/cpu/perf_event_intel_bts.c @@ -62,9 +62,6 @@ struct bts_buffer { struct pmu bts_pmu; -void intel_pmu_enable_bts(u64 config); -void intel_pmu_disable_bts(void); - static size_t buf_size(struct page *page) { return 1 << (PAGE_SHIFT + page_private(page)); -- cgit v0.10.2 From c749b3e96398fcc39286267b72fb8b85c0f757ea Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Wed, 24 Jun 2015 13:05:49 +0300 Subject: perf/x86/intel/lbr: Kill off intel_pmu_needs_lbr_smpl for good The x86_lbr_exclusive commit (4807034248be "perf/x86: Mark Intel PT and LBR/BTS as mutually exclusive") mistakenly moved intel_pmu_needs_lbr_smpl() to perf_event.h, while another commit (a46a2300019 "perf: Simplify the branch stack check") removed it in favor of needs_branch_stack(). This patch gets rid of intel_pmu_needs_lbr_smpl() for good. Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: acme@infradead.org Cc: adrian.hunter@intel.com Cc: hpa@zytor.com Link: http://lkml.kernel.org/r/1435140349-32588-3-git-send-email-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 3e7fd27..3474cf2 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -808,20 +808,6 @@ static inline int amd_pmu_init(void) #ifdef CONFIG_CPU_SUP_INTEL -static inline bool intel_pmu_needs_lbr_smpl(struct perf_event *event) -{ - /* user explicitly requested branch sampling */ - if (has_branch_stack(event)) - return true; - - /* implicit branch sampling to correct PEBS skid */ - if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1 && - x86_pmu.intel_cap.pebs_format < 2) - return true; - - return false; -} - static inline bool intel_pmu_has_bts(struct perf_event *event) { if (event->attr.config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS && -- cgit v0.10.2 From ae3f011fc25104a218caf4448b1d47ef1c9b3a42 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 24 Jun 2015 11:23:35 -0700 Subject: perf/x86/intel: Fix SLM MSR_OFFCORE_RSP1 valid_mask AVG_LATENCY(bit 38) is only available on MSR_OFFCORE_RSP0. So the bit should be removed from RSP1 valid_mask. Since RSP0 and RSP1 may have different valid_mask, intel_alt_er should validate the config on the alternate offcore reg before replacing it. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1435170215-5017-1-git-send-email-kan.liang@intel.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index b9826a9..71815cf 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1114,7 +1114,7 @@ static struct extra_reg intel_slm_extra_regs[] __read_mostly = { /* must define OFFCORE_RSP_X first, see intel_fixup_er() */ INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x768005ffffull, RSP_0), - INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0x768005ffffull, RSP_1), + INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0x368005ffffull, RSP_1), EVENT_EXTRA_END }; @@ -1699,18 +1699,22 @@ intel_bts_constraints(struct perf_event *event) return NULL; } -static int intel_alt_er(int idx) +static int intel_alt_er(int idx, u64 config) { + int alt_idx; if (!(x86_pmu.flags & PMU_FL_HAS_RSP_1)) return idx; if (idx == EXTRA_REG_RSP_0) - return EXTRA_REG_RSP_1; + alt_idx = EXTRA_REG_RSP_1; if (idx == EXTRA_REG_RSP_1) - return EXTRA_REG_RSP_0; + alt_idx = EXTRA_REG_RSP_0; - return idx; + if (config & ~x86_pmu.extra_regs[alt_idx].valid_mask) + return idx; + + return alt_idx; } static void intel_fixup_er(struct perf_event *event, int idx) @@ -1799,7 +1803,7 @@ again: */ c = NULL; } else { - idx = intel_alt_er(idx); + idx = intel_alt_er(idx, reg->config); if (idx != reg->idx) { raw_spin_unlock_irqrestore(&era->lock, flags); goto again; -- cgit v0.10.2 From e5779e8e12299f77c2421a707855d8d124171d85 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 30 Jul 2015 20:32:40 -0700 Subject: perf/x86/hw_breakpoints: Disallow kernel breakpoints unless kprobe-safe Code on the kprobe blacklist doesn't want unexpected int3 exceptions. It probably doesn't want unexpected debug exceptions either. Be safe: disallow breakpoints in nokprobes code. On non-CONFIG_KPROBES kernels, there is no kprobe blacklist. In that case, disallow kernel breakpoints entirely. It will be particularly important to keep hw breakpoints out of the entry and NMI code once we move debug exceptions off the IST stack. Signed-off-by: Andy Lutomirski Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Brian Gerst Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/e14b152af99640448d895e3c2a8c2d5ee19a1325.1438312874.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index 7114ba2..78f3e90 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include @@ -243,6 +244,20 @@ static int arch_build_bp_info(struct perf_event *bp) info->type = X86_BREAKPOINT_RW; break; case HW_BREAKPOINT_X: + /* + * We don't allow kernel breakpoints in places that are not + * acceptable for kprobes. On non-kprobes kernels, we don't + * allow kernel breakpoints at all. + */ + if (bp->attr.bp_addr >= TASK_SIZE_MAX) { +#ifdef CONFIG_KPROBES + if (within_kprobe_blacklist(bp->attr.bp_addr)) + return -EINVAL; +#else + return -EINVAL; +#endif + } + info->type = X86_BREAKPOINT_EXECUTE; /* * x86 inst breakpoints need to have a specific undefined len. diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 1ab5475..8f68490 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -267,6 +267,8 @@ extern void show_registers(struct pt_regs *regs); extern void kprobes_inc_nmissed_count(struct kprobe *p); extern bool arch_within_kprobe_blacklist(unsigned long addr); +extern bool within_kprobe_blacklist(unsigned long addr); + struct kprobe_insn_cache { struct mutex mutex; void *(*alloc)(void); /* allocate insn page */ diff --git a/kernel/kprobes.c b/kernel/kprobes.c index c90e417..d10ab6b 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1332,7 +1332,7 @@ bool __weak arch_within_kprobe_blacklist(unsigned long addr) addr < (unsigned long)__kprobes_text_end; } -static bool within_kprobe_blacklist(unsigned long addr) +bool within_kprobe_blacklist(unsigned long addr) { struct kprobe_blacklist_entry *ent; -- cgit v0.10.2 From ab513927ab449af00cc70b0269e15ee80dd537f9 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 30 Jul 2015 20:32:41 -0700 Subject: perf/x86/hw_breakpoints: Improve range breakpoint validation Range breakpoints will do the wrong thing if the address isn't aligned. While we're there, add comments about why it's safe for instruction breakpoints. Signed-off-by: Andy Lutomirski Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Brian Gerst Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/ae25d14d61f2f43b78e0a247e469f3072df7e201.1438312874.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index 78f3e90..6f345d3 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -291,8 +291,18 @@ static int arch_build_bp_info(struct perf_event *bp) break; #endif default: + /* AMD range breakpoint */ if (!is_power_of_2(bp->attr.bp_len)) return -EINVAL; + if (bp->attr.bp_addr & (bp->attr.bp_len - 1)) + return -EINVAL; + /* + * It's impossible to use a range breakpoint to fake out + * user vs kernel detection because bp_len - 1 can't + * have the high bit set. If we ever allow range instruction + * breakpoints, then we'll have to check for kprobe-blacklisted + * addresses anywhere in the range. + */ if (!cpu_has_bpext) return -EOPNOTSUPP; info->mask = bp->attr.bp_len - 1; -- cgit v0.10.2 From 27747f8bc355a2808ca9e490ab6866acd85b4c16 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 30 Jul 2015 20:32:42 -0700 Subject: perf/x86/hw_breakpoints: Fix check for kernel-space breakpoints The check looked wrong, although I think it was actually safe. TASK_SIZE is unnecessarily small for compat tasks, and it wasn't possible to make a range breakpoint so large it started in user space and ended in kernel space. Nonetheless, let's fix up the check for the benefit of future readers. A breakpoint is in the kernel if either end is in the kernel. Signed-off-by: Andy Lutomirski Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Brian Gerst Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/136be387950e78f18cea60e9d1bef74465d0ee8f.1438312874.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index 6f345d3..50a3fad 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -180,7 +180,11 @@ int arch_check_bp_in_kernelspace(struct perf_event *bp) va = info->address; len = bp->attr.bp_len; - return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE); + /* + * We don't need to worry about va + len - 1 overflowing: + * we already require that va is aligned to a multiple of len. + */ + return (va >= TASK_SIZE_MAX) || ((va + len - 1) >= TASK_SIZE_MAX); } int arch_bp_generic_fields(int x86_len, int x86_type, -- cgit v0.10.2 From 9a6694cfa2390181dec936a17c0d9d21ef7b08d9 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Thu, 30 Jul 2015 16:48:24 +0300 Subject: perf/x86/intel/pt: Do not force sync packets on every schedule-in Currently, the PT driver zeroes out the status register every time before starting the event. However, all the writable bits are already taken care of in pt_handle_status() function, except the new PacketByteCnt field, which in new versions of PT contains the number of packet bytes written since the last sync (PSB) packet. Zeroing it out before enabling PT forces a sync packet to be written. This means that, with the existing code, a sync packet (PSB and PSBEND, 18 bytes in total) will be generated every time a PT event is scheduled in. To avoid these unnecessary syncs and save a WRMSR in the fast path, this patch changes the default behavior to not clear PacketByteCnt field, so that the sync packets will be generated with the period specified as "psb_period" attribute config field. This has little impact on the trace data as the other packets that are normally sent within PSB+ (between PSB and PSBEND) have their own generation scenarios which do not depend on the sync packets. One exception where we do need to force PSB like this when tracing starts, so that the decoder has a clear sync point in the trace. For this purpose we aready have hw::itrace_started flag, which we are currently using to output PERF_RECORD_ITRACE_START. This patch moves setting itrace_started from perf core to the pmu::start, where it should still be 0 on the very first run. Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: acme@infradead.org Cc: adrian.hunter@intel.com Cc: hpa@zytor.com Link: http://lkml.kernel.org/r/1438264104-16189-1-git-send-email-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/cpu/perf_event_intel_pt.c b/arch/x86/kernel/cpu/perf_event_intel_pt.c index 183de71..cc58ef8 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_pt.c +++ b/arch/x86/kernel/cpu/perf_event_intel_pt.c @@ -191,6 +191,11 @@ static void pt_config(struct perf_event *event) { u64 reg; + if (!event->hw.itrace_started) { + event->hw.itrace_started = 1; + wrmsrl(MSR_IA32_RTIT_STATUS, 0); + } + reg = RTIT_CTL_TOPA | RTIT_CTL_BRANCH_EN | RTIT_CTL_TRACEEN; if (!event->attr.exclude_kernel) @@ -910,7 +915,6 @@ void intel_pt_interrupt(void) pt_config_buffer(buf->cur->table, buf->cur_idx, buf->output_off); - wrmsrl(MSR_IA32_RTIT_STATUS, 0); pt_config(event); } } @@ -934,7 +938,6 @@ static void pt_event_start(struct perf_event *event, int mode) pt_config_buffer(buf->cur->table, buf->cur_idx, buf->output_off); - wrmsrl(MSR_IA32_RTIT_STATUS, 0); pt_config(event); } diff --git a/kernel/events/core.c b/kernel/events/core.c index a9796c8..bdea129 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6139,8 +6139,6 @@ static void perf_log_itrace_start(struct perf_event *event) event->hw.itrace_started) return; - event->hw.itrace_started = 1; - rec.header.type = PERF_RECORD_ITRACE_START; rec.header.misc = 0; rec.header.size = sizeof(rec); -- cgit v0.10.2 From b1bf72d6691cc33fc7763fc8ec77df42ca1a8702 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Thu, 30 Jul 2015 16:15:31 +0300 Subject: perf/x86/intel/pt: Add new timing packet enables Intel PT chapter in the new Intel Architecture SDM adds several packets corresponding enable bits and registers that control packet generation. Also, additional bits in the Intel PT CPUID leaf were added to enumerate presence and parameters of these new packets and features. The packets and enables are: * CYC: cycle accurate mode, provides the number of cycles elapsed since previous CYC packet; its presence and available threshold values are enumerated via CPUID; * MTC: mini time counter packets, used for tracking TSC time between full TSC packets; its presence and available resolution options are enumerated via CPUID; * PSB packet period is now configurable, available period values are enumerated via CPUID. This patch adds corresponding bit and register definitions, pmu driver capabilities based on CPUID enumeration, new attribute format bits for the new featurens and extends event configuration validation function to take these into account. Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: acme@infradead.org Cc: adrian.hunter@intel.com Cc: hpa@zytor.com Link: http://lkml.kernel.org/r/1438262131-12725-1-git-send-email-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 9ebc3d0..c665d34 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -80,13 +80,21 @@ #define MSR_IA32_RTIT_CTL 0x00000570 #define RTIT_CTL_TRACEEN BIT(0) +#define RTIT_CTL_CYCLEACC BIT(1) #define RTIT_CTL_OS BIT(2) #define RTIT_CTL_USR BIT(3) #define RTIT_CTL_CR3EN BIT(7) #define RTIT_CTL_TOPA BIT(8) +#define RTIT_CTL_MTC_EN BIT(9) #define RTIT_CTL_TSC_EN BIT(10) #define RTIT_CTL_DISRETC BIT(11) #define RTIT_CTL_BRANCH_EN BIT(13) +#define RTIT_CTL_MTC_RANGE_OFFSET 14 +#define RTIT_CTL_MTC_RANGE (0x0full << RTIT_CTL_MTC_RANGE_OFFSET) +#define RTIT_CTL_CYC_THRESH_OFFSET 19 +#define RTIT_CTL_CYC_THRESH (0x0full << RTIT_CTL_CYC_THRESH_OFFSET) +#define RTIT_CTL_PSB_FREQ_OFFSET 24 +#define RTIT_CTL_PSB_FREQ (0x0full << RTIT_CTL_PSB_FREQ_OFFSET) #define MSR_IA32_RTIT_STATUS 0x00000571 #define RTIT_STATUS_CONTEXTEN BIT(1) #define RTIT_STATUS_TRIGGEREN BIT(2) diff --git a/arch/x86/kernel/cpu/intel_pt.h b/arch/x86/kernel/cpu/intel_pt.h index 1c338b0..feb293e 100644 --- a/arch/x86/kernel/cpu/intel_pt.h +++ b/arch/x86/kernel/cpu/intel_pt.h @@ -72,9 +72,15 @@ struct topa_entry { enum pt_capabilities { PT_CAP_max_subleaf = 0, PT_CAP_cr3_filtering, + PT_CAP_psb_cyc, + PT_CAP_mtc, PT_CAP_topa_output, PT_CAP_topa_multiple_entries, + PT_CAP_single_range_output, PT_CAP_payloads_lip, + PT_CAP_mtc_periods, + PT_CAP_cycle_thresholds, + PT_CAP_psb_periods, }; struct pt_pmu { diff --git a/arch/x86/kernel/cpu/perf_event_intel_pt.c b/arch/x86/kernel/cpu/perf_event_intel_pt.c index cc58ef8..e20cfac 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_pt.c +++ b/arch/x86/kernel/cpu/perf_event_intel_pt.c @@ -65,9 +65,15 @@ static struct pt_cap_desc { } pt_caps[] = { PT_CAP(max_subleaf, 0, CR_EAX, 0xffffffff), PT_CAP(cr3_filtering, 0, CR_EBX, BIT(0)), + PT_CAP(psb_cyc, 0, CR_EBX, BIT(1)), + PT_CAP(mtc, 0, CR_EBX, BIT(3)), PT_CAP(topa_output, 0, CR_ECX, BIT(0)), PT_CAP(topa_multiple_entries, 0, CR_ECX, BIT(1)), + PT_CAP(single_range_output, 0, CR_ECX, BIT(2)), PT_CAP(payloads_lip, 0, CR_ECX, BIT(31)), + PT_CAP(mtc_periods, 1, CR_EAX, 0xffff0000), + PT_CAP(cycle_thresholds, 1, CR_EBX, 0xffff), + PT_CAP(psb_periods, 1, CR_EBX, 0xffff0000), }; static u32 pt_cap_get(enum pt_capabilities cap) @@ -94,12 +100,22 @@ static struct attribute_group pt_cap_group = { .name = "caps", }; +PMU_FORMAT_ATTR(cyc, "config:1" ); +PMU_FORMAT_ATTR(mtc, "config:9" ); PMU_FORMAT_ATTR(tsc, "config:10" ); PMU_FORMAT_ATTR(noretcomp, "config:11" ); +PMU_FORMAT_ATTR(mtc_period, "config:14-17" ); +PMU_FORMAT_ATTR(cyc_thresh, "config:19-22" ); +PMU_FORMAT_ATTR(psb_period, "config:24-27" ); static struct attribute *pt_formats_attr[] = { + &format_attr_cyc.attr, + &format_attr_mtc.attr, &format_attr_tsc.attr, &format_attr_noretcomp.attr, + &format_attr_mtc_period.attr, + &format_attr_cyc_thresh.attr, + &format_attr_psb_period.attr, NULL, }; @@ -170,15 +186,65 @@ fail: return ret; } -#define PT_CONFIG_MASK (RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC) +#define RTIT_CTL_CYC_PSB (RTIT_CTL_CYCLEACC | \ + RTIT_CTL_CYC_THRESH | \ + RTIT_CTL_PSB_FREQ) + +#define RTIT_CTL_MTC (RTIT_CTL_MTC_EN | \ + RTIT_CTL_MTC_RANGE) + +#define PT_CONFIG_MASK (RTIT_CTL_TSC_EN | \ + RTIT_CTL_DISRETC | \ + RTIT_CTL_CYC_PSB | \ + RTIT_CTL_MTC) static bool pt_event_valid(struct perf_event *event) { u64 config = event->attr.config; + u64 allowed, requested; if ((config & PT_CONFIG_MASK) != config) return false; + if (config & RTIT_CTL_CYC_PSB) { + if (!pt_cap_get(PT_CAP_psb_cyc)) + return false; + + allowed = pt_cap_get(PT_CAP_psb_periods); + requested = (config & RTIT_CTL_PSB_FREQ) >> + RTIT_CTL_PSB_FREQ_OFFSET; + if (requested && (!(allowed & BIT(requested)))) + return false; + + allowed = pt_cap_get(PT_CAP_cycle_thresholds); + requested = (config & RTIT_CTL_CYC_THRESH) >> + RTIT_CTL_CYC_THRESH_OFFSET; + if (requested && (!(allowed & BIT(requested)))) + return false; + } + + if (config & RTIT_CTL_MTC) { + /* + * In the unlikely case that CPUID lists valid mtc periods, + * but not the mtc capability, drop out here. + * + * Spec says that setting mtc period bits while mtc bit in + * CPUID is 0 will #GP, so better safe than sorry. + */ + if (!pt_cap_get(PT_CAP_mtc)) + return false; + + allowed = pt_cap_get(PT_CAP_mtc_periods); + if (!allowed) + return false; + + requested = (config & RTIT_CTL_MTC_RANGE) >> + RTIT_CTL_MTC_RANGE_OFFSET; + + if (!(allowed & BIT(requested))) + return false; + } + return true; } -- cgit v0.10.2 From a94cab2376cb35f236be14e2833cef63a8762a31 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sun, 10 May 2015 12:22:39 -0700 Subject: perf/x86: Add a native_perf_sched_clock_from_tsc() PEBSv3 has a raw TSC time stamp in its memory buffer that later needs to to be converted to perf_clock. Add a native_sched_clock_from_tsc() that works the same as native_sched_clock(), but starts with an already given TSC value. Paravirt is ignored, it will just get the native clock. But there isn't a para virtualized PEBS anyway. Signed-off-by: Andi Kleen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: eranian@google.com Link: http://lkml.kernel.org/r/1431285767-27027-2-git-send-email-andi@firstfloor.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index 94605c0..aad56eb 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -51,6 +51,7 @@ extern int unsynchronized_tsc(void); extern int check_tsc_unstable(void); extern int check_tsc_disabled(void); extern unsigned long native_calibrate_tsc(void); +extern unsigned long long native_sched_clock_from_tsc(u64 tsc); extern int tsc_clocksource_reliable; diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 7437b41..88e9a38 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -296,6 +296,14 @@ u64 native_sched_clock(void) return cycles_2_ns(tsc_now); } +/* + * Generate a sched_clock if you already have a TSC value. + */ +u64 native_sched_clock_from_tsc(u64 tsc) +{ + return cycles_2_ns(tsc); +} + /* We need to define a real function for sched_clock, to override the weak default version */ #ifdef CONFIG_PARAVIRT -- cgit v0.10.2 From 2f7ebf2ec2a7b311318aae10b8373b0bd93001a7 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sun, 10 May 2015 12:22:40 -0700 Subject: perf/x86/intel: Add support for PEBSv3 profiling PEBSv3 is the same as the existing PEBSv2 used on Haswell, but it adds a new TSC field. Add support to the generic PEBS handler to handle the new format, and overwrite the perf time stamp using the new native_sched_clock_from_tsc(). Right now the time stamp is just slightly more accurate, as it is nearer the actual event trigger point. With the PEBS threshold > 1 patchkit it will be much more accurate, avoid the problems with MMAP mismatches earlier. The accurate time stamping is only implemented for the default trace clock for now. v2: Use _skl prefix. Check for default clock_id. Signed-off-by: Andi Kleen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: eranian@google.com Link: http://lkml.kernel.org/r/1431285767-27027-3-git-send-email-andi@firstfloor.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 71fc402..410270a 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -224,6 +224,19 @@ union hsw_tsx_tuning { #define PEBS_HSW_TSX_FLAGS 0xff00000000ULL +/* Same as HSW, plus TSC */ + +struct pebs_record_skl { + u64 flags, ip; + u64 ax, bx, cx, dx; + u64 si, di, bp, sp; + u64 r8, r9, r10, r11; + u64 r12, r13, r14, r15; + u64 status, dla, dse, lat; + u64 real_ip, tsx_tuning; + u64 tsc; +}; + void init_debug_store_on_cpu(int cpu) { struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; @@ -885,7 +898,7 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs) return 0; } -static inline u64 intel_hsw_weight(struct pebs_record_hsw *pebs) +static inline u64 intel_hsw_weight(struct pebs_record_skl *pebs) { if (pebs->tsx_tuning) { union hsw_tsx_tuning tsx = { .value = pebs->tsx_tuning }; @@ -894,7 +907,7 @@ static inline u64 intel_hsw_weight(struct pebs_record_hsw *pebs) return 0; } -static inline u64 intel_hsw_transaction(struct pebs_record_hsw *pebs) +static inline u64 intel_hsw_transaction(struct pebs_record_skl *pebs) { u64 txn = (pebs->tsx_tuning & PEBS_HSW_TSX_FLAGS) >> 32; @@ -918,7 +931,7 @@ static void setup_pebs_sample_data(struct perf_event *event, * unconditionally access the 'extra' entries. */ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - struct pebs_record_hsw *pebs = __pebs; + struct pebs_record_skl *pebs = __pebs; u64 sample_type; int fll, fst, dsrc; int fl = event->hw.flags; @@ -1016,6 +1029,16 @@ static void setup_pebs_sample_data(struct perf_event *event, data->txn = intel_hsw_transaction(pebs); } + /* + * v3 supplies an accurate time stamp, so we use that + * for the time stamp. + * + * We can only do this for the default trace clock. + */ + if (x86_pmu.intel_cap.pebs_format >= 3 && + event->attr.use_clockid == 0) + data->time = native_sched_clock_from_tsc(pebs->tsc); + if (has_branch_stack(event)) data->br_stack = &cpuc->lbr_stack; } @@ -1245,6 +1268,13 @@ void __init intel_ds_init(void) x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm; break; + case 3: + pr_cont("PEBS fmt3%c, ", pebs_type); + x86_pmu.pebs_record_size = + sizeof(struct pebs_record_skl); + x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm; + break; + default: printk(KERN_CONT "no PEBS fmt%d%c, ", format, pebs_type); x86_pmu.pebs = 0; -- cgit v0.10.2 From a7b58d211ba18c9175b139e18b68c86a6bcc3c3f Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 27 May 2015 21:13:14 -0700 Subject: perf/x86/intel/lbr: Allow time stamp for free running PEBSv3 With PEBSv3 the PEBS record contains a time stamp. That means we can allow free-running PEBS without a PMI even if the user program requested a time stamp. This avoids the need to use -T to get free running PEBS, and also avoids any problems with mis-identifying MMAPs later. Move the free_running_flags state into a variable in x86_pmu and use it. This only works when no explicit clock_id is set. Signed-off-by: Andi Kleen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: acme@kernel.org Cc: eranian@google.com Cc: jolsa@redhat.com Cc: kan.liang@intel.com Link: http://lkml.kernel.org/r/1432786398-23861-2-git-send-email-andi@firstfloor.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 3474cf2..7378b10 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -594,6 +594,7 @@ struct x86_pmu { struct event_constraint *pebs_constraints; void (*pebs_aliases)(struct perf_event *event); int max_pebs_events; + unsigned long free_running_flags; /* * Intel LBR diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 71815cf..cb112bf 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -2257,6 +2257,15 @@ static void intel_pebs_aliases_snb(struct perf_event *event) } } +static unsigned long intel_pmu_free_running_flags(struct perf_event *event) +{ + unsigned long flags = x86_pmu.free_running_flags; + + if (event->attr.use_clockid) + flags &= ~PERF_SAMPLE_TIME; + return flags; +} + static int intel_pmu_hw_config(struct perf_event *event) { int ret = x86_pmu_hw_config(event); @@ -2267,7 +2276,8 @@ static int intel_pmu_hw_config(struct perf_event *event) if (event->attr.precise_ip) { if (!event->attr.freq) { event->hw.flags |= PERF_X86_EVENT_AUTO_RELOAD; - if (!(event->attr.sample_type & ~PEBS_FREERUNNING_FLAGS)) + if (!(event->attr.sample_type & + ~intel_pmu_free_running_flags(event))) event->hw.flags |= PERF_X86_EVENT_FREERUNNING; } if (x86_pmu.pebs_aliases) @@ -2689,6 +2699,8 @@ static __initconst const struct x86_pmu core_pmu = { .event_map = intel_pmu_event_map, .max_events = ARRAY_SIZE(intel_perfmon_event_map), .apic = 1, + .free_running_flags = PEBS_FREERUNNING_FLAGS, + /* * Intel PMCs cannot be accessed sanely above 32-bit width, * so we install an artificial 1<<31 period regardless of @@ -2727,6 +2739,7 @@ static __initconst const struct x86_pmu intel_pmu = { .event_map = intel_pmu_event_map, .max_events = ARRAY_SIZE(intel_perfmon_event_map), .apic = 1, + .free_running_flags = PEBS_FREERUNNING_FLAGS, /* * Intel PMCs cannot be accessed sanely above 32 bit width, * so we install an artificial 1<<31 period regardless of diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 410270a..03773c2 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -1273,6 +1273,7 @@ void __init intel_ds_init(void) x86_pmu.pebs_record_size = sizeof(struct pebs_record_skl); x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm; + x86_pmu.free_running_flags |= PERF_SAMPLE_TIME; break; default: -- cgit v0.10.2 From b83ff1c8617aac03a1cf807aafa848fe0f0908f2 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sun, 10 May 2015 12:22:41 -0700 Subject: x86: Add new MSRs and MSR bits used for Intel Skylake PMU support Add new MSRs (LBR_INFO) and some new MSR bits used by the Intel Skylake PMU driver. Signed-off-by: Andi Kleen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: eranian@google.com Link: http://lkml.kernel.org/r/1431285767-27027-4-git-send-email-andi@firstfloor.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index c665d34..fcd17c1 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -73,6 +73,12 @@ #define MSR_LBR_CORE_FROM 0x00000040 #define MSR_LBR_CORE_TO 0x00000060 +#define MSR_LBR_INFO_0 0x00000dc0 /* ... 0xddf for _31 */ +#define LBR_INFO_MISPRED BIT_ULL(63) +#define LBR_INFO_IN_TX BIT_ULL(62) +#define LBR_INFO_ABORT BIT_ULL(61) +#define LBR_INFO_CYCLES 0xffff + #define MSR_IA32_PEBS_ENABLE 0x000003f1 #define MSR_IA32_DS_AREA 0x00000600 #define MSR_IA32_PERF_CAPABILITIES 0x00000345 diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index dc0f6ed..7bcb861 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -159,6 +159,13 @@ struct x86_pmu_capability { */ #define INTEL_PMC_IDX_FIXED_BTS (INTEL_PMC_IDX_FIXED + 16) +#define GLOBAL_STATUS_COND_CHG BIT_ULL(63) +#define GLOBAL_STATUS_BUFFER_OVF BIT_ULL(62) +#define GLOBAL_STATUS_UNC_OVF BIT_ULL(61) +#define GLOBAL_STATUS_ASIF BIT_ULL(60) +#define GLOBAL_STATUS_COUNTERS_FROZEN BIT_ULL(59) +#define GLOBAL_STATUS_LBRS_FROZEN BIT_ULL(58) + /* * IBS cpuid feature detection */ -- cgit v0.10.2 From 71ef3c6b9d4665ee7afbbe4c208a98917dcfc32f Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sun, 10 May 2015 12:22:42 -0700 Subject: perf: Add cycles to branch_info Intel Skylake supports reporting the time in cycles a branch in the LBR took, to give a rough indication of the basic block performance. Export the cycle information in the branch_info structure. This can be done by just reusing some currently zero padding. This is just the generic header change. The architecture still needs to fill it in. There's no attempt to convert to real time, as we really want cycles here. Signed-off-by: Andi Kleen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: eranian@google.com Link: http://lkml.kernel.org/r/1431285767-27027-5-git-send-email-andi@firstfloor.org Signed-off-by: Ingo Molnar diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 022d0ac..2881145 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -951,6 +951,7 @@ union perf_mem_data_src { * * in_tx: running in a hardware transaction * abort: aborting a hardware transaction + * cycles: cycles from last branch (or 0 if not supported) */ struct perf_branch_entry { __u64 from; @@ -959,7 +960,8 @@ struct perf_branch_entry { predicted:1,/* target predicted */ in_tx:1, /* in transaction */ abort:1, /* transaction abort */ - reserved:60; + cycles:16, /* cycle count to last branch */ + reserved:44; }; #endif /* _UAPI_LINUX_PERF_EVENT_H */ -- cgit v0.10.2 From 50eab8f6ecd77ae4f9742f8e21ea50705ce0f830 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sun, 10 May 2015 12:22:43 -0700 Subject: perf/x86/intel/lbr: Add support for LBRv5 Add support for the new LBRv5 format used on Intel Skylake CPUs. The flags for mispredict, abort, in_tx etc. moved to range of separate LBR_INFO_* MSRs. Teach the LBR code to read those. The original LBR registers stay the same, except they have full sign extension now. LBR_INFO also reports a cycle count to the last branch. Report the cycle information using the new "cycles" branch_info output field. In addition we have to context switch and clear the new INFO MSRs to avoid any information leaks. Signed-off-by: Andi Kleen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: eranian@google.com Link: http://lkml.kernel.org/r/1431285767-27027-6-git-send-email-andi@firstfloor.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 7378b10..e9c5bdf 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -625,6 +625,7 @@ struct x86_pmu { struct x86_perf_task_context { u64 lbr_from[MAX_LBR_ENTRIES]; u64 lbr_to[MAX_LBR_ENTRIES]; + u64 lbr_info[MAX_LBR_ENTRIES]; int lbr_callstack_users; int lbr_stack_state; }; diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index 452a7bd..2fb5737 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -13,7 +13,8 @@ enum { LBR_FORMAT_EIP = 0x02, LBR_FORMAT_EIP_FLAGS = 0x03, LBR_FORMAT_EIP_FLAGS2 = 0x04, - LBR_FORMAT_MAX_KNOWN = LBR_FORMAT_EIP_FLAGS2, + LBR_FORMAT_INFO = 0x05, + LBR_FORMAT_MAX_KNOWN = LBR_FORMAT_INFO, }; static enum { @@ -186,6 +187,8 @@ static void intel_pmu_lbr_reset_64(void) for (i = 0; i < x86_pmu.lbr_nr; i++) { wrmsrl(x86_pmu.lbr_from + i, 0); wrmsrl(x86_pmu.lbr_to + i, 0); + if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) + wrmsrl(MSR_LBR_INFO_0 + i, 0); } } @@ -234,6 +237,8 @@ static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx) lbr_idx = (tos - i) & mask; wrmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]); wrmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]); + if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) + wrmsrl(MSR_LBR_INFO_0 + i, task_ctx->lbr_info[i]); } task_ctx->lbr_stack_state = LBR_NONE; } @@ -255,6 +260,8 @@ static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx) lbr_idx = (tos - i) & mask; rdmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]); rdmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]); + if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) + rdmsrl(MSR_LBR_INFO_0 + i, task_ctx->lbr_info[i]); } task_ctx->lbr_stack_state = LBR_VALID; } @@ -416,11 +423,22 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) unsigned long lbr_idx = (tos - i) & mask; u64 from, to, mis = 0, pred = 0, in_tx = 0, abort = 0; int skip = 0; + u16 cycles = 0; int lbr_flags = lbr_desc[lbr_format]; rdmsrl(x86_pmu.lbr_from + lbr_idx, from); rdmsrl(x86_pmu.lbr_to + lbr_idx, to); + if (lbr_format == LBR_FORMAT_INFO) { + u64 info; + + rdmsrl(MSR_LBR_INFO_0 + lbr_idx, info); + mis = !!(info & LBR_INFO_MISPRED); + pred = !mis; + in_tx = !!(info & LBR_INFO_IN_TX); + abort = !!(info & LBR_INFO_ABORT); + cycles = (info & LBR_INFO_CYCLES); + } if (lbr_flags & LBR_EIP_FLAGS) { mis = !!(from & LBR_FROM_FLAG_MISPRED); pred = !mis; @@ -450,6 +468,7 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) cpuc->lbr_entries[out].predicted = pred; cpuc->lbr_entries[out].in_tx = in_tx; cpuc->lbr_entries[out].abort = abort; + cpuc->lbr_entries[out].cycles = cycles; cpuc->lbr_entries[out].reserved = 0; out++; } -- cgit v0.10.2 From d8020bee1d0caa90e7b9d6f39ac1fdfaaee7f67f Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sun, 10 May 2015 12:22:45 -0700 Subject: perf/x86/intel: Handle new arch perfmon v4 status bits ArchPerfmon v4 has some new status bits in GLOBAL_STATUS. These need to be ignored when deciding whether a NMI was an NMI, to avoid eating all NMIs when they stay set, see: b292d7a10487 ("perf/x86/intel: ignore CondChgd bit to avoid false NMI handling") This patch ignores the new ASIF bit, which indicates that SGX interfered with the PMU, and also the new LBR freezing bits, which are set when the LBRs get frozen, plus the existing CondChange (set by JTAG debuggers and some buggy BIOSes) Signed-off-by: Andi Kleen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: eranian@google.com Link: http://lkml.kernel.org/r/1431285767-27027-8-git-send-email-andi@firstfloor.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index cb112bf..52c9ded 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1611,13 +1611,14 @@ again: intel_pmu_lbr_read(); /* - * CondChgd bit 63 doesn't mean any overflow status. Ignore - * and clear the bit. + * Ignore a range of extra bits in status that do not indicate + * overflow by themselves. */ - if (__test_and_clear_bit(63, (unsigned long *)&status)) { - if (!status) - goto done; - } + status &= ~(GLOBAL_STATUS_COND_CHG | + GLOBAL_STATUS_ASIF | + GLOBAL_STATUS_LBRS_FROZEN); + if (!status) + goto done; /* * PEBS overflow sets bit 62 in the global status register -- cgit v0.10.2 From 0f29e573dd32bb8598e74271454e97c962da5e05 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sun, 10 May 2015 12:22:47 -0700 Subject: perf/x86/intel: Move PMU ACK to after LBR read With Arch Perfmon v4 the PMU ack unfreezes the LBRs. So we need to do the PMU ack after the LBR reading, otherwise the LBRs would be polluted by the PMI handler. This is a minimal change. In principle the ACK could be moved much later. Signed-off-by: Andi Kleen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: eranian@google.com Link: http://lkml.kernel.org/r/1431285767-27027-10-git-send-email-andi@firstfloor.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 52c9ded..da93b4b 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1594,6 +1594,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs) loops = 0; again: + intel_pmu_lbr_read(); intel_pmu_ack_status(status); if (++loops > 100) { static bool warned = false; @@ -1608,7 +1609,6 @@ again: inc_irq_stat(apic_perf_irqs); - intel_pmu_lbr_read(); /* * Ignore a range of extra bits in status that do not indicate -- cgit v0.10.2 From 425507fa5f321bb5ce1b5eb57a9586e0cf0b9802 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sun, 10 May 2015 12:22:46 -0700 Subject: perf/x86/intel/lbr: Optimize v4 LBR unfreezing In Arch perfmon v4 the GLOBAL_STATUS reset automatically unfreezes LBRs. So no need to do it manually in the LBR code. Add a check to skip it. v2: Move test up to beginning of function. Signed-off-by: Andi Kleen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: eranian@google.com Link: http://lkml.kernel.org/r/1431285767-27027-9-git-send-email-andi@firstfloor.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index 2fb5737..769a42f 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -141,6 +141,13 @@ static void __intel_pmu_lbr_enable(bool pmi) u64 debugctl, lbr_select = 0, orig_debugctl; /* + * No need to unfreeze manually, as v4 can do that as part + * of the GLOBAL_STATUS ack. + */ + if (pmi && x86_pmu.version >= 4) + return; + + /* * No need to reprogram LBR_SELECT in a PMI, as it * did not change. */ -- cgit v0.10.2 From 9a92e16fd7b4ccd9aabcbc4d42a3fb5f9a3cf4a1 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sun, 10 May 2015 12:22:44 -0700 Subject: perf/x86/intel: Add Intel Skylake PMU support Add perf core PMU support for future Intel Skylake CPU cores. The code is based on Haswell/Broadwell. There is a new cache event list, based on the updated Haswell event list. Skylake has removed most counter constraints on basic events, so the basic constraints table now only has a single entry (plus the fixed counters). TSX support and various other setups are all shared with Haswell. Skylake has 32 LBR entries. Add a new LBR init function to set this up. The filters are all the same as Haswell. It also has a new LBR format with a separate LBR_INFO_* MSR, but that has been already added earlier. Signed-off-by: Andi Kleen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: eranian@google.com Link: http://lkml.kernel.org/r/1431285767-27027-7-git-send-email-andi@firstfloor.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index e9c5bdf..8ad9241 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -165,7 +165,7 @@ struct intel_excl_cntrs { unsigned core_id; /* per-core: core id */ }; -#define MAX_LBR_ENTRIES 16 +#define MAX_LBR_ENTRIES 32 enum { X86_PERF_KFREE_SHARED = 0, @@ -861,6 +861,8 @@ extern struct event_constraint intel_ivb_pebs_event_constraints[]; extern struct event_constraint intel_hsw_pebs_event_constraints[]; +extern struct event_constraint intel_skl_pebs_event_constraints[]; + struct event_constraint *intel_pebs_constraints(struct perf_event *event); void intel_pmu_pebs_enable(struct perf_event *event); @@ -899,6 +901,8 @@ void intel_pmu_lbr_init_snb(void); void intel_pmu_lbr_init_hsw(void); +void intel_pmu_lbr_init_skl(void); + int intel_pmu_setup_lbr_filter(struct perf_event *event); void intel_pt_interrupt(void); diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index da93b4b..28fc272 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -177,6 +177,14 @@ static struct event_constraint intel_slm_event_constraints[] __read_mostly = EVENT_CONSTRAINT_END }; +struct event_constraint intel_skl_event_constraints[] = { + FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ + FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ + FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ + INTEL_UEVENT_CONSTRAINT(0x1c0, 0x2), /* INST_RETIRED.PREC_DIST */ + EVENT_CONSTRAINT_END +}; + static struct extra_reg intel_snb_extra_regs[] __read_mostly = { /* must define OFFCORE_RSP_X first, see intel_fixup_er() */ INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3f807f8fffull, RSP_0), @@ -193,6 +201,13 @@ static struct extra_reg intel_snbep_extra_regs[] __read_mostly = { EVENT_EXTRA_END }; +static struct extra_reg intel_skl_extra_regs[] __read_mostly = { + INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3fffff8fffull, RSP_0), + INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3fffff8fffull, RSP_1), + INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd), + EVENT_EXTRA_END +}; + EVENT_ATTR_STR(mem-loads, mem_ld_nhm, "event=0x0b,umask=0x10,ldlat=3"); EVENT_ATTR_STR(mem-loads, mem_ld_snb, "event=0xcd,umask=0x1,ldlat=3"); EVENT_ATTR_STR(mem-stores, mem_st_snb, "event=0xcd,umask=0x2"); @@ -244,6 +259,200 @@ static u64 intel_pmu_event_map(int hw_event) return intel_perfmon_event_map[hw_event]; } +/* + * Notes on the events: + * - data reads do not include code reads (comparable to earlier tables) + * - data counts include speculative execution (except L1 write, dtlb, bpu) + * - remote node access includes remote memory, remote cache, remote mmio. + * - prefetches are not included in the counts. + * - icache miss does not include decoded icache + */ + +#define SKL_DEMAND_DATA_RD BIT_ULL(0) +#define SKL_DEMAND_RFO BIT_ULL(1) +#define SKL_ANY_RESPONSE BIT_ULL(16) +#define SKL_SUPPLIER_NONE BIT_ULL(17) +#define SKL_L3_MISS_LOCAL_DRAM BIT_ULL(26) +#define SKL_L3_MISS_REMOTE_HOP0_DRAM BIT_ULL(27) +#define SKL_L3_MISS_REMOTE_HOP1_DRAM BIT_ULL(28) +#define SKL_L3_MISS_REMOTE_HOP2P_DRAM BIT_ULL(29) +#define SKL_L3_MISS (SKL_L3_MISS_LOCAL_DRAM| \ + SKL_L3_MISS_REMOTE_HOP0_DRAM| \ + SKL_L3_MISS_REMOTE_HOP1_DRAM| \ + SKL_L3_MISS_REMOTE_HOP2P_DRAM) +#define SKL_SPL_HIT BIT_ULL(30) +#define SKL_SNOOP_NONE BIT_ULL(31) +#define SKL_SNOOP_NOT_NEEDED BIT_ULL(32) +#define SKL_SNOOP_MISS BIT_ULL(33) +#define SKL_SNOOP_HIT_NO_FWD BIT_ULL(34) +#define SKL_SNOOP_HIT_WITH_FWD BIT_ULL(35) +#define SKL_SNOOP_HITM BIT_ULL(36) +#define SKL_SNOOP_NON_DRAM BIT_ULL(37) +#define SKL_ANY_SNOOP (SKL_SPL_HIT|SKL_SNOOP_NONE| \ + SKL_SNOOP_NOT_NEEDED|SKL_SNOOP_MISS| \ + SKL_SNOOP_HIT_NO_FWD|SKL_SNOOP_HIT_WITH_FWD| \ + SKL_SNOOP_HITM|SKL_SNOOP_NON_DRAM) +#define SKL_DEMAND_READ SKL_DEMAND_DATA_RD +#define SKL_SNOOP_DRAM (SKL_SNOOP_NONE| \ + SKL_SNOOP_NOT_NEEDED|SKL_SNOOP_MISS| \ + SKL_SNOOP_HIT_NO_FWD|SKL_SNOOP_HIT_WITH_FWD| \ + SKL_SNOOP_HITM|SKL_SPL_HIT) +#define SKL_DEMAND_WRITE SKL_DEMAND_RFO +#define SKL_LLC_ACCESS SKL_ANY_RESPONSE +#define SKL_L3_MISS_REMOTE (SKL_L3_MISS_REMOTE_HOP0_DRAM| \ + SKL_L3_MISS_REMOTE_HOP1_DRAM| \ + SKL_L3_MISS_REMOTE_HOP2P_DRAM) + +static __initconst const u64 skl_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_INST_RETIRED.ALL_LOADS */ + [ C(RESULT_MISS) ] = 0x151, /* L1D.REPLACEMENT */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_INST_RETIRED.ALL_STORES */ + [ C(RESULT_MISS) ] = 0x0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x283, /* ICACHE_64B.MISS */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */ + [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */ + [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_INST_RETIRED.ALL_LOADS */ + [ C(RESULT_MISS) ] = 0x608, /* DTLB_LOAD_MISSES.WALK_COMPLETED */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_INST_RETIRED.ALL_STORES */ + [ C(RESULT_MISS) ] = 0x649, /* DTLB_STORE_MISSES.WALK_COMPLETED */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x2085, /* ITLB_MISSES.STLB_HIT */ + [ C(RESULT_MISS) ] = 0xe85, /* ITLB_MISSES.WALK_COMPLETED */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0xc4, /* BR_INST_RETIRED.ALL_BRANCHES */ + [ C(RESULT_MISS) ] = 0xc5, /* BR_MISP_RETIRED.ALL_BRANCHES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(NODE) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */ + [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */ + [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, +}; + +static __initconst const u64 skl_hw_cache_extra_regs + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = SKL_DEMAND_READ| + SKL_LLC_ACCESS|SKL_ANY_SNOOP, + [ C(RESULT_MISS) ] = SKL_DEMAND_READ| + SKL_L3_MISS|SKL_ANY_SNOOP| + SKL_SUPPLIER_NONE, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = SKL_DEMAND_WRITE| + SKL_LLC_ACCESS|SKL_ANY_SNOOP, + [ C(RESULT_MISS) ] = SKL_DEMAND_WRITE| + SKL_L3_MISS|SKL_ANY_SNOOP| + SKL_SUPPLIER_NONE, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(NODE) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = SKL_DEMAND_READ| + SKL_L3_MISS_LOCAL_DRAM|SKL_SNOOP_DRAM, + [ C(RESULT_MISS) ] = SKL_DEMAND_READ| + SKL_L3_MISS_REMOTE|SKL_SNOOP_DRAM, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = SKL_DEMAND_WRITE| + SKL_L3_MISS_LOCAL_DRAM|SKL_SNOOP_DRAM, + [ C(RESULT_MISS) ] = SKL_DEMAND_WRITE| + SKL_L3_MISS_REMOTE|SKL_SNOOP_DRAM, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, +}; + #define SNB_DMND_DATA_RD (1ULL << 0) #define SNB_DMND_RFO (1ULL << 1) #define SNB_DMND_IFETCH (1ULL << 2) @@ -3278,6 +3487,29 @@ __init int intel_pmu_init(void) pr_cont("Broadwell events, "); break; + case 78: /* 14nm Skylake Mobile */ + case 94: /* 14nm Skylake Desktop */ + x86_pmu.late_ack = true; + memcpy(hw_cache_event_ids, skl_hw_cache_event_ids, sizeof(hw_cache_event_ids)); + memcpy(hw_cache_extra_regs, skl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs)); + intel_pmu_lbr_init_skl(); + + x86_pmu.event_constraints = intel_skl_event_constraints; + x86_pmu.pebs_constraints = intel_skl_pebs_event_constraints; + x86_pmu.extra_regs = intel_skl_extra_regs; + x86_pmu.pebs_aliases = intel_pebs_aliases_snb; + /* all extra regs are per-cpu when HT is on */ + x86_pmu.flags |= PMU_FL_HAS_RSP_1; + x86_pmu.flags |= PMU_FL_NO_HT_SHARING; + + x86_pmu.hw_config = hsw_hw_config; + x86_pmu.get_event_constraints = hsw_get_event_constraints; + x86_pmu.cpu_events = hsw_events_attrs; + WARN_ON(!x86_pmu.format_attrs); + x86_pmu.cpu_events = hsw_events_attrs; + pr_cont("Skylake events, "); + break; + default: switch (x86_pmu.version) { case 1: diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 03773c2..2f7ee05 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -688,6 +688,28 @@ struct event_constraint intel_hsw_pebs_event_constraints[] = { EVENT_CONSTRAINT_END }; +struct event_constraint intel_skl_pebs_event_constraints[] = { + INTEL_FLAGS_UEVENT_CONSTRAINT(0x1c0, 0x2), /* INST_RETIRED.PREC_DIST */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(0x01c2, 0xf), /* UOPS_RETIRED.ALL */ + /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */ + INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf), + INTEL_PLD_CONSTRAINT(0x1cd, 0xf), /* MEM_TRANS_RETIRED.* */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_LOADS */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_STORES */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x21d0, 0xf), /* MEM_INST_RETIRED.LOCK_LOADS */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x22d0, 0xf), /* MEM_INST_RETIRED.LOCK_STORES */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x41d0, 0xf), /* MEM_INST_RETIRED.SPLIT_LOADS */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x42d0, 0xf), /* MEM_INST_RETIRED.SPLIT_STORES */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x81d0, 0xf), /* MEM_INST_RETIRED.ALL_LOADS */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x82d0, 0xf), /* MEM_INST_RETIRED.ALL_STORES */ + INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd1, 0xf), /* MEM_LOAD_RETIRED.* */ + INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd2, 0xf), /* MEM_LOAD_L3_HIT_RETIRED.* */ + INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd3, 0xf), /* MEM_LOAD_L3_MISS_RETIRED.* */ + /* Allow all events as PEBS with no flags */ + INTEL_ALL_EVENT_CONSTRAINT(0, 0xf), + EVENT_CONSTRAINT_END +}; + struct event_constraint *intel_pebs_constraints(struct perf_event *event) { struct event_constraint *c; diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index 769a42f..b432c47 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -973,6 +973,26 @@ void intel_pmu_lbr_init_hsw(void) pr_cont("16-deep LBR, "); } +/* skylake */ +__init void intel_pmu_lbr_init_skl(void) +{ + x86_pmu.lbr_nr = 32; + x86_pmu.lbr_tos = MSR_LBR_TOS; + x86_pmu.lbr_from = MSR_LBR_NHM_FROM; + x86_pmu.lbr_to = MSR_LBR_NHM_TO; + + x86_pmu.lbr_sel_mask = LBR_SEL_MASK; + x86_pmu.lbr_sel_map = hsw_lbr_sel_map; + + /* + * SW branch filter usage: + * - support syscall, sysret capture. + * That requires LBR_FAR but that means far + * jmp need to be filtered out + */ + pr_cont("32-deep LBR, "); +} + /* atom */ void __init intel_pmu_lbr_init_atom(void) { -- cgit v0.10.2 From e0573364b8c5b17401569ef581f1625803210f4d Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 27 May 2015 21:13:17 -0700 Subject: perf/x86/intel/lbr: Use correct index to save/restore LBR_INFO with call stack Use the correct index to save/restore the LBR_INFO_x MSR in callstack mode. This is more a cleanup, as even with the wrong index the register was correctly saved/restored, and also LBR callgraph mode in perf tools do not really need anything in LBR_INFO. But still better to use the right index. Signed-off-by: Andi Kleen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: acme@kernel.org Cc: eranian@google.com Cc: jolsa@redhat.com Link: http://lkml.kernel.org/r/1432786398-23861-5-git-send-email-andi@firstfloor.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index b432c47..a5bc424 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -245,7 +245,7 @@ static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx) wrmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]); wrmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]); if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) - wrmsrl(MSR_LBR_INFO_0 + i, task_ctx->lbr_info[i]); + wrmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]); } task_ctx->lbr_stack_state = LBR_NONE; } @@ -268,7 +268,7 @@ static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx) rdmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]); rdmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]); if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) - rdmsrl(MSR_LBR_INFO_0 + i, task_ctx->lbr_info[i]); + rdmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]); } task_ctx->lbr_stack_state = LBR_VALID; } -- cgit v0.10.2 From 90405aa02247c1a6313c33e2253f9fd2299ae60b Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 27 May 2015 21:13:18 -0700 Subject: perf/x86/intel/lbr: Limit LBR accesses to TOS in callstack mode In callstack mode the LBR is not a ring buffer, but a stack that grows up and down. This means in this case we don't need to access all LBRs, only the ones up to TOS. Do this optimization for the normal LBR read, and the context switch save/restore code. For save/restore it can be done unconditionally, as it only runs when call stack mode is active. This recovers some of the cost of going to 32 LBRs on Skylake. Signed-off-by: Andi Kleen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: acme@kernel.org Cc: eranian@google.com Cc: jolsa@redhat.com Link: http://lkml.kernel.org/r/1432786398-23861-6-git-send-email-andi@firstfloor.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index a5bc424..b2c9475 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -240,7 +240,7 @@ static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx) mask = x86_pmu.lbr_nr - 1; tos = intel_pmu_lbr_tos(); - for (i = 0; i < x86_pmu.lbr_nr; i++) { + for (i = 0; i < tos; i++) { lbr_idx = (tos - i) & mask; wrmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]); wrmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]); @@ -263,7 +263,7 @@ static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx) mask = x86_pmu.lbr_nr - 1; tos = intel_pmu_lbr_tos(); - for (i = 0; i < x86_pmu.lbr_nr; i++) { + for (i = 0; i < tos; i++) { lbr_idx = (tos - i) & mask; rdmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]); rdmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]); @@ -425,8 +425,12 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) u64 tos = intel_pmu_lbr_tos(); int i; int out = 0; + int num = x86_pmu.lbr_nr; - for (i = 0; i < x86_pmu.lbr_nr; i++) { + if (cpuc->lbr_sel->config & LBR_CALL_STACK) + num = tos; + + for (i = 0; i < num; i++) { unsigned long lbr_idx = (tos - i) & mask; u64 from, to, mis = 0, pred = 0, in_tx = 0, abort = 0; int skip = 0; -- cgit v0.10.2 From 47732d886385af769449022a02c7cf0ce45d8a5c Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 29 Jun 2015 14:22:13 -0700 Subject: perf/x86: Make merge_attr() global to use from perf_event_intel merge_attr() allows to merge two sysfs attribute tables. Export it to be usable by other files too. Next patch is going to use that to extend the sysfs format attributes for a CPU. Signed-off-by: Andi Kleen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: eranian@google.com Link: http://lkml.kernel.org/r/1435612935-24425-1-git-send-email-andi@firstfloor.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 3658de4..8bac4bb 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1551,7 +1551,7 @@ static void __init filter_events(struct attribute **attrs) } /* Merge two pointer arrays */ -static __init struct attribute **merge_attr(struct attribute **a, struct attribute **b) +__init struct attribute **merge_attr(struct attribute **a, struct attribute **b) { struct attribute **new; int j, i; diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 8ad9241..5edf6d8 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -795,6 +795,8 @@ static inline void set_linear_ip(struct pt_regs *regs, unsigned long ip) ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event); ssize_t intel_event_sysfs_show(char *page, u64 config); +struct attribute **merge_attr(struct attribute **a, struct attribute **b); + #ifdef CONFIG_CPU_SUP_AMD int amd_pmu_init(void); @@ -926,6 +928,7 @@ static inline int is_ht_workaround_enabled(void) { return !!(x86_pmu.flags & PMU_FL_EXCL_ENABLED); } + #else /* CONFIG_CPU_SUP_INTEL */ static inline void reserve_ds_buffers(void) -- cgit v0.10.2 From 8c4fe7095d633dd5543690ea5c3d522c5cd989b6 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 30 Jun 2015 16:33:24 -0700 Subject: perf/x86/intel: Use 0x11 as extra reg test value The next patch adds a new perf extra register where 0x1ff is not a valid value. Use 0x11 instead. Signed-off-by: Andi Kleen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1435707205-6676-3-git-send-email-andi@firstfloor.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 28fc272..a478e3c 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -3579,7 +3579,7 @@ __init int intel_pmu_init(void) */ if (x86_pmu.extra_regs) { for (er = x86_pmu.extra_regs; er->msr; er++) { - er->extra_msr_access = check_msr(er->msr, 0x1ffUL); + er->extra_msr_access = check_msr(er->msr, 0x11UL); /* Disable LBR select mapping */ if ((er->idx == EXTRA_REG_LBR) && !er->extra_msr_access) x86_pmu.lbr_sel_map = NULL; -- cgit v0.10.2 From 070e98873cf7196cad58f8b6e5278dd5533c81f0 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 2 Jul 2015 08:12:52 -0400 Subject: perf/x86/intel/uncore: Add Broadwell-DE uncore support The uncore subsystem for Broadwell-DE is similar to Haswell-EP. There are some differences in pci device IDs, box number and constraints. Please refer to the public document: http://www.intel.com/content/www/us/en/processors/xeon/xeon-d-1500-uncore-performance-monitoring.html Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: eranian@google.com Link: http://lkml.kernel.org/r/1435839172-15114-1-git-send-email-kan.liang@intel.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index c2af967..560e525 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -911,6 +911,9 @@ static int __init uncore_pci_init(void) case 63: /* Haswell-EP */ ret = hswep_uncore_pci_init(); break; + case 86: /* BDX-DE */ + ret = bdx_uncore_pci_init(); + break; case 42: /* Sandy Bridge */ ret = snb_uncore_pci_init(); break; @@ -1229,6 +1232,9 @@ static int __init uncore_cpu_init(void) case 63: /* Haswell-EP */ hswep_uncore_cpu_init(); break; + case 86: /* BDX-DE */ + bdx_uncore_cpu_init(); + break; default: return 0; } diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h index 0f77f0a..72c54c2 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h @@ -336,6 +336,8 @@ int ivbep_uncore_pci_init(void); void ivbep_uncore_cpu_init(void); int hswep_uncore_pci_init(void); void hswep_uncore_cpu_init(void); +int bdx_uncore_pci_init(void); +void bdx_uncore_cpu_init(void); /* perf_event_intel_uncore_nhmex.c */ void nhmex_uncore_cpu_init(void); diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c index 76a3feb..694510a8 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c @@ -2321,3 +2321,167 @@ int hswep_uncore_pci_init(void) return 0; } /* end of Haswell-EP uncore support */ + +/* BDX-DE uncore support */ + +static struct intel_uncore_type bdx_uncore_ubox = { + .name = "ubox", + .num_counters = 2, + .num_boxes = 1, + .perf_ctr_bits = 48, + .fixed_ctr_bits = 48, + .perf_ctr = HSWEP_U_MSR_PMON_CTR0, + .event_ctl = HSWEP_U_MSR_PMON_CTL0, + .event_mask = SNBEP_U_MSR_PMON_RAW_EVENT_MASK, + .fixed_ctr = HSWEP_U_MSR_PMON_UCLK_FIXED_CTR, + .fixed_ctl = HSWEP_U_MSR_PMON_UCLK_FIXED_CTL, + .num_shared_regs = 1, + .ops = &ivbep_uncore_msr_ops, + .format_group = &ivbep_uncore_ubox_format_group, +}; + +static struct event_constraint bdx_uncore_cbox_constraints[] = { + UNCORE_EVENT_CONSTRAINT(0x09, 0x3), + UNCORE_EVENT_CONSTRAINT(0x11, 0x1), + UNCORE_EVENT_CONSTRAINT(0x36, 0x1), + EVENT_CONSTRAINT_END +}; + +static struct intel_uncore_type bdx_uncore_cbox = { + .name = "cbox", + .num_counters = 4, + .num_boxes = 8, + .perf_ctr_bits = 48, + .event_ctl = HSWEP_C0_MSR_PMON_CTL0, + .perf_ctr = HSWEP_C0_MSR_PMON_CTR0, + .event_mask = SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK, + .box_ctl = HSWEP_C0_MSR_PMON_BOX_CTL, + .msr_offset = HSWEP_CBO_MSR_OFFSET, + .num_shared_regs = 1, + .constraints = bdx_uncore_cbox_constraints, + .ops = &hswep_uncore_cbox_ops, + .format_group = &hswep_uncore_cbox_format_group, +}; + +static struct intel_uncore_type *bdx_msr_uncores[] = { + &bdx_uncore_ubox, + &bdx_uncore_cbox, + &hswep_uncore_pcu, + NULL, +}; + +void bdx_uncore_cpu_init(void) +{ + if (bdx_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores) + bdx_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores; + uncore_msr_uncores = bdx_msr_uncores; +} + +static struct intel_uncore_type bdx_uncore_ha = { + .name = "ha", + .num_counters = 4, + .num_boxes = 1, + .perf_ctr_bits = 48, + SNBEP_UNCORE_PCI_COMMON_INIT(), +}; + +static struct intel_uncore_type bdx_uncore_imc = { + .name = "imc", + .num_counters = 5, + .num_boxes = 2, + .perf_ctr_bits = 48, + .fixed_ctr_bits = 48, + .fixed_ctr = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR, + .fixed_ctl = SNBEP_MC_CHy_PCI_PMON_FIXED_CTL, + .event_descs = hswep_uncore_imc_events, + SNBEP_UNCORE_PCI_COMMON_INIT(), +}; + +static struct intel_uncore_type bdx_uncore_irp = { + .name = "irp", + .num_counters = 4, + .num_boxes = 1, + .perf_ctr_bits = 48, + .event_mask = SNBEP_PMON_RAW_EVENT_MASK, + .box_ctl = SNBEP_PCI_PMON_BOX_CTL, + .ops = &hswep_uncore_irp_ops, + .format_group = &snbep_uncore_format_group, +}; + + +static struct event_constraint bdx_uncore_r2pcie_constraints[] = { + UNCORE_EVENT_CONSTRAINT(0x10, 0x3), + UNCORE_EVENT_CONSTRAINT(0x11, 0x3), + UNCORE_EVENT_CONSTRAINT(0x13, 0x1), + UNCORE_EVENT_CONSTRAINT(0x23, 0x1), + UNCORE_EVENT_CONSTRAINT(0x25, 0x1), + UNCORE_EVENT_CONSTRAINT(0x26, 0x3), + UNCORE_EVENT_CONSTRAINT(0x2d, 0x3), + EVENT_CONSTRAINT_END +}; + +static struct intel_uncore_type bdx_uncore_r2pcie = { + .name = "r2pcie", + .num_counters = 4, + .num_boxes = 1, + .perf_ctr_bits = 48, + .constraints = bdx_uncore_r2pcie_constraints, + SNBEP_UNCORE_PCI_COMMON_INIT(), +}; + +enum { + BDX_PCI_UNCORE_HA, + BDX_PCI_UNCORE_IMC, + BDX_PCI_UNCORE_IRP, + BDX_PCI_UNCORE_R2PCIE, +}; + +static struct intel_uncore_type *bdx_pci_uncores[] = { + [BDX_PCI_UNCORE_HA] = &bdx_uncore_ha, + [BDX_PCI_UNCORE_IMC] = &bdx_uncore_imc, + [BDX_PCI_UNCORE_IRP] = &bdx_uncore_irp, + [BDX_PCI_UNCORE_R2PCIE] = &bdx_uncore_r2pcie, + NULL, +}; + +static DEFINE_PCI_DEVICE_TABLE(bdx_uncore_pci_ids) = { + { /* Home Agent 0 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f30), + .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_HA, 0), + }, + { /* MC0 Channel 0 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb0), + .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 0), + }, + { /* MC0 Channel 1 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb1), + .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 1), + }, + { /* IRP */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f39), + .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IRP, 0), + }, + { /* R2PCIe */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f34), + .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R2PCIE, 0), + }, + { /* end: all zeroes */ } +}; + +static struct pci_driver bdx_uncore_pci_driver = { + .name = "bdx_uncore", + .id_table = bdx_uncore_pci_ids, +}; + +int bdx_uncore_pci_init(void) +{ + int ret = snbep_pci2phy_map_init(0x6f1e); + + if (ret) + return ret; + uncore_pci_uncores = bdx_pci_uncores; + uncore_pci_driver = &bdx_uncore_pci_driver; + return 0; +} + +/* end of BDX-DE uncore support */ -- cgit v0.10.2 From b7b7c7821d932ba18ef6c8eafc8536066b4c2ef4 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 20 Jul 2015 11:49:06 -0400 Subject: perf/x86: Add an MSR PMU driver This patch adds an MSR PMU to support free running MSR counters. Such as time and freq related counters includes TSC, IA32_APERF, IA32_MPERF and IA32_PPERF, but also SMI_COUNT. The events are exposed in sysfs for use by perf stat and other tools. The files are under /sys/devices/msr/events/ Signed-off-by: Andy Lutomirski Signed-off-by: Kan Liang [ s/freq/msr/, added SMI_COUNT, fixed bugs. ] Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: acme@kernel.org Cc: adrian.hunter@intel.com Cc: dsahern@gmail.com Cc: eranian@google.com Cc: jolsa@kernel.org Cc: mark.rutland@arm.com Cc: namhyung@kernel.org Link: http://lkml.kernel.org/r/1437407346-31186-1-git-send-email-kan.liang@intel.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 9bff687..4eb065c 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -46,6 +46,8 @@ obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += perf_event_intel_uncore.o \ perf_event_intel_uncore_snb.o \ perf_event_intel_uncore_snbep.o \ perf_event_intel_uncore_nhmex.o +obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_msr.o +obj-$(CONFIG_CPU_SUP_AMD) += perf_event_msr.o endif diff --git a/arch/x86/kernel/cpu/perf_event_msr.c b/arch/x86/kernel/cpu/perf_event_msr.c new file mode 100644 index 0000000..af216e9 --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_msr.c @@ -0,0 +1,242 @@ +#include + +enum perf_msr_id { + PERF_MSR_TSC = 0, + PERF_MSR_APERF = 1, + PERF_MSR_MPERF = 2, + PERF_MSR_PPERF = 3, + PERF_MSR_SMI = 4, + + PERF_MSR_EVENT_MAX, +}; + +struct perf_msr { + int id; + u64 msr; +}; + +static struct perf_msr msr[] = { + { PERF_MSR_TSC, 0 }, + { PERF_MSR_APERF, MSR_IA32_APERF }, + { PERF_MSR_MPERF, MSR_IA32_MPERF }, + { PERF_MSR_PPERF, MSR_PPERF }, + { PERF_MSR_SMI, MSR_SMI_COUNT }, +}; + +PMU_EVENT_ATTR_STRING(tsc, evattr_tsc, "event=0x00"); +PMU_EVENT_ATTR_STRING(aperf, evattr_aperf, "event=0x01"); +PMU_EVENT_ATTR_STRING(mperf, evattr_mperf, "event=0x02"); +PMU_EVENT_ATTR_STRING(pperf, evattr_pperf, "event=0x03"); +PMU_EVENT_ATTR_STRING(smi, evattr_smi, "event=0x04"); + +static struct attribute *events_attrs[PERF_MSR_EVENT_MAX + 1] = { + &evattr_tsc.attr.attr, +}; + +static struct attribute_group events_attr_group = { + .name = "events", + .attrs = events_attrs, +}; + +PMU_FORMAT_ATTR(event, "config:0-63"); +static struct attribute *format_attrs[] = { + &format_attr_event.attr, + NULL, +}; +static struct attribute_group format_attr_group = { + .name = "format", + .attrs = format_attrs, +}; + +static const struct attribute_group *attr_groups[] = { + &events_attr_group, + &format_attr_group, + NULL, +}; + +static int msr_event_init(struct perf_event *event) +{ + u64 cfg = event->attr.config; + + if (event->attr.type != event->pmu->type) + return -ENOENT; + + if (cfg >= PERF_MSR_EVENT_MAX) + return -EINVAL; + + /* unsupported modes and filters */ + if (event->attr.exclude_user || + event->attr.exclude_kernel || + event->attr.exclude_hv || + event->attr.exclude_idle || + event->attr.exclude_host || + event->attr.exclude_guest || + event->attr.sample_period) /* no sampling */ + return -EINVAL; + + event->hw.idx = -1; + event->hw.event_base = msr[cfg].msr; + event->hw.config = cfg; + + return 0; +} + +static inline u64 msr_read_counter(struct perf_event *event) +{ + u64 now; + + if (event->hw.event_base) + rdmsrl(event->hw.event_base, now); + else + now = rdtsc(); + + return now; +} +static void msr_event_update(struct perf_event *event) +{ + u64 prev, now; + s64 delta; + + /* Careful, an NMI might modify the previous event value. */ +again: + prev = local64_read(&event->hw.prev_count); + now = msr_read_counter(event); + + if (local64_cmpxchg(&event->hw.prev_count, prev, now) != prev) + goto again; + + delta = now - prev; + if (unlikely(event->hw.event_base == MSR_SMI_COUNT)) { + delta <<= 32; + delta >>= 32; /* sign extend */ + } + local64_add(now - prev, &event->count); +} + +static void msr_event_start(struct perf_event *event, int flags) +{ + u64 now; + + now = msr_read_counter(event); + local64_set(&event->hw.prev_count, now); +} + +static void msr_event_stop(struct perf_event *event, int flags) +{ + msr_event_update(event); +} + +static void msr_event_del(struct perf_event *event, int flags) +{ + msr_event_stop(event, PERF_EF_UPDATE); +} + +static int msr_event_add(struct perf_event *event, int flags) +{ + if (flags & PERF_EF_START) + msr_event_start(event, flags); + + return 0; +} + +static struct pmu pmu_msr = { + .task_ctx_nr = perf_sw_context, + .attr_groups = attr_groups, + .event_init = msr_event_init, + .add = msr_event_add, + .del = msr_event_del, + .start = msr_event_start, + .stop = msr_event_stop, + .read = msr_event_update, + .capabilities = PERF_PMU_CAP_NO_INTERRUPT, +}; + +static int __init intel_msr_init(int idx) +{ + if (boot_cpu_data.x86 != 6) + return 0; + + switch (boot_cpu_data.x86_model) { + case 30: /* 45nm Nehalem */ + case 26: /* 45nm Nehalem-EP */ + case 46: /* 45nm Nehalem-EX */ + + case 37: /* 32nm Westmere */ + case 44: /* 32nm Westmere-EP */ + case 47: /* 32nm Westmere-EX */ + + case 42: /* 32nm SandyBridge */ + case 45: /* 32nm SandyBridge-E/EN/EP */ + + case 58: /* 22nm IvyBridge */ + case 62: /* 22nm IvyBridge-EP/EX */ + + case 60: /* 22nm Haswell Core */ + case 63: /* 22nm Haswell Server */ + case 69: /* 22nm Haswell ULT */ + case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */ + + case 61: /* 14nm Broadwell Core-M */ + case 86: /* 14nm Broadwell Xeon D */ + case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */ + case 79: /* 14nm Broadwell Server */ + events_attrs[idx++] = &evattr_smi.attr.attr; + break; + + case 78: /* 14nm Skylake Mobile */ + case 94: /* 14nm Skylake Desktop */ + events_attrs[idx++] = &evattr_pperf.attr.attr; + events_attrs[idx++] = &evattr_smi.attr.attr; + break; + + case 55: /* 22nm Atom "Silvermont" */ + case 76: /* 14nm Atom "Airmont" */ + case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */ + events_attrs[idx++] = &evattr_smi.attr.attr; + break; + } + + events_attrs[idx] = NULL; + + return 0; +} + +static int __init amd_msr_init(int idx) +{ + return 0; +} + +static int __init msr_init(void) +{ + int err; + int idx = 1; + + if (boot_cpu_has(X86_FEATURE_APERFMPERF)) { + events_attrs[idx++] = &evattr_aperf.attr.attr; + events_attrs[idx++] = &evattr_mperf.attr.attr; + events_attrs[idx] = NULL; + } + + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_INTEL: + err = intel_msr_init(idx); + break; + + case X86_VENDOR_AMD: + err = amd_msr_init(idx); + break; + + default: + err = -ENOTSUPP; + } + + if (err != 0) { + pr_cont("no msr PMU driver.\n"); + return 0; + } + + perf_pmu_register(&pmu_msr, "msr", -1); + + return 0; +} +device_initcall(msr_init); -- cgit v0.10.2 From 2a853e1123cbbb43ff74cc47b4ec582ce34df262 Mon Sep 17 00:00:00 2001 From: "Liang, Kan" Date: Fri, 3 Jul 2015 20:08:27 +0000 Subject: perf/x86/intel/pebs: Fix event disable PEBS buffer drain When disabling a PEBS event, we need to drain the buffer. Doing so requires a correct cpuc->pebs_active mask. The current code clears the pebs_active bit before draining the buffer. Fix that. Signed-off-by: "Liang, Kan" Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vince Weaver Link: http://lkml.kernel.org/r/37D7C6CF3E00A74B8858931C1DB2F07701885A65@SHSMSX103.ccr.corp.intel.com [ Fixed the SOB. ] Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 2f7ee05..8e564f8 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -789,6 +789,11 @@ void intel_pmu_pebs_disable(struct perf_event *event) struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct hw_perf_event *hwc = &event->hw; struct debug_store *ds = cpuc->ds; + bool large_pebs = ds->pebs_interrupt_threshold > + ds->pebs_buffer_base + x86_pmu.pebs_record_size; + + if (large_pebs) + intel_pmu_drain_pebs_buffer(); cpuc->pebs_enabled &= ~(1ULL << hwc->idx); @@ -797,12 +802,8 @@ void intel_pmu_pebs_disable(struct perf_event *event) else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST) cpuc->pebs_enabled &= ~(1ULL << 63); - if (ds->pebs_interrupt_threshold > - ds->pebs_buffer_base + x86_pmu.pebs_record_size) { - intel_pmu_drain_pebs_buffer(); - if (!pebs_is_enabled(cpuc)) - perf_sched_cb_dec(event->ctx->pmu); - } + if (large_pebs && !pebs_is_enabled(cpuc)) + perf_sched_cb_dec(event->ctx->pmu); if (cpuc->enabled) wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled); -- cgit v0.10.2 From 75f80859b130a1cc84e59e71295ce2dd51fe1c81 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 15 Jul 2015 14:35:46 +0200 Subject: perf/x86/intel/pebs: Robustify PEBS buffer drain Vince Weaver and Stephane Eranian reported warnings in the PEBS code when running the perf fuzzer. Stephane wrote: > I can reproduce the problem on my HSW running the fuzzer. > > I can see why this could be happening if you are mixing PEBS and non PEBS events > in the bottom 4 counters. I suspect: > for (bit = 0; bit < x86_pmu.max_pebs_events; bit++) { > if ((counts[bit] == 0) && (error[bit] == 0)) > continue; > > This test is not correct when you have non-PEBS events mixed with > PEBS events and they overflow at the same time. They will have > counts[i] != 0 but error[i] == 0, and thus you fall thru the loop > and hit the assert. Or it is something along those lines. The only way I can make this work is if ->status only has !PEBS events set, because if it has both set we'll take that slow path which masks out the !PEBS bits. After masking there are 3 options: - there is one bit set, and its @bit, we increment counts[bit]. - there are multiple bits set, we increment error[] for each set bit, we do not increment counts[]. - there are no bits set, we do nothing. The intent was to never increment counts[] for !PEBS events. Now if we start out with only a single !PEBS event set, we'll pass the test and increment counts[] for a !PEBS and hit the warn. Reported-by: Vince Weaver Reported-by: Stephane Eranian Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: kan.liang@intel.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 8e564f8..84f236a 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -1188,6 +1188,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) for (at = base; at < top; at += x86_pmu.pebs_record_size) { struct pebs_record_nhm *p = at; + u64 pebs_status; /* PEBS v3 has accurate status bits */ if (x86_pmu.intel_cap.pebs_format >= 3) { @@ -1198,12 +1199,17 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) continue; } - bit = find_first_bit((unsigned long *)&p->status, + pebs_status = p->status & cpuc->pebs_enabled; + pebs_status &= (1ULL << x86_pmu.max_pebs_events) - 1; + + bit = find_first_bit((unsigned long *)&pebs_status, x86_pmu.max_pebs_events); - if (bit >= x86_pmu.max_pebs_events) - continue; - if (!test_bit(bit, cpuc->active_mask)) + if (WARN(bit >= x86_pmu.max_pebs_events, + "PEBS record without PEBS event! status=%Lx pebs_enabled=%Lx active_mask=%Lx", + (unsigned long long)p->status, (unsigned long long)cpuc->pebs_enabled, + *(unsigned long long *)cpuc->active_mask)) continue; + /* * The PEBS hardware does not deal well with the situation * when events happen near to each other and multiple bits @@ -1218,27 +1224,21 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) * one, and it's not possible to reconstruct all events * that caused the PEBS record. It's called collision. * If collision happened, the record will be dropped. - * */ - if (p->status != (1 << bit)) { - u64 pebs_status; - - /* slow path */ - pebs_status = p->status & cpuc->pebs_enabled; - pebs_status &= (1ULL << MAX_PEBS_EVENTS) - 1; - if (pebs_status != (1 << bit)) { - for_each_set_bit(i, (unsigned long *)&pebs_status, - MAX_PEBS_EVENTS) - error[i]++; - continue; - } + if (p->status != (1ULL << bit)) { + for_each_set_bit(i, (unsigned long *)&pebs_status, + x86_pmu.max_pebs_events) + error[i]++; + continue; } + counts[bit]++; } for (bit = 0; bit < x86_pmu.max_pebs_events; bit++) { if ((counts[bit] == 0) && (error[bit] == 0)) continue; + event = cpuc->events[bit]; WARN_ON_ONCE(!event); WARN_ON_ONCE(!event->attr.precise_ip); -- cgit v0.10.2 From 9ee67421fe3b67a0796d6ea620fd5fcb037d3f89 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 3 Aug 2015 16:27:40 -0300 Subject: perf script: No tracepoints? Don't call libtraceevent. The libtraceevent handler (session->tevent) is only initialized when there are tracepoints in a perf.data event list, so do not call pevent_set_function_resolve() in those cases, fixing a segfault. Reported-by: Jiri Olsa Tested-by: Jiri Olsa Cc: Adrian Hunter Cc: Borislav Petkov Cc: David Ahern Cc: Frederic Weisbecker Cc: Namhyung Kim Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-xyynkucl5p4bcs13zi4i4b1f@git.kernel.org Report-link: http://lkml.kernel.org/r/20150803174113.GA20282@krava.redhat.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index bd31380..7912feb 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -1861,7 +1861,8 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused) else symbol_conf.use_callchain = false; - if (pevent_set_function_resolver(session->tevent.pevent, + if (session->tevent.pevent && + pevent_set_function_resolver(session->tevent.pevent, machine__resolve_kernel_addr, &session->machines.host) < 0) { pr_err("%s: failed to set libtraceevent function resolver\n", __func__); -- cgit v0.10.2 From 2e5e5f876143b147b98d0c0854b5fc577412b05b Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 3 Aug 2015 17:12:29 -0300 Subject: perf trace: Do not show syscall tracepoint filter in the --no-syscalls case We were accessing trace->syscalls.events members even when that struct wasn't initialized, i.e. --no-syscalls was specified on the command line, fix it to show that, still in debug mode, when we have an event qualifier list, i.e. when we actually are doing subset syscall tracing. Cc: Adrian Hunter Cc: Borislav Petkov Cc: David Ahern Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Milian Wolff Cc: Namhyung Kim Cc: Stephane Eranian Fixes: 19867b6186f3 ("perf trace: Use event filters for the event qualifier list") Link: http://lkml.kernel.org/n/tip-7980ym6vujgh3yiai0cqzc88@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 06cfa93..3cfca93 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -2394,9 +2394,10 @@ static int trace__run(struct trace *trace, int argc, const char **argv) err = trace__set_ev_qualifier_filter(trace); if (err < 0) goto out_errno; - } - pr_debug("%s\n", trace->syscalls.events.sys_exit->filter); + pr_debug("event qualifier tracepoint filter: %s\n", + trace->syscalls.events.sys_exit->filter); + } err = perf_evlist__apply_filters(evlist, &evsel); if (err < 0) -- cgit v0.10.2 From 08c987763ad3bc1aef133bcac073198ff86efa85 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 4 Aug 2015 17:01:04 -0300 Subject: perf trace: Remember if the vfs_getname tracepoint/kprobe is in place So that we can later decide if we will store where to expand the pathname once we are handling vfs_getname or if we should instead just go on and straight away print the pointer. Cc: Adrian Hunter Cc: Borislav Petkov Cc: David Ahern Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Milian Wolff Cc: Namhyung Kim Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-ytxk5s5jpc50wahffmlxgxuw@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 3cfca93..aa1e288 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -1312,6 +1312,7 @@ struct trace { bool show_tool_stats; bool trace_syscalls; bool force; + bool vfs_getname; int trace_pgfaults; }; @@ -2188,19 +2189,20 @@ static int trace__record(struct trace *trace, int argc, const char **argv) static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp); -static void perf_evlist__add_vfs_getname(struct perf_evlist *evlist) +static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist) { struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname"); if (evsel == NULL) - return; + return false; if (perf_evsel__field(evsel, "pathname") == NULL) { perf_evsel__delete(evsel); - return; + return false; } evsel->handler = trace__vfs_getname; perf_evlist__add(evlist, evsel); + return true; } static int perf_evlist__add_pgfault(struct perf_evlist *evlist, @@ -2330,7 +2332,7 @@ static int trace__run(struct trace *trace, int argc, const char **argv) goto out_error_raw_syscalls; if (trace->trace_syscalls) - perf_evlist__add_vfs_getname(evlist); + trace->vfs_getname = perf_evlist__add_vfs_getname(evlist); if ((trace->trace_pgfaults & TRACE_PFMAJ) && perf_evlist__add_pgfault(evlist, PERF_COUNT_SW_PAGE_FAULTS_MAJ)) { -- cgit v0.10.2 From e4d44e830a4bbeb0cf6fdd24c24794a2e599e26d Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 4 Aug 2015 22:17:29 -0300 Subject: perf trace: Use a constant for the syscall formatting buffer We were using it as a magic number, 1024, fix that. Eventually we need to stop doing it per line, and do it per arg, traversing the args at output time, to avoid the memmove() calls that will be used in the next cset to replace pointers present at raw_syscalls:sys_enter time with its contents that appear at probe:vfs_getname time, before raw_syscalls:sys_exit time. Cc: Adrian Hunter Cc: Borislav Petkov Cc: David Ahern Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Milian Wolff Cc: Namhyung Kim Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-4sz3wid39egay1pp8qmbur4u@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index aa1e288..12d6fc0 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -1262,6 +1262,8 @@ fail: #define TRACE_PFMAJ (1 << 0) #define TRACE_PFMIN (1 << 1) +static const size_t trace__entry_str_size = 2048; + struct trace { struct perf_tool tool; struct { @@ -1822,7 +1824,7 @@ static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel, args = perf_evsel__sc_tp_ptr(evsel, args, sample); if (ttrace->entry_str == NULL) { - ttrace->entry_str = malloc(1024); + ttrace->entry_str = malloc(trace__entry_str_size); if (!ttrace->entry_str) goto out_put; } @@ -1832,9 +1834,9 @@ static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel, ttrace->entry_time = sample->time; msg = ttrace->entry_str; - printed += scnprintf(msg + printed, 1024 - printed, "%s(", sc->name); + printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name); - printed += syscall__scnprintf_args(sc, msg + printed, 1024 - printed, + printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed, args, trace, thread); if (sc->is_exit) { -- cgit v0.10.2 From f994592d9386548d15cea597cde37a858b61c762 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 4 Aug 2015 22:30:09 -0300 Subject: perf trace: Deref sys_enter pointer args with contents from probe:vfs_getname To work like strace and dereference syscall pointer args we need to insert probes (or tracepoints) right after we copy those bytes from userspace. Since we're formatting the syscall args at raw_syscalls:sys_enter time, we need to have a formatter that just stores the position where, later, when we get the probe:vfs_getname, we can insert the pointer contents. Now, if a probe:vfs_getname with this format is in place: # perf probe -l probe:vfs_getname (on getname_flags:72@/home/git/linux/fs/namei.c with pathname) That was, in this case, put in place with: # perf probe 'vfs_getname=getname_flags:72 pathname=filename:string' Added new event: probe:vfs_getname (on getname_flags:72 with pathname=filename:string) You can now use it in all perf tools, such as: perf record -e probe:vfs_getname -aR sleep 1 # Then 'perf trace' will notice that and do the pointer -> contents expansion: # trace -e open touch /tmp/bla 0.165 (0.010 ms): touch/17752 open(filename: /etc/ld.so.cache, flags: CLOEXEC) = 3 0.195 (0.011 ms): touch/17752 open(filename: /lib64/libc.so.6, flags: CLOEXEC) = 3 0.512 (0.012 ms): touch/17752 open(filename: /usr/lib/locale/locale-archive, flags: CLOEXEC) = 3 0.582 (0.012 ms): touch/17752 open(filename: /tmp/bla, flags: CREAT|NOCTTY|NONBLOCK|WRONLY, mode: 438) = 3 # Roughly equivalent to strace's output: # strace -rT -e open touch /tmp/bla 0.000000 open("/etc/ld.so.cache", O_RDONLY|O_CLOEXEC) = 3 <0.000039> 0.000317 open("/lib64/libc.so.6", O_RDONLY|O_CLOEXEC) = 3 <0.000102> 0.001461 open("/usr/lib/locale/locale-archive", O_RDONLY|O_CLOEXEC) = 3 <0.000072> 0.000405 open("/tmp/bla", O_WRONLY|O_CREAT|O_NOCTTY|O_NONBLOCK, 0666) = 3 <0.000055> 0.000641 +++ exited with 0 +++ # Now we need to either look for at all syscalls that are marked as pointers and have some well known names ("filename", "pathname", etc) and set the arg formatter to the one used for the "open" syscall in this patch. This implementation works for syscalls with just a string being copied from userspace, for matching syscalls with more than one string being copied via the same probe/trace point (vfs_getname) we need to extend the vfs_getname probe spec to include the pointer too, but there are some problems with that in 'perf probe' or the kernel kprobes code, need to investigate before considering supporting multiple strings per syscall. Cc: Adrian Hunter Cc: Borislav Petkov Cc: David Ahern Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Milian Wolff Cc: Namhyung Kim Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-xvuwx6nuj8cf389kf9s2ue2s@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 12d6fc0..0255dd8 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -744,6 +744,11 @@ static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size, #define SCA_ACCMODE syscall_arg__scnprintf_access_mode +static size_t syscall_arg__scnprintf_filename(char *bf, size_t size, + struct syscall_arg *arg); + +#define SCA_FILENAME syscall_arg__scnprintf_filename + static size_t syscall_arg__scnprintf_open_flags(char *bf, size_t size, struct syscall_arg *arg) { @@ -1088,7 +1093,8 @@ static struct syscall_fmt { { .name = "newfstatat", .errmsg = true, .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, }, { .name = "open", .errmsg = true, - .arg_scnprintf = { [1] = SCA_OPEN_FLAGS, /* flags */ }, }, + .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ + [1] = SCA_OPEN_FLAGS, /* flags */ }, }, { .name = "open_by_handle_at", .errmsg = true, .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ [2] = SCA_OPEN_FLAGS, /* flags */ }, }, @@ -1208,6 +1214,11 @@ static size_t fprintf_duration(unsigned long t, FILE *fp) return printed + fprintf(fp, "): "); } +/** + * filename.ptr: The filename char pointer that will be vfs_getname'd + * filename.entry_str_pos: Where to insert the string translated from + * filename.ptr by the vfs_getname tracepoint/kprobe. + */ struct thread_trace { u64 entry_time; u64 exit_time; @@ -1216,6 +1227,10 @@ struct thread_trace { unsigned long pfmaj, pfmin; char *entry_str; double runtime_ms; + struct { + unsigned long ptr; + int entry_str_pos; + } filename; struct { int max; char **table; @@ -1418,6 +1433,27 @@ static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size, return printed; } +static void thread__set_filename_pos(struct thread *thread, const char *bf, + unsigned long ptr) +{ + struct thread_trace *ttrace = thread__priv(thread); + + ttrace->filename.ptr = ptr; + ttrace->filename.entry_str_pos = bf - ttrace->entry_str; +} + +static size_t syscall_arg__scnprintf_filename(char *bf, size_t size, + struct syscall_arg *arg) +{ + unsigned long ptr = arg->val; + + if (!arg->trace->vfs_getname) + return scnprintf(bf, size, "%#x", ptr); + + thread__set_filename_pos(arg->thread, bf, ptr); + return 0; +} + static bool trace__filter_duration(struct trace *trace, double t) { return t < (trace->duration_filter * NSEC_PER_MSEC); @@ -1938,7 +1974,45 @@ static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel, union perf_event *event __maybe_unused, struct perf_sample *sample) { + struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid); + struct thread_trace *ttrace; + size_t filename_len, entry_str_len, to_move; + ssize_t remaining_space; + char *pos; + const char *filename; + trace->last_vfs_getname = perf_evsel__rawptr(evsel, sample, "pathname"); + + if (!thread) + goto out; + + ttrace = thread__priv(thread); + if (!ttrace) + goto out; + + if (!ttrace->filename.ptr) + goto out; + + entry_str_len = strlen(ttrace->entry_str); + remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */ + if (remaining_space <= 0) + goto out; + + filename = trace->last_vfs_getname; + filename_len = strlen(filename); + if (filename_len > (size_t)remaining_space) { + filename += filename_len - remaining_space; + filename_len = remaining_space; + } + + to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */ + pos = ttrace->entry_str + ttrace->filename.entry_str_pos; + memmove(pos + filename_len, pos, to_move); + memcpy(pos, filename, filename_len); + + ttrace->filename.ptr = 0; + ttrace->filename.entry_str_pos = 0; +out: return 0; } -- cgit v0.10.2 From 34221118cb13f8b37480bce7286a1b44e90773d8 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 4 Aug 2015 23:31:25 -0300 Subject: perf trace: Use vfs_getname syscall arg beautifier in more syscalls Those were covered and tested in this cset: access, chdir, chmod, chown, chroot, creat, getxattr, inotify_add_watch, lchown, lgetxattr, listxattr, lsetxattr, mkdir, mkdirat, mknod, rmdir, faccessat, newfstatat, openat, readlink, readlinkat, removexattr, setxattr, statfs, swapon, swapoff, truncate, unlinkat, utime, utimes, utimensat. E.g.: # trace -e statfs,access,mkdir mkdir /tmp/bla 0.285 (0.020 ms): mkdir/2799 access(filename: /etc/ld.so.preload, mode: R ) = -1 ENOENT No such file or directory 1.070 (0.032 ms): mkdir/2799 statfs(pathname: /sys/fs/selinux, buf: 0x7ffeafbdc930) = 0 1.087 (0.013 ms): mkdir/2799 statfs(pathname: /sys/fs/selinux, buf: 0x7ffeafbdc820) = 0 1.189 (0.014 ms): mkdir/2799 access(filename: /etc/selinux/config ) = 0 1.905 (0.610 ms): mkdir/2799 mkdir(pathname: /tmp/bla, mode: 511 ) = 0 # Cc: Adrian Hunter Cc: Borislav Petkov Cc: David Ahern Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Milian Wolff Cc: Namhyung Kim Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-wbqtnlktquun3wtpjdz3okul@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo and an empty message aborts the commit. diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 0255dd8..98d423e 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -976,14 +976,23 @@ static struct syscall_fmt { bool hexret; } syscall_fmts[] = { { .name = "access", .errmsg = true, - .arg_scnprintf = { [1] = SCA_ACCMODE, /* mode */ }, }, + .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ + [1] = SCA_ACCMODE, /* mode */ }, }, { .name = "arch_prctl", .errmsg = true, .alias = "prctl", }, { .name = "brk", .hexret = true, .arg_scnprintf = { [0] = SCA_HEX, /* brk */ }, }, + { .name = "chdir", .errmsg = true, + .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, }, + { .name = "chmod", .errmsg = true, + .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, }, + { .name = "chroot", .errmsg = true, + .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, }, { .name = "clock_gettime", .errmsg = true, STRARRAY(0, clk_id, clockid), }, { .name = "close", .errmsg = true, .arg_scnprintf = { [0] = SCA_CLOSE_FD, /* fd */ }, }, { .name = "connect", .errmsg = true, }, + { .name = "creat", .errmsg = true, + .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, }, { .name = "dup", .errmsg = true, .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, }, { .name = "dup2", .errmsg = true, @@ -994,7 +1003,8 @@ static struct syscall_fmt { { .name = "eventfd2", .errmsg = true, .arg_scnprintf = { [1] = SCA_EFD_FLAGS, /* flags */ }, }, { .name = "faccessat", .errmsg = true, - .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, }, + .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ + [1] = SCA_FILENAME, /* filename */ }, }, { .name = "fadvise64", .errmsg = true, .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, }, { .name = "fallocate", .errmsg = true, @@ -1008,7 +1018,8 @@ static struct syscall_fmt { { .name = "fchown", .errmsg = true, .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, }, { .name = "fchownat", .errmsg = true, - .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, }, + .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ + [1] = SCA_FILENAME, /* filename */ }, }, { .name = "fcntl", .errmsg = true, .arg_scnprintf = { [0] = SCA_FD, /* fd */ [1] = SCA_STRARRAY, /* cmd */ }, @@ -1023,7 +1034,8 @@ static struct syscall_fmt { { .name = "fstat", .errmsg = true, .alias = "newfstat", .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, }, { .name = "fstatat", .errmsg = true, .alias = "newfstatat", - .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, }, + .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ + [1] = SCA_FILENAME, /* filename */ }, }, { .name = "fstatfs", .errmsg = true, .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, }, { .name = "fsync", .errmsg = true, @@ -1040,6 +1052,10 @@ static struct syscall_fmt { .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, }, { .name = "getitimer", .errmsg = true, STRARRAY(0, which, itimers), }, { .name = "getrlimit", .errmsg = true, STRARRAY(0, resource, rlimit_resources), }, + { .name = "getxattr", .errmsg = true, + .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, }, + { .name = "inotify_add_watch", .errmsg = true, + .arg_scnprintf = { [1] = SCA_FILENAME, /* pathname */ }, }, { .name = "ioctl", .errmsg = true, .arg_scnprintf = { [0] = SCA_FD, /* fd */ #if defined(__i386__) || defined(__x86_64__) @@ -1054,18 +1070,33 @@ static struct syscall_fmt { #endif { .name = "kill", .errmsg = true, .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, }, + { .name = "lchown", .errmsg = true, + .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, }, + { .name = "lgetxattr", .errmsg = true, + .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, }, { .name = "linkat", .errmsg = true, .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, }, + { .name = "listxattr", .errmsg = true, + .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, }, { .name = "lseek", .errmsg = true, .arg_scnprintf = { [0] = SCA_FD, /* fd */ [2] = SCA_STRARRAY, /* whence */ }, .arg_parm = { [2] = &strarray__whences, /* whence */ }, }, + { .name = "lsetxattr", .errmsg = true, + .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, }, { .name = "lstat", .errmsg = true, .alias = "newlstat", }, + { .name = "lsxattr", .errmsg = true, + .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, }, { .name = "madvise", .errmsg = true, .arg_scnprintf = { [0] = SCA_HEX, /* start */ [2] = SCA_MADV_BHV, /* behavior */ }, }, + { .name = "mkdir", .errmsg = true, + .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, }, { .name = "mkdirat", .errmsg = true, - .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, }, + .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ + [1] = SCA_FILENAME, /* pathname */ }, }, + { .name = "mknod", .errmsg = true, + .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, }, { .name = "mknodat", .errmsg = true, .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, }, { .name = "mlock", .errmsg = true, @@ -1091,7 +1122,8 @@ static struct syscall_fmt { { .name = "name_to_handle_at", .errmsg = true, .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, }, { .name = "newfstatat", .errmsg = true, - .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, }, + .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ + [1] = SCA_FILENAME, /* filename */ }, }, { .name = "open", .errmsg = true, .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ [1] = SCA_OPEN_FLAGS, /* flags */ }, }, @@ -1100,6 +1132,7 @@ static struct syscall_fmt { [2] = SCA_OPEN_FLAGS, /* flags */ }, }, { .name = "openat", .errmsg = true, .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ + [1] = SCA_FILENAME, /* filename */ [2] = SCA_OPEN_FLAGS, /* flags */ }, }, { .name = "perf_event_open", .errmsg = true, .arg_scnprintf = { [1] = SCA_INT, /* pid */ @@ -1121,8 +1154,11 @@ static struct syscall_fmt { .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, }, { .name = "read", .errmsg = true, .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, }, + { .name = "readlink", .errmsg = true, + .arg_scnprintf = { [0] = SCA_FILENAME, /* path */ }, }, { .name = "readlinkat", .errmsg = true, - .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, }, + .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ + [1] = SCA_FILENAME, /* pathname */ }, }, { .name = "readv", .errmsg = true, .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, }, { .name = "recvfrom", .errmsg = true, @@ -1131,8 +1167,12 @@ static struct syscall_fmt { .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, }, { .name = "recvmsg", .errmsg = true, .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, }, + { .name = "removexattr", .errmsg = true, + .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, }, { .name = "renameat", .errmsg = true, .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, }, + { .name = "rmdir", .errmsg = true, + .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, }, { .name = "rt_sigaction", .errmsg = true, .arg_scnprintf = { [0] = SCA_SIGNUM, /* sig */ }, }, { .name = "rt_sigprocmask", .errmsg = true, STRARRAY(0, how, sighow), }, @@ -1149,6 +1189,8 @@ static struct syscall_fmt { .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, }, { .name = "setitimer", .errmsg = true, STRARRAY(0, which, itimers), }, { .name = "setrlimit", .errmsg = true, STRARRAY(0, resource, rlimit_resources), }, + { .name = "setxattr", .errmsg = true, + .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, }, { .name = "shutdown", .errmsg = true, .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, }, { .name = "socket", .errmsg = true, @@ -1160,17 +1202,31 @@ static struct syscall_fmt { [1] = SCA_SK_TYPE, /* type */ }, .arg_parm = { [0] = &strarray__socket_families, /* family */ }, }, { .name = "stat", .errmsg = true, .alias = "newstat", }, + { .name = "statfs", .errmsg = true, + .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, }, + { .name = "swapoff", .errmsg = true, + .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, }, + { .name = "swapon", .errmsg = true, + .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, }, { .name = "symlinkat", .errmsg = true, .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, }, { .name = "tgkill", .errmsg = true, .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, }, { .name = "tkill", .errmsg = true, .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, }, + { .name = "truncate", .errmsg = true, + .arg_scnprintf = { [0] = SCA_FILENAME, /* path */ }, }, { .name = "uname", .errmsg = true, .alias = "newuname", }, { .name = "unlinkat", .errmsg = true, - .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, }, + .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ + [1] = SCA_FILENAME, /* pathname */ }, }, + { .name = "utime", .errmsg = true, + .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, }, { .name = "utimensat", .errmsg = true, - .arg_scnprintf = { [0] = SCA_FDAT, /* dirfd */ }, }, + .arg_scnprintf = { [0] = SCA_FDAT, /* dirfd */ + [1] = SCA_FILENAME, /* filename */ }, }, + { .name = "utimes", .errmsg = true, + .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, }, { .name = "write", .errmsg = true, .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, }, { .name = "writev", .errmsg = true, -- cgit v0.10.2 From 320677123905fd1dd122895cd5fb870ee9e1380b Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 4 Aug 2015 04:30:19 -0400 Subject: perf tools: Per-event time support This patchkit adds the ability to turn off time stamps per event. One usaful case for partial time is to work with per-event callgraph to enable "PEBS threshold > 1" (https://lkml.org/lkml/2015/5/10/196), which can significantly reduce the sampling overhead. The event samples with time stamps off will not be ordered. Signed-off-by: Kan Liang Acked-by: Jiri Olsa Cc: Andi Kleen Cc: Namhyung Kim Link: http://lkml.kernel.org/r/1438677022-34296-2-git-send-email-kan.liang@intel.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt index ac41350..0d852d1 100644 --- a/tools/perf/Documentation/perf-record.txt +++ b/tools/perf/Documentation/perf-record.txt @@ -49,7 +49,9 @@ OPTIONS These params can be used to overload default config values per event. Here is a list of the params. - 'period': Set event sampling period - + - 'time': Disable/enable time stamping. Acceptable values are 1 for + enabling time stamping. 0 for disabling time stamping. + The default is 1. Note: If user explicitly sets options which conflict with the params, the value set by the params will be overridden. diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 7d3acba..7febfe2 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -587,15 +587,23 @@ perf_evsel__config_callgraph(struct perf_evsel *evsel, } } -static void apply_config_terms(struct perf_event_attr *attr __maybe_unused, - struct list_head *config_terms) +static void apply_config_terms(struct perf_evsel *evsel) { struct perf_evsel_config_term *term; + struct list_head *config_terms = &evsel->config_terms; + struct perf_event_attr *attr = &evsel->attr; list_for_each_entry(term, config_terms, list) { switch (term->type) { case PERF_EVSEL__CONFIG_TERM_PERIOD: attr->sample_period = term->val.period; + break; + case PERF_EVSEL__CONFIG_TERM_TIME: + if (term->val.time) + perf_evsel__set_sample_bit(evsel, TIME); + else + perf_evsel__reset_sample_bit(evsel, TIME); + break; default: break; } @@ -798,7 +806,7 @@ void perf_evsel__config(struct perf_evsel *evsel, struct record_opts *opts) * Apply event specific term settings, * it overloads any global configuration. */ - apply_config_terms(attr, &evsel->config_terms); + apply_config_terms(evsel); } static int perf_evsel__alloc_fd(struct perf_evsel *evsel, int ncpus, int nthreads) diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index a7d2175..6a12908 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -39,6 +39,7 @@ struct cgroup_sel; */ enum { PERF_EVSEL__CONFIG_TERM_PERIOD, + PERF_EVSEL__CONFIG_TERM_TIME, PERF_EVSEL__CONFIG_TERM_MAX, }; @@ -47,6 +48,7 @@ struct perf_evsel_config_term { int type; union { u64 period; + bool time; } val; }; diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 09bee93..a6cb9af 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -603,6 +603,14 @@ do { \ * attr->branch_sample_type = term->val.num; */ break; + case PARSE_EVENTS__TERM_TYPE_TIME: + CHECK_TYPE_VAL(NUM); + if (term->val.num > 1) { + err->str = strdup("expected 0 or 1"); + err->idx = term->err_val; + return -EINVAL; + } + break; case PARSE_EVENTS__TERM_TYPE_NAME: CHECK_TYPE_VAL(STR); break; @@ -650,6 +658,10 @@ do { \ switch (term->type_term) { case PARSE_EVENTS__TERM_TYPE_SAMPLE_PERIOD: ADD_CONFIG_TERM(PERIOD, period, term->val.num); + break; + case PARSE_EVENTS__TERM_TYPE_TIME: + ADD_CONFIG_TERM(TIME, time, term->val.num); + break; default: break; } diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h index 2063048..e6f9aacc 100644 --- a/tools/perf/util/parse-events.h +++ b/tools/perf/util/parse-events.h @@ -63,6 +63,7 @@ enum { PARSE_EVENTS__TERM_TYPE_NAME, PARSE_EVENTS__TERM_TYPE_SAMPLE_PERIOD, PARSE_EVENTS__TERM_TYPE_BRANCH_SAMPLE_TYPE, + PARSE_EVENTS__TERM_TYPE_TIME, }; struct parse_events_term { diff --git a/tools/perf/util/parse-events.l b/tools/perf/util/parse-events.l index 13cef3c..f542750 100644 --- a/tools/perf/util/parse-events.l +++ b/tools/perf/util/parse-events.l @@ -183,6 +183,7 @@ config2 { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_CONFIG2); } name { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_NAME); } period { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_SAMPLE_PERIOD); } branch_type { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_BRANCH_SAMPLE_TYPE); } +time { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_TIME); } , { return ','; } "/" { BEGIN(INITIAL); return '/'; } {name_minus} { return str(yyscanner, PE_NAME); } diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index 7bcb8c3..b615cdf 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -607,7 +607,7 @@ static char *formats_error_string(struct list_head *formats) { struct perf_pmu_format *format; char *err, *str; - static const char *static_terms = "config,config1,config2,name,period,branch_type\n"; + static const char *static_terms = "config,config1,config2,name,period,branch_type,time\n"; unsigned i = 0; if (!asprintf(&str, "valid terms:")) -- cgit v0.10.2 From c3a6a8c40538f609923acf9473250266283269a5 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 4 Aug 2015 04:30:20 -0400 Subject: perf tools: Refine parse/config callchain functions Pass global callchain_param into parse_callchain_record_opt and perf_evsel__config_callgraph as parameter. So we can reuse these functions to parse/config local param for callchain. Signed-off-by: Kan Liang Acked-by: Jiri Olsa Cc: Andi Kleen Cc: Namhyung Kim Link: http://lkml.kernel.org/r/1438677022-34296-3-git-send-email-kan.liang@intel.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index f51131b..25cf6b4 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -779,7 +779,7 @@ int record_parse_callchain_opt(const struct option *opt, return 0; } - ret = parse_callchain_record_opt(arg); + ret = parse_callchain_record_opt(arg, &callchain_param); if (!ret) callchain_debug(); diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c index 9f643ee..931cca8 100644 --- a/tools/perf/util/callchain.c +++ b/tools/perf/util/callchain.c @@ -53,7 +53,7 @@ static int get_stack_size(const char *str, unsigned long *_size) } #endif /* HAVE_DWARF_UNWIND_SUPPORT */ -int parse_callchain_record_opt(const char *arg) +int parse_callchain_record_opt(const char *arg, struct callchain_param *param) { char *tok, *name, *saveptr = NULL; char *buf; @@ -73,7 +73,7 @@ int parse_callchain_record_opt(const char *arg) /* Framepointer style */ if (!strncmp(name, "fp", sizeof("fp"))) { if (!strtok_r(NULL, ",", &saveptr)) { - callchain_param.record_mode = CALLCHAIN_FP; + param->record_mode = CALLCHAIN_FP; ret = 0; } else pr_err("callchain: No more arguments " @@ -86,20 +86,20 @@ int parse_callchain_record_opt(const char *arg) const unsigned long default_stack_dump_size = 8192; ret = 0; - callchain_param.record_mode = CALLCHAIN_DWARF; - callchain_param.dump_size = default_stack_dump_size; + param->record_mode = CALLCHAIN_DWARF; + param->dump_size = default_stack_dump_size; tok = strtok_r(NULL, ",", &saveptr); if (tok) { unsigned long size = 0; ret = get_stack_size(tok, &size); - callchain_param.dump_size = size; + param->dump_size = size; } #endif /* HAVE_DWARF_UNWIND_SUPPORT */ } else if (!strncmp(name, "lbr", sizeof("lbr"))) { if (!strtok_r(NULL, ",", &saveptr)) { - callchain_param.record_mode = CALLCHAIN_LBR; + param->record_mode = CALLCHAIN_LBR; ret = 0; } else pr_err("callchain: No more arguments " @@ -219,7 +219,7 @@ int perf_callchain_config(const char *var, const char *value) var += sizeof("call-graph.") - 1; if (!strcmp(var, "record-mode")) - return parse_callchain_record_opt(value); + return parse_callchain_record_opt(value, &callchain_param); #ifdef HAVE_DWARF_UNWIND_SUPPORT if (!strcmp(var, "dump-size")) { unsigned long size = 0; diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h index 679c2c6..68a32c2 100644 --- a/tools/perf/util/callchain.h +++ b/tools/perf/util/callchain.h @@ -177,7 +177,7 @@ int fill_callchain_info(struct addr_location *al, struct callchain_cursor_node * bool hide_unresolved); extern const char record_callchain_help[]; -int parse_callchain_record_opt(const char *arg); +int parse_callchain_record_opt(const char *arg, struct callchain_param *param); int parse_callchain_report_opt(const char *arg); int perf_callchain_config(const char *var, const char *value); diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 7febfe2..f572f46 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -545,14 +545,15 @@ int perf_evsel__group_desc(struct perf_evsel *evsel, char *buf, size_t size) static void perf_evsel__config_callgraph(struct perf_evsel *evsel, - struct record_opts *opts) + struct record_opts *opts, + struct callchain_param *param) { bool function = perf_evsel__is_function_event(evsel); struct perf_event_attr *attr = &evsel->attr; perf_evsel__set_sample_bit(evsel, CALLCHAIN); - if (callchain_param.record_mode == CALLCHAIN_LBR) { + if (param->record_mode == CALLCHAIN_LBR) { if (!opts->branch_stack) { if (attr->exclude_user) { pr_warning("LBR callstack option is only available " @@ -568,12 +569,12 @@ perf_evsel__config_callgraph(struct perf_evsel *evsel, "Falling back to framepointers.\n"); } - if (callchain_param.record_mode == CALLCHAIN_DWARF) { + if (param->record_mode == CALLCHAIN_DWARF) { if (!function) { perf_evsel__set_sample_bit(evsel, REGS_USER); perf_evsel__set_sample_bit(evsel, STACK_USER); attr->sample_regs_user = PERF_REGS_MASK; - attr->sample_stack_user = callchain_param.dump_size; + attr->sample_stack_user = param->dump_size; attr->exclude_callchain_user = 1; } else { pr_info("Cannot use DWARF unwind for function trace event," @@ -714,7 +715,7 @@ void perf_evsel__config(struct perf_evsel *evsel, struct record_opts *opts) evsel->attr.exclude_callchain_user = 1; if (callchain_param.enabled && !evsel->no_aux_samples) - perf_evsel__config_callgraph(evsel, opts); + perf_evsel__config_callgraph(evsel, opts, &callchain_param); if (opts->sample_intr_regs) { attr->sample_regs_intr = PERF_REGS_MASK; -- cgit v0.10.2 From 8011de7ab3b10c5352f3f0708f517de2722b0957 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 3 Aug 2015 17:50:01 -0700 Subject: perf tools: Remove trail argument to color vsprintf Seems like it's always '\n' through color_fprintf_ln, which is not used at all, removing.. ;-) Signed-off-by: Jiri Olsa Cc: Andi Kleen Cc: Stephane Eranian Link: http://lkml.kernel.org/r/1438649408-20807-2-git-send-email-andi@firstfloor.org Signed-off-by: Andi Kleen Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/color.c b/tools/perf/util/color.c index 55355b3..ff051d2 100644 --- a/tools/perf/util/color.c +++ b/tools/perf/util/color.c @@ -68,7 +68,7 @@ static int __color_vsnprintf(char *bf, size_t size, const char *color, } static int __color_vfprintf(FILE *fp, const char *color, const char *fmt, - va_list args, const char *trail) + va_list args) { int r = 0; @@ -87,8 +87,6 @@ static int __color_vfprintf(FILE *fp, const char *color, const char *fmt, r += vfprintf(fp, fmt, args); if (perf_use_color_default && *color) r += fprintf(fp, "%s", PERF_COLOR_RESET); - if (trail) - r += fprintf(fp, "%s", trail); return r; } @@ -100,7 +98,7 @@ int color_vsnprintf(char *bf, size_t size, const char *color, int color_vfprintf(FILE *fp, const char *color, const char *fmt, va_list args) { - return __color_vfprintf(fp, color, fmt, args, NULL); + return __color_vfprintf(fp, color, fmt, args); } int color_snprintf(char *bf, size_t size, const char *color, @@ -126,16 +124,6 @@ int color_fprintf(FILE *fp, const char *color, const char *fmt, ...) return r; } -int color_fprintf_ln(FILE *fp, const char *color, const char *fmt, ...) -{ - va_list args; - int r; - va_start(args, fmt); - r = __color_vfprintf(fp, color, fmt, args, "\n"); - va_end(args); - return r; -} - /* * This function splits the buffer by newlines and colors the lines individually. * diff --git a/tools/perf/util/color.h b/tools/perf/util/color.h index 38146f9..a93997f 100644 --- a/tools/perf/util/color.h +++ b/tools/perf/util/color.h @@ -35,7 +35,6 @@ int color_vsnprintf(char *bf, size_t size, const char *color, int color_vfprintf(FILE *fp, const char *color, const char *fmt, va_list args); int color_fprintf(FILE *fp, const char *color, const char *fmt, ...); int color_snprintf(char *bf, size_t size, const char *color, const char *fmt, ...); -int color_fprintf_ln(FILE *fp, const char *color, const char *fmt, ...); int color_fwrite_lines(FILE *fp, const char *color, size_t count, const char *buf); int value_color_snprintf(char *bf, size_t size, const char *fmt, double value); int percent_color_snprintf(char *bf, size_t size, const char *fmt, ...); -- cgit v0.10.2 From b7a001d2067830a98e65d1bbbf99a6d435d70616 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 3 Aug 2015 17:50:02 -0700 Subject: perf tools: Do not include escape sequences in color_vfprintf return color_vprintf was including the length of the invisible escape sequences in its return argument. Don't include them to make the return value usable for indentation calculations. v2: Add comment, rebase Signed-off-by: Andi Kleen Cc: Jiri Olsa Cc: Stephane Eranian Link: http://lkml.kernel.org/r/1438649408-20807-3-git-send-email-andi@firstfloor.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/color.c b/tools/perf/util/color.c index ff051d2..9b95654 100644 --- a/tools/perf/util/color.c +++ b/tools/perf/util/color.c @@ -67,6 +67,7 @@ static int __color_vsnprintf(char *bf, size_t size, const char *color, return r; } +/* Colors are not included in return value */ static int __color_vfprintf(FILE *fp, const char *color, const char *fmt, va_list args) { @@ -83,10 +84,10 @@ static int __color_vfprintf(FILE *fp, const char *color, const char *fmt, } if (perf_use_color_default && *color) - r += fprintf(fp, "%s", color); + fprintf(fp, "%s", color); r += vfprintf(fp, fmt, args); if (perf_use_color_default && *color) - r += fprintf(fp, "%s", PERF_COLOR_RESET); + fprintf(fp, "%s", PERF_COLOR_RESET); return r; } -- cgit v0.10.2 From 007d66a0bd43d886eb3e4aceaf1a96b8743ccaff Mon Sep 17 00:00:00 2001 From: Milian Wolff Date: Wed, 5 Aug 2015 16:52:23 -0300 Subject: perf trace: Write to stderr by default Without this patch, it is cumbersome to read the trace output but ignoring the normal, potentially verbose, output of the debuggee. One common example is doing something like the following: perf trace -s find /tmp > /dev/null Without this patch, the trace summary will be lost. Now, it will still be printed at the end. This behavior is also applied by strace. Cc: Milian Wolff Cc: David Ahern Link: http://lkml.kernel.org/n/tip-tqnks6y2cnvm5f9g2dsfr7zl@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 98d423e..a474970 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -2965,7 +2965,7 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused) .mmap_pages = UINT_MAX, .proc_map_timeout = 500, }, - .output = stdout, + .output = stderr, .show_comm = true, .trace_syscalls = true, }; -- cgit v0.10.2 From f151f53aa4f54a647353e1935e4c6cef7f094dd4 Mon Sep 17 00:00:00 2001 From: Petri Gynther Date: Tue, 4 Aug 2015 17:38:01 -0700 Subject: perf tools: Fix build errors with mipsel-linux-uclibc compiler MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit linux/tools$ make ARCH=mips CROSS_COMPILE=mipsel-linux- perf ... config/Makefile:256: *** No gnu/libc-version.h found, please install glibc-dev[el]. Stop. make[1]: *** [all] Error 2 make: *** [perf] Error 2 ... In file included from builtin-sched.c:13:0: util/cloexec.h:8:12: error: redundant redeclaration of ‘sched_getcpu’ [-Werror=redundant-decls] extern int sched_getcpu(void) __THROW; mipsel-buildroot-linux-uclibc/sysroot/usr/include/bits/sched.h:88:12: note: previous declaration of ‘sched_getcpu’ was here extern int sched_getcpu (void) __THROW; uclibc info: sysroot/usr/include/bits/uClibc_config.h __UCLIBC_MAJOR__ 0 __UCLIBC_MINOR__ 9 __UCLIBC_SUBLEVEL__ 33 sysroot/usr/include/features.h __UCLIBC__ 1 __GLIBC__ 2 __GLIBC_MINOR__ 2 Signed-off-by: Petri Gynther Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1438735081-24131-1-git-send-email-pgynther@google.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/build/feature/test-glibc.c b/tools/build/feature/test-glibc.c index b082034..9367f758 100644 --- a/tools/build/feature/test-glibc.c +++ b/tools/build/feature/test-glibc.c @@ -1,8 +1,19 @@ +#include + +#if !defined(__UCLIBC__) #include +#else +#define XSTR(s) STR(s) +#define STR(s) #s +#endif int main(void) { +#if !defined(__UCLIBC__) const char *version = gnu_get_libc_version(); +#else + const char *version = XSTR(__GLIBC__) "." XSTR(__GLIBC_MINOR__); +#endif return (long)version; } diff --git a/tools/perf/util/cloexec.h b/tools/perf/util/cloexec.h index 68888c2..3bee677 100644 --- a/tools/perf/util/cloexec.h +++ b/tools/perf/util/cloexec.h @@ -4,7 +4,7 @@ unsigned long perf_event_open_cloexec_flag(void); #ifdef __GLIBC_PREREQ -#if !__GLIBC_PREREQ(2, 6) +#if !__GLIBC_PREREQ(2, 6) && !defined(__UCLIBC__) extern int sched_getcpu(void) __THROW; #endif #endif -- cgit v0.10.2 From 834fd46ddb50953cf3fd9caa3f35485715c62ea3 Mon Sep 17 00:00:00 2001 From: Milian Wolff Date: Thu, 6 Aug 2015 11:24:29 +0200 Subject: perf trace: Add total time column to summary. It is cumbersome to manually calculate the total time spent in a given syscall by multiplying the average value with the number of calls. Instead, we now do this directly inside perf trace. Note that this is also done by 'strace', which even adds a column with relative numbers - something we could do in the future. Example: perf trace -s find /some/folder > /dev/null Summary of events: find (19976), 700123 events, 100.0%, 0.000 msec syscall calls total min avg max stddev (msec) (msec) (msec) (msec) (%) --------------- -------- --------- --------- --------- --------- ------ read 4 0.006 0.001 0.002 0.003 27.42% write 8046 9.617 0.001 0.001 0.035 0.56% open 34196 40.384 0.001 0.001 0.071 0.30% close 68375 57.104 0.001 0.001 0.076 0.25% stat 4 0.004 0.001 0.001 0.001 3.14% fstat 34189 27.518 0.001 0.001 0.060 0.34% mmap 13 0.029 0.001 0.002 0.003 10.74% mprotect 6 0.018 0.002 0.003 0.005 17.04% munmap 3 0.014 0.003 0.005 0.006 24.87% brk 87 0.490 0.001 0.006 0.016 6.50% ioctl 3 0.004 0.001 0.001 0.003 36.39% access 1 0.004 0.004 0.004 0.004 0.00% uname 1 0.001 0.001 0.001 0.001 0.00% getdents 68393 143.600 0.001 0.002 0.187 0.95% fchdir 68371 56.980 0.001 0.001 0.111 0.39% arch_prctl 1 0.001 0.001 0.001 0.001 0.00% openat 34184 41.737 0.001 0.001 0.102 0.41% newfstatat 34184 41.180 0.001 0.001 0.064 0.34% Signed-off-by: Milian Wolff Tested-by: Arnaldo Carvalho de Melo LPU-Reference: 1438853069-5902-1-git-send-email-milian.wolff@kdab.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index a474970..a25048c 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -2773,9 +2773,9 @@ static size_t thread__dump_stats(struct thread_trace *ttrace, printed += fprintf(fp, "\n"); - printed += fprintf(fp, " syscall calls min avg max stddev\n"); - printed += fprintf(fp, " (msec) (msec) (msec) (%%)\n"); - printed += fprintf(fp, " --------------- -------- --------- --------- --------- ------\n"); + printed += fprintf(fp, " syscall calls total min avg max stddev\n"); + printed += fprintf(fp, " (msec) (msec) (msec) (msec) (%%)\n"); + printed += fprintf(fp, " --------------- -------- --------- --------- --------- --------- ------\n"); /* each int_node is a syscall */ while (inode) { @@ -2792,8 +2792,8 @@ static size_t thread__dump_stats(struct thread_trace *ttrace, sc = &trace->syscalls.table[inode->i]; printed += fprintf(fp, " %-15s", sc->name); - printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f", - n, min, avg); + printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f", + n, avg * n, min, avg); printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct); } -- cgit v0.10.2 From 098d2164e3441c252eaa28906d45e16b7bf1bd2b Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Wed, 1 Jul 2015 02:13:49 +0000 Subject: bpf: Use correct #ifdef controller for trace_call_bpf() Commit e1abf2cc8d5d80b41c4419368ec743ccadbb131e ("bpf: Fix the build on BPF_SYSCALL=y && !CONFIG_TRACING kernels, make it more configurable") updated the building condition of bpf_trace.o from CONFIG_BPF_SYSCALL to CONFIG_BPF_EVENTS, but the corresponding #ifdef controller in trace_events.h for trace_call_bpf() was not changed. Which, in theory, is incorrect. With current Kconfigs, we can create a .config with CONFIG_BPF_SYSCALL=y and CONFIG_BPF_EVENTS=n by unselecting CONFIG_KPROBE_EVENT and selecting CONFIG_BPF_SYSCALL. With these options, trace_call_bpf() will be defined as an extern function, but if anyone calls it a symbol missing error will be triggered since bpf_trace.o was not built. This patch changes the #ifdef controller for trace_call_bpf() from CONFIG_BPF_SYSCALL to CONFIG_BPF_EVENTS. I'll show its correctness: Before this patch: BPF_SYSCALL BPF_EVENTS trace_call_bpf bpf_trace.o y y normal compiled n n inline not compiled y n normal not compiled (incorrect) n y impossible (BPF_EVENTS depends on BPF_SYSCALL) After this patch: BPF_SYSCALL BPF_EVENTS trace_call_bpf bpf_trace.o y y normal compiled n n inline not compiled y n inline not compiled (fixed) n y impossible (BPF_EVENTS depends on BPF_SYSCALL) So this patch doesn't break anything. QED. Signed-off-by: Wang Nan Cc: Alexei Starovoitov Cc: Brendan Gregg Cc: Daniel Borkmann Cc: David Ahern Cc: He Kuang Cc: Jiri Olsa Cc: Kaixu Xia Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Zefan Li Cc: pi3orama@163.com Link: http://lkml.kernel.org/r/1435716878-189507-2-git-send-email-wangnan0@huawei.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 1063c85..180dbf8 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -542,7 +542,7 @@ event_trigger_unlock_commit_regs(struct trace_event_file *file, event_triggers_post_call(file, tt); } -#ifdef CONFIG_BPF_SYSCALL +#ifdef CONFIG_BPF_EVENTS unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx); #else static inline unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx) -- cgit v0.10.2 From 04a22fae4cbc1f7d3f7471e9b36359f98bd3f043 Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Wed, 1 Jul 2015 02:13:50 +0000 Subject: tracing, perf: Implement BPF programs attached to uprobes By copying BPF related operation to uprobe processing path, this patch allow users attach BPF programs to uprobes like what they are already doing on kprobes. After this patch, users are allowed to use PERF_EVENT_IOC_SET_BPF on a uprobe perf event. Which make it possible to profile user space programs and kernel events together using BPF. Because of this patch, CONFIG_BPF_EVENTS should be selected by CONFIG_UPROBE_EVENT to ensure trace_call_bpf() is compiled even if KPROBE_EVENT is not set. Signed-off-by: Wang Nan Acked-by: Alexei Starovoitov Cc: Brendan Gregg Cc: Daniel Borkmann Cc: David Ahern Cc: He Kuang Cc: Jiri Olsa Cc: Kaixu Xia Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Zefan Li Cc: pi3orama@163.com Link: http://lkml.kernel.org/r/1435716878-189507-3-git-send-email-wangnan0@huawei.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 180dbf8..ed27917 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -243,6 +243,7 @@ enum { TRACE_EVENT_FL_USE_CALL_FILTER_BIT, TRACE_EVENT_FL_TRACEPOINT_BIT, TRACE_EVENT_FL_KPROBE_BIT, + TRACE_EVENT_FL_UPROBE_BIT, }; /* @@ -257,6 +258,7 @@ enum { * USE_CALL_FILTER - For trace internal events, don't use file filter * TRACEPOINT - Event is a tracepoint * KPROBE - Event is a kprobe + * UPROBE - Event is a uprobe */ enum { TRACE_EVENT_FL_FILTERED = (1 << TRACE_EVENT_FL_FILTERED_BIT), @@ -267,8 +269,11 @@ enum { TRACE_EVENT_FL_USE_CALL_FILTER = (1 << TRACE_EVENT_FL_USE_CALL_FILTER_BIT), TRACE_EVENT_FL_TRACEPOINT = (1 << TRACE_EVENT_FL_TRACEPOINT_BIT), TRACE_EVENT_FL_KPROBE = (1 << TRACE_EVENT_FL_KPROBE_BIT), + TRACE_EVENT_FL_UPROBE = (1 << TRACE_EVENT_FL_UPROBE_BIT), }; +#define TRACE_EVENT_FL_UKPROBE (TRACE_EVENT_FL_KPROBE | TRACE_EVENT_FL_UPROBE) + struct trace_event_call { struct list_head list; struct trace_event_class *class; diff --git a/kernel/events/core.c b/kernel/events/core.c index bdea129..77f9e5d 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6846,8 +6846,8 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) if (event->tp_event->prog) return -EEXIST; - if (!(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) - /* bpf programs can only be attached to kprobes */ + if (!(event->tp_event->flags & TRACE_EVENT_FL_UKPROBE)) + /* bpf programs can only be attached to u/kprobes */ return -EINVAL; prog = bpf_prog_get(prog_fd); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 3b9a48a..1153c43 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -434,7 +434,7 @@ config UPROBE_EVENT config BPF_EVENTS depends on BPF_SYSCALL - depends on KPROBE_EVENT + depends on KPROBE_EVENT || UPROBE_EVENT bool default y help diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index aa1ea7b..f97479f 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1095,11 +1095,15 @@ static void __uprobe_perf_func(struct trace_uprobe *tu, { struct trace_event_call *call = &tu->tp.call; struct uprobe_trace_entry_head *entry; + struct bpf_prog *prog = call->prog; struct hlist_head *head; void *data; int size, esize; int rctx; + if (prog && !trace_call_bpf(prog, regs)) + return; + esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); size = esize + tu->tp.size + dsize; @@ -1289,6 +1293,7 @@ static int register_uprobe_event(struct trace_uprobe *tu) return -ENODEV; } + call->flags = TRACE_EVENT_FL_UPROBE; call->class->reg = trace_uprobe_register; call->data = tu; ret = trace_add_event_call(call); -- cgit v0.10.2 From 0af0885ef69c182d1fa6bb201cd0570e9aa384eb Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Fri, 31 Jul 2015 10:35:33 -0300 Subject: perf tools: Introduce veprintf va_args alternative to eprintf(). Signed-off-by: Wang Nan Acked-by: Alexei Starovoitov Cc: Brendan Gregg Cc: Daniel Borkmann Cc: David Ahern Cc: He Kuang Cc: Jiri Olsa Cc: Kaixu Xia Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Zefan Li Cc: pi3orama@163.com Link: http://lkml.kernel.org/n/1436445342-1402-19-git-send-email-wangnan0@huawei.com [ split from another patch ] Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/debug.c b/tools/perf/util/debug.c index 2da5581..86d9c73 100644 --- a/tools/perf/util/debug.c +++ b/tools/perf/util/debug.c @@ -36,6 +36,11 @@ static int _eprintf(int level, int var, const char *fmt, va_list args) return ret; } +int veprintf(int level, int var, const char *fmt, va_list args) +{ + return _eprintf(level, var, fmt, args); +} + int eprintf(int level, int var, const char *fmt, ...) { va_list args; diff --git a/tools/perf/util/debug.h b/tools/perf/util/debug.h index caac2fd..8b9a088 100644 --- a/tools/perf/util/debug.h +++ b/tools/perf/util/debug.h @@ -50,6 +50,7 @@ void pr_stat(const char *fmt, ...); int eprintf(int level, int var, const char *fmt, ...) __attribute__((format(printf, 3, 4))); int eprintf_time(int level, int var, u64 t, const char *fmt, ...) __attribute__((format(printf, 4, 5))); +int veprintf(int level, int var, const char *fmt, va_list args); int perf_debug_option(const char *str); -- cgit v0.10.2 From 5a023b57a8e96327925a39312bccc443a7c540b6 Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Fri, 19 Jun 2015 08:42:48 +0000 Subject: perf tools: Add missing forward declaration of struct map to probe-event.h Commit 7b6ff0bdbf4f7f429c2116cca92a6d171217449e ("perf probe ppc64le: Fixup function entry if using kallsyms lookup") adds 'struct map' into probe-event.h but not forward declares it. This patch fixes it. Signed-off-by: Wang Nan Cc: Alexei Starovoitov Cc: Brendan Gregg Cc: Daniel Borkmann Cc: David Ahern Cc: He Kuang Cc: Jiri Olsa Cc: Kaixu Xia Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Zefan Li Cc: pi3orama@163.com Fixes: 7b6ff0bdbf4f ("perf probe ppc64le: Fixup function entry if using kallsyms lookup") Link: http://lkml.kernel.org/n/1436445342-1402-30-git-send-email-wangnan0@huawei.com [ No need to include map.h, just forward declare 'struct map' ] Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/probe-event.h b/tools/perf/util/probe-event.h index 20f555d..83ee95e 100644 --- a/tools/perf/util/probe-event.h +++ b/tools/perf/util/probe-event.h @@ -106,6 +106,8 @@ struct variable_list { struct strlist *vars; /* Available variables */ }; +struct map; + /* Command string to events */ extern int parse_perf_probe_command(const char *cmd, struct perf_probe_event *pev); -- cgit v0.10.2 From 421a50f3fafaf271bb3293378eaafca71337dfec Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 21 Jul 2015 14:31:22 +0200 Subject: perf stat: Introduce struct perf_stat_config Moving 'aggr_mode' into new struct. The point is to centralize the base stat config so it could be used localy together with other stat routines in other parts of perf code. Signed-off-by: Jiri Olsa Cc: David Ahern Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1437481927-29538-3-git-send-email-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index d99d850..bafb830 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -102,7 +102,6 @@ static struct target target = { static int run_count = 1; static bool no_inherit = false; static bool scale = true; -static enum aggr_mode aggr_mode = AGGR_GLOBAL; static volatile pid_t child_pid = -1; static bool null_run = false; static int detailed_run = 0; @@ -126,6 +125,10 @@ static int (*aggr_get_id)(struct cpu_map *m, int cpu); static volatile int done = 0; +static struct perf_stat_config stat_config = { + .aggr_mode = AGGR_GLOBAL, +}; + static inline void diff_timespec(struct timespec *r, struct timespec *a, struct timespec *b) { @@ -230,7 +233,7 @@ process_counter_values(struct perf_evsel *evsel, int cpu, int thread, if (skip) count = &zero; - switch (aggr_mode) { + switch (stat_config.aggr_mode) { case AGGR_THREAD: case AGGR_CORE: case AGGR_SOCKET: @@ -238,7 +241,7 @@ process_counter_values(struct perf_evsel *evsel, int cpu, int thread, if (!evsel->snapshot) perf_evsel__compute_deltas(evsel, cpu, thread, count); perf_counts_values__scale(count, scale, NULL); - if (aggr_mode == AGGR_NONE) + if (stat_config.aggr_mode == AGGR_NONE) perf_stat__update_shadow_stats(evsel, count->values, cpu); break; case AGGR_GLOBAL: @@ -291,7 +294,7 @@ static int process_counter(struct perf_evsel *counter) if (ret) return ret; - if (aggr_mode != AGGR_GLOBAL) + if (stat_config.aggr_mode != AGGR_GLOBAL) return 0; if (!counter->snapshot) @@ -578,7 +581,7 @@ static void print_noise(struct perf_evsel *evsel, double avg) static void aggr_printout(struct perf_evsel *evsel, int id, int nr) { - switch (aggr_mode) { + switch (stat_config.aggr_mode) { case AGGR_CORE: fprintf(output, "S%d-C%*d%s%*d%s", cpu_map__id_to_socket(id), @@ -670,7 +673,7 @@ static void abs_printout(int id, int nr, struct perf_evsel *evsel, double avg) aggr_printout(evsel, id, nr); - if (aggr_mode == AGGR_GLOBAL) + if (stat_config.aggr_mode == AGGR_GLOBAL) cpu = 0; fprintf(output, fmt, avg, csv_sep); @@ -688,7 +691,8 @@ static void abs_printout(int id, int nr, struct perf_evsel *evsel, double avg) if (csv_output || interval) return; - perf_stat__print_shadow_stats(output, evsel, avg, cpu, aggr_mode); + perf_stat__print_shadow_stats(output, evsel, avg, cpu, + stat_config.aggr_mode); } static void print_aggr(char *prefix) @@ -909,7 +913,7 @@ static void print_interval(char *prefix, struct timespec *ts) sprintf(prefix, "%6lu.%09lu%s", ts->tv_sec, ts->tv_nsec, csv_sep); if (num_print_interval == 0 && !csv_output) { - switch (aggr_mode) { + switch (stat_config.aggr_mode) { case AGGR_SOCKET: fprintf(output, "# time socket cpus counts %*s events\n", unit_width, "unit"); break; @@ -985,7 +989,7 @@ static void print_counters(struct timespec *ts, int argc, const char **argv) else print_header(argc, argv); - switch (aggr_mode) { + switch (stat_config.aggr_mode) { case AGGR_CORE: case AGGR_SOCKET: print_aggr(prefix); @@ -1064,7 +1068,7 @@ static int stat__set_big_num(const struct option *opt __maybe_unused, static int perf_stat_init_aggr_mode(void) { - switch (aggr_mode) { + switch (stat_config.aggr_mode) { case AGGR_SOCKET: if (cpu_map__build_socket_map(evsel_list->cpus, &aggr_map)) { perror("cannot build socket map"); @@ -1286,7 +1290,7 @@ int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused) stat__set_big_num), OPT_STRING('C', "cpu", &target.cpu_list, "cpu", "list of cpus to monitor in system-wide"), - OPT_SET_UINT('A', "no-aggr", &aggr_mode, + OPT_SET_UINT('A', "no-aggr", &stat_config.aggr_mode, "disable CPU count aggregation", AGGR_NONE), OPT_STRING('x', "field-separator", &csv_sep, "separator", "print counts with custom separator"), @@ -1302,11 +1306,11 @@ int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused) "command to run after to the measured command"), OPT_UINTEGER('I', "interval-print", &interval, "print counts at regular interval in ms (>= 100)"), - OPT_SET_UINT(0, "per-socket", &aggr_mode, + OPT_SET_UINT(0, "per-socket", &stat_config.aggr_mode, "aggregate counts per processor socket", AGGR_SOCKET), - OPT_SET_UINT(0, "per-core", &aggr_mode, + OPT_SET_UINT(0, "per-core", &stat_config.aggr_mode, "aggregate counts per physical processor core", AGGR_CORE), - OPT_SET_UINT(0, "per-thread", &aggr_mode, + OPT_SET_UINT(0, "per-thread", &stat_config.aggr_mode, "aggregate counts per thread", AGGR_THREAD), OPT_UINTEGER('D', "delay", &initial_delay, "ms to wait before starting measurement after program start"), @@ -1399,7 +1403,7 @@ int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused) run_count = 1; } - if ((aggr_mode == AGGR_THREAD) && !target__has_task(&target)) { + if ((stat_config.aggr_mode == AGGR_THREAD) && !target__has_task(&target)) { fprintf(stderr, "The --per-thread option is only available " "when monitoring via -p -t options.\n"); parse_options_usage(NULL, options, "p", 1); @@ -1411,7 +1415,8 @@ int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused) * no_aggr, cgroup are for system-wide only * --per-thread is aggregated per thread, we dont mix it with cpu mode */ - if (((aggr_mode != AGGR_GLOBAL && aggr_mode != AGGR_THREAD) || nr_cgroups) && + if (((stat_config.aggr_mode != AGGR_GLOBAL && + stat_config.aggr_mode != AGGR_THREAD) || nr_cgroups) && !target__has_cpu(&target)) { fprintf(stderr, "both cgroup and no-aggregation " "modes only available in system-wide mode\n"); @@ -1444,7 +1449,7 @@ int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused) * Initialize thread_map with comm names, * so we could print it out on output. */ - if (aggr_mode == AGGR_THREAD) + if (stat_config.aggr_mode == AGGR_THREAD) thread_map__read_comms(evsel_list->threads); if (interval && interval < 100) { diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h index 1cfbe0a..078bee4 100644 --- a/tools/perf/util/stat.h +++ b/tools/perf/util/stat.h @@ -50,6 +50,10 @@ struct perf_counts { struct xyarray *values; }; +struct perf_stat_config { + enum aggr_mode aggr_mode; +}; + static inline struct perf_counts_values* perf_counts(struct perf_counts *counts, int cpu, int thread) { -- cgit v0.10.2 From 711a572ea8ae7e9ab6575403c6d632d058d5cb3d Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 21 Jul 2015 14:31:23 +0200 Subject: perf stat: Move 'scale' into struct perf_stat_config Moving 'scale' into struct perf_stat_config. The point is to centralize the base stat config so it could be used localy together with other stat routines in other parts of perf code. Signed-off-by: Jiri Olsa Cc: David Ahern Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1437481927-29538-4-git-send-email-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index bafb830..3fb2865 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -101,7 +101,6 @@ static struct target target = { static int run_count = 1; static bool no_inherit = false; -static bool scale = true; static volatile pid_t child_pid = -1; static bool null_run = false; static int detailed_run = 0; @@ -127,6 +126,7 @@ static volatile int done = 0; static struct perf_stat_config stat_config = { .aggr_mode = AGGR_GLOBAL, + .scale = true, }; static inline void diff_timespec(struct timespec *r, struct timespec *a, @@ -151,7 +151,7 @@ static int create_perf_stat_counter(struct perf_evsel *evsel) { struct perf_event_attr *attr = &evsel->attr; - if (scale) + if (stat_config.scale) attr->read_format = PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING; @@ -240,13 +240,13 @@ process_counter_values(struct perf_evsel *evsel, int cpu, int thread, case AGGR_NONE: if (!evsel->snapshot) perf_evsel__compute_deltas(evsel, cpu, thread, count); - perf_counts_values__scale(count, scale, NULL); + perf_counts_values__scale(count, stat_config.scale, NULL); if (stat_config.aggr_mode == AGGR_NONE) perf_stat__update_shadow_stats(evsel, count->values, cpu); break; case AGGR_GLOBAL: aggr->val += count->val; - if (scale) { + if (stat_config.scale) { aggr->ena += count->ena; aggr->run += count->run; } @@ -299,7 +299,7 @@ static int process_counter(struct perf_evsel *counter) if (!counter->snapshot) perf_evsel__compute_deltas(counter, -1, -1, aggr); - perf_counts_values__scale(aggr, scale, &counter->counts->scaled); + perf_counts_values__scale(aggr, stat_config.scale, &counter->counts->scaled); for (i = 0; i < 3; i++) update_stats(&ps->res_stats[i], count[i]); @@ -1274,7 +1274,7 @@ int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused) "system-wide collection from all CPUs"), OPT_BOOLEAN('g', "group", &group, "put the counters into a counter group"), - OPT_BOOLEAN('c', "scale", &scale, "scale/normalize counters"), + OPT_BOOLEAN('c', "scale", &stat_config.scale, "scale/normalize counters"), OPT_INCR('v', "verbose", &verbose, "be more verbose (show counter open errors, etc)"), OPT_INTEGER('r', "repeat", &run_count, diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h index 078bee4..0a1d83f 100644 --- a/tools/perf/util/stat.h +++ b/tools/perf/util/stat.h @@ -52,6 +52,7 @@ struct perf_counts { struct perf_stat_config { enum aggr_mode aggr_mode; + bool scale; }; static inline struct perf_counts_values* -- cgit v0.10.2 From 5821522e9484a8b503f89aa546085900b99589e9 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 21 Jul 2015 14:31:24 +0200 Subject: perf stat: Move 'output' into struct perf_stat_config Moving 'output' into struct perf_stat_config. The point is to centralize the base stat config so it could be used localy together with other stat routines in other parts of perf code. Signed-off-by: Jiri Olsa Cc: David Ahern Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1437481927-29538-5-git-send-email-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 3fb2865..e3ea8b6 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -110,7 +110,6 @@ static int big_num_opt = -1; static const char *csv_sep = NULL; static bool csv_output = false; static bool group = false; -static FILE *output = NULL; static const char *pre_cmd = NULL; static const char *post_cmd = NULL; static bool sync_run = false; @@ -305,7 +304,7 @@ static int process_counter(struct perf_evsel *counter) update_stats(&ps->res_stats[i], count[i]); if (verbose) { - fprintf(output, "%s: %" PRIu64 " %" PRIu64 " %" PRIu64 "\n", + fprintf(stat_config.output, "%s: %" PRIu64 " %" PRIu64 " %" PRIu64 "\n", perf_evsel__name(counter), count[0], count[1], count[2]); } @@ -548,13 +547,13 @@ static int run_perf_stat(int argc, const char **argv) static void print_running(u64 run, u64 ena) { if (csv_output) { - fprintf(output, "%s%" PRIu64 "%s%.2f", + fprintf(stat_config.output, "%s%" PRIu64 "%s%.2f", csv_sep, run, csv_sep, ena ? 100.0 * run / ena : 100.0); } else if (run != ena) { - fprintf(output, " (%.2f%%)", 100.0 * run / ena); + fprintf(stat_config.output, " (%.2f%%)", 100.0 * run / ena); } } @@ -563,9 +562,9 @@ static void print_noise_pct(double total, double avg) double pct = rel_stddev_stats(total, avg); if (csv_output) - fprintf(output, "%s%.2f%%", csv_sep, pct); + fprintf(stat_config.output, "%s%.2f%%", csv_sep, pct); else if (pct) - fprintf(output, " ( +-%6.2f%% )", pct); + fprintf(stat_config.output, " ( +-%6.2f%% )", pct); } static void print_noise(struct perf_evsel *evsel, double avg) @@ -583,7 +582,7 @@ static void aggr_printout(struct perf_evsel *evsel, int id, int nr) { switch (stat_config.aggr_mode) { case AGGR_CORE: - fprintf(output, "S%d-C%*d%s%*d%s", + fprintf(stat_config.output, "S%d-C%*d%s%*d%s", cpu_map__id_to_socket(id), csv_output ? 0 : -8, cpu_map__id_to_cpu(id), @@ -593,7 +592,7 @@ static void aggr_printout(struct perf_evsel *evsel, int id, int nr) csv_sep); break; case AGGR_SOCKET: - fprintf(output, "S%*d%s%*d%s", + fprintf(stat_config.output, "S%*d%s%*d%s", csv_output ? 0 : -5, id, csv_sep, @@ -602,12 +601,12 @@ static void aggr_printout(struct perf_evsel *evsel, int id, int nr) csv_sep); break; case AGGR_NONE: - fprintf(output, "CPU%*d%s", + fprintf(stat_config.output, "CPU%*d%s", csv_output ? 0 : -4, perf_evsel__cpus(evsel)->map[id], csv_sep); break; case AGGR_THREAD: - fprintf(output, "%*s-%*d%s", + fprintf(stat_config.output, "%*s-%*d%s", csv_output ? 0 : 16, thread_map__comm(evsel->threads, id), csv_output ? 0 : -8, @@ -622,6 +621,7 @@ static void aggr_printout(struct perf_evsel *evsel, int id, int nr) static void nsec_printout(int id, int nr, struct perf_evsel *evsel, double avg) { + FILE *output = stat_config.output; double msecs = avg / 1e6; const char *fmt_v, *fmt_n; char name[25]; @@ -658,6 +658,7 @@ static void nsec_printout(int id, int nr, struct perf_evsel *evsel, double avg) static void abs_printout(int id, int nr, struct perf_evsel *evsel, double avg) { + FILE *output = stat_config.output; double sc = evsel->scale; const char *fmt; int cpu = cpu_map__id_to_cpu(id); @@ -697,6 +698,7 @@ static void abs_printout(int id, int nr, struct perf_evsel *evsel, double avg) static void print_aggr(char *prefix) { + FILE *output = stat_config.output; struct perf_evsel *counter; int cpu, cpu2, s, s2, id, nr; double uval; @@ -765,6 +767,7 @@ static void print_aggr(char *prefix) static void print_aggr_thread(struct perf_evsel *counter, char *prefix) { + FILE *output = stat_config.output; int nthreads = thread_map__nr(counter->threads); int ncpus = cpu_map__nr(counter->cpus); int cpu, thread; @@ -803,6 +806,7 @@ static void print_aggr_thread(struct perf_evsel *counter, char *prefix) */ static void print_counter_aggr(struct perf_evsel *counter, char *prefix) { + FILE *output = stat_config.output; struct perf_stat *ps = counter->priv; double avg = avg_stats(&ps->res_stats[0]); int scaled = counter->counts->scaled; @@ -854,6 +858,7 @@ static void print_counter_aggr(struct perf_evsel *counter, char *prefix) */ static void print_counter(struct perf_evsel *counter, char *prefix) { + FILE *output = stat_config.output; u64 ena, run, val; double uval; int cpu; @@ -908,6 +913,7 @@ static void print_counter(struct perf_evsel *counter, char *prefix) static void print_interval(char *prefix, struct timespec *ts) { + FILE *output = stat_config.output; static int num_print_interval; sprintf(prefix, "%6lu.%09lu%s", ts->tv_sec, ts->tv_nsec, csv_sep); @@ -938,6 +944,7 @@ static void print_interval(char *prefix, struct timespec *ts) static void print_header(int argc, const char **argv) { + FILE *output = stat_config.output; int i; fflush(stdout); @@ -967,6 +974,8 @@ static void print_header(int argc, const char **argv) static void print_footer(void) { + FILE *output = stat_config.output; + if (!null_run) fprintf(output, "\n"); fprintf(output, " %17.9f seconds time elapsed", @@ -1013,7 +1022,7 @@ static void print_counters(struct timespec *ts, int argc, const char **argv) if (!interval && !csv_output) print_footer(); - fflush(output); + fflush(stat_config.output); } static volatile int signr = -1; @@ -1322,6 +1331,7 @@ int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused) }; int status = -EINVAL, run_idx; const char *mode; + FILE *output = stderr; setlocale(LC_ALL, ""); @@ -1332,7 +1342,6 @@ int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused) argc = parse_options(argc, argv, options, stat_usage, PARSE_OPT_STOP_AT_NON_OPTION); - output = stderr; if (output_name && strcmp(output_name, "-")) output = NULL; @@ -1369,6 +1378,8 @@ int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused) } } + stat_config.output = output; + if (csv_sep) { csv_output = true; if (!strcmp(csv_sep, "\\t")) diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h index 0a1d83f..ed0e058 100644 --- a/tools/perf/util/stat.h +++ b/tools/perf/util/stat.h @@ -53,6 +53,7 @@ struct perf_counts { struct perf_stat_config { enum aggr_mode aggr_mode; bool scale; + FILE *output; }; static inline struct perf_counts_values* -- cgit v0.10.2 From ec0d3d1fd292adb80372193c03d859e9cbefd367 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 21 Jul 2015 14:31:25 +0200 Subject: perf stat: Move 'interval' into struct perf_stat_config Moving 'interval' into struct perf_stat_config. The point is to centralize the base stat config so it could be used localy together with other stat routines in other parts of perf code. Signed-off-by: Jiri Olsa Cc: David Ahern Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1437481927-29538-6-git-send-email-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index e3ea8b6..1bdfec8 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -113,7 +113,6 @@ static bool group = false; static const char *pre_cmd = NULL; static const char *post_cmd = NULL; static bool sync_run = false; -static unsigned int interval = 0; static unsigned int initial_delay = 0; static unsigned int unit_width = 4; /* strlen("unit") */ static bool forever = false; @@ -404,6 +403,7 @@ static void workload_exec_failed_signal(int signo __maybe_unused, siginfo_t *inf static int __run_perf_stat(int argc, const char **argv) { + int interval = stat_config.interval; char msg[512]; unsigned long long t0, t1; struct perf_evsel *counter; @@ -646,7 +646,7 @@ static void nsec_printout(int id, int nr, struct perf_evsel *evsel, double avg) if (evsel->cgrp) fprintf(output, "%s%s", csv_sep, evsel->cgrp->name); - if (csv_output || interval) + if (csv_output || stat_config.interval) return; if (perf_evsel__match(evsel, SOFTWARE, SW_TASK_CLOCK)) @@ -689,7 +689,7 @@ static void abs_printout(int id, int nr, struct perf_evsel *evsel, double avg) if (evsel->cgrp) fprintf(output, "%s%s", csv_sep, evsel->cgrp->name); - if (csv_output || interval) + if (csv_output || stat_config.interval) return; perf_stat__print_shadow_stats(output, evsel, avg, cpu, @@ -990,6 +990,7 @@ static void print_footer(void) static void print_counters(struct timespec *ts, int argc, const char **argv) { + int interval = stat_config.interval; struct perf_evsel *counter; char buf[64], *prefix = NULL; @@ -1029,7 +1030,7 @@ static volatile int signr = -1; static void skip_signal(int signo) { - if ((child_pid == -1) || interval) + if ((child_pid == -1) || stat_config.interval) done = 1; signr = signo; @@ -1313,7 +1314,7 @@ int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused) "command to run prior to the measured command"), OPT_STRING(0, "post", &post_cmd, "command", "command to run after to the measured command"), - OPT_UINTEGER('I', "interval-print", &interval, + OPT_UINTEGER('I', "interval-print", &stat_config.interval, "print counts at regular interval in ms (>= 100)"), OPT_SET_UINT(0, "per-socket", &stat_config.aggr_mode, "aggregate counts per processor socket", AGGR_SOCKET), @@ -1332,6 +1333,7 @@ int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused) int status = -EINVAL, run_idx; const char *mode; FILE *output = stderr; + unsigned int interval; setlocale(LC_ALL, ""); @@ -1342,6 +1344,8 @@ int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused) argc = parse_options(argc, argv, options, stat_usage, PARSE_OPT_STOP_AT_NON_OPTION); + interval = stat_config.interval; + if (output_name && strcmp(output_name, "-")) output = NULL; diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h index ed0e058..1da706d 100644 --- a/tools/perf/util/stat.h +++ b/tools/perf/util/stat.h @@ -54,6 +54,7 @@ struct perf_stat_config { enum aggr_mode aggr_mode; bool scale; FILE *output; + unsigned int interval; }; static inline struct perf_counts_values* -- cgit v0.10.2 From 5e5fe748bec771a810b1f44ec9c19e4b92685246 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 21 Jul 2015 14:31:26 +0200 Subject: perf stat: Pass 'struct perf_stat_config' into process_counter() Passing 'struct perf_stat_config' into process_counter(), so that we can make process_counter() non static and use it from other places. Signed-off-by: Jiri Olsa Cc: David Ahern Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1437481927-29538-7-git-send-email-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 1bdfec8..5a78171 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -216,7 +216,8 @@ static int check_per_pkg(struct perf_evsel *counter, int cpu, bool *skip) } static int -process_counter_values(struct perf_evsel *evsel, int cpu, int thread, +process_counter_values(struct perf_stat_config *config, struct perf_evsel *evsel, + int cpu, int thread, struct perf_counts_values *count) { struct perf_counts_values *aggr = &evsel->counts->aggr; @@ -231,20 +232,20 @@ process_counter_values(struct perf_evsel *evsel, int cpu, int thread, if (skip) count = &zero; - switch (stat_config.aggr_mode) { + switch (config->aggr_mode) { case AGGR_THREAD: case AGGR_CORE: case AGGR_SOCKET: case AGGR_NONE: if (!evsel->snapshot) perf_evsel__compute_deltas(evsel, cpu, thread, count); - perf_counts_values__scale(count, stat_config.scale, NULL); - if (stat_config.aggr_mode == AGGR_NONE) + perf_counts_values__scale(count, config->scale, NULL); + if (config->aggr_mode == AGGR_NONE) perf_stat__update_shadow_stats(evsel, count->values, cpu); break; case AGGR_GLOBAL: aggr->val += count->val; - if (stat_config.scale) { + if (config->scale) { aggr->ena += count->ena; aggr->run += count->run; } @@ -255,7 +256,8 @@ process_counter_values(struct perf_evsel *evsel, int cpu, int thread, return 0; } -static int process_counter_maps(struct perf_evsel *counter) +static int process_counter_maps(struct perf_stat_config *config, + struct perf_evsel *counter) { int nthreads = thread_map__nr(counter->threads); int ncpus = perf_evsel__nr_cpus(counter); @@ -266,7 +268,7 @@ static int process_counter_maps(struct perf_evsel *counter) for (thread = 0; thread < nthreads; thread++) { for (cpu = 0; cpu < ncpus; cpu++) { - if (process_counter_values(counter, cpu, thread, + if (process_counter_values(config, counter, cpu, thread, perf_counts(counter->counts, cpu, thread))) return -1; } @@ -275,7 +277,8 @@ static int process_counter_maps(struct perf_evsel *counter) return 0; } -static int process_counter(struct perf_evsel *counter) +static int process_counter(struct perf_stat_config *config, + struct perf_evsel *counter) { struct perf_counts_values *aggr = &counter->counts->aggr; struct perf_stat *ps = counter->priv; @@ -288,22 +291,22 @@ static int process_counter(struct perf_evsel *counter) if (counter->per_pkg) zero_per_pkg(counter); - ret = process_counter_maps(counter); + ret = process_counter_maps(&stat_config, counter); if (ret) return ret; - if (stat_config.aggr_mode != AGGR_GLOBAL) + if (config->aggr_mode != AGGR_GLOBAL) return 0; if (!counter->snapshot) perf_evsel__compute_deltas(counter, -1, -1, aggr); - perf_counts_values__scale(aggr, stat_config.scale, &counter->counts->scaled); + perf_counts_values__scale(aggr, config->scale, &counter->counts->scaled); for (i = 0; i < 3; i++) update_stats(&ps->res_stats[i], count[i]); if (verbose) { - fprintf(stat_config.output, "%s: %" PRIu64 " %" PRIu64 " %" PRIu64 "\n", + fprintf(config->output, "%s: %" PRIu64 " %" PRIu64 " %" PRIu64 "\n", perf_evsel__name(counter), count[0], count[1], count[2]); } @@ -352,7 +355,7 @@ static void read_counters(bool close_counters) if (read_counter(counter)) pr_warning("failed to read counter %s\n", counter->name); - if (process_counter(counter)) + if (process_counter(&stat_config, counter)) pr_warning("failed to process counter %s\n", counter->name); if (close_counters) { -- cgit v0.10.2 From f80010eb230b94e8d9cf5bf83373a097fb5b2dcc Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 21 Jul 2015 14:31:27 +0200 Subject: perf stat: Move counter processing code into stat object Moving counter processing code into stat object as perf_stat__process_counter. Signed-off-by: Jiri Olsa Cc: David Ahern Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1437481927-29538-8-git-send-email-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 5a78171..a054ddc 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -179,145 +179,6 @@ static inline int nsec_counter(struct perf_evsel *evsel) return 0; } -static void zero_per_pkg(struct perf_evsel *counter) -{ - if (counter->per_pkg_mask) - memset(counter->per_pkg_mask, 0, MAX_NR_CPUS); -} - -static int check_per_pkg(struct perf_evsel *counter, int cpu, bool *skip) -{ - unsigned long *mask = counter->per_pkg_mask; - struct cpu_map *cpus = perf_evsel__cpus(counter); - int s; - - *skip = false; - - if (!counter->per_pkg) - return 0; - - if (cpu_map__empty(cpus)) - return 0; - - if (!mask) { - mask = zalloc(MAX_NR_CPUS); - if (!mask) - return -ENOMEM; - - counter->per_pkg_mask = mask; - } - - s = cpu_map__get_socket(cpus, cpu); - if (s < 0) - return -1; - - *skip = test_and_set_bit(s, mask) == 1; - return 0; -} - -static int -process_counter_values(struct perf_stat_config *config, struct perf_evsel *evsel, - int cpu, int thread, - struct perf_counts_values *count) -{ - struct perf_counts_values *aggr = &evsel->counts->aggr; - static struct perf_counts_values zero; - bool skip = false; - - if (check_per_pkg(evsel, cpu, &skip)) { - pr_err("failed to read per-pkg counter\n"); - return -1; - } - - if (skip) - count = &zero; - - switch (config->aggr_mode) { - case AGGR_THREAD: - case AGGR_CORE: - case AGGR_SOCKET: - case AGGR_NONE: - if (!evsel->snapshot) - perf_evsel__compute_deltas(evsel, cpu, thread, count); - perf_counts_values__scale(count, config->scale, NULL); - if (config->aggr_mode == AGGR_NONE) - perf_stat__update_shadow_stats(evsel, count->values, cpu); - break; - case AGGR_GLOBAL: - aggr->val += count->val; - if (config->scale) { - aggr->ena += count->ena; - aggr->run += count->run; - } - default: - break; - } - - return 0; -} - -static int process_counter_maps(struct perf_stat_config *config, - struct perf_evsel *counter) -{ - int nthreads = thread_map__nr(counter->threads); - int ncpus = perf_evsel__nr_cpus(counter); - int cpu, thread; - - if (counter->system_wide) - nthreads = 1; - - for (thread = 0; thread < nthreads; thread++) { - for (cpu = 0; cpu < ncpus; cpu++) { - if (process_counter_values(config, counter, cpu, thread, - perf_counts(counter->counts, cpu, thread))) - return -1; - } - } - - return 0; -} - -static int process_counter(struct perf_stat_config *config, - struct perf_evsel *counter) -{ - struct perf_counts_values *aggr = &counter->counts->aggr; - struct perf_stat *ps = counter->priv; - u64 *count = counter->counts->aggr.values; - int i, ret; - - aggr->val = aggr->ena = aggr->run = 0; - init_stats(ps->res_stats); - - if (counter->per_pkg) - zero_per_pkg(counter); - - ret = process_counter_maps(&stat_config, counter); - if (ret) - return ret; - - if (config->aggr_mode != AGGR_GLOBAL) - return 0; - - if (!counter->snapshot) - perf_evsel__compute_deltas(counter, -1, -1, aggr); - perf_counts_values__scale(aggr, config->scale, &counter->counts->scaled); - - for (i = 0; i < 3; i++) - update_stats(&ps->res_stats[i], count[i]); - - if (verbose) { - fprintf(config->output, "%s: %" PRIu64 " %" PRIu64 " %" PRIu64 "\n", - perf_evsel__name(counter), count[0], count[1], count[2]); - } - - /* - * Save the full runtime - to allow normalization during printout: - */ - perf_stat__update_shadow_stats(counter, count, 0); - - return 0; -} - /* * Read out the results of a single counter: * do not aggregate counts across CPUs in system-wide mode @@ -355,7 +216,7 @@ static void read_counters(bool close_counters) if (read_counter(counter)) pr_warning("failed to read counter %s\n", counter->name); - if (process_counter(&stat_config, counter)) + if (perf_stat_process_counter(&stat_config, counter)) pr_warning("failed to process counter %s\n", counter->name); if (close_counters) { diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c index f2a0d15..c5c709c 100644 --- a/tools/perf/util/stat.c +++ b/tools/perf/util/stat.c @@ -238,3 +238,142 @@ void perf_evlist__reset_stats(struct perf_evlist *evlist) perf_evsel__reset_counts(evsel); } } + +static void zero_per_pkg(struct perf_evsel *counter) +{ + if (counter->per_pkg_mask) + memset(counter->per_pkg_mask, 0, MAX_NR_CPUS); +} + +static int check_per_pkg(struct perf_evsel *counter, int cpu, bool *skip) +{ + unsigned long *mask = counter->per_pkg_mask; + struct cpu_map *cpus = perf_evsel__cpus(counter); + int s; + + *skip = false; + + if (!counter->per_pkg) + return 0; + + if (cpu_map__empty(cpus)) + return 0; + + if (!mask) { + mask = zalloc(MAX_NR_CPUS); + if (!mask) + return -ENOMEM; + + counter->per_pkg_mask = mask; + } + + s = cpu_map__get_socket(cpus, cpu); + if (s < 0) + return -1; + + *skip = test_and_set_bit(s, mask) == 1; + return 0; +} + +static int +process_counter_values(struct perf_stat_config *config, struct perf_evsel *evsel, + int cpu, int thread, + struct perf_counts_values *count) +{ + struct perf_counts_values *aggr = &evsel->counts->aggr; + static struct perf_counts_values zero; + bool skip = false; + + if (check_per_pkg(evsel, cpu, &skip)) { + pr_err("failed to read per-pkg counter\n"); + return -1; + } + + if (skip) + count = &zero; + + switch (config->aggr_mode) { + case AGGR_THREAD: + case AGGR_CORE: + case AGGR_SOCKET: + case AGGR_NONE: + if (!evsel->snapshot) + perf_evsel__compute_deltas(evsel, cpu, thread, count); + perf_counts_values__scale(count, config->scale, NULL); + if (config->aggr_mode == AGGR_NONE) + perf_stat__update_shadow_stats(evsel, count->values, cpu); + break; + case AGGR_GLOBAL: + aggr->val += count->val; + if (config->scale) { + aggr->ena += count->ena; + aggr->run += count->run; + } + default: + break; + } + + return 0; +} + +static int process_counter_maps(struct perf_stat_config *config, + struct perf_evsel *counter) +{ + int nthreads = thread_map__nr(counter->threads); + int ncpus = perf_evsel__nr_cpus(counter); + int cpu, thread; + + if (counter->system_wide) + nthreads = 1; + + for (thread = 0; thread < nthreads; thread++) { + for (cpu = 0; cpu < ncpus; cpu++) { + if (process_counter_values(config, counter, cpu, thread, + perf_counts(counter->counts, cpu, thread))) + return -1; + } + } + + return 0; +} + +int perf_stat_process_counter(struct perf_stat_config *config, + struct perf_evsel *counter) +{ + struct perf_counts_values *aggr = &counter->counts->aggr; + struct perf_stat *ps = counter->priv; + u64 *count = counter->counts->aggr.values; + int i, ret; + + aggr->val = aggr->ena = aggr->run = 0; + init_stats(ps->res_stats); + + if (counter->per_pkg) + zero_per_pkg(counter); + + ret = process_counter_maps(config, counter); + if (ret) + return ret; + + if (config->aggr_mode != AGGR_GLOBAL) + return 0; + + if (!counter->snapshot) + perf_evsel__compute_deltas(counter, -1, -1, aggr); + perf_counts_values__scale(aggr, config->scale, &counter->counts->scaled); + + for (i = 0; i < 3; i++) + update_stats(&ps->res_stats[i], count[i]); + + if (verbose) { + fprintf(config->output, "%s: %" PRIu64 " %" PRIu64 " %" PRIu64 "\n", + perf_evsel__name(counter), count[0], count[1], count[2]); + } + + /* + * Save the full runtime - to allow normalization during printout: + */ + perf_stat__update_shadow_stats(counter, count, 0); + + return 0; +} diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h index 1da706d..0b897b0 100644 --- a/tools/perf/util/stat.h +++ b/tools/perf/util/stat.h @@ -116,4 +116,7 @@ int perf_evsel__alloc_stats(struct perf_evsel *evsel, bool alloc_raw); int perf_evlist__alloc_stats(struct perf_evlist *evlist, bool alloc_raw); void perf_evlist__free_stats(struct perf_evlist *evlist); void perf_evlist__reset_stats(struct perf_evlist *evlist); + +int perf_stat_process_counter(struct perf_stat_config *config, + struct perf_evsel *counter); #endif -- cgit v0.10.2 From 93df8a1ed6231727c5db94a80b1a6bd5ee67cec3 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Tue, 4 Aug 2015 17:10:27 +0100 Subject: perf tools: Add empty Build files for architectures lacking them perf currently fails to build on MIPS as there is no tools/perf/arch/mips/Build file. Adding an empty file fixes this as there are no MIPS-specific sources to build. It looks like the same is needed for Alpha and PA-RISC, though I haven't been able to test those. Signed-off-by: Ben Hutchings Fixes: 5e8c0fb6a957 ("perf build: Add arch x86 objects building") Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1438704627.7315.2.camel@decadent.org.uk Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/arch/alpha/Build b/tools/perf/arch/alpha/Build new file mode 100644 index 0000000..1bb8bf6 --- /dev/null +++ b/tools/perf/arch/alpha/Build @@ -0,0 +1 @@ +# empty diff --git a/tools/perf/arch/mips/Build b/tools/perf/arch/mips/Build new file mode 100644 index 0000000..1bb8bf6 --- /dev/null +++ b/tools/perf/arch/mips/Build @@ -0,0 +1 @@ +# empty diff --git a/tools/perf/arch/parisc/Build b/tools/perf/arch/parisc/Build new file mode 100644 index 0000000..1bb8bf6 --- /dev/null +++ b/tools/perf/arch/parisc/Build @@ -0,0 +1 @@ +# empty -- cgit v0.10.2 From 0e332f033a8216fa03792fde69882f66500848c7 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sat, 18 Jul 2015 08:24:46 -0700 Subject: perf tools: Add support for cycles, weight branch_info field cycles is a new branch_info field available on some CPUs that indicates the time deltas between branches in the LBR. Add a sort key and output code for the cycles to allow to display the basic block cycles individually in perf report. We also pass in the cycles for weight when LBRs are processed, which allows to get global and local weight, to get an estimate of the total cost. And also print the cycles information for perf report -D. I also added printing for the previously missing LBR flags (mispredict etc.) Signed-off-by: Andi Kleen Acked-by: Jiri Olsa Cc: Namhyung Kim Link: http://lkml.kernel.org/r/1437233094-12844-2-git-send-email-andi@firstfloor.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt index c33b69f..960da20 100644 --- a/tools/perf/Documentation/perf-report.txt +++ b/tools/perf/Documentation/perf-report.txt @@ -109,6 +109,7 @@ OPTIONS - mispredict: "N" for predicted branch, "Y" for mispredicted branch - in_tx: branch in TSX transaction - abort: TSX transaction abort. + - cycles: Cycles in basic block And default sort keys are changed to comm, dso_from, symbol_from, dso_to and symbol_to, see '--branch-stack'. diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h index 4bb2ae8..f729df5 100644 --- a/tools/perf/util/event.h +++ b/tools/perf/util/event.h @@ -134,7 +134,8 @@ struct branch_flags { u64 predicted:1; u64 in_tx:1; u64 abort:1; - u64 reserved:60; + u64 cycles:16; + u64 reserved:44; }; struct branch_entry { diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index 6f28d53..54fc003 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -618,7 +618,8 @@ iter_add_next_branch_entry(struct hist_entry_iter *iter, struct addr_location *a * and not events sampled. Thus we use a pseudo period of 1. */ he = __hists__add_entry(hists, al, iter->parent, &bi[i], NULL, - 1, 1, 0, true); + 1, bi->flags.cycles ? bi->flags.cycles : 1, + 0, true); if (he == NULL) return -ENOMEM; diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h index 5ed8d9c..3881d98 100644 --- a/tools/perf/util/hist.h +++ b/tools/perf/util/hist.h @@ -47,6 +47,7 @@ enum hist_column { HISTC_MEM_SNOOP, HISTC_MEM_DCACHELINE, HISTC_TRANSACTION, + HISTC_CYCLES, HISTC_NR_COLS, /* Last entry */ }; diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index f51eb54..18722e7 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -784,10 +784,18 @@ static void branch_stack__printf(struct perf_sample *sample) printf("... branch stack: nr:%" PRIu64 "\n", sample->branch_stack->nr); - for (i = 0; i < sample->branch_stack->nr; i++) - printf("..... %2"PRIu64": %016" PRIx64 " -> %016" PRIx64 "\n", - i, sample->branch_stack->entries[i].from, - sample->branch_stack->entries[i].to); + for (i = 0; i < sample->branch_stack->nr; i++) { + struct branch_entry *e = &sample->branch_stack->entries[i]; + + printf("..... %2"PRIu64": %016" PRIx64 " -> %016" PRIx64 " %hu cycles %s%s%s%s %x\n", + i, e->from, e->to, + e->flags.cycles, + e->flags.mispred ? "M" : " ", + e->flags.predicted ? "P" : " ", + e->flags.abort ? "A" : " ", + e->flags.in_tx ? "T" : " ", + (unsigned)e->flags.reserved); + } } static void regs_dump__printf(u64 mask, u64 *regs) diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c index 4c65a14..5b7a50c 100644 --- a/tools/perf/util/sort.c +++ b/tools/perf/util/sort.c @@ -526,6 +526,29 @@ static int hist_entry__mispredict_snprintf(struct hist_entry *he, char *bf, return repsep_snprintf(bf, size, "%-*.*s", width, width, out); } +static int64_t +sort__cycles_cmp(struct hist_entry *left, struct hist_entry *right) +{ + return left->branch_info->flags.cycles - + right->branch_info->flags.cycles; +} + +static int hist_entry__cycles_snprintf(struct hist_entry *he, char *bf, + size_t size, unsigned int width) +{ + if (he->branch_info->flags.cycles == 0) + return repsep_snprintf(bf, size, "%-*s", width, "-"); + return repsep_snprintf(bf, size, "%-*hd", width, + he->branch_info->flags.cycles); +} + +struct sort_entry sort_cycles = { + .se_header = "Basic Block Cycles", + .se_cmp = sort__cycles_cmp, + .se_snprintf = hist_entry__cycles_snprintf, + .se_width_idx = HISTC_CYCLES, +}; + /* --sort daddr_sym */ static int64_t sort__daddr_cmp(struct hist_entry *left, struct hist_entry *right) @@ -1190,6 +1213,7 @@ static struct sort_dimension bstack_sort_dimensions[] = { DIM(SORT_MISPREDICT, "mispredict", sort_mispredict), DIM(SORT_IN_TX, "in_tx", sort_in_tx), DIM(SORT_ABORT, "abort", sort_abort), + DIM(SORT_CYCLES, "cycles", sort_cycles), }; #undef DIM diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h index e97cd47..bc6c87a 100644 --- a/tools/perf/util/sort.h +++ b/tools/perf/util/sort.h @@ -185,6 +185,7 @@ enum sort_type { SORT_MISPREDICT, SORT_ABORT, SORT_IN_TX, + SORT_CYCLES, /* memory mode specific sort keys */ __SORT_MEMORY_MODE, -- cgit v0.10.2 From 98df858ed46ddaaf9be3573eb2b63b57a68c6af7 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sat, 18 Jul 2015 08:24:47 -0700 Subject: perf report: Add flag for non ANY branch mode Later patches need to cheaply check that the branch mode is in ANY. Add a new function to check all event attrs and add a flag to the report state, which is then initialized. v2: Rename flag Signed-off-by: Andi Kleen Acked-by: Jiri Olsa Cc: Namhyung Kim Link: http://lkml.kernel.org/r/1437233094-12844-3-git-send-email-andi@firstfloor.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 95a4771..3ba0e97 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -53,6 +53,7 @@ struct report { bool mem_mode; bool header; bool header_only; + bool nonany_branch_mode; int max_stack; struct perf_read_values show_threads_values; const char *pretty_printing_style; @@ -258,6 +259,12 @@ static int report__setup_sample_type(struct report *rep) else callchain_param.record_mode = CALLCHAIN_FP; } + + /* ??? handle more cases than just ANY? */ + if (!(perf_evlist__combined_branch_type(session->evlist) & + PERF_SAMPLE_BRANCH_ANY)) + rep->nonany_branch_mode = true; + return 0; } diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 3b9f411..373f65b 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -1273,6 +1273,16 @@ u64 perf_evlist__combined_sample_type(struct perf_evlist *evlist) return __perf_evlist__combined_sample_type(evlist); } +u64 perf_evlist__combined_branch_type(struct perf_evlist *evlist) +{ + struct perf_evsel *evsel; + u64 branch_type = 0; + + evlist__for_each(evlist, evsel) + branch_type |= evsel->attr.branch_sample_type; + return branch_type; +} + bool perf_evlist__valid_read_format(struct perf_evlist *evlist) { struct perf_evsel *first = perf_evlist__first(evlist), *pos = first; diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h index a8930b6..3977570 100644 --- a/tools/perf/util/evlist.h +++ b/tools/perf/util/evlist.h @@ -165,6 +165,7 @@ void perf_evlist__set_leader(struct perf_evlist *evlist); u64 perf_evlist__read_format(struct perf_evlist *evlist); u64 __perf_evlist__combined_sample_type(struct perf_evlist *evlist); u64 perf_evlist__combined_sample_type(struct perf_evlist *evlist); +u64 perf_evlist__combined_branch_type(struct perf_evlist *evlist); bool perf_evlist__sample_id_all(struct perf_evlist *evlist); u16 perf_evlist__id_hdr_size(struct perf_evlist *evlist); -- cgit v0.10.2 From d4957633bf9dab70e566e7dbb2b8d0c61c3a2f1e Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sat, 18 Jul 2015 08:24:48 -0700 Subject: perf report: Add infrastructure for a cycles histogram This adds the basic infrastructure to keep track of cycle counts per basic block for annotate. We allocate an array similar to the normal accounting, and then account branch cycles there. We handle two cases: cycles per basic block with start and cycles per branch (these are later used for either IPC or just cycles per BB) In the start case we cannot handle overlaps, so always the longest basic block wins. For the cycles per branch case everything is accurately accounted. v2: Remove unnecessary checks. Slight restructure. Move symbol__get_annotation to another patch. Move histogram allocation. v3: Merged with current tree Signed-off-by: Andi Kleen Acked-by: Jiri Olsa Cc: Namhyung Kim Link: http://lkml.kernel.org/r/1437233094-12844-4-git-send-email-andi@firstfloor.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c index 2c1bec3..467a23b 100644 --- a/tools/perf/builtin-annotate.c +++ b/tools/perf/builtin-annotate.c @@ -187,6 +187,7 @@ find_next: * symbol, free he->ms.sym->src to signal we already * processed this symbol. */ + zfree(¬es->src->cycles_hist); zfree(¬es->src); } } diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 03b7bc70..e0b6146 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -473,17 +473,73 @@ int symbol__alloc_hist(struct symbol *sym) return 0; } +/* The cycles histogram is lazily allocated. */ +static int symbol__alloc_hist_cycles(struct symbol *sym) +{ + struct annotation *notes = symbol__annotation(sym); + const size_t size = symbol__size(sym); + + notes->src->cycles_hist = calloc(size, sizeof(struct cyc_hist)); + if (notes->src->cycles_hist == NULL) + return -1; + return 0; +} + void symbol__annotate_zero_histograms(struct symbol *sym) { struct annotation *notes = symbol__annotation(sym); pthread_mutex_lock(¬es->lock); - if (notes->src != NULL) + if (notes->src != NULL) { memset(notes->src->histograms, 0, notes->src->nr_histograms * notes->src->sizeof_sym_hist); + if (notes->src->cycles_hist) + memset(notes->src->cycles_hist, 0, + symbol__size(sym) * sizeof(struct cyc_hist)); + } pthread_mutex_unlock(¬es->lock); } +static int __symbol__account_cycles(struct annotation *notes, + u64 start, + unsigned offset, unsigned cycles, + unsigned have_start) +{ + struct cyc_hist *ch; + + ch = notes->src->cycles_hist; + /* + * For now we can only account one basic block per + * final jump. But multiple could be overlapping. + * Always account the longest one. So when + * a shorter one has been already seen throw it away. + * + * We separately always account the full cycles. + */ + ch[offset].num_aggr++; + ch[offset].cycles_aggr += cycles; + + if (!have_start && ch[offset].have_start) + return 0; + if (ch[offset].num) { + if (have_start && (!ch[offset].have_start || + ch[offset].start > start)) { + ch[offset].have_start = 0; + ch[offset].cycles = 0; + ch[offset].num = 0; + if (ch[offset].reset < 0xffff) + ch[offset].reset++; + } else if (have_start && + ch[offset].start < start) + return 0; + } + ch[offset].have_start = have_start; + ch[offset].start = start; + ch[offset].cycles += cycles; + ch[offset].num++; + return 0; +} + static int __symbol__inc_addr_samples(struct symbol *sym, struct map *map, struct annotation *notes, int evidx, u64 addr) { @@ -506,7 +562,7 @@ static int __symbol__inc_addr_samples(struct symbol *sym, struct map *map, return 0; } -static struct annotation *symbol__get_annotation(struct symbol *sym) +static struct annotation *symbol__get_annotation(struct symbol *sym, bool cycles) { struct annotation *notes = symbol__annotation(sym); @@ -514,6 +570,10 @@ static struct annotation *symbol__get_annotation(struct symbol *sym) if (symbol__alloc_hist(sym) < 0) return NULL; } + if (!notes->src->cycles_hist && cycles) { + if (symbol__alloc_hist_cycles(sym) < 0) + return NULL; + } return notes; } @@ -524,12 +584,73 @@ static int symbol__inc_addr_samples(struct symbol *sym, struct map *map, if (sym == NULL) return 0; - notes = symbol__get_annotation(sym); + notes = symbol__get_annotation(sym, false); if (notes == NULL) return -ENOMEM; return __symbol__inc_addr_samples(sym, map, notes, evidx, addr); } +static int symbol__account_cycles(u64 addr, u64 start, + struct symbol *sym, unsigned cycles) +{ + struct annotation *notes; + unsigned offset; + + if (sym == NULL) + return 0; + notes = symbol__get_annotation(sym, true); + if (notes == NULL) + return -ENOMEM; + if (addr < sym->start || addr >= sym->end) + return -ERANGE; + + if (start) { + if (start < sym->start || start >= sym->end) + return -ERANGE; + if (start >= addr) + start = 0; + } + offset = addr - sym->start; + return __symbol__account_cycles(notes, + start ? start - sym->start : 0, + offset, cycles, + !!start); +} + +int addr_map_symbol__account_cycles(struct addr_map_symbol *ams, + struct addr_map_symbol *start, + unsigned cycles) +{ + unsigned long saddr = 0; + int err; + + if (!cycles) + return 0; + + /* + * Only set start when IPC can be computed. We can only + * compute it when the basic block is completely in a single + * function. + * Special case the case when the jump is elsewhere, but + * it starts on the function start. + */ + if (start && + (start->sym == ams->sym || + (ams->sym && + start->addr == ams->sym->start + ams->map->start))) + saddr = start->al_addr; + if (saddr == 0) + pr_debug2("BB with bad start: addr %lx start %lx sym %lx saddr %lx\n", + ams->addr, + start ? start->addr : 0, + ams->sym ? ams->sym->start + ams->map->start : 0, + saddr); + err = symbol__account_cycles(ams->al_addr, saddr, ams->sym, cycles); + if (err) + pr_debug2("account_cycles failed %d\n", err); + return err; +} + int addr_map_symbol__inc_samples(struct addr_map_symbol *ams, int evidx) { return symbol__inc_addr_samples(ams->sym, ams->map, evidx, ams->al_addr); diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h index 7e78e6c..a06518d 100644 --- a/tools/perf/util/annotate.h +++ b/tools/perf/util/annotate.h @@ -79,6 +79,17 @@ struct sym_hist { u64 addr[0]; }; +struct cyc_hist { + u64 start; + u64 cycles; + u64 cycles_aggr; + u32 num; + u32 num_aggr; + u8 have_start; + /* 1 byte padding */ + u16 reset; +}; + struct source_line_samples { double percent; double percent_sum; @@ -97,6 +108,7 @@ struct source_line { * @histogram: Array of addr hit histograms per event being monitored * @lines: If 'print_lines' is specified, per source code line percentages * @source: source parsed from a disassembler like objdump -dS + * @cyc_hist: Average cycles per basic block * * lines is allocated, percentages calculated and all sorted by percentage * when the annotation is about to be presented, so the percentages are for @@ -109,6 +121,7 @@ struct annotated_source { struct source_line *lines; int nr_histograms; int sizeof_sym_hist; + struct cyc_hist *cycles_hist; struct sym_hist histograms[0]; }; @@ -130,6 +143,10 @@ static inline struct annotation *symbol__annotation(struct symbol *sym) int addr_map_symbol__inc_samples(struct addr_map_symbol *ams, int evidx); +int addr_map_symbol__account_cycles(struct addr_map_symbol *ams, + struct addr_map_symbol *start, + unsigned cycles); + int hist_entry__inc_addr_samples(struct hist_entry *he, int evidx, u64 addr); int symbol__alloc_hist(struct symbol *sym); -- cgit v0.10.2 From 57849998e2cd24d50295076a1bbd2f029e2d7c38 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sat, 18 Jul 2015 08:24:49 -0700 Subject: perf report: Add processing for cycle histograms Call the earlier added cycle histogram infrastructure from the perf report hist iter callback. For this we walk the branch records. This allows to use cycle histograms when browsing perf report annotate. v2: Rename flag Signed-off-by: Andi Kleen Cc: Jiri Olsa Cc: Namhyung Kim Link: http://lkml.kernel.org/r/1437233094-12844-5-git-send-email-andi@firstfloor.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 3ba0e97..3a9d1b6 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -103,6 +103,9 @@ static int hist_iter__report_callback(struct hist_entry_iter *iter, if (!ui__has_annotation()) return 0; + hist__account_cycles(iter->sample->branch_stack, al, iter->sample, + rep->nonany_branch_mode); + if (sort__mode == SORT_MODE__BRANCH) { bi = he->branch_info; err = addr_map_symbol__inc_samples(&bi->from, evsel->idx); diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index 54fc003..a6e9ddd 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -1415,6 +1415,39 @@ int hists__link(struct hists *leader, struct hists *other) return 0; } +void hist__account_cycles(struct branch_stack *bs, struct addr_location *al, + struct perf_sample *sample, bool nonany_branch_mode) +{ + struct branch_info *bi; + + /* If we have branch cycles always annotate them. */ + if (bs && bs->nr && bs->entries[0].flags.cycles) { + int i; + + bi = sample__resolve_bstack(sample, al); + if (bi) { + struct addr_map_symbol *prev = NULL; + + /* + * Ignore errors, still want to process the + * other entries. + * + * For non standard branch modes always + * force no IPC (prev == NULL) + * + * Note that perf stores branches reversed from + * program order! + */ + for (i = bs->nr - 1; i >= 0; i--) { + addr_map_symbol__account_cycles(&bi[i].from, + nonany_branch_mode ? NULL : prev, + bi[i].flags.cycles); + prev = &bi[i].to; + } + free(bi); + } + } +} size_t perf_evlist__fprintf_nr_events(struct perf_evlist *evlist, FILE *fp) { diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h index 3881d98..e2f712f 100644 --- a/tools/perf/util/hist.h +++ b/tools/perf/util/hist.h @@ -350,6 +350,9 @@ static inline int script_browse(const char *script_opt __maybe_unused) unsigned int hists__sort_list_width(struct hists *hists); +void hist__account_cycles(struct branch_stack *bs, struct addr_location *al, + struct perf_sample *sample, bool nonany_branch_mode); + struct option; int parse_filter_percentage(const struct option *opt __maybe_unused, const char *arg, int unset __maybe_unused); -- cgit v0.10.2 From 30e863bb6f708c0abd422fbb0e6b295f5ee6407b Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sat, 18 Jul 2015 08:24:50 -0700 Subject: perf annotate: Compute IPC and basic block cycles Compute the IPC and the basic block cycles for the annotate display. IPC is computed by counting the instructions, and then dividing the accounted cycles by that count. The actual IPC computation can only be done at annotate time, because we need to parse the objdump output first to know the number of instructions in the basic block. The cycles/IPC are also put into the perf function annotation so that the display code can show them. Again basic block overlaps are not handled, with the longest winning, but there are some heuristics to hide the IPC when the longest is not the most common. v2: Compute IPC correctly. Signed-off-by: Andi Kleen Cc: Jiri Olsa Cc: Namhyung Kim Link: http://lkml.kernel.org/r/1437233094-12844-6-git-send-email-andi@firstfloor.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/ui/browsers/annotate.c b/tools/perf/ui/browsers/annotate.c index 5995a8b..6ec1795 100644 --- a/tools/perf/ui/browsers/annotate.c +++ b/tools/perf/ui/browsers/annotate.c @@ -53,6 +53,7 @@ struct annotate_browser { int max_jump_sources; int nr_jumps; bool searching_backwards; + bool have_cycles; u8 addr_width; u8 jumps_width; u8 target_width; @@ -390,7 +391,7 @@ static void annotate_browser__calc_percent(struct annotate_browser *browser, max_percent = bpos->samples[i].percent; } - if (max_percent < 0.01) { + if (max_percent < 0.01 && pos->ipc == 0) { RB_CLEAR_NODE(&bpos->rb_node); continue; } @@ -869,6 +870,75 @@ int hist_entry__tui_annotate(struct hist_entry *he, struct perf_evsel *evsel, return map_symbol__tui_annotate(&he->ms, evsel, hbt); } + +static unsigned count_insn(struct annotate_browser *browser, u64 start, u64 end) +{ + unsigned n_insn = 0; + u64 offset; + + for (offset = start; offset <= end; offset++) { + if (browser->offsets[offset]) + n_insn++; + } + return n_insn; +} + +static void count_and_fill(struct annotate_browser *browser, u64 start, u64 end, + struct cyc_hist *ch) +{ + unsigned n_insn; + u64 offset; + + n_insn = count_insn(browser, start, end); + if (n_insn && ch->num && ch->cycles) { + float ipc = n_insn / ((double)ch->cycles / (double)ch->num); + + /* Hide data when there are too many overlaps. */ + if (ch->reset >= 0x7fff || ch->reset >= ch->num / 2) + return; + + for (offset = start; offset <= end; offset++) { + struct disasm_line *dl = browser->offsets[offset]; + + if (dl) + dl->ipc = ipc; + } + } +} + +/* + * This should probably be in util/annotate.c to share with the tty + * annotate, but right now we need the per byte offsets arrays, + * which are only here. + */ +static void annotate__compute_ipc(struct annotate_browser *browser, size_t size, + struct symbol *sym) +{ + u64 offset; + struct annotation *notes = symbol__annotation(sym); + + if (!notes->src || !notes->src->cycles_hist) + return; + + pthread_mutex_lock(¬es->lock); + for (offset = 0; offset < size; ++offset) { + struct cyc_hist *ch; + + ch = ¬es->src->cycles_hist[offset]; + if (ch && ch->cycles) { + struct disasm_line *dl; + + if (ch->have_start) + count_and_fill(browser, ch->start, offset, ch); + dl = browser->offsets[offset]; + if (dl && ch->num_aggr) + dl->cycles = ch->cycles_aggr / ch->num_aggr; + browser->have_cycles = true; + } + } + pthread_mutex_unlock(¬es->lock); +} + static void annotate_browser__mark_jump_targets(struct annotate_browser *browser, size_t size) { @@ -991,6 +1061,7 @@ int symbol__tui_annotate(struct symbol *sym, struct map *map, } annotate_browser__mark_jump_targets(&browser, size); + annotate__compute_ipc(&browser, size, sym); browser.addr_width = browser.target_width = browser.min_addr_width = hex_width(size); browser.max_addr_width = hex_width(sym->end); diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h index a06518d..e999609 100644 --- a/tools/perf/util/annotate.h +++ b/tools/perf/util/annotate.h @@ -59,6 +59,8 @@ struct disasm_line { char *name; struct ins *ins; int line_nr; + float ipc; + u64 cycles; struct ins_operands ops; }; -- cgit v0.10.2 From f8f4aaead579c947fb8fc051c9d242037025caf3 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sat, 18 Jul 2015 08:24:51 -0700 Subject: perf annotate: Finally display IPC and cycle accounting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add two new columns to the annotate display and display the average cycles and the compute IPC if available. When the LBR was not in any branch mode the IPC computation is automatically disabled. We still display the cycle information. Example output (with made up numbers): The second column is the IPC and third average cycles. │ __attribute__((noinline)) f2() │ { 5.15 0.07 │ push %rbp 0.01 0.07 │ mov %rsp,%rbp │ c = a / b; 9.87 0.07 │ mov a,%eax 0.07 │ mov b,%ecx 0.07 │ cltd 4.92 0.07 123│ idiv %ecx 70.79 0.07 │ mov %eax,__TMC_END__ │ } 9.25 0.07 │ pop %rbp 0.01 0.07 123│ ← retq v2: Fix display problems. Signed-off-by: Andi Kleen Cc: Jiri Olsa Cc: Namhyung Kim Link: http://lkml.kernel.org/r/1437233094-12844-7-git-send-email-andi@firstfloor.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/ui/browsers/annotate.c b/tools/perf/ui/browsers/annotate.c index 6ec1795..b5fc847 100644 --- a/tools/perf/ui/browsers/annotate.c +++ b/tools/perf/ui/browsers/annotate.c @@ -16,6 +16,9 @@ struct disasm_line_samples { u64 nr; }; +#define IPC_WIDTH 6 +#define CYCLES_WIDTH 6 + struct browser_disasm_line { struct rb_node rb_node; u32 idx; @@ -97,6 +100,15 @@ static int annotate_browser__set_jumps_percent_color(struct annotate_browser *br return ui_browser__set_color(&browser->b, color); } +static int annotate_browser__pcnt_width(struct annotate_browser *ab) +{ + int w = 7 * ab->nr_events; + + if (ab->have_cycles) + w += IPC_WIDTH + CYCLES_WIDTH; + return w; +} + static void annotate_browser__write(struct ui_browser *browser, void *entry, int row) { struct annotate_browser *ab = container_of(browser, struct annotate_browser, b); @@ -107,7 +119,7 @@ static void annotate_browser__write(struct ui_browser *browser, void *entry, int (!current_entry || (browser->use_navkeypressed && !browser->navkeypressed))); int width = browser->width, printed; - int i, pcnt_width = 7 * ab->nr_events; + int i, pcnt_width = annotate_browser__pcnt_width(ab); double percent_max = 0.0; char bf[256]; @@ -117,19 +129,34 @@ static void annotate_browser__write(struct ui_browser *browser, void *entry, int } if (dl->offset != -1 && percent_max != 0.0) { - for (i = 0; i < ab->nr_events; i++) { - ui_browser__set_percent_color(browser, - bdl->samples[i].percent, - current_entry); - if (annotate_browser__opts.show_total_period) - slsmg_printf("%6" PRIu64 " ", - bdl->samples[i].nr); - else - slsmg_printf("%6.2f ", bdl->samples[i].percent); + if (percent_max != 0.0) { + for (i = 0; i < ab->nr_events; i++) { + ui_browser__set_percent_color(browser, + bdl->samples[i].percent, + current_entry); + if (annotate_browser__opts.show_total_period) + slsmg_printf("%6" PRIu64 " ", + bdl->samples[i].nr); + else + slsmg_printf("%6.2f ", bdl->samples[i].percent); + } + } else { + slsmg_write_nstring(" ", 7 * ab->nr_events); } } else { ui_browser__set_percent_color(browser, 0, current_entry); - slsmg_write_nstring(" ", pcnt_width); + slsmg_write_nstring(" ", 7 * ab->nr_events); + } + if (ab->have_cycles) { + if (dl->ipc) + slsmg_printf("%*.2f ", IPC_WIDTH - 1, dl->ipc); + else + slsmg_write_nstring(" ", IPC_WIDTH); + if (dl->cycles) + slsmg_printf("%*" PRIu64 " ", + CYCLES_WIDTH - 1, dl->cycles); + else + slsmg_write_nstring(" ", CYCLES_WIDTH); } SLsmg_write_char(' '); @@ -232,7 +259,7 @@ static void annotate_browser__draw_current_jump(struct ui_browser *browser) unsigned int from, to; struct map_symbol *ms = ab->b.priv; struct symbol *sym = ms->sym; - u8 pcnt_width = 7; + u8 pcnt_width = annotate_browser__pcnt_width(ab); /* PLT symbols contain external offsets */ if (strstr(sym->name, "@plt")) @@ -256,8 +283,6 @@ static void annotate_browser__draw_current_jump(struct ui_browser *browser) to = (u64)btarget->idx; } - pcnt_width *= ab->nr_events; - ui_browser__set_color(browser, HE_COLORSET_CODE); __ui_browser__line_arrow(browser, pcnt_width + 2 + ab->addr_width, from, to); @@ -267,9 +292,7 @@ static unsigned int annotate_browser__refresh(struct ui_browser *browser) { struct annotate_browser *ab = container_of(browser, struct annotate_browser, b); int ret = ui_browser__list_head_refresh(browser); - int pcnt_width; - - pcnt_width = 7 * ab->nr_events; + int pcnt_width = annotate_browser__pcnt_width(ab); if (annotate_browser__opts.jump_arrows) annotate_browser__draw_current_jump(browser); -- cgit v0.10.2 From a18b027efe1a2a502d98a8d0ea0391a72bf3f696 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sat, 18 Jul 2015 08:24:52 -0700 Subject: perf top: Add branch annotation code to top Now that we can process branch data in annotate it makes sense to support enabling branch recording from top too. Most of the code needed for this is already in shared code with report. But we need to add: - The option parsing code (using shared code from the previous patch) - Document the options - Set up the IPC/cycles accounting state in the top session - Call the accounting code in the hist iter callback Signed-off-by: Andi Kleen Cc: Jiri Olsa Cc: Namhyung Kim Link: http://lkml.kernel.org/r/1437233094-12844-8-git-send-email-andi@firstfloor.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/Documentation/perf-top.txt b/tools/perf/Documentation/perf-top.txt index 776aec4..f6a23eb 100644 --- a/tools/perf/Documentation/perf-top.txt +++ b/tools/perf/Documentation/perf-top.txt @@ -208,6 +208,27 @@ Default is to monitor all CPUS. This option sets the time out limit. The default value is 500 ms. +-b:: +--branch-any:: + Enable taken branch stack sampling. Any type of taken branch may be sampled. + This is a shortcut for --branch-filter any. See --branch-filter for more infos. + +-j:: +--branch-filter:: + Enable taken branch stack sampling. Each sample captures a series of consecutive + taken branches. The number of branches captured with each sample depends on the + underlying hardware, the type of branches of interest, and the executed code. + It is possible to select the types of branches captured by enabling filters. + For a full list of modifiers please see the perf record manpage. + + The option requires at least one branch type among any, any_call, any_ret, ind_call, cond. + The privilege levels may be omitted, in which case, the privilege levels of the associated + event are applied to the branch filter. Both kernel (k) and hypervisor (hv) privilege + levels are subject to permissions. When sampling on multiple events, branch stack sampling + is enabled for all the sampling events. The sampled branch type is the same for all events. + The various filters must be specified as a comma separated list: --branch-filter any_ret,u,k + Note that this feature may not be available on all processors. + INTERACTIVE PROMPTING KEYS -------------------------- diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index ecf3197..bfe24f1 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -40,6 +40,7 @@ #include "util/xyarray.h" #include "util/sort.h" #include "util/intlist.h" +#include "util/parse-branch-options.h" #include "arch/common.h" #include "util/debug.h" @@ -695,6 +696,8 @@ static int hist_iter__top_callback(struct hist_entry_iter *iter, perf_top__record_precise_ip(top, he, evsel->idx, ip); } + hist__account_cycles(iter->sample->branch_stack, al, iter->sample, + !(top->record_opts.branch_stack & PERF_SAMPLE_BRANCH_ANY)); return 0; } @@ -1171,6 +1174,12 @@ int cmd_top(int argc, const char **argv, const char *prefix __maybe_unused) "don't try to adjust column width, use these fixed values"), OPT_UINTEGER(0, "proc-map-timeout", &opts->proc_map_timeout, "per thread proc mmap processing timeout in ms"), + OPT_CALLBACK_NOOPT('b', "branch-any", &opts->branch_stack, + "branch any", "sample any taken branches", + parse_branch_stack), + OPT_CALLBACK('j', "branch-filter", &opts->branch_stack, + "branch filter mask", "branch stack filter modes", + parse_branch_stack), OPT_END() }; const char * const top_usage[] = { -- cgit v0.10.2 From 40997d6cf9fc40c85dba479e162a89e7530eb360 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sat, 18 Jul 2015 08:24:53 -0700 Subject: perf report: Display cycles in branch sort mode Display the cycles by default in branch sort mode. To make enough room for the new column I removed dso_to. It is usually redundant with dso_from. Signed-off-by: Andi Kleen Cc: Jiri Olsa Cc: Namhyung Kim Link: http://lkml.kernel.org/r/1437233094-12844-9-git-send-email-andi@firstfloor.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c index 5b7a50c..5177088 100644 --- a/tools/perf/util/sort.c +++ b/tools/perf/util/sort.c @@ -9,7 +9,7 @@ regex_t parent_regex; const char default_parent_pattern[] = "^sys_|^do_page_fault"; const char *parent_pattern = default_parent_pattern; const char default_sort_order[] = "comm,dso,symbol"; -const char default_branch_sort_order[] = "comm,dso_from,symbol_from,dso_to,symbol_to"; +const char default_branch_sort_order[] = "comm,dso_from,symbol_from,symbol_to,cycles"; const char default_mem_sort_order[] = "local_weight,mem,sym,dso,symbol_daddr,dso_daddr,snoop,tlb,locked"; const char default_top_sort_order[] = "dso,symbol"; const char default_diff_sort_order[] = "dso,symbol"; -- cgit v0.10.2 From 74d4582f430a797564f92fbff0bd3a21945528b7 Mon Sep 17 00:00:00 2001 From: Max Filippov Date: Sat, 18 Jul 2015 11:30:11 +0300 Subject: perf tools xtensa: Add DWARF register names Signed-off-by: Max Filippov Cc: Chris Zankel Cc: Marc Gauthier Cc: Paul Mackerras Cc: Peter Zijlstra Cc: linux-xtensa@linux-xtensa.org Link: http://lkml.kernel.org/r/1437208216-15729-9-git-send-email-jcmvbkbc@gmail.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/arch/xtensa/Build b/tools/perf/arch/xtensa/Build new file mode 100644 index 0000000..54afe4a --- /dev/null +++ b/tools/perf/arch/xtensa/Build @@ -0,0 +1 @@ +libperf-y += util/ diff --git a/tools/perf/arch/xtensa/Makefile b/tools/perf/arch/xtensa/Makefile new file mode 100644 index 0000000..7fbca17 --- /dev/null +++ b/tools/perf/arch/xtensa/Makefile @@ -0,0 +1,3 @@ +ifndef NO_DWARF +PERF_HAVE_DWARF_REGS := 1 +endif diff --git a/tools/perf/arch/xtensa/util/Build b/tools/perf/arch/xtensa/util/Build new file mode 100644 index 0000000..954e287 --- /dev/null +++ b/tools/perf/arch/xtensa/util/Build @@ -0,0 +1 @@ +libperf-$(CONFIG_DWARF) += dwarf-regs.o diff --git a/tools/perf/arch/xtensa/util/dwarf-regs.c b/tools/perf/arch/xtensa/util/dwarf-regs.c new file mode 100644 index 0000000..4dba76b --- /dev/null +++ b/tools/perf/arch/xtensa/util/dwarf-regs.c @@ -0,0 +1,25 @@ +/* + * Mapping of DWARF debug register numbers into register names. + * + * Copyright (c) 2015 Cadence Design Systems Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include + +#define XTENSA_MAX_REGS 16 + +const char *xtensa_regs_table[XTENSA_MAX_REGS] = { + "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7", + "a8", "a9", "a10", "a11", "a12", "a13", "a14", "a15", +}; + +const char *get_arch_regstr(unsigned int n) +{ + return n < XTENSA_MAX_REGS ? xtensa_regs_table[n] : NULL; +} -- cgit v0.10.2 From f70cfa07e3675a115265e32d6357272275358cdb Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Fri, 17 Jul 2015 19:33:46 +0300 Subject: perf auxtrace: Fix period type 'i' not working PERF_ITRACE_PERIOD_INSTRUCTIONS is zero so it got overwritten by the default period type. Fix by checking if the period type was set rather than if the value was zero when applying the default. Signed-off-by: Adrian Hunter Cc: Jiri Olsa Link: http://lkml.kernel.org/r/1437150840-31811-12-git-send-email-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/auxtrace.c b/tools/perf/util/auxtrace.c index 83d9dd9..a25b360 100644 --- a/tools/perf/util/auxtrace.c +++ b/tools/perf/util/auxtrace.c @@ -942,6 +942,7 @@ int itrace_parse_synth_opts(const struct option *opt, const char *str, struct itrace_synth_opts *synth_opts = opt->value; const char *p; char *endptr; + bool period_type_set = false; synth_opts->set = true; @@ -970,10 +971,12 @@ int itrace_parse_synth_opts(const struct option *opt, const char *str, case 'i': synth_opts->period_type = PERF_ITRACE_PERIOD_INSTRUCTIONS; + period_type_set = true; break; case 't': synth_opts->period_type = PERF_ITRACE_PERIOD_TICKS; + period_type_set = true; break; case 'm': synth_opts->period *= 1000; @@ -986,6 +989,7 @@ int itrace_parse_synth_opts(const struct option *opt, const char *str, goto out_err; synth_opts->period_type = PERF_ITRACE_PERIOD_NANOSECS; + period_type_set = true; break; case '\0': goto out; @@ -1039,7 +1043,7 @@ int itrace_parse_synth_opts(const struct option *opt, const char *str, } out: if (synth_opts->instructions) { - if (!synth_opts->period_type) + if (!period_type_set) synth_opts->period_type = PERF_ITRACE_DEFAULT_PERIOD_TYPE; if (!synth_opts->period) -- cgit v0.10.2 From 8bd1b2d2578ca2688969352ed1f8a0a8f10dbb63 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Fri, 17 Jul 2015 19:33:47 +0300 Subject: perf tools: Fix perf-with-kcore handling of arguments containing spaces Fix the perf-with-kcore script so that it doesn't split arguments that contain spaces. Signed-off-by: Adrian Hunter Cc: Jiri Olsa Link: http://lkml.kernel.org/r/1437150840-31811-13-git-send-email-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/perf-with-kcore.sh b/tools/perf/perf-with-kcore.sh index c7ff90a..7e47a7c 100644 --- a/tools/perf/perf-with-kcore.sh +++ b/tools/perf/perf-with-kcore.sh @@ -50,7 +50,7 @@ copy_kcore() fi rm -f perf.data.junk - ("$PERF" record -o perf.data.junk $PERF_OPTIONS -- sleep 60) >/dev/null 2>/dev/null & + ("$PERF" record -o perf.data.junk "${PERF_OPTIONS[@]}" -- sleep 60) >/dev/null 2>/dev/null & PERF_PID=$! # Need to make sure that perf has started @@ -160,18 +160,18 @@ record() echo "*** WARNING *** /proc/sys/kernel/kptr_restrict prevents access to kernel addresses" >&2 fi - if echo "$PERF_OPTIONS" | grep -q ' -a \|^-a \| -a$\|^-a$\| --all-cpus \|^--all-cpus \| --all-cpus$\|^--all-cpus$' ; then + if echo "${PERF_OPTIONS[@]}" | grep -q ' -a \|^-a \| -a$\|^-a$\| --all-cpus \|^--all-cpus \| --all-cpus$\|^--all-cpus$' ; then echo "*** WARNING *** system-wide tracing without root access will not be able to read all necessary information from /proc" >&2 fi - if echo "$PERF_OPTIONS" | grep -q 'intel_pt\|intel_bts\| -I\|^-I' ; then + if echo "${PERF_OPTIONS[@]}" | grep -q 'intel_pt\|intel_bts\| -I\|^-I' ; then if [ "$(cat /proc/sys/kernel/perf_event_paranoid)" -gt -1 ] ; then echo "*** WARNING *** /proc/sys/kernel/perf_event_paranoid restricts buffer size and tracepoint (sched_switch) use" >&2 fi - if echo "$PERF_OPTIONS" | grep -q ' --per-thread \|^--per-thread \| --per-thread$\|^--per-thread$' ; then + if echo "${PERF_OPTIONS[@]}" | grep -q ' --per-thread \|^--per-thread \| --per-thread$\|^--per-thread$' ; then true - elif echo "$PERF_OPTIONS" | grep -q ' -t \|^-t \| -t$\|^-t$' ; then + elif echo "${PERF_OPTIONS[@]}" | grep -q ' -t \|^-t \| -t$\|^-t$' ; then true elif [ ! -r /sys/kernel/debug -o ! -x /sys/kernel/debug ] ; then echo "*** WARNING *** /sys/kernel/debug permissions prevent tracepoint (sched_switch) use" >&2 @@ -193,8 +193,8 @@ record() mkdir "$PERF_DATA_DIR" - echo "$PERF record -o $PERF_DATA_DIR/perf.data $PERF_OPTIONS -- $*" - "$PERF" record -o "$PERF_DATA_DIR/perf.data" $PERF_OPTIONS -- $* || true + echo "$PERF record -o $PERF_DATA_DIR/perf.data ${PERF_OPTIONS[@]} -- $@" + "$PERF" record -o "$PERF_DATA_DIR/perf.data" "${PERF_OPTIONS[@]}" -- "$@" || true if rmdir "$PERF_DATA_DIR" > /dev/null 2>/dev/null ; then exit 1 @@ -209,8 +209,8 @@ subcommand() { find_perf check_buildid_cache_permissions - echo "$PERF $PERF_SUB_COMMAND -i $PERF_DATA_DIR/perf.data --kallsyms=$PERF_DATA_DIR/kcore_dir/kallsyms $*" - "$PERF" $PERF_SUB_COMMAND -i "$PERF_DATA_DIR/perf.data" "--kallsyms=$PERF_DATA_DIR/kcore_dir/kallsyms" $* + echo "$PERF $PERF_SUB_COMMAND -i $PERF_DATA_DIR/perf.data --kallsyms=$PERF_DATA_DIR/kcore_dir/kallsyms $@" + "$PERF" $PERF_SUB_COMMAND -i "$PERF_DATA_DIR/perf.data" "--kallsyms=$PERF_DATA_DIR/kcore_dir/kallsyms" "$@" } if [ "$1" = "fix_buildid_cache_permissions" ] ; then @@ -234,7 +234,7 @@ fi case "$PERF_SUB_COMMAND" in "record") while [ "$1" != "--" ] ; do - PERF_OPTIONS+="$1 " + PERF_OPTIONS+=("$1") shift || break done if [ "$1" != "--" ] ; then @@ -242,16 +242,16 @@ case "$PERF_SUB_COMMAND" in usage fi shift - record $* + record "$@" ;; "script") - subcommand $* + subcommand "$@" ;; "report") - subcommand $* + subcommand "$@" ;; "inject") - subcommand $* + subcommand "$@" ;; *) usage -- cgit v0.10.2 From 09ff607176ab2bf7e038150100fdf9290a6fbe47 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Fri, 17 Jul 2015 19:33:49 +0300 Subject: perf tools: Add perf_pmu__format_bits() Add perf_pmu__format_bits() to get the format bits for a PMU config term. Intel PT will use this to validate terms and to record format bits to enable later interpreting the config from the attribute stored in the perf.data file. Signed-off-by: Adrian Hunter Cc: Jiri Olsa Link: http://lkml.kernel.org/r/1437150840-31811-15-git-send-email-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index b615cdf..c548ec8 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -542,7 +542,7 @@ struct perf_pmu *perf_pmu__find(const char *name) } static struct perf_pmu_format * -pmu_find_format(struct list_head *formats, char *name) +pmu_find_format(struct list_head *formats, const char *name) { struct perf_pmu_format *format; @@ -553,6 +553,21 @@ pmu_find_format(struct list_head *formats, char *name) return NULL; } +__u64 perf_pmu__format_bits(struct list_head *formats, const char *name) +{ + struct perf_pmu_format *format = pmu_find_format(formats, name); + __u64 bits = 0; + int fbit; + + if (!format) + return 0; + + for_each_set_bit(fbit, format->bits, PERF_PMU_FORMAT_BITS) + bits |= 1ULL << fbit; + + return bits; +} + /* * Sets value based on the format definition (format parameter) * and unformated value (value parameter). diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h index 7b9c8cf..5d7e844 100644 --- a/tools/perf/util/pmu.h +++ b/tools/perf/util/pmu.h @@ -54,6 +54,7 @@ int perf_pmu__config_terms(struct list_head *formats, struct perf_event_attr *attr, struct list_head *head_terms, bool zero, struct parse_events_error *error); +__u64 perf_pmu__format_bits(struct list_head *formats, const char *name); int perf_pmu__check_alias(struct perf_pmu *pmu, struct list_head *head_terms, struct perf_pmu_info *info); struct list_head *perf_pmu__alias(struct perf_pmu *pmu, -- cgit v0.10.2 From 0efe6b67690b6546daa0d2f34a17eb3ca46c9dea Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Fri, 17 Jul 2015 19:33:50 +0300 Subject: perf tools: Validate config term maximum value Currently the value of a PMU config term is silently truncated if it is too big. This is an impediment to validating the value for other criteria later on i.e. the user provides an invalid value that gets truncated to a valid one. The maximum value validation is only done for the parser where the error is passed back to the user. In other cases the silent truncation continues so as not to affect tools that perhaps rely on it. Signed-off-by: Adrian Hunter Cc: Jiri Olsa Link: http://lkml.kernel.org/r/1437150840-31811-16-git-send-email-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index c548ec8..d4b0e64 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -589,6 +589,18 @@ static void pmu_format_value(unsigned long *format, __u64 value, __u64 *v, } } +static __u64 pmu_format_max_value(const unsigned long *format) +{ + int w; + + w = bitmap_weight(format, PERF_PMU_FORMAT_BITS); + if (!w) + return 0; + if (w < 64) + return (1ULL << w) - 1; + return -1; +} + /* * Term is a string term, and might be a param-term. Try to look up it's value * in the remaining terms. @@ -662,7 +674,7 @@ static int pmu_config_term(struct list_head *formats, { struct perf_pmu_format *format; __u64 *vp; - __u64 val; + __u64 val, max_val; /* * If this is a parameter we've already used for parameterized-eval, @@ -728,6 +740,22 @@ static int pmu_config_term(struct list_head *formats, } else return -EINVAL; + max_val = pmu_format_max_value(format->bits); + if (val > max_val) { + if (err) { + err->idx = term->err_val; + if (asprintf(&err->str, + "value too big for format, maximum is %llu", + (unsigned long long)max_val) < 0) + err->str = strdup("value too big for format"); + return -EINVAL; + } + /* + * Assume we don't care if !err, in which case the value will be + * silently truncated. + */ + } + pmu_format_value(format->bits, val, vp, zero); return 0; } -- cgit v0.10.2 From 141b2d3161f19a774b3ceaa8faed5e63484a4684 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Fri, 17 Jul 2015 19:33:51 +0300 Subject: perf tools: Extend the event parser maximum error index Extend the event parser maximum error index from 10 to 13. That allows PMU config terms of up to 10 characters to display un-truncated in the error message. Signed-off-by: Adrian Hunter Cc: Jiri Olsa Link: http://lkml.kernel.org/r/1437150840-31811-17-git-send-email-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index a6cb9af..828936d 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -1168,7 +1168,7 @@ static void parse_events_print_error(struct parse_events_error *err, * Maximum error index indent, we will cut * the event string if it's bigger. */ - int max_err_idx = 10; + int max_err_idx = 13; /* * Let's be specific with the message when -- cgit v0.10.2 From 1b76c13e4b36f978ea45af079f38ad423a229630 Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Wed, 1 Jul 2015 02:13:51 +0000 Subject: bpf tools: Introduce 'bpf' library and add bpf feature check This is the first patch of libbpf. The goal of libbpf is to create a standard way for accessing eBPF object files. This patch creates 'Makefile' and 'Build' for it, allows 'make' to build libbpf.a and libbpf.so, 'make install' to put them into proper directories. Most part of Makefile is borrowed from traceevent. Before building, it checks the existence of libelf in Makefile, and deny to build if not found. Instead of throwing an error if libelf not found, the error raises in a phony target "elfdep". This design is to ensure 'make clean' still workable even if libelf is not found. Because libbpf requires 'kern_version' field set for 'union bpf_attr' (bpfdep" is used for that dependency), Kernel BPF API is also checked by intruducing a new feature check 'bpf' into tools/build/feature, which checks the existence and version of linux/bpf.h. When building libbpf, it searches that file from include/uapi/linux in kernel source tree (controlled by FEATURE_CHECK_CFLAGS-bpf). Since it searches kernel source tree it reside, installing of newest kernel headers is not required, except we are trying to port these files to an old kernel. To avoid checking that file when perf building, the newly introduced 'bpf' feature check doesn't added into FEATURE_TESTS and FEATURE_DISPLAY by default in tools/build/Makefile.feature, but added into libbpf's specific. Signed-off-by: Wang Nan Acked-by: Alexei Starovoitov Cc: Brendan Gregg Cc: Daniel Borkmann Cc: David Ahern Cc: He Kuang Cc: Jiri Olsa Cc: Kaixu Xia Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Zefan Li Bcc: pi3orama@163.com Link: http://lkml.kernel.org/r/1435716878-189507-4-git-send-email-wangnan0@huawei.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/build/feature/Makefile b/tools/build/feature/Makefile index 463ed8f..1c0d69f 100644 --- a/tools/build/feature/Makefile +++ b/tools/build/feature/Makefile @@ -33,7 +33,8 @@ FILES= \ test-compile-32.bin \ test-compile-x32.bin \ test-zlib.bin \ - test-lzma.bin + test-lzma.bin \ + test-bpf.bin CC := $(CROSS_COMPILE)gcc -MD PKG_CONFIG := $(CROSS_COMPILE)pkg-config @@ -156,6 +157,9 @@ test-zlib.bin: test-lzma.bin: $(BUILD) -llzma +test-bpf.bin: + $(BUILD) + -include *.d ############################### diff --git a/tools/build/feature/test-bpf.c b/tools/build/feature/test-bpf.c new file mode 100644 index 0000000..062bac8 --- /dev/null +++ b/tools/build/feature/test-bpf.c @@ -0,0 +1,18 @@ +#include + +int main(void) +{ + union bpf_attr attr; + + attr.prog_type = BPF_PROG_TYPE_KPROBE; + attr.insn_cnt = 0; + attr.insns = 0; + attr.license = 0; + attr.log_buf = 0; + attr.log_size = 0; + attr.log_level = 0; + attr.kern_version = 0; + + attr = attr; + return 0; +} diff --git a/tools/lib/bpf/.gitignore b/tools/lib/bpf/.gitignore new file mode 100644 index 0000000..812aeed --- /dev/null +++ b/tools/lib/bpf/.gitignore @@ -0,0 +1,2 @@ +libbpf_version.h +FEATURE-DUMP diff --git a/tools/lib/bpf/Build b/tools/lib/bpf/Build new file mode 100644 index 0000000..a316484 --- /dev/null +++ b/tools/lib/bpf/Build @@ -0,0 +1 @@ +libbpf-y := libbpf.o diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile new file mode 100644 index 0000000..f68d23a --- /dev/null +++ b/tools/lib/bpf/Makefile @@ -0,0 +1,195 @@ +# Most of this file is copied from tools/lib/traceevent/Makefile + +BPF_VERSION = 0 +BPF_PATCHLEVEL = 0 +BPF_EXTRAVERSION = 1 + +MAKEFLAGS += --no-print-directory + + +# Makefiles suck: This macro sets a default value of $(2) for the +# variable named by $(1), unless the variable has been set by +# environment or command line. This is necessary for CC and AR +# because make sets default values, so the simpler ?= approach +# won't work as expected. +define allow-override + $(if $(or $(findstring environment,$(origin $(1))),\ + $(findstring command line,$(origin $(1)))),,\ + $(eval $(1) = $(2))) +endef + +# Allow setting CC and AR, or setting CROSS_COMPILE as a prefix. +$(call allow-override,CC,$(CROSS_COMPILE)gcc) +$(call allow-override,AR,$(CROSS_COMPILE)ar) + +INSTALL = install + +# Use DESTDIR for installing into a different root directory. +# This is useful for building a package. The program will be +# installed in this directory as if it was the root directory. +# Then the build tool can move it later. +DESTDIR ?= +DESTDIR_SQ = '$(subst ','\'',$(DESTDIR))' + +LP64 := $(shell echo __LP64__ | ${CC} ${CFLAGS} -E -x c - | tail -n 1) +ifeq ($(LP64), 1) + libdir_relative = lib64 +else + libdir_relative = lib +endif + +prefix ?= /usr/local +libdir = $(prefix)/$(libdir_relative) +man_dir = $(prefix)/share/man +man_dir_SQ = '$(subst ','\'',$(man_dir))' + +export man_dir man_dir_SQ INSTALL +export DESTDIR DESTDIR_SQ + +include ../../scripts/Makefile.include + +# copy a bit from Linux kbuild + +ifeq ("$(origin V)", "command line") + VERBOSE = $(V) +endif +ifndef VERBOSE + VERBOSE = 0 +endif + +ifeq ($(srctree),) +srctree := $(patsubst %/,%,$(dir $(shell pwd))) +srctree := $(patsubst %/,%,$(dir $(srctree))) +srctree := $(patsubst %/,%,$(dir $(srctree))) +#$(info Determined 'srctree' to be $(srctree)) +endif + +FEATURE_DISPLAY = libelf libelf-getphdrnum libelf-mmap bpf +FEATURE_TESTS = libelf bpf + +INCLUDES = -I. -I$(srctree)/tools/include -I$(srctree)/arch/$(ARCH)/include/uapi -I$(srctree)/include/uapi +FEATURE_CHECK_CFLAGS-bpf = $(INCLUDES) + +include $(srctree)/tools/build/Makefile.feature + +export prefix libdir src obj + +# Shell quotes +libdir_SQ = $(subst ','\'',$(libdir)) +libdir_relative_SQ = $(subst ','\'',$(libdir_relative)) +plugin_dir_SQ = $(subst ','\'',$(plugin_dir)) + +LIB_FILE = libbpf.a libbpf.so + +VERSION = $(BPF_VERSION) +PATCHLEVEL = $(BPF_PATCHLEVEL) +EXTRAVERSION = $(BPF_EXTRAVERSION) + +OBJ = $@ +N = + +LIBBPF_VERSION = $(BPF_VERSION).$(BPF_PATCHLEVEL).$(BPF_EXTRAVERSION) + +# Set compile option CFLAGS +ifdef EXTRA_CFLAGS + CFLAGS := $(EXTRA_CFLAGS) +else + CFLAGS := -g -Wall +endif + +ifeq ($(feature-libelf-mmap), 1) + override CFLAGS += -DHAVE_LIBELF_MMAP_SUPPORT +endif + +ifeq ($(feature-libelf-getphdrnum), 1) + override CFLAGS += -DHAVE_ELF_GETPHDRNUM_SUPPORT +endif + +# Append required CFLAGS +override CFLAGS += $(EXTRA_WARNINGS) +override CFLAGS += -Werror -Wall +override CFLAGS += -fPIC +override CFLAGS += $(INCLUDES) + +ifeq ($(VERBOSE),1) + Q = +else + Q = @ +endif + +# Disable command line variables (CFLAGS) overide from top +# level Makefile (perf), otherwise build Makefile will get +# the same command line setup. +MAKEOVERRIDES= + +export srctree OUTPUT CC LD CFLAGS V +build := -f $(srctree)/tools/build/Makefile.build dir=. obj + +BPF_IN := $(OUTPUT)libbpf-in.o +LIB_FILE := $(addprefix $(OUTPUT),$(LIB_FILE)) + +CMD_TARGETS = $(LIB_FILE) + +TARGETS = $(CMD_TARGETS) + +all: $(VERSION_FILES) all_cmd + +all_cmd: $(CMD_TARGETS) + +$(BPF_IN): force elfdep bpfdep + $(Q)$(MAKE) $(build)=libbpf + +$(OUTPUT)libbpf.so: $(BPF_IN) + $(QUIET_LINK)$(CC) --shared $^ -o $@ + +$(OUTPUT)libbpf.a: $(BPF_IN) + $(QUIET_LINK)$(RM) $@; $(AR) rcs $@ $^ + +define update_dir + (echo $1 > $@.tmp; \ + if [ -r $@ ] && cmp -s $@ $@.tmp; then \ + rm -f $@.tmp; \ + else \ + echo ' UPDATE $@'; \ + mv -f $@.tmp $@; \ + fi); +endef + +define do_install + if [ ! -d '$(DESTDIR_SQ)$2' ]; then \ + $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$2'; \ + fi; \ + $(INSTALL) $1 '$(DESTDIR_SQ)$2' +endef + +install_lib: all_cmd + $(call QUIET_INSTALL, $(LIB_FILE)) \ + $(call do_install,$(LIB_FILE),$(libdir_SQ)) + +install: install_lib + +### Cleaning rules + +config-clean: + $(call QUIET_CLEAN, config) + $(Q)$(MAKE) -C $(srctree)/tools/build/feature/ clean >/dev/null + +clean: + $(call QUIET_CLEAN, libbpf) $(RM) *.o *~ $(TARGETS) *.a *.so $(VERSION_FILES) .*.d \ + $(RM) LIBBPF-CFLAGS + $(call QUIET_CLEAN, core-gen) $(RM) $(OUTPUT)FEATURE-DUMP + + + +PHONY += force elfdep bpfdep +force: + +elfdep: + @if [ "$(feature-libelf)" != "1" ]; then echo "No libelf found"; exit -1 ; fi + +bpfdep: + @if [ "$(feature-bpf)" != "1" ]; then echo "BPF API too old"; exit -1 ; fi + +# Declare the contents of the .PHONY variable as phony. We keep that +# information in a variable so we can use it in if_changed and friends. +.PHONY: $(PHONY) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c new file mode 100644 index 0000000..c08d6bc --- /dev/null +++ b/tools/lib/bpf/libbpf.c @@ -0,0 +1,14 @@ +/* + * Common eBPF ELF object loading operations. + * + * Copyright (C) 2013-2015 Alexei Starovoitov + * Copyright (C) 2015 Wang Nan + * Copyright (C) 2015 Huawei Inc. + */ + +#include +#include +#include +#include + +#include "libbpf.h" diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h new file mode 100644 index 0000000..a6f46d9 --- /dev/null +++ b/tools/lib/bpf/libbpf.h @@ -0,0 +1,11 @@ +/* + * Common eBPF ELF object loading operations. + * + * Copyright (C) 2013-2015 Alexei Starovoitov + * Copyright (C) 2015 Wang Nan + * Copyright (C) 2015 Huawei Inc. + */ +#ifndef __BPF_LIBBPF_H +#define __BPF_LIBBPF_H + +#endif -- cgit v0.10.2 From b3f59d66e22b8be4ccae67c8eaffa2cbb9e54eb1 Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Wed, 1 Jul 2015 02:13:52 +0000 Subject: bpf tools: Allow caller to set printing function By libbpf_set_print(), users of libbpf are allowed to register he/she own debug, info and warning printing functions. Libbpf will use those functions to print messages. If not provided, default info and warning printing functions are fprintf(stderr, ...); default debug printing is NULL. This API is designed to be used by perf, enables it to register its own logging functions to make all logs uniform, instead of separated logging level control. Signed-off-by: Wang Nan Acked-by: Alexei Starovoitov Cc: Brendan Gregg Cc: Daniel Borkmann Cc: David Ahern Cc: He Kuang Cc: Jiri Olsa Cc: Kaixu Xia Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Zefan Li Cc: pi3orama@163.com Link: http://lkml.kernel.org/r/1435716878-189507-5-git-send-email-wangnan0@huawei.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index c08d6bc..6f0c13a 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -7,8 +7,48 @@ */ #include +#include +#include +#include #include #include #include #include "libbpf.h" + +#define __printf(a, b) __attribute__((format(printf, a, b))) + +__printf(1, 2) +static int __base_pr(const char *format, ...) +{ + va_list args; + int err; + + va_start(args, format); + err = vfprintf(stderr, format, args); + va_end(args); + return err; +} + +static __printf(1, 2) libbpf_print_fn_t __pr_warning = __base_pr; +static __printf(1, 2) libbpf_print_fn_t __pr_info = __base_pr; +static __printf(1, 2) libbpf_print_fn_t __pr_debug; + +#define __pr(func, fmt, ...) \ +do { \ + if ((func)) \ + (func)("libbpf: " fmt, ##__VA_ARGS__); \ +} while (0) + +#define pr_warning(fmt, ...) __pr(__pr_warning, fmt, ##__VA_ARGS__) +#define pr_info(fmt, ...) __pr(__pr_info, fmt, ##__VA_ARGS__) +#define pr_debug(fmt, ...) __pr(__pr_debug, fmt, ##__VA_ARGS__) + +void libbpf_set_print(libbpf_print_fn_t warn, + libbpf_print_fn_t info, + libbpf_print_fn_t debug) +{ + __pr_warning = warn; + __pr_info = info; + __pr_debug = debug; +} diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index a6f46d9..8d1eeba 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -8,4 +8,16 @@ #ifndef __BPF_LIBBPF_H #define __BPF_LIBBPF_H +/* + * In include/linux/compiler-gcc.h, __printf is defined. However + * it should be better if libbpf.h doesn't depend on Linux header file. + * So instead of __printf, here we use gcc attribute directly. + */ +typedef int (*libbpf_print_fn_t)(const char *, ...) + __attribute__((format(printf, 1, 2))); + +void libbpf_set_print(libbpf_print_fn_t warn, + libbpf_print_fn_t info, + libbpf_print_fn_t debug); + #endif -- cgit v0.10.2 From 1a5e3fb1e9a4a3e3695cd57c33c169a90d16fd3b Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Wed, 1 Jul 2015 02:13:53 +0000 Subject: bpf tools: Open eBPF object file and do basic validation This patch defines basic interface of libbpf. 'struct bpf_object' will be the handler of each object file. Its internal structure is hide to user. eBPF object files are compiled by LLVM as ELF format. In this patch, libelf is used to open those files, read EHDR and do basic validation according to e_type and e_machine. All elf related staffs are grouped together and reside in efile field of 'struct bpf_object'. bpf_object__elf_finish() is introduced to clear it. After all eBPF programs in an object file are loaded, related ELF information is useless. Close the object file and free those memory. The zfree() and zclose() functions are introduced to ensure setting NULL pointers and negative file descriptors after resources are released. Signed-off-by: Wang Nan Acked-by: Alexei Starovoitov Cc: Brendan Gregg Cc: Daniel Borkmann Cc: David Ahern Cc: He Kuang Cc: Jiri Olsa Cc: Kaixu Xia Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Zefan Li Cc: pi3orama@163.com Link: http://lkml.kernel.org/r/1435716878-189507-6-git-send-email-wangnan0@huawei.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 6f0c13a..9e44608 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -11,8 +11,12 @@ #include #include #include +#include +#include #include #include +#include +#include #include "libbpf.h" @@ -52,3 +56,157 @@ void libbpf_set_print(libbpf_print_fn_t warn, __pr_info = info; __pr_debug = debug; } + +/* Copied from tools/perf/util/util.h */ +#ifndef zfree +# define zfree(ptr) ({ free(*ptr); *ptr = NULL; }) +#endif + +#ifndef zclose +# define zclose(fd) ({ \ + int ___err = 0; \ + if ((fd) >= 0) \ + ___err = close((fd)); \ + fd = -1; \ + ___err; }) +#endif + +#ifdef HAVE_LIBELF_MMAP_SUPPORT +# define LIBBPF_ELF_C_READ_MMAP ELF_C_READ_MMAP +#else +# define LIBBPF_ELF_C_READ_MMAP ELF_C_READ +#endif + +struct bpf_object { + /* + * Information when doing elf related work. Only valid if fd + * is valid. + */ + struct { + int fd; + Elf *elf; + GElf_Ehdr ehdr; + } efile; + char path[]; +}; +#define obj_elf_valid(o) ((o)->efile.elf) + +static struct bpf_object *bpf_object__new(const char *path) +{ + struct bpf_object *obj; + + obj = calloc(1, sizeof(struct bpf_object) + strlen(path) + 1); + if (!obj) { + pr_warning("alloc memory failed for %s\n", path); + return NULL; + } + + strcpy(obj->path, path); + obj->efile.fd = -1; + return obj; +} + +static void bpf_object__elf_finish(struct bpf_object *obj) +{ + if (!obj_elf_valid(obj)) + return; + + if (obj->efile.elf) { + elf_end(obj->efile.elf); + obj->efile.elf = NULL; + } + zclose(obj->efile.fd); +} + +static int bpf_object__elf_init(struct bpf_object *obj) +{ + int err = 0; + GElf_Ehdr *ep; + + if (obj_elf_valid(obj)) { + pr_warning("elf init: internal error\n"); + return -EEXIST; + } + + obj->efile.fd = open(obj->path, O_RDONLY); + if (obj->efile.fd < 0) { + pr_warning("failed to open %s: %s\n", obj->path, + strerror(errno)); + return -errno; + } + + obj->efile.elf = elf_begin(obj->efile.fd, + LIBBPF_ELF_C_READ_MMAP, + NULL); + if (!obj->efile.elf) { + pr_warning("failed to open %s as ELF file\n", + obj->path); + err = -EINVAL; + goto errout; + } + + if (!gelf_getehdr(obj->efile.elf, &obj->efile.ehdr)) { + pr_warning("failed to get EHDR from %s\n", + obj->path); + err = -EINVAL; + goto errout; + } + ep = &obj->efile.ehdr; + + if ((ep->e_type != ET_REL) || (ep->e_machine != 0)) { + pr_warning("%s is not an eBPF object file\n", + obj->path); + err = -EINVAL; + goto errout; + } + + return 0; +errout: + bpf_object__elf_finish(obj); + return err; +} + +static struct bpf_object * +__bpf_object__open(const char *path) +{ + struct bpf_object *obj; + + if (elf_version(EV_CURRENT) == EV_NONE) { + pr_warning("failed to init libelf for %s\n", path); + return NULL; + } + + obj = bpf_object__new(path); + if (!obj) + return NULL; + + if (bpf_object__elf_init(obj)) + goto out; + + bpf_object__elf_finish(obj); + return obj; +out: + bpf_object__close(obj); + return NULL; +} + +struct bpf_object *bpf_object__open(const char *path) +{ + /* param validation */ + if (!path) + return NULL; + + pr_debug("loading %s\n", path); + + return __bpf_object__open(path); +} + +void bpf_object__close(struct bpf_object *obj) +{ + if (!obj) + return; + + bpf_object__elf_finish(obj); + + free(obj); +} diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 8d1eeba..ec3301c 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -8,6 +8,8 @@ #ifndef __BPF_LIBBPF_H #define __BPF_LIBBPF_H +#include + /* * In include/linux/compiler-gcc.h, __printf is defined. However * it should be better if libbpf.h doesn't depend on Linux header file. @@ -20,4 +22,10 @@ void libbpf_set_print(libbpf_print_fn_t warn, libbpf_print_fn_t info, libbpf_print_fn_t debug); +/* Hide internal to user */ +struct bpf_object; + +struct bpf_object *bpf_object__open(const char *path); +void bpf_object__close(struct bpf_object *object); + #endif -- cgit v0.10.2 From 6c956392b0486c170e382476ab9dd97a696fd169 Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Wed, 1 Jul 2015 02:13:54 +0000 Subject: bpf tools: Read eBPF object from buffer To support dynamic compiling, this patch allows caller to pass a in-memory buffer to libbpf by bpf_object__open_buffer(). libbpf calls elf_memory() to open it as ELF object file. Because __bpf_object__open() collects all required data and won't need that buffer anymore, libbpf uses that buffer directly instead of clone a new buffer. Caller of libbpf can free that buffer or use it do other things after bpf_object__open_buffer() return. Signed-off-by: Wang Nan Acked-by: Alexei Starovoitov Cc: Brendan Gregg Cc: Daniel Borkmann Cc: David Ahern Cc: He Kuang Cc: Jiri Olsa Cc: Kaixu Xia Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Zefan Li Cc: pi3orama@163.com Link: http://lkml.kernel.org/r/1435716878-189507-7-git-send-email-wangnan0@huawei.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 9e44608..36dfbc1 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -84,6 +84,8 @@ struct bpf_object { */ struct { int fd; + void *obj_buf; + size_t obj_buf_sz; Elf *elf; GElf_Ehdr ehdr; } efile; @@ -91,7 +93,9 @@ struct bpf_object { }; #define obj_elf_valid(o) ((o)->efile.elf) -static struct bpf_object *bpf_object__new(const char *path) +static struct bpf_object *bpf_object__new(const char *path, + void *obj_buf, + size_t obj_buf_sz) { struct bpf_object *obj; @@ -103,6 +107,16 @@ static struct bpf_object *bpf_object__new(const char *path) strcpy(obj->path, path); obj->efile.fd = -1; + + /* + * Caller of this function should also calls + * bpf_object__elf_finish() after data collection to return + * obj_buf to user. If not, we should duplicate the buffer to + * avoid user freeing them before elf finish. + */ + obj->efile.obj_buf = obj_buf; + obj->efile.obj_buf_sz = obj_buf_sz; + return obj; } @@ -116,6 +130,8 @@ static void bpf_object__elf_finish(struct bpf_object *obj) obj->efile.elf = NULL; } zclose(obj->efile.fd); + obj->efile.obj_buf = NULL; + obj->efile.obj_buf_sz = 0; } static int bpf_object__elf_init(struct bpf_object *obj) @@ -128,16 +144,26 @@ static int bpf_object__elf_init(struct bpf_object *obj) return -EEXIST; } - obj->efile.fd = open(obj->path, O_RDONLY); - if (obj->efile.fd < 0) { - pr_warning("failed to open %s: %s\n", obj->path, - strerror(errno)); - return -errno; + if (obj->efile.obj_buf_sz > 0) { + /* + * obj_buf should have been validated by + * bpf_object__open_buffer(). + */ + obj->efile.elf = elf_memory(obj->efile.obj_buf, + obj->efile.obj_buf_sz); + } else { + obj->efile.fd = open(obj->path, O_RDONLY); + if (obj->efile.fd < 0) { + pr_warning("failed to open %s: %s\n", obj->path, + strerror(errno)); + return -errno; + } + + obj->efile.elf = elf_begin(obj->efile.fd, + LIBBPF_ELF_C_READ_MMAP, + NULL); } - obj->efile.elf = elf_begin(obj->efile.fd, - LIBBPF_ELF_C_READ_MMAP, - NULL); if (!obj->efile.elf) { pr_warning("failed to open %s as ELF file\n", obj->path); @@ -167,7 +193,7 @@ errout: } static struct bpf_object * -__bpf_object__open(const char *path) +__bpf_object__open(const char *path, void *obj_buf, size_t obj_buf_sz) { struct bpf_object *obj; @@ -176,7 +202,7 @@ __bpf_object__open(const char *path) return NULL; } - obj = bpf_object__new(path); + obj = bpf_object__new(path, obj_buf, obj_buf_sz); if (!obj) return NULL; @@ -198,7 +224,19 @@ struct bpf_object *bpf_object__open(const char *path) pr_debug("loading %s\n", path); - return __bpf_object__open(path); + return __bpf_object__open(path, NULL, 0); +} + +struct bpf_object *bpf_object__open_buffer(void *obj_buf, + size_t obj_buf_sz) +{ + /* param validation */ + if (!obj_buf || obj_buf_sz <= 0) + return NULL; + + pr_debug("loading object from buffer\n"); + + return __bpf_object__open("[buffer]", obj_buf, obj_buf_sz); } void bpf_object__close(struct bpf_object *obj) diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index ec3301c..dc966dd 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -26,6 +26,8 @@ void libbpf_set_print(libbpf_print_fn_t warn, struct bpf_object; struct bpf_object *bpf_object__open(const char *path); +struct bpf_object *bpf_object__open_buffer(void *obj_buf, + size_t obj_buf_sz); void bpf_object__close(struct bpf_object *object); #endif -- cgit v0.10.2 From cc4228d57c4c35ae0a29efa1e7687a817dc038d9 Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Wed, 1 Jul 2015 02:13:55 +0000 Subject: bpf tools: Check endianness and make libbpf fail early Check endianness according to EHDR. Code is taken from tools/perf/util/symbol-elf.c. Libbpf doesn't magically convert missmatched endianness. Even if we swap eBPF instructions to correct byte order, we are unable to deal with endianness in code logical generated by LLVM. Therefore, libbpf should simply reject missmatched ELF object, and let LLVM to create good code. Signed-off-by: Wang Nan Acked-by: Alexei Starovoitov Cc: Brendan Gregg Cc: Daniel Borkmann Cc: David Ahern Cc: He Kuang Cc: Jiri Olsa Cc: Kaixu Xia Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Zefan Li Cc: pi3orama@163.com Link: http://lkml.kernel.org/r/1435716878-189507-8-git-send-email-wangnan0@huawei.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 36dfbc1..15b3e82 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -192,6 +192,34 @@ errout: return err; } +static int +bpf_object__check_endianness(struct bpf_object *obj) +{ + static unsigned int const endian = 1; + + switch (obj->efile.ehdr.e_ident[EI_DATA]) { + case ELFDATA2LSB: + /* We are big endian, BPF obj is little endian. */ + if (*(unsigned char const *)&endian != 1) + goto mismatch; + break; + + case ELFDATA2MSB: + /* We are little endian, BPF obj is big endian. */ + if (*(unsigned char const *)&endian != 0) + goto mismatch; + break; + default: + return -EINVAL; + } + + return 0; + +mismatch: + pr_warning("Error: endianness mismatch.\n"); + return -EINVAL; +} + static struct bpf_object * __bpf_object__open(const char *path, void *obj_buf, size_t obj_buf_sz) { @@ -208,6 +236,8 @@ __bpf_object__open(const char *path, void *obj_buf, size_t obj_buf_sz) if (bpf_object__elf_init(obj)) goto out; + if (bpf_object__check_endianness(obj)) + goto out; bpf_object__elf_finish(obj); return obj; -- cgit v0.10.2 From 296036653ae8b1367ec9d06d65377c2e2371b153 Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Wed, 1 Jul 2015 02:13:56 +0000 Subject: bpf tools: Iterate over ELF sections to collect information bpf_obj_elf_collect() is introduced to iterate over each elf sections to collection information in eBPF object files. This function will futher enhanced to collect license, kernel version, programs, configs and map information. Signed-off-by: Wang Nan Acked-by: Alexei Starovoitov Cc: Brendan Gregg Cc: Daniel Borkmann Cc: David Ahern Cc: He Kuang Cc: Jiri Olsa Cc: Kaixu Xia Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Zefan Li Cc: pi3orama@163.com Link: http://lkml.kernel.org/r/1435716878-189507-9-git-send-email-wangnan0@huawei.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 15b3e82..d8d6eb5 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -220,6 +220,57 @@ mismatch: return -EINVAL; } +static int bpf_object__elf_collect(struct bpf_object *obj) +{ + Elf *elf = obj->efile.elf; + GElf_Ehdr *ep = &obj->efile.ehdr; + Elf_Scn *scn = NULL; + int idx = 0, err = 0; + + /* Elf is corrupted/truncated, avoid calling elf_strptr. */ + if (!elf_rawdata(elf_getscn(elf, ep->e_shstrndx), NULL)) { + pr_warning("failed to get e_shstrndx from %s\n", + obj->path); + return -EINVAL; + } + + while ((scn = elf_nextscn(elf, scn)) != NULL) { + char *name; + GElf_Shdr sh; + Elf_Data *data; + + idx++; + if (gelf_getshdr(scn, &sh) != &sh) { + pr_warning("failed to get section header from %s\n", + obj->path); + err = -EINVAL; + goto out; + } + + name = elf_strptr(elf, ep->e_shstrndx, sh.sh_name); + if (!name) { + pr_warning("failed to get section name from %s\n", + obj->path); + err = -EINVAL; + goto out; + } + + data = elf_getdata(scn, 0); + if (!data) { + pr_warning("failed to get section data from %s(%s)\n", + name, obj->path); + err = -EINVAL; + goto out; + } + pr_debug("section %s, size %ld, link %d, flags %lx, type=%d\n", + name, (unsigned long)data->d_size, + (int)sh.sh_link, (unsigned long)sh.sh_flags, + (int)sh.sh_type); + } +out: + return err; +} + static struct bpf_object * __bpf_object__open(const char *path, void *obj_buf, size_t obj_buf_sz) { @@ -238,6 +289,8 @@ __bpf_object__open(const char *path, void *obj_buf, size_t obj_buf_sz) goto out; if (bpf_object__check_endianness(obj)) goto out; + if (bpf_object__elf_collect(obj)) + goto out; bpf_object__elf_finish(obj); return obj; -- cgit v0.10.2 From cb1e5e961991ee9b2cbfd3bf06ef490ea578cd2f Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Wed, 1 Jul 2015 02:13:57 +0000 Subject: bpf tools: Collect version and license from ELF sections Expand bpf_obj_elf_collect() to collect license and kernel version information in eBPF object file. eBPF object file should have a section named 'license', which contains a string. It should also have a section named 'version', contains a u32 LINUX_VERSION_CODE. bpf_obj_validate() is introduced to validate object file after loaded. Currently it only check existence of 'version' section. Signed-off-by: Wang Nan Acked-by: Alexei Starovoitov Cc: Brendan Gregg Cc: Daniel Borkmann Cc: David Ahern Cc: He Kuang Cc: Jiri Olsa Cc: Kaixu Xia Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Zefan Li Cc: pi3orama@163.com Link: http://lkml.kernel.org/r/1435716878-189507-10-git-send-email-wangnan0@huawei.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index d8d6eb5..95c8d8e 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -78,6 +79,8 @@ void libbpf_set_print(libbpf_print_fn_t warn, #endif struct bpf_object { + char license[64]; + u32 kern_version; /* * Information when doing elf related work. Only valid if fd * is valid. @@ -220,6 +223,33 @@ mismatch: return -EINVAL; } +static int +bpf_object__init_license(struct bpf_object *obj, + void *data, size_t size) +{ + memcpy(obj->license, data, + min(size, sizeof(obj->license) - 1)); + pr_debug("license of %s is %s\n", obj->path, obj->license); + return 0; +} + +static int +bpf_object__init_kversion(struct bpf_object *obj, + void *data, size_t size) +{ + u32 kver; + + if (size != sizeof(kver)) { + pr_warning("invalid kver section in %s\n", obj->path); + return -EINVAL; + } + memcpy(&kver, data, sizeof(kver)); + obj->kern_version = kver; + pr_debug("kernel version of %s is %x\n", obj->path, + obj->kern_version); + return 0; +} + static int bpf_object__elf_collect(struct bpf_object *obj) { Elf *elf = obj->efile.elf; @@ -266,11 +296,32 @@ static int bpf_object__elf_collect(struct bpf_object *obj) name, (unsigned long)data->d_size, (int)sh.sh_link, (unsigned long)sh.sh_flags, (int)sh.sh_type); + + if (strcmp(name, "license") == 0) + err = bpf_object__init_license(obj, + data->d_buf, + data->d_size); + else if (strcmp(name, "version") == 0) + err = bpf_object__init_kversion(obj, + data->d_buf, + data->d_size); + if (err) + goto out; } out: return err; } +static int bpf_object__validate(struct bpf_object *obj) +{ + if (obj->kern_version == 0) { + pr_warning("%s doesn't provide kernel version\n", + obj->path); + return -EINVAL; + } + return 0; +} + static struct bpf_object * __bpf_object__open(const char *path, void *obj_buf, size_t obj_buf_sz) { @@ -291,6 +342,8 @@ __bpf_object__open(const char *path, void *obj_buf, size_t obj_buf_sz) goto out; if (bpf_object__elf_collect(obj)) goto out; + if (bpf_object__validate(obj)) + goto out; bpf_object__elf_finish(obj); return obj; -- cgit v0.10.2 From 0b3d1efade1b7e2ccaf79c40a7481c93cfb5090a Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Wed, 1 Jul 2015 02:13:58 +0000 Subject: bpf tools: Collect map definitions from 'maps' section If maps are used by eBPF programs, corresponding object file(s) should contain a section named 'map'. Which contains map definitions. This patch copies the data of the whole section. Map data parsing should be acted just before map loading. Signed-off-by: Wang Nan Acked-by: Alexei Starovoitov Cc: Brendan Gregg Cc: Daniel Borkmann Cc: David Ahern Cc: He Kuang Cc: Jiri Olsa Cc: Kaixu Xia Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Zefan Li Cc: pi3orama@163.com Link: http://lkml.kernel.org/r/1435716878-189507-11-git-send-email-wangnan0@huawei.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 95c8d8e..87f5054a9 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -81,6 +81,9 @@ void libbpf_set_print(libbpf_print_fn_t warn, struct bpf_object { char license[64]; u32 kern_version; + void *maps_buf; + size_t maps_buf_sz; + /* * Information when doing elf related work. Only valid if fd * is valid. @@ -250,6 +253,28 @@ bpf_object__init_kversion(struct bpf_object *obj, return 0; } +static int +bpf_object__init_maps(struct bpf_object *obj, void *data, + size_t size) +{ + if (size == 0) { + pr_debug("%s doesn't need map definition\n", + obj->path); + return 0; + } + + obj->maps_buf = malloc(size); + if (!obj->maps_buf) { + pr_warning("malloc maps failed: %s\n", obj->path); + return -ENOMEM; + } + + obj->maps_buf_sz = size; + memcpy(obj->maps_buf, data, size); + pr_debug("maps in %s: %ld bytes\n", obj->path, (long)size); + return 0; +} + static int bpf_object__elf_collect(struct bpf_object *obj) { Elf *elf = obj->efile.elf; @@ -305,6 +330,9 @@ static int bpf_object__elf_collect(struct bpf_object *obj) err = bpf_object__init_kversion(obj, data->d_buf, data->d_size); + else if (strcmp(name, "maps") == 0) + err = bpf_object__init_maps(obj, data->d_buf, + data->d_size); if (err) goto out; } @@ -382,5 +410,6 @@ void bpf_object__close(struct bpf_object *obj) bpf_object__elf_finish(obj); + zfree(&obj->maps_buf); free(obj); } -- cgit v0.10.2 From bec7d68cb561e94f8a44c2b73c468b534c05f20d Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Wed, 1 Jul 2015 02:13:59 +0000 Subject: bpf tools: Collect symbol table from SHT_SYMTAB section This patch collects symbols section. This section is useful when linking BPF maps. What 'bpf_map_xxx()' functions actually require are map's file descriptors (and the internal verifier converts fds into pointers to 'struct bpf_map'), which we don't know when compiling. Therefore, we should make compiler generate a 'ldr_64 r1, ' instruction, and fill the 'imm' field with the actual file descriptor when loading in libbpf. BPF programs should be written in this way: struct bpf_map_def SEC("maps") my_map = { .type = BPF_MAP_TYPE_HASH, .key_size = sizeof(unsigned long), .value_size = sizeof(unsigned long), .max_entries = 1000000, }; SEC("my_func=sys_write") int my_func(void *ctx) { ... bpf_map_update_elem(&my_map, &key, &value, BPF_ANY); ... } Compiler should convert '&my_map' into a 'ldr_64, r1, ' instruction, where imm should be the address of 'my_map'. According to the address, libbpf knows which map it actually referenced, and then fills the imm field with the 'fd' of that map created by it. However, since we never really 'link' the object file, the imm field is only a record in relocation section. Therefore libbpf should do the relocation: 1. In relocation section (type == SHT_REL), positions of each such 'ldr_64' instruction are recorded with a reference of an entry in symbol table (SHT_SYMTAB); 2. From records in symbol table we can find the indics of map variables. Libbpf first record SHT_SYMTAB and positions of each instruction which required bu such operation. Then create file descriptor. Finally, after map creation complete, replace the imm field. This is the first patch of BPF map related stuff. It records SHT_SYMTAB into object's efile field for further use. Signed-off-by: Wang Nan Acked-by: Alexei Starovoitov Cc: Brendan Gregg Cc: Daniel Borkmann Cc: David Ahern Cc: He Kuang Cc: Jiri Olsa Cc: Kaixu Xia Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Zefan Li Cc: pi3orama@163.com Link: http://lkml.kernel.org/r/1435716878-189507-12-git-send-email-wangnan0@huawei.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 87f5054a9..9b016c0 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -94,6 +94,7 @@ struct bpf_object { size_t obj_buf_sz; Elf *elf; GElf_Ehdr ehdr; + Elf_Data *symbols; } efile; char path[]; }; @@ -135,6 +136,7 @@ static void bpf_object__elf_finish(struct bpf_object *obj) elf_end(obj->efile.elf); obj->efile.elf = NULL; } + obj->efile.symbols = NULL; zclose(obj->efile.fd); obj->efile.obj_buf = NULL; obj->efile.obj_buf_sz = 0; @@ -333,6 +335,14 @@ static int bpf_object__elf_collect(struct bpf_object *obj) else if (strcmp(name, "maps") == 0) err = bpf_object__init_maps(obj, data->d_buf, data->d_size); + else if (sh.sh_type == SHT_SYMTAB) { + if (obj->efile.symbols) { + pr_warning("bpf: multiple SYMTAB in %s\n", + obj->path); + err = -EEXIST; + } else + obj->efile.symbols = data; + } if (err) goto out; } -- cgit v0.10.2 From a5b8bd47dcc574cd3a71357b3a0f586969e4b887 Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Wed, 1 Jul 2015 02:14:00 +0000 Subject: bpf tools: Collect eBPF programs from their own sections This patch collects all programs in an object file into an array of 'struct bpf_program' for further processing. That structure is for representing each eBPF program. 'bpf_prog' should be a better name, but it has been used by linux/filter.h. Although it is a kernel space name, I still prefer to call it 'bpf_program' to prevent possible confusion. bpf_object__add_program() creates a new 'struct bpf_program' object. It first init a variable in stack using bpf_program__init(), then if success, enlarges obj->programs array and copy the new object in. Signed-off-by: Wang Nan Acked-by: Alexei Starovoitov Cc: Brendan Gregg Cc: Daniel Borkmann Cc: David Ahern Cc: He Kuang Cc: Jiri Olsa Cc: Kaixu Xia Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Zefan Li Cc: pi3orama@163.com Link: http://lkml.kernel.org/r/1435716878-189507-13-git-send-email-wangnan0@huawei.com [ Made bpf_object__add_program() propagate the error (-EINVAL or -ENOMEM) ] Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 9b016c0..102156f 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -78,12 +78,27 @@ void libbpf_set_print(libbpf_print_fn_t warn, # define LIBBPF_ELF_C_READ_MMAP ELF_C_READ #endif +/* + * bpf_prog should be a better name but it has been used in + * linux/filter.h. + */ +struct bpf_program { + /* Index in elf obj file, for relocation use. */ + int idx; + char *section_name; + struct bpf_insn *insns; + size_t insns_cnt; +}; + struct bpf_object { char license[64]; u32 kern_version; void *maps_buf; size_t maps_buf_sz; + struct bpf_program *programs; + size_t nr_programs; + /* * Information when doing elf related work. Only valid if fd * is valid. @@ -100,6 +115,85 @@ struct bpf_object { }; #define obj_elf_valid(o) ((o)->efile.elf) +static void bpf_program__exit(struct bpf_program *prog) +{ + if (!prog) + return; + + zfree(&prog->section_name); + zfree(&prog->insns); + prog->insns_cnt = 0; + prog->idx = -1; +} + +static int +bpf_program__init(void *data, size_t size, char *name, int idx, + struct bpf_program *prog) +{ + if (size < sizeof(struct bpf_insn)) { + pr_warning("corrupted section '%s'\n", name); + return -EINVAL; + } + + bzero(prog, sizeof(*prog)); + + prog->section_name = strdup(name); + if (!prog->section_name) { + pr_warning("failed to alloc name for prog %s\n", + name); + goto errout; + } + + prog->insns = malloc(size); + if (!prog->insns) { + pr_warning("failed to alloc insns for %s\n", name); + goto errout; + } + prog->insns_cnt = size / sizeof(struct bpf_insn); + memcpy(prog->insns, data, + prog->insns_cnt * sizeof(struct bpf_insn)); + prog->idx = idx; + + return 0; +errout: + bpf_program__exit(prog); + return -ENOMEM; +} + +static int +bpf_object__add_program(struct bpf_object *obj, void *data, size_t size, + char *name, int idx) +{ + struct bpf_program prog, *progs; + int nr_progs, err; + + err = bpf_program__init(data, size, name, idx, &prog); + if (err) + return err; + + progs = obj->programs; + nr_progs = obj->nr_programs; + + progs = realloc(progs, sizeof(progs[0]) * (nr_progs + 1)); + if (!progs) { + /* + * In this case the original obj->programs + * is still valid, so don't need special treat for + * bpf_close_object(). + */ + pr_warning("failed to alloc a new program '%s'\n", + name); + bpf_program__exit(&prog); + return -ENOMEM; + } + + pr_debug("found program %s\n", prog.section_name); + obj->programs = progs; + obj->nr_programs = nr_progs + 1; + progs[nr_progs] = prog; + return 0; +} + static struct bpf_object *bpf_object__new(const char *path, void *obj_buf, size_t obj_buf_sz) @@ -342,6 +436,17 @@ static int bpf_object__elf_collect(struct bpf_object *obj) err = -EEXIST; } else obj->efile.symbols = data; + } else if ((sh.sh_type == SHT_PROGBITS) && + (sh.sh_flags & SHF_EXECINSTR) && + (data->d_size > 0)) { + err = bpf_object__add_program(obj, data->d_buf, + data->d_size, name, idx); + if (err) { + char errmsg[128]; + strerror_r(-err, errmsg, sizeof(errmsg)); + pr_warning("failed to alloc program %s (%s): %s", + name, obj->path, errmsg); + } } if (err) goto out; @@ -415,11 +520,20 @@ struct bpf_object *bpf_object__open_buffer(void *obj_buf, void bpf_object__close(struct bpf_object *obj) { + size_t i; + if (!obj) return; bpf_object__elf_finish(obj); zfree(&obj->maps_buf); + + if (obj->programs && obj->nr_programs) { + for (i = 0; i < obj->nr_programs; i++) + bpf_program__exit(&obj->programs[i]); + } + zfree(&obj->programs); + free(obj); } -- cgit v0.10.2 From b62f06e81bcf28d47fe736fe2beae40f15f496be Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Wed, 1 Jul 2015 02:14:01 +0000 Subject: bpf tools: Collect relocation sections from SHT_REL sections This patch collects relocation sections into 'struct object'. Such sections are used for connecting maps to bpf programs. 'reloc' field in 'struct bpf_object' is introduced for storing such information. This patch simply store the data into 'reloc' field. Following patch will parse them to know the exact instructions which are needed to be relocated. Note that the collected data will be invalid after ELF object file is closed. This is the second patch related to map relocation. The first one is 'bpf tools: Collect symbol table from SHT_SYMTAB section'. The principle of map relocation is described in its commit message. Signed-off-by: Wang Nan Acked-by: Alexei Starovoitov Cc: Brendan Gregg Cc: Daniel Borkmann Cc: David Ahern Cc: He Kuang Cc: Jiri Olsa Cc: Kaixu Xia Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Zefan Li Cc: pi3orama@163.com Link: http://lkml.kernel.org/r/1435716878-189507-14-git-send-email-wangnan0@huawei.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 102156f..e8088f8 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -110,6 +110,11 @@ struct bpf_object { Elf *elf; GElf_Ehdr ehdr; Elf_Data *symbols; + struct { + GElf_Shdr shdr; + Elf_Data *data; + } *reloc; + int nr_reloc; } efile; char path[]; }; @@ -231,6 +236,9 @@ static void bpf_object__elf_finish(struct bpf_object *obj) obj->efile.elf = NULL; } obj->efile.symbols = NULL; + + zfree(&obj->efile.reloc); + obj->efile.nr_reloc = 0; zclose(obj->efile.fd); obj->efile.obj_buf = NULL; obj->efile.obj_buf_sz = 0; @@ -447,6 +455,24 @@ static int bpf_object__elf_collect(struct bpf_object *obj) pr_warning("failed to alloc program %s (%s): %s", name, obj->path, errmsg); } + } else if (sh.sh_type == SHT_REL) { + void *reloc = obj->efile.reloc; + int nr_reloc = obj->efile.nr_reloc + 1; + + reloc = realloc(reloc, + sizeof(*obj->efile.reloc) * nr_reloc); + if (!reloc) { + pr_warning("realloc failed\n"); + err = -ENOMEM; + } else { + int n = nr_reloc - 1; + + obj->efile.reloc = reloc; + obj->efile.nr_reloc = nr_reloc; + + obj->efile.reloc[n].shdr = sh; + obj->efile.reloc[n].data = data; + } } if (err) goto out; -- cgit v0.10.2 From 340909152ae5a588b9a77a85df35d61ec3523833 Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Wed, 1 Jul 2015 02:14:02 +0000 Subject: bpf tools: Record map accessing instructions for each program This patch records the indices of instructions which are needed to be relocated. That information is saved in the 'reloc_desc' field in 'struct bpf_program'. In the loading phase (this patch takes effect in the opening phase), the collected instructions will be replaced by map loading instructions. Since we are going to close the ELF file and clear all data at the end of the 'opening' phase, the ELF information will no longer be valid in the 'loading' phase. We have to locate the instructions before maps are loaded, instead of directly modifying the instruction. 'struct bpf_map_def' is introduced in this patch to let us know how many maps are defined in the object. This is the third part of map relocation. The principle of map relocation is described in commit message of 'bpf tools: Collect symbol table from SHT_SYMTAB section'. Signed-off-by: Wang Nan Acked-by: Alexei Starovoitov Cc: Brendan Gregg Cc: Daniel Borkmann Cc: David Ahern Cc: He Kuang Cc: Jiri Olsa Cc: Kaixu Xia Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Zefan Li Cc: pi3orama@163.com Link: http://lkml.kernel.org/r/1435716878-189507-15-git-send-email-wangnan0@huawei.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index e8088f8..ac69ae3 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -88,6 +89,12 @@ struct bpf_program { char *section_name; struct bpf_insn *insns; size_t insns_cnt; + + struct { + int insn_idx; + int map_idx; + } *reloc_desc; + int nr_reloc; }; struct bpf_object { @@ -127,6 +134,9 @@ static void bpf_program__exit(struct bpf_program *prog) zfree(&prog->section_name); zfree(&prog->insns); + zfree(&prog->reloc_desc); + + prog->nr_reloc = 0; prog->insns_cnt = 0; prog->idx = -1; } @@ -481,6 +491,118 @@ out: return err; } +static struct bpf_program * +bpf_object__find_prog_by_idx(struct bpf_object *obj, int idx) +{ + struct bpf_program *prog; + size_t i; + + for (i = 0; i < obj->nr_programs; i++) { + prog = &obj->programs[i]; + if (prog->idx == idx) + return prog; + } + return NULL; +} + +static int +bpf_program__collect_reloc(struct bpf_program *prog, + size_t nr_maps, GElf_Shdr *shdr, + Elf_Data *data, Elf_Data *symbols) +{ + int i, nrels; + + pr_debug("collecting relocating info for: '%s'\n", + prog->section_name); + nrels = shdr->sh_size / shdr->sh_entsize; + + prog->reloc_desc = malloc(sizeof(*prog->reloc_desc) * nrels); + if (!prog->reloc_desc) { + pr_warning("failed to alloc memory in relocation\n"); + return -ENOMEM; + } + prog->nr_reloc = nrels; + + for (i = 0; i < nrels; i++) { + GElf_Sym sym; + GElf_Rel rel; + unsigned int insn_idx; + struct bpf_insn *insns = prog->insns; + size_t map_idx; + + if (!gelf_getrel(data, i, &rel)) { + pr_warning("relocation: failed to get %d reloc\n", i); + return -EINVAL; + } + + insn_idx = rel.r_offset / sizeof(struct bpf_insn); + pr_debug("relocation: insn_idx=%u\n", insn_idx); + + if (!gelf_getsym(symbols, + GELF_R_SYM(rel.r_info), + &sym)) { + pr_warning("relocation: symbol %"PRIx64" not found\n", + GELF_R_SYM(rel.r_info)); + return -EINVAL; + } + + if (insns[insn_idx].code != (BPF_LD | BPF_IMM | BPF_DW)) { + pr_warning("bpf: relocation: invalid relo for insns[%d].code 0x%x\n", + insn_idx, insns[insn_idx].code); + return -EINVAL; + } + + map_idx = sym.st_value / sizeof(struct bpf_map_def); + if (map_idx >= nr_maps) { + pr_warning("bpf relocation: map_idx %d large than %d\n", + (int)map_idx, (int)nr_maps - 1); + return -EINVAL; + } + + prog->reloc_desc[i].insn_idx = insn_idx; + prog->reloc_desc[i].map_idx = map_idx; + } + return 0; +} + +static int bpf_object__collect_reloc(struct bpf_object *obj) +{ + int i, err; + + if (!obj_elf_valid(obj)) { + pr_warning("Internal error: elf object is closed\n"); + return -EINVAL; + } + + for (i = 0; i < obj->efile.nr_reloc; i++) { + GElf_Shdr *shdr = &obj->efile.reloc[i].shdr; + Elf_Data *data = obj->efile.reloc[i].data; + int idx = shdr->sh_info; + struct bpf_program *prog; + size_t nr_maps = obj->maps_buf_sz / + sizeof(struct bpf_map_def); + + if (shdr->sh_type != SHT_REL) { + pr_warning("internal error at %d\n", __LINE__); + return -EINVAL; + } + + prog = bpf_object__find_prog_by_idx(obj, idx); + if (!prog) { + pr_warning("relocation failed: no %d section\n", + idx); + return -ENOENT; + } + + err = bpf_program__collect_reloc(prog, nr_maps, + shdr, data, + obj->efile.symbols); + if (err) + return -EINVAL; + } + return 0; +} + static int bpf_object__validate(struct bpf_object *obj) { if (obj->kern_version == 0) { @@ -511,6 +633,8 @@ __bpf_object__open(const char *path, void *obj_buf, size_t obj_buf_sz) goto out; if (bpf_object__elf_collect(obj)) goto out; + if (bpf_object__collect_reloc(obj)) + goto out; if (bpf_object__validate(obj)) goto out; diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index dc966dd..6e75acd 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -30,4 +30,17 @@ struct bpf_object *bpf_object__open_buffer(void *obj_buf, size_t obj_buf_sz); void bpf_object__close(struct bpf_object *object); +/* + * We don't need __attribute__((packed)) now since it is + * unnecessary for 'bpf_map_def' because they are all aligned. + * In addition, using it will trigger -Wpacked warning message, + * and will be treated as an error due to -Werror. + */ +struct bpf_map_def { + unsigned int type; + unsigned int key_size; + unsigned int value_size; + unsigned int max_entries; +}; + #endif -- cgit v0.10.2 From e3ed2fef22b694cf07b06abaa6481a9e6f868e1f Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Wed, 1 Jul 2015 02:14:03 +0000 Subject: bpf tools: Add bpf.c/h for common bpf operations This patch introduces bpf.c and bpf.h, which hold common functions issuing bpf syscall. The goal of these two files is to hide syscall completely from user. Note that bpf.c and bpf.h deal with kernel interface only. Things like structure of 'map' section in the ELF object is not cared by of bpf.[ch]. We first introduce bpf_create_map(). Note that, since functions in bpf.[ch] are wrapper of sys_bpf, they don't use OO style naming. Signed-off-by: Wang Nan Acked-by: Alexei Starovoitov Cc: Brendan Gregg Cc: Daniel Borkmann Cc: David Ahern Cc: He Kuang Cc: Jiri Olsa Cc: Kaixu Xia Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Zefan Li Cc: pi3orama@163.com Link: http://lkml.kernel.org/r/1435716878-189507-16-git-send-email-wangnan0@huawei.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/lib/bpf/Build b/tools/lib/bpf/Build index a316484..d874975 100644 --- a/tools/lib/bpf/Build +++ b/tools/lib/bpf/Build @@ -1 +1 @@ -libbpf-y := libbpf.o +libbpf-y := libbpf.o bpf.o diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c new file mode 100644 index 0000000..208de7c3 --- /dev/null +++ b/tools/lib/bpf/bpf.c @@ -0,0 +1,51 @@ +/* + * common eBPF ELF operations. + * + * Copyright (C) 2013-2015 Alexei Starovoitov + * Copyright (C) 2015 Wang Nan + * Copyright (C) 2015 Huawei Inc. + */ + +#include +#include +#include +#include +#include +#include "bpf.h" + +/* + * When building perf, unistd.h is override. Define __NR_bpf is + * required to be defined. + */ +#ifndef __NR_bpf +# if defined(__i386__) +# define __NR_bpf 357 +# elif defined(__x86_64__) +# define __NR_bpf 321 +# elif defined(__aarch64__) +# define __NR_bpf 280 +# else +# error __NR_bpf not defined. libbpf does not support your arch. +# endif +#endif + +static int sys_bpf(enum bpf_cmd cmd, union bpf_attr *attr, + unsigned int size) +{ + return syscall(__NR_bpf, cmd, attr, size); +} + +int bpf_create_map(enum bpf_map_type map_type, int key_size, + int value_size, int max_entries) +{ + union bpf_attr attr; + + memset(&attr, '\0', sizeof(attr)); + + attr.map_type = map_type; + attr.key_size = key_size; + attr.value_size = value_size; + attr.max_entries = max_entries; + + return sys_bpf(BPF_MAP_CREATE, &attr, sizeof(attr)); +} diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h new file mode 100644 index 0000000..28f7942 --- /dev/null +++ b/tools/lib/bpf/bpf.h @@ -0,0 +1,16 @@ +/* + * common eBPF ELF operations. + * + * Copyright (C) 2013-2015 Alexei Starovoitov + * Copyright (C) 2015 Wang Nan + * Copyright (C) 2015 Huawei Inc. + */ +#ifndef __BPF_BPF_H +#define __BPF_BPF_H + +#include + +int bpf_create_map(enum bpf_map_type map_type, int key_size, int value_size, + int max_entries); + +#endif -- cgit v0.10.2 From 52d3352e79815307521bef6121dba00b6c3477c6 Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Wed, 1 Jul 2015 02:14:04 +0000 Subject: bpf tools: Create eBPF maps defined in an object file This patch creates maps based on 'map' section in object file using bpf_create_map(), and stores the fds into an array in 'struct bpf_object'. Previous patches parse ELF object file and collects required data, but doesn't play with the kernel. They belong to the 'opening' phase. This patch is the first patch in 'loading' phase. The 'loaded' field is introduced in 'struct bpf_object' to avoid loading an object twice, because the loading phase clears resources collected during the opening which becomes useless after loading. In this patch, maps_buf is cleared. Signed-off-by: Wang Nan Acked-by: Alexei Starovoitov Cc: Brendan Gregg Cc: Daniel Borkmann Cc: David Ahern Cc: He Kuang Cc: Jiri Olsa Cc: Kaixu Xia Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Zefan Li Cc: pi3orama@163.com Link: http://lkml.kernel.org/r/1435716878-189507-17-git-send-email-wangnan0@huawei.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index ac69ae3..54b48de 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -21,6 +21,7 @@ #include #include "libbpf.h" +#include "bpf.h" #define __printf(a, b) __attribute__((format(printf, a, b))) @@ -105,6 +106,13 @@ struct bpf_object { struct bpf_program *programs; size_t nr_programs; + int *map_fds; + /* + * This field is required because maps_buf will be freed and + * maps_buf_sz will be set to 0 after loaded. + */ + size_t nr_map_fds; + bool loaded; /* * Information when doing elf related work. Only valid if fd @@ -233,6 +241,7 @@ static struct bpf_object *bpf_object__new(const char *path, obj->efile.obj_buf = obj_buf; obj->efile.obj_buf_sz = obj_buf_sz; + obj->loaded = false; return obj; } @@ -565,6 +574,62 @@ bpf_program__collect_reloc(struct bpf_program *prog, return 0; } +static int +bpf_object__create_maps(struct bpf_object *obj) +{ + unsigned int i; + size_t nr_maps; + int *pfd; + + nr_maps = obj->maps_buf_sz / sizeof(struct bpf_map_def); + if (!obj->maps_buf || !nr_maps) { + pr_debug("don't need create maps for %s\n", + obj->path); + return 0; + } + + obj->map_fds = malloc(sizeof(int) * nr_maps); + if (!obj->map_fds) { + pr_warning("realloc perf_bpf_map_fds failed\n"); + return -ENOMEM; + } + obj->nr_map_fds = nr_maps; + + /* fill all fd with -1 */ + memset(obj->map_fds, -1, sizeof(int) * nr_maps); + + pfd = obj->map_fds; + for (i = 0; i < nr_maps; i++) { + struct bpf_map_def def; + + def = *(struct bpf_map_def *)(obj->maps_buf + + i * sizeof(struct bpf_map_def)); + + *pfd = bpf_create_map(def.type, + def.key_size, + def.value_size, + def.max_entries); + if (*pfd < 0) { + size_t j; + int err = *pfd; + + pr_warning("failed to create map: %s\n", + strerror(errno)); + for (j = 0; j < i; j++) + zclose(obj->map_fds[j]); + obj->nr_map_fds = 0; + zfree(&obj->map_fds); + return err; + } + pr_debug("create map: fd=%d\n", *pfd); + pfd++; + } + + zfree(&obj->maps_buf); + obj->maps_buf_sz = 0; + return 0; +} + static int bpf_object__collect_reloc(struct bpf_object *obj) { int i, err; @@ -668,6 +733,42 @@ struct bpf_object *bpf_object__open_buffer(void *obj_buf, return __bpf_object__open("[buffer]", obj_buf, obj_buf_sz); } +int bpf_object__unload(struct bpf_object *obj) +{ + size_t i; + + if (!obj) + return -EINVAL; + + for (i = 0; i < obj->nr_map_fds; i++) + zclose(obj->map_fds[i]); + zfree(&obj->map_fds); + obj->nr_map_fds = 0; + + return 0; +} + +int bpf_object__load(struct bpf_object *obj) +{ + if (!obj) + return -EINVAL; + + if (obj->loaded) { + pr_warning("object should not be loaded twice\n"); + return -EINVAL; + } + + obj->loaded = true; + if (bpf_object__create_maps(obj)) + goto out; + + return 0; +out: + bpf_object__unload(obj); + pr_warning("failed to load object '%s'\n", obj->path); + return -EINVAL; +} + void bpf_object__close(struct bpf_object *obj) { size_t i; @@ -676,6 +777,7 @@ void bpf_object__close(struct bpf_object *obj) return; bpf_object__elf_finish(obj); + bpf_object__unload(obj); zfree(&obj->maps_buf); diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 6e75acd..3e69600 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -30,6 +30,10 @@ struct bpf_object *bpf_object__open_buffer(void *obj_buf, size_t obj_buf_sz); void bpf_object__close(struct bpf_object *object); +/* Load/unload object into/from kernel */ +int bpf_object__load(struct bpf_object *obj); +int bpf_object__unload(struct bpf_object *obj); + /* * We don't need __attribute__((packed)) now since it is * unnecessary for 'bpf_map_def' because they are all aligned. -- cgit v0.10.2 From 8a47a6c522c0593a977069b0b1e5a0725ca0e32e Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Wed, 1 Jul 2015 02:14:05 +0000 Subject: bpf tools: Relocate eBPF programs If an eBPF program accesses a map, LLVM generates a load instruction which loads an absolute address into a register, like this: ld_64 r1, ... call 2 That ld_64 instruction will be recorded in relocation section. To enable the usage of that map, relocation must be done by replacing the immediate value by real map file descriptor so it can be found by eBPF map functions. This patch to the relocation work based on information collected by patches: 'bpf tools: Collect symbol table from SHT_SYMTAB section', 'bpf tools: Collect relocation sections from SHT_REL sections' and 'bpf tools: Record map accessing instructions for each program'. For each instruction which needs relocation, it inject corresponding file descriptor to imm field. As a part of protocol, src_reg is set to BPF_PSEUDO_MAP_FD to notify kernel this is a map loading instruction. This is the final part of map relocation patch. The principle of map relocation is described in commit message of 'bpf tools: Collect symbol table from SHT_SYMTAB section'. Signed-off-by: Wang Nan Acked-by: Alexei Starovoitov Cc: Brendan Gregg Cc: Daniel Borkmann Cc: David Ahern Cc: He Kuang Cc: Jiri Olsa Cc: Kaixu Xia Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Zefan Li Cc: pi3orama@163.com Link: http://lkml.kernel.org/r/1435716878-189507-18-git-send-email-wangnan0@huawei.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 54b48de..94f9660 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -630,6 +630,56 @@ bpf_object__create_maps(struct bpf_object *obj) return 0; } +static int +bpf_program__relocate(struct bpf_program *prog, int *map_fds) +{ + int i; + + if (!prog || !prog->reloc_desc) + return 0; + + for (i = 0; i < prog->nr_reloc; i++) { + int insn_idx, map_idx; + struct bpf_insn *insns = prog->insns; + + insn_idx = prog->reloc_desc[i].insn_idx; + map_idx = prog->reloc_desc[i].map_idx; + + if (insn_idx >= (int)prog->insns_cnt) { + pr_warning("relocation out of range: '%s'\n", + prog->section_name); + return -ERANGE; + } + insns[insn_idx].src_reg = BPF_PSEUDO_MAP_FD; + insns[insn_idx].imm = map_fds[map_idx]; + } + + zfree(&prog->reloc_desc); + prog->nr_reloc = 0; + return 0; +} + + +static int +bpf_object__relocate(struct bpf_object *obj) +{ + struct bpf_program *prog; + size_t i; + int err; + + for (i = 0; i < obj->nr_programs; i++) { + prog = &obj->programs[i]; + + err = bpf_program__relocate(prog, obj->map_fds); + if (err) { + pr_warning("failed to relocate '%s'\n", + prog->section_name); + return err; + } + } + return 0; +} + static int bpf_object__collect_reloc(struct bpf_object *obj) { int i, err; @@ -761,6 +811,8 @@ int bpf_object__load(struct bpf_object *obj) obj->loaded = true; if (bpf_object__create_maps(obj)) goto out; + if (bpf_object__relocate(obj)) + goto out; return 0; out: -- cgit v0.10.2 From 7bf98369a73a9a84c56b3f2b247ea8f0eaf8668b Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Wed, 1 Jul 2015 02:14:06 +0000 Subject: bpf tools: Introduce bpf_load_program() to bpf.c bpf_load_program() can be used to load bpf program into kernel. To make loading faster, first try to load without logbuf. Try again with logbuf if the first try failed. Signed-off-by: Wang Nan Acked-by: Alexei Starovoitov Cc: Brendan Gregg Cc: Daniel Borkmann Cc: David Ahern Cc: He Kuang Cc: Jiri Olsa Cc: Kaixu Xia Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Zefan Li Cc: pi3orama@163.com Link: http://lkml.kernel.org/r/1435716878-189507-19-git-send-email-wangnan0@huawei.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index 208de7c3..a633105 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -29,6 +29,11 @@ # endif #endif +static __u64 ptr_to_u64(void *ptr) +{ + return (__u64) (unsigned long) ptr; +} + static int sys_bpf(enum bpf_cmd cmd, union bpf_attr *attr, unsigned int size) { @@ -49,3 +54,32 @@ int bpf_create_map(enum bpf_map_type map_type, int key_size, return sys_bpf(BPF_MAP_CREATE, &attr, sizeof(attr)); } + +int bpf_load_program(enum bpf_prog_type type, struct bpf_insn *insns, + size_t insns_cnt, char *license, + u32 kern_version, char *log_buf, size_t log_buf_sz) +{ + int fd; + union bpf_attr attr; + + bzero(&attr, sizeof(attr)); + attr.prog_type = type; + attr.insn_cnt = (__u32)insns_cnt; + attr.insns = ptr_to_u64(insns); + attr.license = ptr_to_u64(license); + attr.log_buf = ptr_to_u64(NULL); + attr.log_size = 0; + attr.log_level = 0; + attr.kern_version = kern_version; + + fd = sys_bpf(BPF_PROG_LOAD, &attr, sizeof(attr)); + if (fd >= 0 || !log_buf || !log_buf_sz) + return fd; + + /* Try again with log */ + attr.log_buf = ptr_to_u64(log_buf); + attr.log_size = log_buf_sz; + attr.log_level = 1; + log_buf[0] = 0; + return sys_bpf(BPF_PROG_LOAD, &attr, sizeof(attr)); +} diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h index 28f7942..854b736 100644 --- a/tools/lib/bpf/bpf.h +++ b/tools/lib/bpf/bpf.h @@ -13,4 +13,11 @@ int bpf_create_map(enum bpf_map_type map_type, int key_size, int value_size, int max_entries); +/* Recommend log buffer size */ +#define BPF_LOG_BUF_SIZE 65536 +int bpf_load_program(enum bpf_prog_type type, struct bpf_insn *insns, + size_t insns_cnt, char *license, + u32 kern_version, char *log_buf, + size_t log_buf_sz); + #endif -- cgit v0.10.2 From 55cffde2e1a41109cb49f8e94de954c8240242b5 Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Wed, 1 Jul 2015 02:14:07 +0000 Subject: bpf tools: Load eBPF programs in object files into kernel This patch utilizes previous introduced bpf_load_program to load programs in the ELF file into kernel. Result is stored in 'fd' field in 'struct bpf_program'. During loading, it allocs a log buffer and free it before return. Note that that buffer is not passed to bpf_load_program() if the first loading try is successful. Doesn't use a statically allocated log buffer to avoid potention multi-thread problem. Instructions collected during opening is cleared after loading. load_program() is created for loading a 'struct bpf_insn' array into kernel, bpf_program__load() calls it. By this design we have a function loads instructions into kernel. It will be used by further patches, which creates different instances from a program and load them into kernel. Signed-off-by: Wang Nan Acked-by: Alexei Starovoitov Cc: Brendan Gregg Cc: Daniel Borkmann Cc: David Ahern Cc: He Kuang Cc: Jiri Olsa Cc: Kaixu Xia Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Zefan Li Cc: pi3orama@163.com Link: http://lkml.kernel.org/r/1435716878-189507-20-git-send-email-wangnan0@huawei.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 94f9660..38447b7 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -96,6 +96,8 @@ struct bpf_program { int map_idx; } *reloc_desc; int nr_reloc; + + int fd; }; struct bpf_object { @@ -135,11 +137,20 @@ struct bpf_object { }; #define obj_elf_valid(o) ((o)->efile.elf) +static void bpf_program__unload(struct bpf_program *prog) +{ + if (!prog) + return; + + zclose(prog->fd); +} + static void bpf_program__exit(struct bpf_program *prog) { if (!prog) return; + bpf_program__unload(prog); zfree(&prog->section_name); zfree(&prog->insns); zfree(&prog->reloc_desc); @@ -176,6 +187,7 @@ bpf_program__init(void *data, size_t size, char *name, int idx, memcpy(prog->insns, data, prog->insns_cnt * sizeof(struct bpf_insn)); prog->idx = idx; + prog->fd = -1; return 0; errout: @@ -718,6 +730,79 @@ static int bpf_object__collect_reloc(struct bpf_object *obj) return 0; } +static int +load_program(struct bpf_insn *insns, int insns_cnt, + char *license, u32 kern_version, int *pfd) +{ + int ret; + char *log_buf; + + if (!insns || !insns_cnt) + return -EINVAL; + + log_buf = malloc(BPF_LOG_BUF_SIZE); + if (!log_buf) + pr_warning("Alloc log buffer for bpf loader error, continue without log\n"); + + ret = bpf_load_program(BPF_PROG_TYPE_KPROBE, insns, + insns_cnt, license, kern_version, + log_buf, BPF_LOG_BUF_SIZE); + + if (ret >= 0) { + *pfd = ret; + ret = 0; + goto out; + } + + ret = -EINVAL; + pr_warning("load bpf program failed: %s\n", strerror(errno)); + + if (log_buf) { + pr_warning("-- BEGIN DUMP LOG ---\n"); + pr_warning("\n%s\n", log_buf); + pr_warning("-- END LOG --\n"); + } + +out: + free(log_buf); + return ret; +} + +static int +bpf_program__load(struct bpf_program *prog, + char *license, u32 kern_version) +{ + int err, fd; + + err = load_program(prog->insns, prog->insns_cnt, + license, kern_version, &fd); + if (!err) + prog->fd = fd; + + if (err) + pr_warning("failed to load program '%s'\n", + prog->section_name); + zfree(&prog->insns); + prog->insns_cnt = 0; + return err; +} + +static int +bpf_object__load_progs(struct bpf_object *obj) +{ + size_t i; + int err; + + for (i = 0; i < obj->nr_programs; i++) { + err = bpf_program__load(&obj->programs[i], + obj->license, + obj->kern_version); + if (err) + return err; + } + return 0; +} + static int bpf_object__validate(struct bpf_object *obj) { if (obj->kern_version == 0) { @@ -795,6 +880,9 @@ int bpf_object__unload(struct bpf_object *obj) zfree(&obj->map_fds); obj->nr_map_fds = 0; + for (i = 0; i < obj->nr_programs; i++) + bpf_program__unload(&obj->programs[i]); + return 0; } @@ -813,6 +901,8 @@ int bpf_object__load(struct bpf_object *obj) goto out; if (bpf_object__relocate(obj)) goto out; + if (bpf_object__load_progs(obj)) + goto out; return 0; out: -- cgit v0.10.2 From aa9b1ac33c7979d0d91eff8b70cffc4916f5555c Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Wed, 1 Jul 2015 02:14:08 +0000 Subject: bpf tools: Introduce accessors for struct bpf_program This patch introduces accessors for user of libbpf to retrieve section name and fd of a opened/loaded eBPF program. 'struct bpf_prog_handler' is used for that purpose. Accessors of programs section name and file descriptor are provided. Set/get private data are also impelmented. Signed-off-by: Wang Nan Acked-by: Alexei Starovoitov Cc: Brendan Gregg Cc: Daniel Borkmann Cc: David Ahern Cc: He Kuang Cc: Jiri Olsa Cc: Kaixu Xia Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Zefan Li Link: http://lkml.kernel.org/r/1435716878-189507-21-git-send-email-wangnan0@huawei.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 38447b7..ae1c5cb 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -98,6 +98,10 @@ struct bpf_program { int nr_reloc; int fd; + + struct bpf_object *obj; + void *priv; + bpf_program_clear_priv_t clear_priv; }; struct bpf_object { @@ -150,6 +154,12 @@ static void bpf_program__exit(struct bpf_program *prog) if (!prog) return; + if (prog->clear_priv) + prog->clear_priv(prog, prog->priv); + + prog->priv = NULL; + prog->clear_priv = NULL; + bpf_program__unload(prog); zfree(&prog->section_name); zfree(&prog->insns); @@ -225,6 +235,7 @@ bpf_object__add_program(struct bpf_object *obj, void *data, size_t size, pr_debug("found program %s\n", prog.section_name); obj->programs = progs; obj->nr_programs = nr_progs + 1; + prog.obj = obj; progs[nr_progs] = prog; return 0; } @@ -931,3 +942,64 @@ void bpf_object__close(struct bpf_object *obj) free(obj); } + +struct bpf_program * +bpf_program__next(struct bpf_program *prev, struct bpf_object *obj) +{ + size_t idx; + + if (!obj->programs) + return NULL; + /* First handler */ + if (prev == NULL) + return &obj->programs[0]; + + if (prev->obj != obj) { + pr_warning("error: program handler doesn't match object\n"); + return NULL; + } + + idx = (prev - obj->programs) + 1; + if (idx >= obj->nr_programs) + return NULL; + return &obj->programs[idx]; +} + +int bpf_program__set_private(struct bpf_program *prog, + void *priv, + bpf_program_clear_priv_t clear_priv) +{ + if (prog->priv && prog->clear_priv) + prog->clear_priv(prog, prog->priv); + + prog->priv = priv; + prog->clear_priv = clear_priv; + return 0; +} + +int bpf_program__get_private(struct bpf_program *prog, void **ppriv) +{ + *ppriv = prog->priv; + return 0; +} + +const char *bpf_program__title(struct bpf_program *prog, bool dup) +{ + const char *title; + + title = prog->section_name; + if (dup) { + title = strdup(title); + if (!title) { + pr_warning("failed to strdup program title\n"); + return NULL; + } + } + + return title; +} + +int bpf_program__fd(struct bpf_program *prog) +{ + return prog->fd; +} diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 3e69600..657e497 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -9,6 +9,7 @@ #define __BPF_LIBBPF_H #include +#include /* * In include/linux/compiler-gcc.h, __printf is defined. However @@ -34,6 +35,29 @@ void bpf_object__close(struct bpf_object *object); int bpf_object__load(struct bpf_object *obj); int bpf_object__unload(struct bpf_object *obj); +/* Accessors of bpf_program. */ +struct bpf_program; +struct bpf_program *bpf_program__next(struct bpf_program *prog, + struct bpf_object *obj); + +#define bpf_object__for_each_program(pos, obj) \ + for ((pos) = bpf_program__next(NULL, (obj)); \ + (pos) != NULL; \ + (pos) = bpf_program__next((pos), (obj))) + +typedef void (*bpf_program_clear_priv_t)(struct bpf_program *, + void *); + +int bpf_program__set_private(struct bpf_program *prog, void *priv, + bpf_program_clear_priv_t clear_priv); + +int bpf_program__get_private(struct bpf_program *prog, + void **ppriv); + +const char *bpf_program__title(struct bpf_program *prog, bool dup); + +int bpf_program__fd(struct bpf_program *prog); + /* * We don't need __attribute__((packed)) now since it is * unnecessary for 'bpf_map_def' because they are all aligned. -- cgit v0.10.2 From 9a208effd1832e50e1f7ea002f400f8b9ca8b1ed Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Wed, 1 Jul 2015 02:14:10 +0000 Subject: bpf tools: Link all bpf objects onto a list To allow enumeration of all bpf_objects, keep them in a list (hidden to caller). bpf_object__for_each_safe() is introduced to do this iteration. It is safe even user close the object during iteration. Signed-off-by: Wang Nan Acked-by: Alexei Starovoitov Cc: Brendan Gregg Cc: Daniel Borkmann Cc: David Ahern Cc: He Kuang Cc: Jiri Olsa Cc: Kaixu Xia Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Zefan Li Cc: pi3orama@163.com Link: http://lkml.kernel.org/r/1435716878-189507-23-git-send-email-wangnan0@huawei.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index ae1c5cb..4fa4bc4 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -104,6 +105,8 @@ struct bpf_program { bpf_program_clear_priv_t clear_priv; }; +static LIST_HEAD(bpf_objects_list); + struct bpf_object { char license[64]; u32 kern_version; @@ -137,6 +140,12 @@ struct bpf_object { } *reloc; int nr_reloc; } efile; + /* + * All loaded bpf_object is linked in a list, which is + * hidden to caller. bpf_objects__ handlers deal with + * all objects. + */ + struct list_head list; char path[]; }; #define obj_elf_valid(o) ((o)->efile.elf) @@ -265,6 +274,9 @@ static struct bpf_object *bpf_object__new(const char *path, obj->efile.obj_buf_sz = obj_buf_sz; obj->loaded = false; + + INIT_LIST_HEAD(&obj->list); + list_add(&obj->list, &bpf_objects_list); return obj; } @@ -940,9 +952,29 @@ void bpf_object__close(struct bpf_object *obj) } zfree(&obj->programs); + list_del(&obj->list); free(obj); } +struct bpf_object * +bpf_object__next(struct bpf_object *prev) +{ + struct bpf_object *next; + + if (!prev) + next = list_first_entry(&bpf_objects_list, + struct bpf_object, + list); + else + next = list_next_entry(prev, list); + + /* Empty list is noticed here so don't need checking on entry. */ + if (&next->list == &bpf_objects_list) + return NULL; + + return next; +} + struct bpf_program * bpf_program__next(struct bpf_program *prev, struct bpf_object *obj) { diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 657e497..ea8adc2 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -35,6 +35,13 @@ void bpf_object__close(struct bpf_object *object); int bpf_object__load(struct bpf_object *obj); int bpf_object__unload(struct bpf_object *obj); +struct bpf_object *bpf_object__next(struct bpf_object *prev); +#define bpf_object__for_each_safe(pos, tmp) \ + for ((pos) = bpf_object__next(NULL), \ + (tmp) = bpf_object__next(pos); \ + (pos) != NULL; \ + (pos) = (tmp), (tmp) = bpf_object__next(tmp)) + /* Accessors of bpf_program. */ struct bpf_program; struct bpf_program *bpf_program__next(struct bpf_program *prog, -- cgit v0.10.2 From aa61fd05ca79666c973d5b15e0f91ecdc7dcfa21 Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Tue, 21 Jul 2015 11:13:34 +0000 Subject: perf tools: Introduce llvm config options This patch introduces [llvm] config section with 5 options. Following patches will use then to config llvm dynamica compiling. 'llvm-utils.[ch]' is introduced in this patch for holding all llvm/clang related stuffs. Example: [llvm] # Path to clang. If omit, search it from $PATH. clang-path = "/path/to/clang" # Cmdline template. Following line shows its default value. # Environment variable is used to passing options. # # *NOTE*: -D__KERNEL__ MUST appears before $CLANG_OPTIONS, # so user have a chance to use -U__KERNEL__ in $CLANG_OPTIONS # to cancel it. clang-bpf-cmd-template = "$CLANG_EXEC -D__KERNEL__ $CLANG_OPTIONS \ $KERNEL_INC_OPTIONS -Wno-unused-value \ -Wno-pointer-sign -working-directory \ $WORKING_DIR -c $CLANG_SOURCE -target \ bpf -O2 -o -" # Options passed to clang, will be passed to cmdline by # $CLANG_OPTIONS. clang-opt = "-Wno-unused-value -Wno-pointer-sign" # kbuild directory. If not set, use /lib/modules/`uname -r`/build. # If set to "" deliberately, skip kernel header auto-detector. kbuild-dir = "/path/to/kernel/build" # Options passed to 'make' when detecting kernel header options. kbuild-opts = "ARCH=x86_64" Signed-off-by: Wang Nan Acked-by: Alexei Starovoitov Cc: Brendan Gregg Cc: Daniel Borkmann Cc: David Ahern Cc: He Kuang Cc: Jiri Olsa Cc: Kaixu Xia Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Zefan Li Cc: pi3orama@163.com Link: http://lkml.kernel.org/r/1437477214-149684-1-git-send-email-wangnan0@huawei.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/Build b/tools/perf/util/Build index a1e5168..2ee81d7 100644 --- a/tools/perf/util/Build +++ b/tools/perf/util/Build @@ -14,6 +14,7 @@ libperf-y += find_next_bit.o libperf-y += help.o libperf-y += kallsyms.o libperf-y += levenshtein.o +libperf-y += llvm-utils.o libperf-y += parse-options.o libperf-y += parse-events.o libperf-y += path.o diff --git a/tools/perf/util/config.c b/tools/perf/util/config.c index e18f653..2e452ac 100644 --- a/tools/perf/util/config.c +++ b/tools/perf/util/config.c @@ -12,6 +12,7 @@ #include "cache.h" #include "exec_cmd.h" #include "util/hist.h" /* perf_hist_config */ +#include "util/llvm-utils.h" /* perf_llvm_config */ #define MAXNAME (256) @@ -408,6 +409,9 @@ int perf_default_config(const char *var, const char *value, if (!prefixcmp(var, "call-graph.")) return perf_callchain_config(var, value); + if (!prefixcmp(var, "llvm.")) + return perf_llvm_config(var, value); + /* Add other config variables here. */ return 0; } diff --git a/tools/perf/util/llvm-utils.c b/tools/perf/util/llvm-utils.c new file mode 100644 index 0000000..472e8cd --- /dev/null +++ b/tools/perf/util/llvm-utils.c @@ -0,0 +1,45 @@ +/* + * Copyright (C) 2015, Wang Nan + * Copyright (C) 2015, Huawei Inc. + */ + +#include +#include "util.h" +#include "debug.h" +#include "llvm-utils.h" +#include "cache.h" + +#define CLANG_BPF_CMD_DEFAULT_TEMPLATE \ + "$CLANG_EXEC -D__KERNEL__ $CLANG_OPTIONS " \ + "$KERNEL_INC_OPTIONS -Wno-unused-value " \ + "-Wno-pointer-sign -working-directory " \ + "$WORKING_DIR -c \"$CLANG_SOURCE\" -target bpf -O2 -o -" + +struct llvm_param llvm_param = { + .clang_path = "clang", + .clang_bpf_cmd_template = CLANG_BPF_CMD_DEFAULT_TEMPLATE, + .clang_opt = NULL, + .kbuild_dir = NULL, + .kbuild_opts = NULL, +}; + +int perf_llvm_config(const char *var, const char *value) +{ + if (prefixcmp(var, "llvm.")) + return 0; + var += sizeof("llvm.") - 1; + + if (!strcmp(var, "clang-path")) + llvm_param.clang_path = strdup(value); + else if (!strcmp(var, "clang-bpf-cmd-template")) + llvm_param.clang_bpf_cmd_template = strdup(value); + else if (!strcmp(var, "clang-opt")) + llvm_param.clang_opt = strdup(value); + else if (!strcmp(var, "kbuild-dir")) + llvm_param.kbuild_dir = strdup(value); + else if (!strcmp(var, "kbuild-opts")) + llvm_param.kbuild_opts = strdup(value); + else + return -1; + return 0; +} diff --git a/tools/perf/util/llvm-utils.h b/tools/perf/util/llvm-utils.h new file mode 100644 index 0000000..504b799 --- /dev/null +++ b/tools/perf/util/llvm-utils.h @@ -0,0 +1,36 @@ +/* + * Copyright (C) 2015, Wang Nan + * Copyright (C) 2015, Huawei Inc. + */ +#ifndef __LLVM_UTILS_H +#define __LLVM_UTILS_H + +#include "debug.h" + +struct llvm_param { + /* Path of clang executable */ + const char *clang_path; + /* + * Template of clang bpf compiling. 5 env variables + * can be used: + * $CLANG_EXEC: Path to clang. + * $CLANG_OPTIONS: Extra options to clang. + * $KERNEL_INC_OPTIONS: Kernel include directories. + * $WORKING_DIR: Kernel source directory. + * $CLANG_SOURCE: Source file to be compiled. + */ + const char *clang_bpf_cmd_template; + /* Will be filled in $CLANG_OPTIONS */ + const char *clang_opt; + /* Where to find kbuild system */ + const char *kbuild_dir; + /* + * Arguments passed to make, like 'ARCH=arm' if doing cross + * compiling. Should not be used for dynamic compiling. + */ + const char *kbuild_opts; +}; + +extern struct llvm_param llvm_param; +extern int perf_llvm_config(const char *var, const char *value); +#endif -- cgit v0.10.2 From 4cea3a9cb30a962fa759fcb081fb83351113d9c4 Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Thu, 11 Jun 2015 10:31:09 +0000 Subject: perf tools: Call clang to compile C source to object code This is the core patch for supporting eBPF on-the-fly compiling, does the following work: 1. Search clang compiler using search_program(). 2. Run command template defined in llvm-bpf-cmd-template option in [llvm] config section using read_from_pipe(). Patch of clang and source code path is injected into shell command using environment variable using force_set_env(). Commiter notice: When building with DEBUG=1 we get a compiler error that gets fixed with the same approach described in commit b236512280fb: perf kmem: Fix compiler warning about may be accessing uninitialized variable The last argument to strtok_r doesn't need to be initialized, its just a placeholder to make this routine reentrant, but gcc doesn't know about that and complains, breaking the build, fix it by setting it to NULL. Signed-off-by: Wang Nan Acked-by: Alexei Starovoitov Cc: Brendan Gregg Cc: Daniel Borkmann Cc: David Ahern Cc: He Kuang Cc: Jiri Olsa Cc: Kaixu Xia Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Zefan Li Cc: pi3orama@163.com Link: http://lkml.kernel.org/n/1436445342-1402-14-git-send-email-wangnan0@huawei.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/llvm-utils.c b/tools/perf/util/llvm-utils.c index 472e8cd..5ae1124 100644 --- a/tools/perf/util/llvm-utils.c +++ b/tools/perf/util/llvm-utils.c @@ -43,3 +43,229 @@ int perf_llvm_config(const char *var, const char *value) return -1; return 0; } + +static int +search_program(const char *def, const char *name, + char *output) +{ + char *env, *path, *tmp = NULL; + char buf[PATH_MAX]; + int ret; + + output[0] = '\0'; + if (def && def[0] != '\0') { + if (def[0] == '/') { + if (access(def, F_OK) == 0) { + strlcpy(output, def, PATH_MAX); + return 0; + } + } else if (def[0] != '\0') + name = def; + } + + env = getenv("PATH"); + if (!env) + return -1; + env = strdup(env); + if (!env) + return -1; + + ret = -ENOENT; + path = strtok_r(env, ":", &tmp); + while (path) { + scnprintf(buf, sizeof(buf), "%s/%s", path, name); + if (access(buf, F_OK) == 0) { + strlcpy(output, buf, PATH_MAX); + ret = 0; + break; + } + path = strtok_r(NULL, ":", &tmp); + } + + free(env); + return ret; +} + +#define READ_SIZE 4096 +static int +read_from_pipe(const char *cmd, void **p_buf, size_t *p_read_sz) +{ + int err = 0; + void *buf = NULL; + FILE *file = NULL; + size_t read_sz = 0, buf_sz = 0; + + file = popen(cmd, "r"); + if (!file) { + pr_err("ERROR: unable to popen cmd: %s\n", + strerror(errno)); + return -EINVAL; + } + + while (!feof(file) && !ferror(file)) { + /* + * Make buf_sz always have obe byte extra space so we + * can put '\0' there. + */ + if (buf_sz - read_sz < READ_SIZE + 1) { + void *new_buf; + + buf_sz = read_sz + READ_SIZE + 1; + new_buf = realloc(buf, buf_sz); + + if (!new_buf) { + pr_err("ERROR: failed to realloc memory\n"); + err = -ENOMEM; + goto errout; + } + + buf = new_buf; + } + read_sz += fread(buf + read_sz, 1, READ_SIZE, file); + } + + if (buf_sz - read_sz < 1) { + pr_err("ERROR: internal error\n"); + err = -EINVAL; + goto errout; + } + + if (ferror(file)) { + pr_err("ERROR: error occurred when reading from pipe: %s\n", + strerror(errno)); + err = -EIO; + goto errout; + } + + err = WEXITSTATUS(pclose(file)); + file = NULL; + if (err) { + err = -EINVAL; + goto errout; + } + + /* + * If buf is string, give it terminal '\0' to make our life + * easier. If buf is not string, that '\0' is out of space + * indicated by read_sz so caller won't even notice it. + */ + ((char *)buf)[read_sz] = '\0'; + + if (!p_buf) + free(buf); + else + *p_buf = buf; + + if (p_read_sz) + *p_read_sz = read_sz; + return 0; + +errout: + if (file) + pclose(file); + free(buf); + if (p_buf) + *p_buf = NULL; + if (p_read_sz) + *p_read_sz = 0; + return err; +} + +static inline void +force_set_env(const char *var, const char *value) +{ + if (value) { + setenv(var, value, 1); + pr_debug("set env: %s=%s\n", var, value); + } else { + unsetenv(var); + pr_debug("unset env: %s\n", var); + } +} + +static void +version_notice(void) +{ + pr_err( +" \tLLVM 3.7 or newer is required. Which can be found from http://llvm.org\n" +" \tYou may want to try git trunk:\n" +" \t\tgit clone http://llvm.org/git/llvm.git\n" +" \t\t and\n" +" \t\tgit clone http://llvm.org/git/clang.git\n\n" +" \tOr fetch the latest clang/llvm 3.7 from pre-built llvm packages for\n" +" \tdebian/ubuntu:\n" +" \t\thttp://llvm.org/apt\n\n" +" \tIf you are using old version of clang, change 'clang-bpf-cmd-template'\n" +" \toption in [llvm] section of ~/.perfconfig to:\n\n" +" \t \"$CLANG_EXEC $CLANG_OPTIONS $KERNEL_INC_OPTIONS \\\n" +" \t -working-directory $WORKING_DIR -c $CLANG_SOURCE \\\n" +" \t -emit-llvm -o - | /path/to/llc -march=bpf -filetype=obj -o -\"\n" +" \t(Replace /path/to/llc with path to your llc)\n\n" +); +} + +int llvm__compile_bpf(const char *path, void **p_obj_buf, + size_t *p_obj_buf_sz) +{ + int err; + char clang_path[PATH_MAX]; + const char *clang_opt = llvm_param.clang_opt; + const char *template = llvm_param.clang_bpf_cmd_template; + void *obj_buf = NULL; + size_t obj_buf_sz; + + if (!template) + template = CLANG_BPF_CMD_DEFAULT_TEMPLATE; + + err = search_program(llvm_param.clang_path, + "clang", clang_path); + if (err) { + pr_err( +"ERROR:\tunable to find clang.\n" +"Hint:\tTry to install latest clang/llvm to support BPF. Check your $PATH\n" +" \tand 'clang-path' option in [llvm] section of ~/.perfconfig.\n"); + version_notice(); + return -ENOENT; + } + + force_set_env("CLANG_EXEC", clang_path); + force_set_env("CLANG_OPTIONS", clang_opt); + force_set_env("KERNEL_INC_OPTIONS", NULL); + force_set_env("WORKING_DIR", "."); + + /* + * Since we may reset clang's working dir, path of source file + * should be transferred into absolute path, except we want + * stdin to be source file (testing). + */ + force_set_env("CLANG_SOURCE", + (path[0] == '-') ? path : + make_nonrelative_path(path)); + + pr_debug("llvm compiling command template: %s\n", template); + err = read_from_pipe(template, &obj_buf, &obj_buf_sz); + if (err) { + pr_err("ERROR:\tunable to compile %s\n", path); + pr_err("Hint:\tCheck error message shown above.\n"); + pr_err("Hint:\tYou can also pre-compile it into .o using:\n"); + pr_err(" \t\tclang -target bpf -O2 -c %s\n", path); + pr_err(" \twith proper -I and -D options.\n"); + goto errout; + } + + if (!p_obj_buf) + free(obj_buf); + else + *p_obj_buf = obj_buf; + + if (p_obj_buf_sz) + *p_obj_buf_sz = obj_buf_sz; + return 0; +errout: + free(obj_buf); + if (p_obj_buf) + *p_obj_buf = NULL; + if (p_obj_buf_sz) + *p_obj_buf_sz = 0; + return err; +} diff --git a/tools/perf/util/llvm-utils.h b/tools/perf/util/llvm-utils.h index 504b799..d23adbc 100644 --- a/tools/perf/util/llvm-utils.h +++ b/tools/perf/util/llvm-utils.h @@ -33,4 +33,7 @@ struct llvm_param { extern struct llvm_param llvm_param; extern int perf_llvm_config(const char *var, const char *value); + +extern int llvm__compile_bpf(const char *path, void **p_obj_buf, + size_t *p_obj_buf_sz); #endif -- cgit v0.10.2 From d325d7887b960627dc686d70d07682e18383f9c4 Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Tue, 14 Jul 2015 06:40:02 +0000 Subject: perf tools: Auto detecting kernel build directory This patch detects kernel build directory by checking the existence of include/generated/autoconf.h. clang working directory is changed to kbuild directory if it is found, to help user use relative include path. Following patch will detect kernel include directory, which contains relative include patch so this workdir changing is needed. Users are allowed to set 'kbuild-dir = ""' manually to disable this checking. Signed-off-by: Wang Nan Acked-by: Alexei Starovoitov Cc: Brendan Gregg Cc: Daniel Borkmann Cc: David Ahern Cc: He Kuang Cc: Jiri Olsa Cc: Kaixu Xia Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Zefan Li Cc: pi3orama@163.com Link: http://lkml.kernel.org/n/tip-owyfwfbemrjn0tlj6tgk2nf5@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/llvm-utils.c b/tools/perf/util/llvm-utils.c index 5ae1124..6bfcb2d 100644 --- a/tools/perf/util/llvm-utils.c +++ b/tools/perf/util/llvm-utils.c @@ -4,6 +4,7 @@ */ #include +#include #include "util.h" #include "debug.h" #include "llvm-utils.h" @@ -204,6 +205,74 @@ version_notice(void) ); } +static int detect_kbuild_dir(char **kbuild_dir) +{ + const char *test_dir = llvm_param.kbuild_dir; + const char *prefix_dir = ""; + const char *suffix_dir = ""; + + char *autoconf_path; + struct utsname utsname; + + int err; + + if (!test_dir) { + err = uname(&utsname); + if (err) { + pr_warning("uname failed: %s\n", strerror(errno)); + return -EINVAL; + } + + test_dir = utsname.release; + prefix_dir = "/lib/modules/"; + suffix_dir = "/build"; + } + + err = asprintf(&autoconf_path, "%s%s%s/include/generated/autoconf.h", + prefix_dir, test_dir, suffix_dir); + if (err < 0) + return -ENOMEM; + + if (access(autoconf_path, R_OK) == 0) { + free(autoconf_path); + + err = asprintf(kbuild_dir, "%s%s%s", prefix_dir, test_dir, + suffix_dir); + if (err < 0) + return -ENOMEM; + return 0; + } + free(autoconf_path); + return -ENOENT; +} + +static inline void +get_kbuild_opts(char **kbuild_dir) +{ + int err; + + if (!kbuild_dir) + return; + + *kbuild_dir = NULL; + + if (llvm_param.kbuild_dir && !llvm_param.kbuild_dir[0]) { + pr_debug("[llvm.kbuild-dir] is set to \"\" deliberately.\n"); + pr_debug("Skip kbuild options detection.\n"); + return; + } + + err = detect_kbuild_dir(kbuild_dir); + if (err) { + pr_warning( +"WARNING:\tunable to get correct kernel building directory.\n" +"Hint:\tSet correct kbuild directory using 'kbuild-dir' option in [llvm]\n" +" \tsection of ~/.perfconfig or set it to \"\" to suppress kbuild\n" +" \tdetection.\n\n"); + return; + } +} + int llvm__compile_bpf(const char *path, void **p_obj_buf, size_t *p_obj_buf_sz) { @@ -211,6 +280,7 @@ int llvm__compile_bpf(const char *path, void **p_obj_buf, char clang_path[PATH_MAX]; const char *clang_opt = llvm_param.clang_opt; const char *template = llvm_param.clang_bpf_cmd_template; + char *kbuild_dir = NULL; void *obj_buf = NULL; size_t obj_buf_sz; @@ -228,10 +298,16 @@ int llvm__compile_bpf(const char *path, void **p_obj_buf, return -ENOENT; } + /* + * This is an optional work. Even it fail we can continue our + * work. Needn't to check error return. + */ + get_kbuild_opts(&kbuild_dir); + force_set_env("CLANG_EXEC", clang_path); force_set_env("CLANG_OPTIONS", clang_opt); force_set_env("KERNEL_INC_OPTIONS", NULL); - force_set_env("WORKING_DIR", "."); + force_set_env("WORKING_DIR", kbuild_dir ? : "."); /* * Since we may reset clang's working dir, path of source file @@ -253,6 +329,7 @@ int llvm__compile_bpf(const char *path, void **p_obj_buf, goto errout; } + free(kbuild_dir); if (!p_obj_buf) free(obj_buf); else @@ -262,6 +339,7 @@ int llvm__compile_bpf(const char *path, void **p_obj_buf, *p_obj_buf_sz = obj_buf_sz; return 0; errout: + free(kbuild_dir); free(obj_buf); if (p_obj_buf) *p_obj_buf = NULL; -- cgit v0.10.2 From 0c6d18bfd551622b438e216e4863155f47907b0d Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Thu, 11 Jun 2015 11:25:49 +0000 Subject: perf tools: Auto detecting kernel include options To help user find correct kernel include options, this patch extracts them from kbuild system by an embedded script kinc_fetch_script, which creates a temporary directory, generates Makefile and an empty dummy.o then use the Makefile to fetch $(NOSTDINC_FLAGS), $(LINUXINCLUDE) and $(EXTRA_CFLAGS) options. The result is passed to compiler script using 'KERNEL_INC_OPTIONS' environment variable. Because options from kbuild contains relative path like 'Iinclude/generated/uapi', the work directory must be changed. This is done by previous patch. Signed-off-by: Wang Nan Acked-by: Alexei Starovoitov Cc: Brendan Gregg Cc: Daniel Borkmann Cc: David Ahern Cc: He Kuang Cc: Jiri Olsa Cc: Kaixu Xia Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Zefan Li Cc: pi3orama@163.com Link: http://lkml.kernel.org/r/1436445342-1402-16-git-send-email-wangnan0@huawei.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/llvm-utils.c b/tools/perf/util/llvm-utils.c index 6bfcb2d..5887bb8 100644 --- a/tools/perf/util/llvm-utils.c +++ b/tools/perf/util/llvm-utils.c @@ -246,15 +246,42 @@ static int detect_kbuild_dir(char **kbuild_dir) return -ENOENT; } +static const char *kinc_fetch_script = +"#!/usr/bin/env sh\n" +"if ! test -d \"$KBUILD_DIR\"\n" +"then\n" +" exit -1\n" +"fi\n" +"if ! test -f \"$KBUILD_DIR/include/generated/autoconf.h\"\n" +"then\n" +" exit -1\n" +"fi\n" +"TMPDIR=`mktemp -d`\n" +"if test -z \"$TMPDIR\"\n" +"then\n" +" exit -1\n" +"fi\n" +"cat << EOF > $TMPDIR/Makefile\n" +"obj-y := dummy.o\n" +"\\$(obj)/%.o: \\$(src)/%.c\n" +"\t@echo -n \"\\$(NOSTDINC_FLAGS) \\$(LINUXINCLUDE) \\$(EXTRA_CFLAGS)\"\n" +"EOF\n" +"touch $TMPDIR/dummy.c\n" +"make -s -C $KBUILD_DIR M=$TMPDIR $KBUILD_OPTS dummy.o 2>/dev/null\n" +"RET=$?\n" +"rm -rf $TMPDIR\n" +"exit $RET\n"; + static inline void -get_kbuild_opts(char **kbuild_dir) +get_kbuild_opts(char **kbuild_dir, char **kbuild_include_opts) { int err; - if (!kbuild_dir) + if (!kbuild_dir || !kbuild_include_opts) return; *kbuild_dir = NULL; + *kbuild_include_opts = NULL; if (llvm_param.kbuild_dir && !llvm_param.kbuild_dir[0]) { pr_debug("[llvm.kbuild-dir] is set to \"\" deliberately.\n"); @@ -271,6 +298,27 @@ get_kbuild_opts(char **kbuild_dir) " \tdetection.\n\n"); return; } + + pr_debug("Kernel build dir is set to %s\n", *kbuild_dir); + force_set_env("KBUILD_DIR", *kbuild_dir); + force_set_env("KBUILD_OPTS", llvm_param.kbuild_opts); + err = read_from_pipe(kinc_fetch_script, + (void **)kbuild_include_opts, + NULL); + if (err) { + pr_warning( +"WARNING:\tunable to get kernel include directories from '%s'\n" +"Hint:\tTry set clang include options using 'clang-bpf-cmd-template'\n" +" \toption in [llvm] section of ~/.perfconfig and set 'kbuild-dir'\n" +" \toption in [llvm] to \"\" to suppress this detection.\n\n", + *kbuild_dir); + + free(*kbuild_dir); + *kbuild_dir = NULL; + return; + } + + pr_debug("include option is set to %s\n", *kbuild_include_opts); } int llvm__compile_bpf(const char *path, void **p_obj_buf, @@ -280,7 +328,7 @@ int llvm__compile_bpf(const char *path, void **p_obj_buf, char clang_path[PATH_MAX]; const char *clang_opt = llvm_param.clang_opt; const char *template = llvm_param.clang_bpf_cmd_template; - char *kbuild_dir = NULL; + char *kbuild_dir = NULL, *kbuild_include_opts = NULL; void *obj_buf = NULL; size_t obj_buf_sz; @@ -302,11 +350,11 @@ int llvm__compile_bpf(const char *path, void **p_obj_buf, * This is an optional work. Even it fail we can continue our * work. Needn't to check error return. */ - get_kbuild_opts(&kbuild_dir); + get_kbuild_opts(&kbuild_dir, &kbuild_include_opts); force_set_env("CLANG_EXEC", clang_path); force_set_env("CLANG_OPTIONS", clang_opt); - force_set_env("KERNEL_INC_OPTIONS", NULL); + force_set_env("KERNEL_INC_OPTIONS", kbuild_include_opts); force_set_env("WORKING_DIR", kbuild_dir ? : "."); /* @@ -330,6 +378,7 @@ int llvm__compile_bpf(const char *path, void **p_obj_buf, } free(kbuild_dir); + free(kbuild_include_opts); if (!p_obj_buf) free(obj_buf); else @@ -340,6 +389,7 @@ int llvm__compile_bpf(const char *path, void **p_obj_buf, return 0; errout: free(kbuild_dir); + free(kbuild_include_opts); free(obj_buf); if (p_obj_buf) *p_obj_buf = NULL; -- cgit v0.10.2 From 9bc898c7019383b6aa2ae6cb2928c4ca926449f0 Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Wed, 8 Jul 2015 10:04:02 +0000 Subject: perf tests: Add LLVM test for eBPF on-the-fly compiling Previous patches introduce llvm__compile_bpf() to compile source file to eBPF object. This patch adds testcase to test it. It also tests libbpf by opening generated object after applying next patch which introduces HAVE_LIBBPF_SUPPORT option. Since llvm__compile_bpf() prints long messages which users who don't explicitly test llvm doesn't care, this patch set verbose to -1 to suppress all debug, warning and error message, and hint user use 'perf test -v' to see the full output. For the same reason, if clang is not found in PATH and there's no [llvm] section in .perfconfig, skip this test. Signed-off-by: Wang Nan Acked-by: Alexei Starovoitov Cc: Brendan Gregg Cc: Daniel Borkmann Cc: David Ahern Cc: He Kuang Cc: Jiri Olsa Cc: Kaixu Xia Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Zefan Li Cc: pi3orama@163.com Link: http://lkml.kernel.org/n/1436445342-1402-17-git-send-email-wangnan0@huawei.com [ Add tools/lib/bpf/ to tools/perf/MANIFEST, so that the tarball targets build ] Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/MANIFEST b/tools/perf/MANIFEST index d01a0aa..f31f15a 100644 --- a/tools/perf/MANIFEST +++ b/tools/perf/MANIFEST @@ -18,6 +18,7 @@ tools/arch/x86/include/asm/atomic.h tools/arch/x86/include/asm/rmwcc.h tools/lib/traceevent tools/lib/api +tools/lib/bpf tools/lib/hweight.c tools/lib/rbtree.c tools/lib/symbol/kallsyms.c diff --git a/tools/perf/tests/Build b/tools/perf/tests/Build index d20d6e6..c1518bd 100644 --- a/tools/perf/tests/Build +++ b/tools/perf/tests/Build @@ -32,6 +32,7 @@ perf-y += sample-parsing.o perf-y += parse-no-sample-id-all.o perf-y += kmod-path.o perf-y += thread-map.o +perf-y += llvm.o perf-$(CONFIG_X86) += perf-time-to-tsc.o diff --git a/tools/perf/tests/builtin-test.c b/tools/perf/tests/builtin-test.c index c1dde73..136cd93 100644 --- a/tools/perf/tests/builtin-test.c +++ b/tools/perf/tests/builtin-test.c @@ -175,6 +175,10 @@ static struct test { .func = test__thread_map, }, { + .desc = "Test LLVM searching and compiling", + .func = test__llvm, + }, + { .func = NULL, }, }; diff --git a/tools/perf/tests/llvm.c b/tools/perf/tests/llvm.c new file mode 100644 index 0000000..a337356 --- /dev/null +++ b/tools/perf/tests/llvm.c @@ -0,0 +1,98 @@ +#include +#include +#include +#include +#include "tests.h" +#include "debug.h" + +static int perf_config_cb(const char *var, const char *val, + void *arg __maybe_unused) +{ + return perf_default_config(var, val, arg); +} + +/* + * Randomly give it a "version" section since we don't really load it + * into kernel + */ +static const char test_bpf_prog[] = + "__attribute__((section(\"do_fork\"), used)) " + "int fork(void *ctx) {return 0;} " + "char _license[] __attribute__((section(\"license\"), used)) = \"GPL\";" + "int _version __attribute__((section(\"version\"), used)) = 0x40100;"; + +#ifdef HAVE_LIBBPF_SUPPORT +static int test__bpf_parsing(void *obj_buf, size_t obj_buf_sz) +{ + struct bpf_object *obj; + + obj = bpf_object__open_buffer(obj_buf, obj_buf_sz); + if (!obj) + return -1; + bpf_object__close(obj); + return 0; +} +#else +static int test__bpf_parsing(void *obj_buf __maybe_unused, + size_t obj_buf_sz __maybe_unused) +{ + fprintf(stderr, " (skip bpf parsing)"); + return 0; +} +#endif + +int test__llvm(void) +{ + char *tmpl_new, *clang_opt_new; + void *obj_buf; + size_t obj_buf_sz; + int err, old_verbose; + + perf_config(perf_config_cb, NULL); + + /* + * Skip this test if user's .perfconfig doesn't set [llvm] section + * and clang is not found in $PATH, and this is not perf test -v + */ + if (verbose == 0 && !llvm_param.user_set_param && llvm__search_clang()) { + fprintf(stderr, " (no clang, try 'perf test -v LLVM')"); + return TEST_SKIP; + } + + old_verbose = verbose; + /* + * llvm is verbosity when error. Suppress all error output if + * not 'perf test -v'. + */ + if (verbose == 0) + verbose = -1; + + if (!llvm_param.clang_bpf_cmd_template) + return -1; + + if (!llvm_param.clang_opt) + llvm_param.clang_opt = strdup(""); + + err = asprintf(&tmpl_new, "echo '%s' | %s", test_bpf_prog, + llvm_param.clang_bpf_cmd_template); + if (err < 0) + return -1; + err = asprintf(&clang_opt_new, "-xc %s", llvm_param.clang_opt); + if (err < 0) + return -1; + + llvm_param.clang_bpf_cmd_template = tmpl_new; + llvm_param.clang_opt = clang_opt_new; + err = llvm__compile_bpf("-", &obj_buf, &obj_buf_sz); + + verbose = old_verbose; + if (err) { + if (!verbose) + fprintf(stderr, " (use -v to see error message)"); + return -1; + } + + err = test__bpf_parsing(obj_buf, obj_buf_sz); + free(obj_buf); + return err; +} diff --git a/tools/perf/tests/tests.h b/tools/perf/tests/tests.h index ebb47d9..bf113a2 100644 --- a/tools/perf/tests/tests.h +++ b/tools/perf/tests/tests.h @@ -62,6 +62,7 @@ int test__fdarray__filter(void); int test__fdarray__add(void); int test__kmod_path__parse(void); int test__thread_map(void); +int test__llvm(void); #if defined(__x86_64__) || defined(__i386__) || defined(__arm__) || defined(__aarch64__) #ifdef HAVE_DWARF_UNWIND_SUPPORT diff --git a/tools/perf/util/llvm-utils.c b/tools/perf/util/llvm-utils.c index 5887bb8..4f6a478 100644 --- a/tools/perf/util/llvm-utils.c +++ b/tools/perf/util/llvm-utils.c @@ -22,6 +22,7 @@ struct llvm_param llvm_param = { .clang_opt = NULL, .kbuild_dir = NULL, .kbuild_opts = NULL, + .user_set_param = false, }; int perf_llvm_config(const char *var, const char *value) @@ -42,6 +43,7 @@ int perf_llvm_config(const char *var, const char *value) llvm_param.kbuild_opts = strdup(value); else return -1; + llvm_param.user_set_param = true; return 0; } @@ -397,3 +399,10 @@ errout: *p_obj_buf_sz = 0; return err; } + +int llvm__search_clang(void) +{ + char clang_path[PATH_MAX]; + + return search_program(llvm_param.clang_path, "clang", clang_path); +} diff --git a/tools/perf/util/llvm-utils.h b/tools/perf/util/llvm-utils.h index d23adbc..5b3cf1c 100644 --- a/tools/perf/util/llvm-utils.h +++ b/tools/perf/util/llvm-utils.h @@ -29,6 +29,13 @@ struct llvm_param { * compiling. Should not be used for dynamic compiling. */ const char *kbuild_opts; + /* + * Default is false. If one of the above fields is set by user + * explicitly then user_set_llvm is set to true. This is used + * for perf test. If user doesn't set anything in .perfconfig + * and clang is not found, don't trigger llvm test. + */ + bool user_set_param; }; extern struct llvm_param llvm_param; @@ -36,4 +43,7 @@ extern int perf_llvm_config(const char *var, const char *value); extern int llvm__compile_bpf(const char *path, void **p_obj_buf, size_t *p_obj_buf_sz); + +/* This function is for test__llvm() use only */ +extern int llvm__search_clang(void); #endif -- cgit v0.10.2 From d809560b36a7ed31fbaf3719fdf79ddcbd30950b Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 7 Aug 2015 12:51:03 +0200 Subject: perf stat: Move perf_counts struct and functions into separate object Moving 'struct perf_counts' and associated functions into separate object, so we could remove stat.c object dependency from python build. It makes the python code to build properly, because it fails to load due to missing stat-shadow.c object dependency if some patches from Kan Liang are applied. So apply this one, then Kan's. Signed-off-by: Jiri Olsa Cc: Andi Kleen Cc: Jiri Olsa Cc: Kan Liang Cc: Namhyung Kim Link: http://lkml.kernel.org/r/20150807105103.GB8624@krava.brq.redhat.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index a054ddc..7aa039b 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -58,6 +58,7 @@ #include "util/cpumap.h" #include "util/thread.h" #include "util/thread_map.h" +#include "util/counts.h" #include #include diff --git a/tools/perf/util/Build b/tools/perf/util/Build index 2ee81d7..1ce0adc 100644 --- a/tools/perf/util/Build +++ b/tools/perf/util/Build @@ -68,6 +68,7 @@ libperf-y += target.o libperf-y += rblist.o libperf-y += intlist.o libperf-y += vdso.o +libperf-y += counts.o libperf-y += stat.o libperf-y += stat-shadow.o libperf-y += record.o diff --git a/tools/perf/util/counts.c b/tools/perf/util/counts.c new file mode 100644 index 0000000..e3fde31 --- /dev/null +++ b/tools/perf/util/counts.c @@ -0,0 +1,52 @@ +#include +#include "evsel.h" +#include "counts.h" + +struct perf_counts *perf_counts__new(int ncpus, int nthreads) +{ + struct perf_counts *counts = zalloc(sizeof(*counts)); + + if (counts) { + struct xyarray *values; + + values = xyarray__new(ncpus, nthreads, sizeof(struct perf_counts_values)); + if (!values) { + free(counts); + return NULL; + } + + counts->values = values; + } + + return counts; +} + +void perf_counts__delete(struct perf_counts *counts) +{ + if (counts) { + xyarray__delete(counts->values); + free(counts); + } +} + +static void perf_counts__reset(struct perf_counts *counts) +{ + xyarray__reset(counts->values); +} + +void perf_evsel__reset_counts(struct perf_evsel *evsel) +{ + perf_counts__reset(evsel->counts); +} + +int perf_evsel__alloc_counts(struct perf_evsel *evsel, int ncpus, int nthreads) +{ + evsel->counts = perf_counts__new(ncpus, nthreads); + return evsel->counts != NULL ? 0 : -ENOMEM; +} + +void perf_evsel__free_counts(struct perf_evsel *evsel) +{ + perf_counts__delete(evsel->counts); + evsel->counts = NULL; +} diff --git a/tools/perf/util/counts.h b/tools/perf/util/counts.h new file mode 100644 index 0000000..34d8baa --- /dev/null +++ b/tools/perf/util/counts.h @@ -0,0 +1,37 @@ +#ifndef __PERF_COUNTS_H +#define __PERF_COUNTS_H + +#include "xyarray.h" + +struct perf_counts_values { + union { + struct { + u64 val; + u64 ena; + u64 run; + }; + u64 values[3]; + }; +}; + +struct perf_counts { + s8 scaled; + struct perf_counts_values aggr; + struct xyarray *values; +}; + + +static inline struct perf_counts_values* +perf_counts(struct perf_counts *counts, int cpu, int thread) +{ + return xyarray__entry(counts->values, cpu, thread); +} + +struct perf_counts *perf_counts__new(int ncpus, int nthreads); +void perf_counts__delete(struct perf_counts *counts); + +void perf_evsel__reset_counts(struct perf_evsel *evsel); +int perf_evsel__alloc_counts(struct perf_evsel *evsel, int ncpus, int nthreads); +void perf_evsel__free_counts(struct perf_evsel *evsel); + +#endif /* __PERF_COUNTS_H */ diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index 6a12908..b948f69 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -9,7 +9,7 @@ #include "xyarray.h" #include "symbol.h" #include "cpumap.h" -#include "stat.h" +#include "counts.h" struct perf_evsel; diff --git a/tools/perf/util/python-ext-sources b/tools/perf/util/python-ext-sources index 0766d98..51be28b 100644 --- a/tools/perf/util/python-ext-sources +++ b/tools/perf/util/python-ext-sources @@ -16,7 +16,7 @@ util/util.c util/xyarray.c util/cgroup.c util/rblist.c -util/stat.c +util/counts.c util/strlist.c util/trace-event.c ../lib/rbtree.c diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c index c5c709c..415c359 100644 --- a/tools/perf/util/stat.c +++ b/tools/perf/util/stat.c @@ -97,55 +97,6 @@ void perf_stat_evsel_id_init(struct perf_evsel *evsel) } } -struct perf_counts *perf_counts__new(int ncpus, int nthreads) -{ - struct perf_counts *counts = zalloc(sizeof(*counts)); - - if (counts) { - struct xyarray *values; - - values = xyarray__new(ncpus, nthreads, sizeof(struct perf_counts_values)); - if (!values) { - free(counts); - return NULL; - } - - counts->values = values; - } - - return counts; -} - -void perf_counts__delete(struct perf_counts *counts) -{ - if (counts) { - xyarray__delete(counts->values); - free(counts); - } -} - -static void perf_counts__reset(struct perf_counts *counts) -{ - xyarray__reset(counts->values); -} - -void perf_evsel__reset_counts(struct perf_evsel *evsel) -{ - perf_counts__reset(evsel->counts); -} - -int perf_evsel__alloc_counts(struct perf_evsel *evsel, int ncpus, int nthreads) -{ - evsel->counts = perf_counts__new(ncpus, nthreads); - return evsel->counts != NULL ? 0 : -ENOMEM; -} - -void perf_evsel__free_counts(struct perf_evsel *evsel) -{ - perf_counts__delete(evsel->counts); - evsel->counts = NULL; -} - void perf_evsel__reset_stat_priv(struct perf_evsel *evsel) { int i; diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h index 0b897b0..62448c8 100644 --- a/tools/perf/util/stat.h +++ b/tools/perf/util/stat.h @@ -33,23 +33,6 @@ enum aggr_mode { AGGR_THREAD, }; -struct perf_counts_values { - union { - struct { - u64 val; - u64 ena; - u64 run; - }; - u64 values[3]; - }; -}; - -struct perf_counts { - s8 scaled; - struct perf_counts_values aggr; - struct xyarray *values; -}; - struct perf_stat_config { enum aggr_mode aggr_mode; bool scale; @@ -57,12 +40,6 @@ struct perf_stat_config { unsigned int interval; }; -static inline struct perf_counts_values* -perf_counts(struct perf_counts *counts, int cpu, int thread) -{ - return xyarray__entry(counts->values, cpu, thread); -} - void update_stats(struct stats *stats, u64 val); double avg_stats(struct stats *stats); double stddev_stats(struct stats *stats); @@ -96,13 +73,6 @@ void perf_stat__update_shadow_stats(struct perf_evsel *counter, u64 *count, void perf_stat__print_shadow_stats(FILE *out, struct perf_evsel *evsel, double avg, int cpu, enum aggr_mode aggr); -struct perf_counts *perf_counts__new(int ncpus, int nthreads); -void perf_counts__delete(struct perf_counts *counts); - -void perf_evsel__reset_counts(struct perf_evsel *evsel); -int perf_evsel__alloc_counts(struct perf_evsel *evsel, int ncpus, int nthreads); -void perf_evsel__free_counts(struct perf_evsel *evsel); - void perf_evsel__reset_stat_priv(struct perf_evsel *evsel); int perf_evsel__alloc_stat_priv(struct perf_evsel *evsel); void perf_evsel__free_stat_priv(struct perf_evsel *evsel); -- cgit v0.10.2 From 076a30c411ba2b91a18d44a5a01977035cdd7716 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 6 Aug 2015 15:44:52 -0400 Subject: perf callchain: Move option parsing code to util.c Move callchain option parse related code to util.c, to avoid dragging more object files into the python binding. Signed-off-by: Kan Liang Cc: Andi Kleen Cc: Jiri Olsa Cc: Namhyung Kim Link: http://lkml.kernel.org/r/1438890294-33409-1-git-send-email-kan.liang@intel.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c index 931cca8..773fe13 100644 --- a/tools/perf/util/callchain.c +++ b/tools/perf/util/callchain.c @@ -25,96 +25,9 @@ __thread struct callchain_cursor callchain_cursor; -#ifdef HAVE_DWARF_UNWIND_SUPPORT -static int get_stack_size(const char *str, unsigned long *_size) -{ - char *endptr; - unsigned long size; - unsigned long max_size = round_down(USHRT_MAX, sizeof(u64)); - - size = strtoul(str, &endptr, 0); - - do { - if (*endptr) - break; - - size = round_up(size, sizeof(u64)); - if (!size || size > max_size) - break; - - *_size = size; - return 0; - - } while (0); - - pr_err("callchain: Incorrect stack dump size (max %ld): %s\n", - max_size, str); - return -1; -} -#endif /* HAVE_DWARF_UNWIND_SUPPORT */ - int parse_callchain_record_opt(const char *arg, struct callchain_param *param) { - char *tok, *name, *saveptr = NULL; - char *buf; - int ret = -1; - - /* We need buffer that we know we can write to. */ - buf = malloc(strlen(arg) + 1); - if (!buf) - return -ENOMEM; - - strcpy(buf, arg); - - tok = strtok_r((char *)buf, ",", &saveptr); - name = tok ? : (char *)buf; - - do { - /* Framepointer style */ - if (!strncmp(name, "fp", sizeof("fp"))) { - if (!strtok_r(NULL, ",", &saveptr)) { - param->record_mode = CALLCHAIN_FP; - ret = 0; - } else - pr_err("callchain: No more arguments " - "needed for --call-graph fp\n"); - break; - -#ifdef HAVE_DWARF_UNWIND_SUPPORT - /* Dwarf style */ - } else if (!strncmp(name, "dwarf", sizeof("dwarf"))) { - const unsigned long default_stack_dump_size = 8192; - - ret = 0; - param->record_mode = CALLCHAIN_DWARF; - param->dump_size = default_stack_dump_size; - - tok = strtok_r(NULL, ",", &saveptr); - if (tok) { - unsigned long size = 0; - - ret = get_stack_size(tok, &size); - param->dump_size = size; - } -#endif /* HAVE_DWARF_UNWIND_SUPPORT */ - } else if (!strncmp(name, "lbr", sizeof("lbr"))) { - if (!strtok_r(NULL, ",", &saveptr)) { - param->record_mode = CALLCHAIN_LBR; - ret = 0; - } else - pr_err("callchain: No more arguments " - "needed for --call-graph lbr\n"); - break; - } else { - pr_err("callchain: Unknown --call-graph option " - "value: %s\n", arg); - break; - } - - } while (0); - - free(buf); - return ret; + return parse_callchain_record(arg, param); } static int parse_callchain_mode(const char *value) diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h index 68a32c2..acee2b3 100644 --- a/tools/perf/util/callchain.h +++ b/tools/perf/util/callchain.h @@ -177,6 +177,7 @@ int fill_callchain_info(struct addr_location *al, struct callchain_cursor_node * bool hide_unresolved); extern const char record_callchain_help[]; +extern int parse_callchain_record(const char *arg, struct callchain_param *param); int parse_callchain_record_opt(const char *arg, struct callchain_param *param); int parse_callchain_report_opt(const char *arg); int perf_callchain_config(const char *var, const char *value); diff --git a/tools/perf/util/util.c b/tools/perf/util/util.c index edc2d63..f7adf12 100644 --- a/tools/perf/util/util.c +++ b/tools/perf/util/util.c @@ -566,6 +566,96 @@ unsigned long parse_tag_value(const char *str, struct parse_tag *tags) return (unsigned long) -1; } +int get_stack_size(const char *str, unsigned long *_size) +{ + char *endptr; + unsigned long size; + unsigned long max_size = round_down(USHRT_MAX, sizeof(u64)); + + size = strtoul(str, &endptr, 0); + + do { + if (*endptr) + break; + + size = round_up(size, sizeof(u64)); + if (!size || size > max_size) + break; + + *_size = size; + return 0; + + } while (0); + + pr_err("callchain: Incorrect stack dump size (max %ld): %s\n", + max_size, str); + return -1; +} + +int parse_callchain_record(const char *arg, struct callchain_param *param) +{ + char *tok, *name, *saveptr = NULL; + char *buf; + int ret = -1; + + /* We need buffer that we know we can write to. */ + buf = malloc(strlen(arg) + 1); + if (!buf) + return -ENOMEM; + + strcpy(buf, arg); + + tok = strtok_r((char *)buf, ",", &saveptr); + name = tok ? : (char *)buf; + + do { + /* Framepointer style */ + if (!strncmp(name, "fp", sizeof("fp"))) { + if (!strtok_r(NULL, ",", &saveptr)) { + param->record_mode = CALLCHAIN_FP; + ret = 0; + } else + pr_err("callchain: No more arguments " + "needed for --call-graph fp\n"); + break; + +#ifdef HAVE_DWARF_UNWIND_SUPPORT + /* Dwarf style */ + } else if (!strncmp(name, "dwarf", sizeof("dwarf"))) { + const unsigned long default_stack_dump_size = 8192; + + ret = 0; + param->record_mode = CALLCHAIN_DWARF; + param->dump_size = default_stack_dump_size; + + tok = strtok_r(NULL, ",", &saveptr); + if (tok) { + unsigned long size = 0; + + ret = get_stack_size(tok, &size); + param->dump_size = size; + } +#endif /* HAVE_DWARF_UNWIND_SUPPORT */ + } else if (!strncmp(name, "lbr", sizeof("lbr"))) { + if (!strtok_r(NULL, ",", &saveptr)) { + param->record_mode = CALLCHAIN_LBR; + ret = 0; + } else + pr_err("callchain: No more arguments " + "needed for --call-graph lbr\n"); + break; + } else { + pr_err("callchain: Unknown --call-graph option " + "value: %s\n", arg); + break; + } + + } while (0); + + free(buf); + return ret; +} + int filename__read_str(const char *filename, char **buf, size_t *sizep) { size_t size = 0, alloc_size = 0; diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h index 20d625a..8148703 100644 --- a/tools/perf/util/util.h +++ b/tools/perf/util/util.h @@ -351,4 +351,6 @@ static inline char *asprintf_expr_not_in_ints(const char *var, size_t nints, int return asprintf_expr_inout_ints(var, false, nints, ints); } +int get_stack_size(const char *str, unsigned long *_size); + #endif /* GIT_COMPAT_UTIL_H */ -- cgit v0.10.2 From a9710ba091b0dcdace90f791706e9192313ffb7c Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 7 Aug 2015 15:24:05 -0700 Subject: perf tools: Support full source file paths for srcline For perf report/script srcline currently only the base file name of the source file is printed. This is a good default because it usually fits on the screen. But in some cases we want to know the full file name, for example to aggregate hits per file. In the later case we need more than the base file name to resolve file naming collisions: for example the kernel source has ~70 files named "core.c" It's also useful as input to post processing tools which want to point to the right file. Add a flag to allow full file name output. Add an option to perf report/script to enable this option. Signed-off-by: Andi Kleen Cc: Jiri Olsa Cc: Namhyung Kim Link: http://lkml.kernel.org/r/1438986245-15191-1-git-send-email-andi@firstfloor.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt index 960da20..1a782ef 100644 --- a/tools/perf/Documentation/perf-report.txt +++ b/tools/perf/Documentation/perf-report.txt @@ -354,6 +354,8 @@ OPTIONS To disable decoding entirely, use --no-itrace. +--full-source-path:: + Show the full path for source files for srcline output. include::callchain-overhead-calculation.txt[] diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt index e2fec5f..8e9be1f 100644 --- a/tools/perf/Documentation/perf-script.txt +++ b/tools/perf/Documentation/perf-script.txt @@ -260,6 +260,9 @@ OPTIONS To disable decoding entirely, use --no-itrace. +--full-source-path:: + Show the full path for source files for srcline output. + SEE ALSO -------- linkperf:perf-record[1], linkperf:perf-script-perl[1], diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 3a9d1b6..f301e86 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -738,6 +738,8 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused) OPT_CALLBACK_OPTARG(0, "itrace", &itrace_synth_opts, NULL, "opts", "Instruction Tracing options", itrace_parse_synth_opts), + OPT_BOOLEAN(0, "full-source-path", &srcline_full_filename, + "Show full source file name path for source lines"), OPT_END() }; struct perf_data_file file = { diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 7912feb..7b376d2 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -1653,6 +1653,8 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused) OPT_CALLBACK_OPTARG(0, "itrace", &itrace_synth_opts, NULL, "opts", "Instruction Tracing options", itrace_parse_synth_opts), + OPT_BOOLEAN(0, "full-source-path", &srcline_full_filename, + "Show full source file name path for source lines"), OPT_END() }; const char * const script_subcommands[] = { "record", "report", NULL }; diff --git a/tools/perf/util/srcline.c b/tools/perf/util/srcline.c index c93fb0c..fc08248 100644 --- a/tools/perf/util/srcline.c +++ b/tools/perf/util/srcline.c @@ -10,6 +10,8 @@ #include "symbol.h" +bool srcline_full_filename; + #ifdef HAVE_LIBBFD_SUPPORT /* @@ -277,7 +279,9 @@ char *get_srcline(struct dso *dso, u64 addr, struct symbol *sym, if (!addr2line(dso_name, addr, &file, &line, dso)) goto out; - if (asprintf(&srcline, "%s:%u", basename(file), line) < 0) { + if (asprintf(&srcline, "%s:%u", + srcline_full_filename ? file : basename(file), + line) < 0) { free(file); goto out; } diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h index 8148703..88a8915 100644 --- a/tools/perf/util/util.h +++ b/tools/perf/util/util.h @@ -318,6 +318,7 @@ static inline int path__join3(char *bf, size_t size, struct dso; struct symbol; +extern bool srcline_full_filename; char *get_srcline(struct dso *dso, u64 addr, struct symbol *sym, bool show_sym); void free_srcline(char *srcline); -- cgit v0.10.2 From ab35a7d0ee59a36c9c567defe43c1adb72e9240c Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sat, 8 Aug 2015 19:12:10 +0200 Subject: perf tools: Unset perf_event_attr::freq when period term is set We need to unset 'perf_event_attr::freq' bit (default 1) when 'period' term is specified within event definition like: -e 'cpu/cpu-cycles,call-graph=fp,time,period=100000' otherwise it will handle the period value as frequency (and fail if it crossed the maximum allowed frequency value). Signed-off-by: Jiri Olsa Cc: Andi Kleen Cc: Kan Liang Cc: Namhyung Kim Link: http://lkml.kernel.org/r/20150808171210.GC17040@krava.brq.redhat.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index f572f46..a59710f 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -598,6 +598,7 @@ static void apply_config_terms(struct perf_evsel *evsel) switch (term->type) { case PERF_EVSEL__CONFIG_TERM_PERIOD: attr->sample_period = term->val.period; + attr->freq = 0; break; case PERF_EVSEL__CONFIG_TERM_TIME: if (term->val.time) -- cgit v0.10.2 From 5cef897652f224442c013d9e9425536b38385351 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 10 Aug 2015 15:45:55 -0300 Subject: perf hists: hist_entry__cmp() may use he_tmp.hists, initialize it The iter_add_next_cumulative_entry() function calls hist_entry__cmp(), which may want to access the hists where this hist_entry is stored, initialize it to let that happen and avoid segfaults. Cc: Adrian Hunter Cc: Borislav Petkov Cc: David Ahern Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Namhyung Kim Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-iqg98sfn4fvwcxp0pdvqauie@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index a6e9ddd..2fe6ea3 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -761,6 +761,7 @@ iter_add_next_cumulative_entry(struct hist_entry_iter *iter, struct hist_entry **he_cache = iter->priv; struct hist_entry *he; struct hist_entry he_tmp = { + .hists = evsel__hists(evsel), .cpu = al->cpu, .thread = al->thread, .comm = thread__comm(al->thread), -- cgit v0.10.2 From e8e6d37e73e6b950c891c780745460b87f4755b6 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 10 Aug 2015 16:53:54 -0300 Subject: perf hists: Update the column width for the "srcline" sort key When we introduce a new sort key, we need to update the hists__calc_col_len() function accordingly, otherwise the width will be limited to strlen(header). We can't update it when obtaining a line value for a column (for instance, in sort__srcline_cmp()), because we reset it all when doing a resort (see hists__output_recalc_col_len()), so we need to, from what is in the hist_entry fields, set each of the column widths. Cc: Namhyung Kim Cc: Andi Kleen Cc: Jiri Olsa Fixes: 409a8be61560 ("perf tools: Add sort by src line/number") Link: http://lkml.kernel.org/n/tip-jgbe0yx8v1gs89cslr93pvz2@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index 2fe6ea3..608c0a7 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -151,6 +151,9 @@ void hists__calc_col_len(struct hists *hists, struct hist_entry *h) hists__new_col_len(hists, HISTC_LOCAL_WEIGHT, 12); hists__new_col_len(hists, HISTC_GLOBAL_WEIGHT, 12); + if (h->srcline) + hists__new_col_len(hists, HISTC_SRCLINE, strlen(h->srcline)); + if (h->transaction) hists__new_col_len(hists, HISTC_TRANSACTION, hist_entry__transaction_len()); -- cgit v0.10.2 From 31191a85fb875cf123cea56bbfd34f4b941f3c79 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 7 Aug 2015 15:54:24 -0700 Subject: perf report: Add support for srcfile sort key In some cases it's useful to characterize samples by file. This is useful to get a higher level categorization, for example to map cost to subsystems. Add a srcfile sort key to perf report. It builds on top of the existing srcline support. Commiter notes: E.g.: # perf record -F 10000 usleep 1 [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.016 MB perf.data (13 samples) ] [root@zoo ~]# perf report -s srcfile --stdio # Total Lost Samples: 0 # # Samples: 13 of event 'cycles' # Event count (approx.): 869878 # # Overhead Source File # ........ ........... 60.99% . 20.62% paravirt.h 14.23% rmap.c 4.04% signal.c 0.11% msr.h # The first line is collecting all the files for which srcfiles couldn't somehow get resolved to: # perf report -s srcfile,dso --stdio # Total Lost Samples: 0 # # Samples: 13 of event 'cycles' # Event count (approx.): 869878 # # Overhead Source File Shared Object # ........ ........... ................ 40.97% . ld-2.20.so 20.62% paravirt.h [kernel.vmlinux] 20.02% . libc-2.20.so 14.23% rmap.c [kernel.vmlinux] 4.04% signal.c [kernel.vmlinux] 0.11% msr.h [kernel.vmlinux] # XXX: Investigate why that is not resolving on Fedora 21, Andi says he hasn't seen this on Fedora 22. Signed-off-by: Andi Kleen Tested-by: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Namhyung Kim Link: http://lkml.kernel.org/r/1438988064-21834-1-git-send-email-andi@firstfloor.org [ Added column length update, from 0e65bdb3f90f ('perf hists: Update the column width for the "srcline" sort key') ] Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt index 1a782ef..7b07d19 100644 --- a/tools/perf/Documentation/perf-report.txt +++ b/tools/perf/Documentation/perf-report.txt @@ -81,6 +81,8 @@ OPTIONS - cpu: cpu number the task ran at the time of sample - srcline: filename and line number executed at the time of sample. The DWARF debugging info must be provided. + - srcfile: file name of the source file of the same. Requires dwarf + information. - weight: Event specific weight, e.g. memory latency or transaction abort cost. This is the global weight. - local_weight: Local weight version of the weight above. diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index 608c0a7..6bccfae 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -154,6 +154,9 @@ void hists__calc_col_len(struct hists *hists, struct hist_entry *h) if (h->srcline) hists__new_col_len(hists, HISTC_SRCLINE, strlen(h->srcline)); + if (h->srcfile) + hists__new_col_len(hists, HISTC_SRCFILE, strlen(h->srcfile)); + if (h->transaction) hists__new_col_len(hists, HISTC_TRANSACTION, hist_entry__transaction_len()); @@ -949,6 +952,8 @@ void hist_entry__delete(struct hist_entry *he) zfree(&he->stat_acc); free_srcline(he->srcline); + if (he->srcfile && he->srcfile[0]) + free(he->srcfile); free_callchain(he->callchain); free(he); } diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h index e2f712f..bc528d5 100644 --- a/tools/perf/util/hist.h +++ b/tools/perf/util/hist.h @@ -30,6 +30,7 @@ enum hist_column { HISTC_PARENT, HISTC_CPU, HISTC_SRCLINE, + HISTC_SRCFILE, HISTC_MISPREDICT, HISTC_IN_TX, HISTC_ABORT, diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c index 5177088..c0c32b0 100644 --- a/tools/perf/util/sort.c +++ b/tools/perf/util/sort.c @@ -319,6 +319,57 @@ struct sort_entry sort_srcline = { .se_width_idx = HISTC_SRCLINE, }; +/* --sort srcfile */ + +static char no_srcfile[1]; + +static char *get_srcfile(struct hist_entry *e) +{ + char *sf, *p; + struct map *map = e->ms.map; + + sf = get_srcline(map->dso, map__rip_2objdump(map, e->ip), + e->ms.sym, true); + p = strchr(sf, ':'); + if (p && *sf) { + *p = 0; + return sf; + } + free(sf); + return no_srcfile; +} + +static int64_t +sort__srcfile_cmp(struct hist_entry *left, struct hist_entry *right) +{ + if (!left->srcfile) { + if (!left->ms.map) + left->srcfile = no_srcfile; + else + left->srcfile = get_srcfile(left); + } + if (!right->srcfile) { + if (!right->ms.map) + right->srcfile = no_srcfile; + else + right->srcfile = get_srcfile(right); + } + return strcmp(right->srcfile, left->srcfile); +} + +static int hist_entry__srcfile_snprintf(struct hist_entry *he, char *bf, + size_t size, unsigned int width) +{ + return repsep_snprintf(bf, size, "%-*.*s", width, width, he->srcfile); +} + +struct sort_entry sort_srcfile = { + .se_header = "Source File", + .se_cmp = sort__srcfile_cmp, + .se_snprintf = hist_entry__srcfile_snprintf, + .se_width_idx = HISTC_SRCFILE, +}; + /* --sort parent */ static int64_t @@ -1196,6 +1247,7 @@ static struct sort_dimension common_sort_dimensions[] = { DIM(SORT_PARENT, "parent", sort_parent), DIM(SORT_CPU, "cpu", sort_cpu), DIM(SORT_SRCLINE, "srcline", sort_srcline), + DIM(SORT_SRCFILE, "srcfile", sort_srcfile), DIM(SORT_LOCAL_WEIGHT, "local_weight", sort_local_weight), DIM(SORT_GLOBAL_WEIGHT, "weight", sort_global_weight), DIM(SORT_TRANSACTION, "transaction", sort_transaction), diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h index bc6c87a..3c2a399 100644 --- a/tools/perf/util/sort.h +++ b/tools/perf/util/sort.h @@ -114,6 +114,7 @@ struct hist_entry { }; }; char *srcline; + char *srcfile; struct symbol *parent; struct rb_root sorted_chain; struct branch_info *branch_info; @@ -172,6 +173,7 @@ enum sort_type { SORT_PARENT, SORT_CPU, SORT_SRCLINE, + SORT_SRCFILE, SORT_LOCAL_WEIGHT, SORT_GLOBAL_WEIGHT, SORT_TRANSACTION, -- cgit v0.10.2 From 09af2a553577a6e53e40011a910be0f27ce56f3e Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Sun, 9 Aug 2015 15:45:23 +0900 Subject: perf record: Support per-event freq term Now perf can set per-event value of time and (sampling) period. But I guess most users like me just want to set frequency rather than period. So add the 'freq' term in the event parser. Signed-off-by: Namhyung Kim Acked-by: Jiri Olsa Cc: David Ahern Cc: Kan Liang Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1439102724-14079-1-git-send-email-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt index 0d852d1..afbe45e 100644 --- a/tools/perf/Documentation/perf-record.txt +++ b/tools/perf/Documentation/perf-record.txt @@ -49,6 +49,7 @@ OPTIONS These params can be used to overload default config values per event. Here is a list of the params. - 'period': Set event sampling period + - 'freq': Set event sampling frequency - 'time': Disable/enable time stamping. Acceptable values are 1 for enabling time stamping. 0 for disabling time stamping. The default is 1. diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index a59710f..f664a22 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -600,6 +600,10 @@ static void apply_config_terms(struct perf_evsel *evsel) attr->sample_period = term->val.period; attr->freq = 0; break; + case PERF_EVSEL__CONFIG_TERM_FREQ: + attr->sample_freq = term->val.freq; + attr->freq = 1; + break; case PERF_EVSEL__CONFIG_TERM_TIME: if (term->val.time) perf_evsel__set_sample_bit(evsel, TIME); diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index b948f69..fdf2674 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -39,6 +39,7 @@ struct cgroup_sel; */ enum { PERF_EVSEL__CONFIG_TERM_PERIOD, + PERF_EVSEL__CONFIG_TERM_FREQ, PERF_EVSEL__CONFIG_TERM_TIME, PERF_EVSEL__CONFIG_TERM_MAX, }; @@ -48,6 +49,7 @@ struct perf_evsel_config_term { int type; union { u64 period; + u64 freq; bool time; } val; }; diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 828936d..dbf315d 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -597,6 +597,9 @@ do { \ case PARSE_EVENTS__TERM_TYPE_SAMPLE_PERIOD: CHECK_TYPE_VAL(NUM); break; + case PARSE_EVENTS__TERM_TYPE_SAMPLE_FREQ: + CHECK_TYPE_VAL(NUM); + break; case PARSE_EVENTS__TERM_TYPE_BRANCH_SAMPLE_TYPE: /* * TODO uncomment when the field is available @@ -659,6 +662,9 @@ do { \ case PARSE_EVENTS__TERM_TYPE_SAMPLE_PERIOD: ADD_CONFIG_TERM(PERIOD, period, term->val.num); break; + case PARSE_EVENTS__TERM_TYPE_SAMPLE_FREQ: + ADD_CONFIG_TERM(FREQ, freq, term->val.num); + break; case PARSE_EVENTS__TERM_TYPE_TIME: ADD_CONFIG_TERM(TIME, time, term->val.num); break; diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h index e6f9aacc..ce2d13a 100644 --- a/tools/perf/util/parse-events.h +++ b/tools/perf/util/parse-events.h @@ -62,6 +62,7 @@ enum { PARSE_EVENTS__TERM_TYPE_CONFIG2, PARSE_EVENTS__TERM_TYPE_NAME, PARSE_EVENTS__TERM_TYPE_SAMPLE_PERIOD, + PARSE_EVENTS__TERM_TYPE_SAMPLE_FREQ, PARSE_EVENTS__TERM_TYPE_BRANCH_SAMPLE_TYPE, PARSE_EVENTS__TERM_TYPE_TIME, }; diff --git a/tools/perf/util/parse-events.l b/tools/perf/util/parse-events.l index f542750..4306f5a 100644 --- a/tools/perf/util/parse-events.l +++ b/tools/perf/util/parse-events.l @@ -182,6 +182,7 @@ config1 { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_CONFIG1); } config2 { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_CONFIG2); } name { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_NAME); } period { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_SAMPLE_PERIOD); } +freq { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_SAMPLE_FREQ); } branch_type { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_BRANCH_SAMPLE_TYPE); } time { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_TIME); } , { return ','; } diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index d4b0e64..d85f11b 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -634,7 +634,7 @@ static char *formats_error_string(struct list_head *formats) { struct perf_pmu_format *format; char *err, *str; - static const char *static_terms = "config,config1,config2,name,period,branch_type,time\n"; + static const char *static_terms = "config,config1,config2,name,period,freq,branch_type,time\n"; unsigned i = 0; if (!asprintf(&str, "valid terms:")) -- cgit v0.10.2 From 4605bb55b91449a1a953a51f0334d3bc02351adb Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Sun, 9 Aug 2015 15:45:24 +0900 Subject: perf evlist: Be more specific on -F/--freq Currently perf evlist -F shows the number as if it's always sampling frequency. But we now support per-event freq/period settings. So it'd better to show more detailed info whether it's freq or period. $ perf record -e 'cpu/config=1/,cpu/config=2,period=300000/' sleep 1 [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.017 MB perf.data ] $ perf evlist -F cpu/config=1/: sample_freq=4000 cpu/config=2,period=300000/: sample_period=300000 Signed-off-by: Namhyung Kim Acked-by: Jiri Olsa Tested-by: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1439102724-14079-2-git-send-email-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index f664a22..04fdddd 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -2158,8 +2158,13 @@ int perf_evsel__fprintf(struct perf_evsel *evsel, printed += perf_event_attr__fprintf(fp, &evsel->attr, __print_attr__fprintf, &first); } else if (details->freq) { - printed += comma_fprintf(fp, &first, " sample_freq=%" PRIu64, - (u64)evsel->attr.sample_freq); + const char *term = "sample_freq"; + + if (!evsel->attr.freq) + term = "sample_period"; + + printed += comma_fprintf(fp, &first, " %s=%" PRIu64, + term, (u64)evsel->attr.sample_freq); } out: fputc('\n', fp); -- cgit v0.10.2 From 19b3340cf58d14decf2898fc795cc2b1fa49e79e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 6 Aug 2015 17:26:58 +0200 Subject: perf/x86: Fix MSR PMU driver Currently we only update the sysfs event files per available MSR, we didn't actually disallow creating unlisted events. Rework things such that the dectection, sysfs listing and event creation are better coordinated. Sadly it appears it's impossible to probe R/O MSRs under virt. This means we have to do the full model table to avoid listing all MSRs all the time. Tested-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Acked-by: Andy Lutomirski Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/cpu/perf_event_msr.c b/arch/x86/kernel/cpu/perf_event_msr.c index af216e9..b0dd2e8 100644 --- a/arch/x86/kernel/cpu/perf_event_msr.c +++ b/arch/x86/kernel/cpu/perf_event_msr.c @@ -10,17 +10,63 @@ enum perf_msr_id { PERF_MSR_EVENT_MAX, }; +bool test_aperfmperf(int idx) +{ + return boot_cpu_has(X86_FEATURE_APERFMPERF); +} + +bool test_intel(int idx) +{ + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || + boot_cpu_data.x86 != 6) + return false; + + switch (boot_cpu_data.x86_model) { + case 30: /* 45nm Nehalem */ + case 26: /* 45nm Nehalem-EP */ + case 46: /* 45nm Nehalem-EX */ + + case 37: /* 32nm Westmere */ + case 44: /* 32nm Westmere-EP */ + case 47: /* 32nm Westmere-EX */ + + case 42: /* 32nm SandyBridge */ + case 45: /* 32nm SandyBridge-E/EN/EP */ + + case 58: /* 22nm IvyBridge */ + case 62: /* 22nm IvyBridge-EP/EX */ + + case 60: /* 22nm Haswell Core */ + case 63: /* 22nm Haswell Server */ + case 69: /* 22nm Haswell ULT */ + case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */ + + case 61: /* 14nm Broadwell Core-M */ + case 86: /* 14nm Broadwell Xeon D */ + case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */ + case 79: /* 14nm Broadwell Server */ + + case 55: /* 22nm Atom "Silvermont" */ + case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */ + case 76: /* 14nm Atom "Airmont" */ + if (idx == PERF_MSR_SMI) + return true; + break; + + case 78: /* 14nm Skylake Mobile */ + case 94: /* 14nm Skylake Desktop */ + if (idx == PERF_MSR_SMI || idx == PERF_MSR_PPERF) + return true; + break; + } + + return false; +} + struct perf_msr { - int id; u64 msr; -}; - -static struct perf_msr msr[] = { - { PERF_MSR_TSC, 0 }, - { PERF_MSR_APERF, MSR_IA32_APERF }, - { PERF_MSR_MPERF, MSR_IA32_MPERF }, - { PERF_MSR_PPERF, MSR_PPERF }, - { PERF_MSR_SMI, MSR_SMI_COUNT }, + struct perf_pmu_events_attr *attr; + bool (*test)(int idx); }; PMU_EVENT_ATTR_STRING(tsc, evattr_tsc, "event=0x00"); @@ -29,8 +75,16 @@ PMU_EVENT_ATTR_STRING(mperf, evattr_mperf, "event=0x02"); PMU_EVENT_ATTR_STRING(pperf, evattr_pperf, "event=0x03"); PMU_EVENT_ATTR_STRING(smi, evattr_smi, "event=0x04"); +static struct perf_msr msr[] = { + [PERF_MSR_TSC] = { 0, &evattr_tsc, NULL, }, + [PERF_MSR_APERF] = { MSR_IA32_APERF, &evattr_aperf, test_aperfmperf, }, + [PERF_MSR_MPERF] = { MSR_IA32_MPERF, &evattr_mperf, test_aperfmperf, }, + [PERF_MSR_PPERF] = { MSR_PPERF, &evattr_pperf, test_intel, }, + [PERF_MSR_SMI] = { MSR_SMI_COUNT, &evattr_smi, test_intel, }, +}; + static struct attribute *events_attrs[PERF_MSR_EVENT_MAX + 1] = { - &evattr_tsc.attr.attr, + NULL, }; static struct attribute_group events_attr_group = { @@ -74,6 +128,9 @@ static int msr_event_init(struct perf_event *event) event->attr.sample_period) /* no sampling */ return -EINVAL; + if (!msr[cfg].attr) + return -EINVAL; + event->hw.idx = -1; event->hw.event_base = msr[cfg].msr; event->hw.config = cfg; @@ -151,89 +208,32 @@ static struct pmu pmu_msr = { .capabilities = PERF_PMU_CAP_NO_INTERRUPT, }; -static int __init intel_msr_init(int idx) -{ - if (boot_cpu_data.x86 != 6) - return 0; - - switch (boot_cpu_data.x86_model) { - case 30: /* 45nm Nehalem */ - case 26: /* 45nm Nehalem-EP */ - case 46: /* 45nm Nehalem-EX */ - - case 37: /* 32nm Westmere */ - case 44: /* 32nm Westmere-EP */ - case 47: /* 32nm Westmere-EX */ - - case 42: /* 32nm SandyBridge */ - case 45: /* 32nm SandyBridge-E/EN/EP */ - - case 58: /* 22nm IvyBridge */ - case 62: /* 22nm IvyBridge-EP/EX */ - - case 60: /* 22nm Haswell Core */ - case 63: /* 22nm Haswell Server */ - case 69: /* 22nm Haswell ULT */ - case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */ - - case 61: /* 14nm Broadwell Core-M */ - case 86: /* 14nm Broadwell Xeon D */ - case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */ - case 79: /* 14nm Broadwell Server */ - events_attrs[idx++] = &evattr_smi.attr.attr; - break; - - case 78: /* 14nm Skylake Mobile */ - case 94: /* 14nm Skylake Desktop */ - events_attrs[idx++] = &evattr_pperf.attr.attr; - events_attrs[idx++] = &evattr_smi.attr.attr; - break; - - case 55: /* 22nm Atom "Silvermont" */ - case 76: /* 14nm Atom "Airmont" */ - case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */ - events_attrs[idx++] = &evattr_smi.attr.attr; - break; - } - - events_attrs[idx] = NULL; - - return 0; -} - -static int __init amd_msr_init(int idx) -{ - return 0; -} - static int __init msr_init(void) { - int err; - int idx = 1; + int i, j = 0; - if (boot_cpu_has(X86_FEATURE_APERFMPERF)) { - events_attrs[idx++] = &evattr_aperf.attr.attr; - events_attrs[idx++] = &evattr_mperf.attr.attr; - events_attrs[idx] = NULL; + if (!boot_cpu_has(X86_FEATURE_TSC)) { + pr_cont("no MSR PMU driver.\n"); + return 0; } - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_INTEL: - err = intel_msr_init(idx); - break; - - case X86_VENDOR_AMD: - err = amd_msr_init(idx); - break; + /* Probe the MSRs. */ + for (i = PERF_MSR_TSC + 1; i < PERF_MSR_EVENT_MAX; i++) { + u64 val; - default: - err = -ENOTSUPP; + /* + * Virt sucks arse; you cannot tell if a R/O MSR is present :/ + */ + if (!msr[i].test(i) || rdmsrl_safe(msr[i].msr, &val)) + msr[i].attr = NULL; } - if (err != 0) { - pr_cont("no msr PMU driver.\n"); - return 0; + /* List remaining MSRs in the sysfs attrs. */ + for (i = 0; i < PERF_MSR_EVENT_MAX; i++) { + if (msr[i].attr) + events_attrs[j++] = &msr[i].attr->attr.attr; } + events_attrs[j] = NULL; perf_pmu_register(&pmu_msr, "msr", -1); -- cgit v0.10.2 From c2ad6b51efc5f27d70ce952decd2a15679b83600 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Tue, 28 Jul 2015 09:00:04 +0300 Subject: perf/ring-buffer: Clarify the use of page::private for high-order AUX allocations A question [1] was raised about the use of page::private in AUX buffer allocations, so let's add a clarification about its intended use. The private field and flag are used by perf's rb_alloc_aux() path to tell the pmu driver the size of each high-order allocation, so that the driver can program those appropriately into its hardware. This only matters for PMUs that don't support hardware scatter tables. Otherwise, every page in the buffer is just a page. This patch adds a comment about the private field to the AUX buffer allocation path. [1] http://marc.info/?l=linux-kernel&m=143803696607968 Reported-by: Mathieu Poirier Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1438063204-665-1-git-send-email-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index c8aa3f7..182bc30 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -437,7 +437,10 @@ static struct page *rb_alloc_aux_page(int node, int order) if (page && order) { /* - * Communicate the allocation size to the driver + * Communicate the allocation size to the driver: + * if we managed to secure a high-order allocation, + * set its first page's private to this order; + * !PagePrivate(page) means it's just a normal page. */ split_page(page, order); SetPagePrivate(page); -- cgit v0.10.2 From 709bc871923c12b284424f9d47b99dc975ba8b29 Mon Sep 17 00:00:00 2001 From: Takao Indoh Date: Tue, 4 Aug 2015 18:36:55 +0900 Subject: perf/x86/intel/pt: Clean up files of Intel Processor Trace This patch just cleans up some files of Intel Processor Trace, does not change its behavior. This patch removes unused definitions and replaces a constant value with a macro. Signed-off-by: Takao Indoh Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: H.Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1438681015-5124-1-git-send-email-indou.takao@jp.fujitsu.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/cpu/intel_pt.h b/arch/x86/kernel/cpu/intel_pt.h index feb293e..336878a 100644 --- a/arch/x86/kernel/cpu/intel_pt.h +++ b/arch/x86/kernel/cpu/intel_pt.h @@ -25,32 +25,11 @@ */ #define TOPA_PMI_MARGIN 512 -/* - * Table of Physical Addresses bits - */ -enum topa_sz { - TOPA_4K = 0, - TOPA_8K, - TOPA_16K, - TOPA_32K, - TOPA_64K, - TOPA_128K, - TOPA_256K, - TOPA_512K, - TOPA_1MB, - TOPA_2MB, - TOPA_4MB, - TOPA_8MB, - TOPA_16MB, - TOPA_32MB, - TOPA_64MB, - TOPA_128MB, - TOPA_SZ_END, -}; +#define TOPA_SHIFT 12 -static inline unsigned int sizes(enum topa_sz tsz) +static inline unsigned int sizes(unsigned int tsz) { - return 1 << (tsz + 12); + return 1 << (tsz + TOPA_SHIFT); }; struct topa_entry { @@ -66,8 +45,8 @@ struct topa_entry { u64 rsvd4 : 16; }; -#define TOPA_SHIFT 12 -#define PT_CPUID_LEAVES 2 +#define PT_CPUID_LEAVES 2 +#define PT_CPUID_REGS_NUM 4 /* number of regsters (eax, ebx, ecx, edx) */ enum pt_capabilities { PT_CAP_max_subleaf = 0, @@ -85,7 +64,7 @@ enum pt_capabilities { struct pt_pmu { struct pmu pmu; - u32 caps[4 * PT_CPUID_LEAVES]; + u32 caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES]; }; /** diff --git a/arch/x86/kernel/cpu/perf_event_intel_pt.c b/arch/x86/kernel/cpu/perf_event_intel_pt.c index e20cfac..4216928 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_pt.c +++ b/arch/x86/kernel/cpu/perf_event_intel_pt.c @@ -79,7 +79,7 @@ static struct pt_cap_desc { static u32 pt_cap_get(enum pt_capabilities cap) { struct pt_cap_desc *cd = &pt_caps[cap]; - u32 c = pt_pmu.caps[cd->leaf * 4 + cd->reg]; + u32 c = pt_pmu.caps[cd->leaf * PT_CPUID_REGS_NUM + cd->reg]; unsigned int shift = __ffs(cd->mask); return (c & cd->mask) >> shift; @@ -145,10 +145,10 @@ static int __init pt_pmu_hw_init(void) for (i = 0; i < PT_CPUID_LEAVES; i++) { cpuid_count(20, i, - &pt_pmu.caps[CR_EAX + i*4], - &pt_pmu.caps[CR_EBX + i*4], - &pt_pmu.caps[CR_ECX + i*4], - &pt_pmu.caps[CR_EDX + i*4]); + &pt_pmu.caps[CR_EAX + i*PT_CPUID_REGS_NUM], + &pt_pmu.caps[CR_EBX + i*PT_CPUID_REGS_NUM], + &pt_pmu.caps[CR_ECX + i*PT_CPUID_REGS_NUM], + &pt_pmu.caps[CR_EDX + i*PT_CPUID_REGS_NUM]); } ret = -ENOMEM; -- cgit v0.10.2 From 090389b6d95c046cc6caca2e7e00adc27829acf2 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 10 Aug 2015 19:20:52 -0300 Subject: perf trace: Associate some more syscall args with the getname beautifier This time using 'trinity' to test these: fchmodat, futimesat, llistxattr, lremovexattr, lstat, mknodat, mq_unlink, stat and vmsplice. Cc: Adrian Hunter Cc: Borislav Petkov Cc: David Ahern Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Milian Wolff Cc: Namhyung Kim Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-a1uqu249nwwh0ixrhm80k4a4@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index a25048c..7cdcbee 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -1014,7 +1014,8 @@ static struct syscall_fmt { { .name = "fchmod", .errmsg = true, .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, }, { .name = "fchmodat", .errmsg = true, - .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, }, + .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ + [1] = SCA_FILENAME, /* filename */ }, }, { .name = "fchown", .errmsg = true, .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, }, { .name = "fchownat", .errmsg = true, @@ -1045,7 +1046,8 @@ static struct syscall_fmt { { .name = "futex", .errmsg = true, .arg_scnprintf = { [1] = SCA_FUTEX_OP, /* op */ }, }, { .name = "futimesat", .errmsg = true, - .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, }, + .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ + [1] = SCA_FILENAME, /* filename */ }, }, { .name = "getdents", .errmsg = true, .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, }, { .name = "getdents64", .errmsg = true, @@ -1078,13 +1080,18 @@ static struct syscall_fmt { .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, }, { .name = "listxattr", .errmsg = true, .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, }, + { .name = "llistxattr", .errmsg = true, + .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, }, + { .name = "lremovexattr", .errmsg = true, + .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, }, { .name = "lseek", .errmsg = true, .arg_scnprintf = { [0] = SCA_FD, /* fd */ [2] = SCA_STRARRAY, /* whence */ }, .arg_parm = { [2] = &strarray__whences, /* whence */ }, }, { .name = "lsetxattr", .errmsg = true, .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, }, - { .name = "lstat", .errmsg = true, .alias = "newlstat", }, + { .name = "lstat", .errmsg = true, .alias = "newlstat", + .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, }, { .name = "lsxattr", .errmsg = true, .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, }, { .name = "madvise", .errmsg = true, @@ -1098,7 +1105,8 @@ static struct syscall_fmt { { .name = "mknod", .errmsg = true, .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, }, { .name = "mknodat", .errmsg = true, - .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, }, + .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ + [1] = SCA_FILENAME, /* filename */ }, }, { .name = "mlock", .errmsg = true, .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, }, { .name = "mlockall", .errmsg = true, @@ -1111,6 +1119,8 @@ static struct syscall_fmt { { .name = "mprotect", .errmsg = true, .arg_scnprintf = { [0] = SCA_HEX, /* start */ [2] = SCA_MMAP_PROT, /* prot */ }, }, + { .name = "mq_unlink", .errmsg = true, + .arg_scnprintf = { [0] = SCA_FILENAME, /* u_name */ }, }, { .name = "mremap", .hexret = true, .arg_scnprintf = { [0] = SCA_HEX, /* addr */ [3] = SCA_MREMAP_FLAGS, /* flags */ @@ -1201,7 +1211,8 @@ static struct syscall_fmt { .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */ [1] = SCA_SK_TYPE, /* type */ }, .arg_parm = { [0] = &strarray__socket_families, /* family */ }, }, - { .name = "stat", .errmsg = true, .alias = "newstat", }, + { .name = "stat", .errmsg = true, .alias = "newstat", + .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, }, { .name = "statfs", .errmsg = true, .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, }, { .name = "swapoff", .errmsg = true, @@ -1227,6 +1238,8 @@ static struct syscall_fmt { [1] = SCA_FILENAME, /* filename */ }, }, { .name = "utimes", .errmsg = true, .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, }, + { .name = "vmsplice", .errmsg = true, + .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, }, { .name = "write", .errmsg = true, .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, }, { .name = "writev", .errmsg = true, -- cgit v0.10.2 From 28ebb87c73f9c17266d4f7570d65a1e695df7d9d Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 11 Aug 2015 10:38:38 -0300 Subject: perf trace: Add missing clockid entries We were missing: CLOCK_BOOTTIME, CLOCK_REALTIME_ALARM, CLOCK_BOOTTIME_ALARM, CLOCK_SGI_CYCLE and CLOCK_TAI. Add them. Cc: Adrian Hunter Cc: Borislav Petkov Cc: David Ahern Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Milian Wolff Cc: Namhyung Kim Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-d67rwqtwm9jyenwes98kr0cr@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 7cdcbee..5c494a8 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -599,7 +599,8 @@ static DEFINE_STRARRAY(sighow); static const char *clockid[] = { "REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID", - "MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", + "MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME", + "REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI" }; static DEFINE_STRARRAY(clockid); -- cgit v0.10.2 From 76b10655818c939e257377f83992975a5f55ffb3 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 11 Aug 2015 06:36:55 -0700 Subject: perf sort: Check for SRCLINE_UNKNOWN case in "srcfile" processing Handle the SRCLINE_UNKNOWN case correctly when processing "srcfile". Commiter note: We can't just free it, as it was't allocated via malloc, its a guard variable. Reported-by: Namhyung Kim Signed-off-by: Andi Kleen Cc: Jiri Olsa Link: http://lkml.kernel.org/r/20150811133655.GC4524@tassilo.jf.intel.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c index c0c32b0..7e38716 100644 --- a/tools/perf/util/sort.c +++ b/tools/perf/util/sort.c @@ -330,6 +330,8 @@ static char *get_srcfile(struct hist_entry *e) sf = get_srcline(map->dso, map__rip_2objdump(map, e->ip), e->ms.sym, true); + if (!strcmp(sf, SRCLINE_UNKNOWN)) + return no_srcfile; p = strchr(sf, ':'); if (p && *sf) { *p = 0; -- cgit v0.10.2 From 8d8c66a248dfb9a7e517822b5ef87e357b1605bf Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 11 Aug 2015 10:57:02 -0300 Subject: perf trace: Use the FD beautifier for socket syscall fds But we really should have something like 'strace -yy' here... Cc: Adrian Hunter Cc: Borislav Petkov Cc: David Ahern Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Namhyung Kim Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-eyrt1ypfq68u4ljagyk2nj1i@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 5c494a8..12fc6df 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -1173,11 +1173,14 @@ static struct syscall_fmt { { .name = "readv", .errmsg = true, .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, }, { .name = "recvfrom", .errmsg = true, - .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, }, + .arg_scnprintf = { [0] = SCA_FD, /* fd */ + [3] = SCA_MSG_FLAGS, /* flags */ }, }, { .name = "recvmmsg", .errmsg = true, - .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, }, + .arg_scnprintf = { [0] = SCA_FD, /* fd */ + [3] = SCA_MSG_FLAGS, /* flags */ }, }, { .name = "recvmsg", .errmsg = true, - .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, }, + .arg_scnprintf = { [0] = SCA_FD, /* fd */ + [2] = SCA_MSG_FLAGS, /* flags */ }, }, { .name = "removexattr", .errmsg = true, .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, }, { .name = "renameat", .errmsg = true, @@ -1193,11 +1196,14 @@ static struct syscall_fmt { .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, }, { .name = "select", .errmsg = true, .timeout = true, }, { .name = "sendmmsg", .errmsg = true, - .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, }, + .arg_scnprintf = { [0] = SCA_FD, /* fd */ + [3] = SCA_MSG_FLAGS, /* flags */ }, }, { .name = "sendmsg", .errmsg = true, - .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, }, + .arg_scnprintf = { [0] = SCA_FD, /* fd */ + [2] = SCA_MSG_FLAGS, /* flags */ }, }, { .name = "sendto", .errmsg = true, - .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, }, + .arg_scnprintf = { [0] = SCA_FD, /* fd */ + [3] = SCA_MSG_FLAGS, /* flags */ }, }, { .name = "setitimer", .errmsg = true, STRARRAY(0, which, itimers), }, { .name = "setrlimit", .errmsg = true, STRARRAY(0, resource, rlimit_resources), }, { .name = "setxattr", .errmsg = true, -- cgit v0.10.2 From b62bee1bdea6d78e444183b04c81ce982a371571 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 11 Aug 2015 11:05:36 -0300 Subject: perf trace: Beautify keyctl's option arg 8.697 (0.103 ms): pool/2343 keyctl(option: GET_PERSISTENT, arg2: 1000, arg3: 4294967294, arg4: 140703061514067, arg5: 140703692383680) = 1023192809 8.763 (0.049 ms): pool/2343 keyctl(option: SEARCH, arg2: 1023192809, arg3: 140703745767772, arg4: 140703745767832, arg5: 4294967294) = 140224497 8.789 (0.016 ms): pool/2343 keyctl(option: SEARCH, arg2: 140224497, arg3: 140703745767814, arg4: 140703745767900) = 512300257 8.807 (0.011 ms): pool/2343 keyctl(option: READ, arg2: 512300257 ) = 13 8.822 (0.008 ms): pool/2343 keyctl(option: READ, arg2: 512300257, arg3: 140703061514000, arg4: 13 ) = 13 8.837 (0.007 ms): pool/2343 keyctl(option: READ, arg2: 140224497 ) = 4 8.852 (0.009 ms): pool/2343 keyctl(option: READ, arg2: 140224497, arg3: 140703061514000, arg4: 4 ) = 4 8.869 (0.010 ms): pool/2343 keyctl(option: SEARCH, arg2: 140224497, arg3: 140703745767772, arg4: 140703061514032) = -1 ENOKEY Required key not available 8.892 (0.017 ms): pool/2343 keyctl(option: DESCRIBE, arg2: 512300257 ) = 43 8.910 (0.012 ms): pool/2343 keyctl(option: DESCRIBE, arg2: 512300257, arg3: 140703061544384, arg4: 43) = 43 Cc: Adrian Hunter Cc: Borislav Petkov Cc: David Ahern Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Namhyung Kim Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-013ab219irsxngyumrf5gp8s@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 12fc6df..489cc11 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -569,6 +569,15 @@ static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1); static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", }; static DEFINE_STRARRAY(itimers); +static const char *keyctl_options[] = { + "GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN", + "SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ", + "INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT", + "ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT", + "INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT", +}; +static DEFINE_STRARRAY(keyctl_options); + static const char *whences[] = { "SET", "CUR", "END", #ifdef SEEK_DATA "DATA", @@ -1071,6 +1080,7 @@ static struct syscall_fmt { #else [2] = SCA_HEX, /* arg */ }, }, #endif + { .name = "keyctl", .errmsg = true, STRARRAY(0, option, keyctl_options), }, { .name = "kill", .errmsg = true, .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, }, { .name = "lchown", .errmsg = true, -- cgit v0.10.2 From 26270a008251ec36431623bd992252934bbe529a Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 11 Aug 2015 12:24:27 -0300 Subject: perf ui browser: Introduce ui_browser__write_nstring() To remove direct access to libslang functions, with the immediate goal of implementing horizontal scrolling at the ui_browser level, but also because we may at some point want to implement ui_browser with other UIs in addition to the current libslang implementation. Cc: Adrian Hunter Cc: Borislav Petkov Cc: David Ahern Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Namhyung Kim Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-437ineavoejzou727mr9bxpi@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/ui/browser.c b/tools/perf/ui/browser.c index 6680fa5..5c730ab 100644 --- a/tools/perf/ui/browser.c +++ b/tools/perf/ui/browser.c @@ -46,6 +46,12 @@ void ui_browser__gotorc(struct ui_browser *browser, int y, int x) SLsmg_gotorc(browser->y + y, browser->x + x); } +void ui_browser__write_nstring(struct ui_browser *browser __maybe_unused, const char *msg, + unsigned int width) +{ + slsmg_write_nstring(msg, width); +} + static struct list_head * ui_browser__list_head_filter_entries(struct ui_browser *browser, struct list_head *pos) @@ -234,7 +240,7 @@ void __ui_browser__show_title(struct ui_browser *browser, const char *title) { SLsmg_gotorc(0, 0); ui_browser__set_color(browser, HE_COLORSET_ROOT); - slsmg_write_nstring(title, browser->width + 1); + ui_browser__write_nstring(browser, title, browser->width + 1); } void ui_browser__show_title(struct ui_browser *browser, const char *title) diff --git a/tools/perf/ui/browser.h b/tools/perf/ui/browser.h index 92ae721..ba276c7 100644 --- a/tools/perf/ui/browser.h +++ b/tools/perf/ui/browser.h @@ -37,6 +37,8 @@ void ui_browser__refresh_dimensions(struct ui_browser *browser); void ui_browser__reset_index(struct ui_browser *browser); void ui_browser__gotorc(struct ui_browser *browser, int y, int x); +void ui_browser__write_nstring(struct ui_browser *browser, const char *msg, + unsigned int width); void ui_browser__write_graph(struct ui_browser *browser, int graph); void __ui_browser__line_arrow(struct ui_browser *browser, unsigned int column, u64 start, u64 end); diff --git a/tools/perf/ui/browsers/annotate.c b/tools/perf/ui/browsers/annotate.c index b5fc847..c14f8f9 100644 --- a/tools/perf/ui/browsers/annotate.c +++ b/tools/perf/ui/browsers/annotate.c @@ -141,22 +141,22 @@ static void annotate_browser__write(struct ui_browser *browser, void *entry, int slsmg_printf("%6.2f ", bdl->samples[i].percent); } } else { - slsmg_write_nstring(" ", 7 * ab->nr_events); + ui_browser__write_nstring(browser, " ", 7 * ab->nr_events); } } else { ui_browser__set_percent_color(browser, 0, current_entry); - slsmg_write_nstring(" ", 7 * ab->nr_events); + ui_browser__write_nstring(browser, " ", 7 * ab->nr_events); } if (ab->have_cycles) { if (dl->ipc) slsmg_printf("%*.2f ", IPC_WIDTH - 1, dl->ipc); else - slsmg_write_nstring(" ", IPC_WIDTH); + ui_browser__write_nstring(browser, " ", IPC_WIDTH); if (dl->cycles) slsmg_printf("%*" PRIu64 " ", CYCLES_WIDTH - 1, dl->cycles); else - slsmg_write_nstring(" ", CYCLES_WIDTH); + ui_browser__write_nstring(browser, " ", CYCLES_WIDTH); } SLsmg_write_char(' '); @@ -166,7 +166,7 @@ static void annotate_browser__write(struct ui_browser *browser, void *entry, int width += 1; if (!*dl->line) - slsmg_write_nstring(" ", width - pcnt_width); + ui_browser__write_nstring(browser, " ", width - pcnt_width); else if (dl->offset == -1) { if (dl->line_nr && annotate_browser__opts.show_linenr) printed = scnprintf(bf, sizeof(bf), "%-*d ", @@ -174,8 +174,8 @@ static void annotate_browser__write(struct ui_browser *browser, void *entry, int else printed = scnprintf(bf, sizeof(bf), "%*s ", ab->addr_width, " "); - slsmg_write_nstring(bf, printed); - slsmg_write_nstring(dl->line, width - printed - pcnt_width + 1); + ui_browser__write_nstring(browser, bf, printed); + ui_browser__write_nstring(browser, dl->line, width - printed - pcnt_width + 1); } else { u64 addr = dl->offset; int color = -1; @@ -194,7 +194,7 @@ static void annotate_browser__write(struct ui_browser *browser, void *entry, int bdl->jump_sources); prev = annotate_browser__set_jumps_percent_color(ab, bdl->jump_sources, current_entry); - slsmg_write_nstring(bf, printed); + ui_browser__write_nstring(browser, bf, printed); ui_browser__set_color(browser, prev); } @@ -208,7 +208,7 @@ static void annotate_browser__write(struct ui_browser *browser, void *entry, int if (change_color) color = ui_browser__set_color(browser, HE_COLORSET_ADDR); - slsmg_write_nstring(bf, printed); + ui_browser__write_nstring(browser, bf, printed); if (change_color) ui_browser__set_color(browser, color); if (dl->ins && dl->ins->ops->scnprintf) { @@ -222,11 +222,11 @@ static void annotate_browser__write(struct ui_browser *browser, void *entry, int ui_browser__write_graph(browser, SLSMG_RARROW_CHAR); SLsmg_write_char(' '); } else { - slsmg_write_nstring(" ", 2); + ui_browser__write_nstring(browser, " ", 2); } } else { if (strcmp(dl->name, "retq")) { - slsmg_write_nstring(" ", 2); + ui_browser__write_nstring(browser, " ", 2); } else { ui_browser__write_graph(browser, SLSMG_LARROW_CHAR); SLsmg_write_char(' '); @@ -234,7 +234,7 @@ static void annotate_browser__write(struct ui_browser *browser, void *entry, int } disasm_line__scnprintf(dl, bf, sizeof(bf), !annotate_browser__opts.use_offset); - slsmg_write_nstring(bf, width - pcnt_width - 3 - printed); + ui_browser__write_nstring(browser, bf, width - pcnt_width - 3 - printed); } if (current_entry) diff --git a/tools/perf/ui/browsers/header.c b/tools/perf/ui/browsers/header.c index e8278c5..f106817 100644 --- a/tools/perf/ui/browsers/header.c +++ b/tools/perf/ui/browsers/header.c @@ -25,7 +25,7 @@ static void ui_browser__argv_write(struct ui_browser *browser, ui_browser__set_color(browser, current_entry ? HE_COLORSET_SELECTED : HE_COLORSET_NORMAL); - slsmg_write_nstring(str, browser->width); + ui_browser__write_nstring(browser, str, browser->width); } static int list_menu__run(struct ui_browser *menu) diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c index fa67613..9e94dc8 100644 --- a/tools/perf/ui/browsers/hists.c +++ b/tools/perf/ui/browsers/hists.c @@ -540,10 +540,10 @@ static void hist_browser__show_callchain_entry(struct hist_browser *browser, ui_browser__set_color(&browser->b, color); hist_browser__gotorc(browser, row, 0); - slsmg_write_nstring(" ", offset); + ui_browser__write_nstring(&browser->b, " ", offset); slsmg_printf("%c", folded_sign); ui_browser__write_graph(&browser->b, show_annotated ? SLSMG_RARROW_CHAR : ' '); - slsmg_write_nstring(str, width); + ui_browser__write_nstring(&browser->b, str, width); } static void hist_browser__fprintf_callchain_entry(struct hist_browser *b __maybe_unused, @@ -822,7 +822,7 @@ static int hist_browser__show_entry(struct hist_browser *browser, if (!browser->b.navkeypressed) width += 1; - slsmg_write_nstring("", width); + ui_browser__write_nstring(&browser->b, "", width); ++row; ++printed; @@ -899,7 +899,7 @@ static void hist_browser__show_headers(struct hist_browser *browser) hists__scnprintf_headers(headers, sizeof(headers), browser->hists); ui_browser__gotorc(&browser->b, 0, 0); ui_browser__set_color(&browser->b, HE_COLORSET_ROOT); - slsmg_write_nstring(headers, browser->b.width + 1); + ui_browser__write_nstring(&browser->b, headers, browser->b.width + 1); } static void ui_browser__hists_init_top(struct ui_browser *browser) @@ -2057,7 +2057,7 @@ static void perf_evsel_menu__write(struct ui_browser *browser, warn = bf; } - slsmg_write_nstring(warn, browser->width - printed); + ui_browser__write_nstring(browser, warn, browser->width - printed); if (current_entry) menu->selection = evsel; diff --git a/tools/perf/ui/browsers/map.c b/tools/perf/ui/browsers/map.c index b11639f..fb1ae24 100644 --- a/tools/perf/ui/browsers/map.c +++ b/tools/perf/ui/browsers/map.c @@ -32,7 +32,7 @@ static void map_browser__write(struct ui_browser *browser, void *nd, int row) sym->binding == STB_LOCAL ? 'l' : 'w'); width = browser->width - ((mb->addrlen * 2) + 4); if (width > 0) - slsmg_write_nstring(sym->name, width); + ui_browser__write_nstring(browser, sym->name, width); } /* FIXME uber-kludgy, see comment on cmd_report... */ diff --git a/tools/perf/ui/browsers/scripts.c b/tools/perf/ui/browsers/scripts.c index 402d2bd..e13b48d 100644 --- a/tools/perf/ui/browsers/scripts.c +++ b/tools/perf/ui/browsers/scripts.c @@ -81,7 +81,7 @@ static void script_browser__write(struct ui_browser *browser, ui_browser__set_color(browser, current_entry ? HE_COLORSET_SELECTED : HE_COLORSET_NORMAL); - slsmg_write_nstring(sline->line, browser->width); + ui_browser__write_nstring(browser, sline->line, browser->width); } static int script_browser__run(struct perf_script_browser *browser) diff --git a/tools/perf/ui/tui/util.c b/tools/perf/ui/tui/util.c index bf890f7..d96ad7c 100644 --- a/tools/perf/ui/tui/util.c +++ b/tools/perf/ui/tui/util.c @@ -21,7 +21,7 @@ static void ui_browser__argv_write(struct ui_browser *browser, ui_browser__set_color(browser, current_entry ? HE_COLORSET_SELECTED : HE_COLORSET_NORMAL); - slsmg_write_nstring(*arg, browser->width); + ui_browser__write_nstring(browser, *arg, browser->width); } static int popup_menu__run(struct ui_browser *menu) -- cgit v0.10.2 From 517dfdb315fa2da9b144f6fa494b4f54b490de7e Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 11 Aug 2015 12:50:55 -0300 Subject: perf ui browser: Introduce ui_browser__printf() To remove direct access to libslang functions, with the immediate goal of implementing horizontal scrolling at the ui_browser level, but also because we may at some point want to implement ui_browser with other UIs in addition to the current libslang implementation. Cc: Adrian Hunter Cc: Borislav Petkov Cc: David Ahern Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Namhyung Kim Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-w0niblabqrkecs4o0eogfy6c@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/ui/browser.c b/tools/perf/ui/browser.c index 5c730ab..c6c7e51 100644 --- a/tools/perf/ui/browser.c +++ b/tools/perf/ui/browser.c @@ -52,6 +52,15 @@ void ui_browser__write_nstring(struct ui_browser *browser __maybe_unused, const slsmg_write_nstring(msg, width); } +void ui_browser__printf(struct ui_browser *browser __maybe_unused, const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + slsmg_vprintf(fmt, args); + va_end(args); +} + static struct list_head * ui_browser__list_head_filter_entries(struct ui_browser *browser, struct list_head *pos) diff --git a/tools/perf/ui/browser.h b/tools/perf/ui/browser.h index ba276c7..52be871 100644 --- a/tools/perf/ui/browser.h +++ b/tools/perf/ui/browser.h @@ -39,6 +39,7 @@ void ui_browser__reset_index(struct ui_browser *browser); void ui_browser__gotorc(struct ui_browser *browser, int y, int x); void ui_browser__write_nstring(struct ui_browser *browser, const char *msg, unsigned int width); +void ui_browser__printf(struct ui_browser *browser, const char *fmt, ...); void ui_browser__write_graph(struct ui_browser *browser, int graph); void __ui_browser__line_arrow(struct ui_browser *browser, unsigned int column, u64 start, u64 end); diff --git a/tools/perf/ui/browsers/annotate.c b/tools/perf/ui/browsers/annotate.c index c14f8f9..29739b3 100644 --- a/tools/perf/ui/browsers/annotate.c +++ b/tools/perf/ui/browsers/annotate.c @@ -1,7 +1,6 @@ #include "../../util/util.h" #include "../browser.h" #include "../helpline.h" -#include "../libslang.h" #include "../ui.h" #include "../util.h" #include "../../util/annotate.h" @@ -134,11 +133,13 @@ static void annotate_browser__write(struct ui_browser *browser, void *entry, int ui_browser__set_percent_color(browser, bdl->samples[i].percent, current_entry); - if (annotate_browser__opts.show_total_period) - slsmg_printf("%6" PRIu64 " ", - bdl->samples[i].nr); - else - slsmg_printf("%6.2f ", bdl->samples[i].percent); + if (annotate_browser__opts.show_total_period) { + ui_browser__printf(browser, "%6" PRIu64 " ", + bdl->samples[i].nr); + } else { + ui_browser__printf(browser, "%6.2f ", + bdl->samples[i].percent); + } } } else { ui_browser__write_nstring(browser, " ", 7 * ab->nr_events); @@ -149,12 +150,12 @@ static void annotate_browser__write(struct ui_browser *browser, void *entry, int } if (ab->have_cycles) { if (dl->ipc) - slsmg_printf("%*.2f ", IPC_WIDTH - 1, dl->ipc); + ui_browser__printf(browser, "%*.2f ", IPC_WIDTH - 1, dl->ipc); else ui_browser__write_nstring(browser, " ", IPC_WIDTH); if (dl->cycles) - slsmg_printf("%*" PRIu64 " ", - CYCLES_WIDTH - 1, dl->cycles); + ui_browser__printf(browser, "%*" PRIu64 " ", + CYCLES_WIDTH - 1, dl->cycles); else ui_browser__write_nstring(browser, " ", CYCLES_WIDTH); } diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c index 9e94dc8..2de53d6 100644 --- a/tools/perf/ui/browsers/hists.c +++ b/tools/perf/ui/browsers/hists.c @@ -1,5 +1,4 @@ #include -#include "../libslang.h" #include #include #include @@ -541,7 +540,7 @@ static void hist_browser__show_callchain_entry(struct hist_browser *browser, ui_browser__set_color(&browser->b, color); hist_browser__gotorc(browser, row, 0); ui_browser__write_nstring(&browser->b, " ", offset); - slsmg_printf("%c", folded_sign); + ui_browser__printf(&browser->b, "%c", folded_sign); ui_browser__write_graph(&browser->b, show_annotated ? SLSMG_RARROW_CHAR : ' '); ui_browser__write_nstring(&browser->b, str, width); } @@ -680,7 +679,7 @@ static int __hpp__slsmg_color_printf(struct perf_hpp *hpp, const char *fmt, ...) ui_browser__set_percent_color(arg->b, percent, arg->current_entry); ret = scnprintf(hpp->buf, hpp->size, fmt, len, percent); - slsmg_printf("%s", hpp->buf); + ui_browser__printf(arg->b, "%s", hpp->buf); advance_hpp(hpp, ret); return ret; @@ -713,10 +712,11 @@ hist_browser__hpp_color_##_type(struct perf_hpp_fmt *fmt, \ struct hist_entry *he) \ { \ if (!symbol_conf.cumulate_callchain) { \ + struct hpp_arg *arg = hpp->ptr; \ int len = fmt->user_len ?: fmt->len; \ int ret = scnprintf(hpp->buf, hpp->size, \ "%*s", len, "N/A"); \ - slsmg_printf("%s", hpp->buf); \ + ui_browser__printf(arg->b, "%s", hpp->buf); \ \ return ret; \ } \ @@ -801,12 +801,12 @@ static int hist_browser__show_entry(struct hist_browser *browser, if (first) { if (symbol_conf.use_callchain) { - slsmg_printf("%c ", folded_sign); + ui_browser__printf(&browser->b, "%c ", folded_sign); width -= 2; } first = false; } else { - slsmg_printf(" "); + ui_browser__printf(&browser->b, " "); width -= 2; } @@ -814,7 +814,7 @@ static int hist_browser__show_entry(struct hist_browser *browser, width -= fmt->color(fmt, &hpp, entry); } else { width -= fmt->entry(fmt, &hpp, entry); - slsmg_printf("%s", s); + ui_browser__printf(&browser->b, "%s", s); } } @@ -2044,7 +2044,7 @@ static void perf_evsel_menu__write(struct ui_browser *browser, nr_events = convert_unit(nr_events, &unit); printed = scnprintf(bf, sizeof(bf), "%lu%c%s%s", nr_events, unit, unit == ' ' ? "" : " ", ev_name); - slsmg_printf("%s", bf); + ui_browser__printf(browser, "%s", bf); nr_events = hists->stats.nr_events[PERF_RECORD_LOST]; if (nr_events != 0) { diff --git a/tools/perf/ui/browsers/map.c b/tools/perf/ui/browsers/map.c index fb1ae24..8c154c7 100644 --- a/tools/perf/ui/browsers/map.c +++ b/tools/perf/ui/browsers/map.c @@ -1,4 +1,3 @@ -#include "../libslang.h" #include #include #include @@ -26,10 +25,10 @@ static void map_browser__write(struct ui_browser *browser, void *nd, int row) int width; ui_browser__set_percent_color(browser, 0, current_entry); - slsmg_printf("%*" PRIx64 " %*" PRIx64 " %c ", - mb->addrlen, sym->start, mb->addrlen, sym->end, - sym->binding == STB_GLOBAL ? 'g' : - sym->binding == STB_LOCAL ? 'l' : 'w'); + ui_browser__printf(browser, "%*" PRIx64 " %*" PRIx64 " %c ", + mb->addrlen, sym->start, mb->addrlen, sym->end, + sym->binding == STB_GLOBAL ? 'g' : + sym->binding == STB_LOCAL ? 'l' : 'w'); width = browser->width - ((mb->addrlen * 2) + 4); if (width > 0) ui_browser__write_nstring(browser, sym->name, width); diff --git a/tools/perf/ui/libslang.h b/tools/perf/ui/libslang.h index 4d54b64..db81669 100644 --- a/tools/perf/ui/libslang.h +++ b/tools/perf/ui/libslang.h @@ -14,12 +14,15 @@ #if SLANG_VERSION < 20104 #define slsmg_printf(msg, args...) \ SLsmg_printf((char *)(msg), ##args) +#define slsmg_vprintf(msg, vargs) \ + SLsmg_vprintf((char *)(msg), vargs) #define slsmg_write_nstring(msg, len) \ SLsmg_write_nstring((char *)(msg), len) #define sltt_set_color(obj, name, fg, bg) \ SLtt_set_color(obj,(char *)(name), (char *)(fg), (char *)(bg)) #else #define slsmg_printf SLsmg_printf +#define slsmg_vprintf SLsmg_vprintf #define slsmg_write_nstring SLsmg_write_nstring #define sltt_set_color SLtt_set_color #endif -- cgit v0.10.2 From 63ab1749f39aeec27b0dcf71cd7996d862c5ad63 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 12 Aug 2015 12:42:58 -0300 Subject: perf hists browser: Make ESC unzoom as well In addition to <-, that may be repurposed for horizontal scrolling. Cc: Adrian Hunter Cc: Borislav Petkov Cc: David Ahern Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Namhyung Kim Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-w3rctelxr4yxrjufx7z3fclb@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c index 2de53d6..f296b73 100644 --- a/tools/perf/ui/browsers/hists.c +++ b/tools/perf/ui/browsers/hists.c @@ -1868,6 +1868,7 @@ static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events, case K_RIGHT: /* menu */ break; + case K_ESC: case K_LEFT: { const void *top; @@ -1877,6 +1878,12 @@ static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events, */ if (left_exits) goto out_free_stack; + + if (key == K_ESC && + ui_browser__dialog_yesno(&browser->b, + "Do you really want to exit?")) + goto out_free_stack; + continue; } top = pstack__peek(browser->pstack); @@ -1892,12 +1899,6 @@ static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events, do_zoom_thread(browser, actions); continue; } - case K_ESC: - if (!left_exits && - !ui_browser__dialog_yesno(&browser->b, - "Do you really want to exit?")) - continue; - /* Fall thru */ case 'q': case CTRL('c'): goto out_free_stack; @@ -2120,15 +2121,11 @@ browse_hists: else pos = perf_evsel__prev(pos); goto browse_hists; - case K_ESC: - if (!ui_browser__dialog_yesno(&menu->b, - "Do you really want to exit?")) - continue; - /* Fall thru */ case K_SWITCH_INPUT_DATA: case 'q': case CTRL('c'): goto out; + case K_ESC: default: continue; } -- cgit v0.10.2 From 75186a9b09e47072f442f43e292cd47180b67b5c Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 12 Aug 2015 10:24:07 +0900 Subject: perf probe: Fix to show lines of sys_ functions correctly "perf probe --lines sys_poll" shows only the first line of sys_poll, because the SYSCALL_DEFINE macro: ---- SYSCALL_DEFINE*(foo,...) { body; } ---- is expanded as below (on debuginfo) ---- static inline int SYSC_foo(...) { body; } int SyS_foo(...) <- is an alias of sys_foo. { return SYSC_foo(...); } ---- So, "perf probe --lines sys_foo" decodes SyS_foo function and it also skips inlined functions(SYSC_foo) inside the target function because those functions are usually defined somewhere else. To fix this issue, this fix checks whether the inlined function is defined at the same point of the target function, and if so, it doesn't skip the inline function. Reported-by: Arnaldo Carvalho de Melo Signed-off-by: Masami Hiramatsu Tested-by: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Link: http://lkml.kernel.org/r/20150812012406.11811.94691.stgit@localhost.localdomain Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/dwarf-aux.c b/tools/perf/util/dwarf-aux.c index 57f3ef4..445f455 100644 --- a/tools/perf/util/dwarf-aux.c +++ b/tools/perf/util/dwarf-aux.c @@ -734,15 +734,18 @@ int die_walk_lines(Dwarf_Die *rt_die, line_walk_callback_t callback, void *data) Dwarf_Lines *lines; Dwarf_Line *line; Dwarf_Addr addr; - const char *fname; + const char *fname, *decf = NULL; int lineno, ret = 0; + int decl = 0, inl; Dwarf_Die die_mem, *cu_die; size_t nlines, i; /* Get the CU die */ - if (dwarf_tag(rt_die) != DW_TAG_compile_unit) + if (dwarf_tag(rt_die) != DW_TAG_compile_unit) { cu_die = dwarf_diecu(rt_die, &die_mem, NULL, NULL); - else + dwarf_decl_line(rt_die, &decl); + decf = dwarf_decl_file(rt_die); + } else cu_die = rt_die; if (!cu_die) { pr_debug2("Failed to get CU from given DIE.\n"); @@ -773,9 +776,14 @@ int die_walk_lines(Dwarf_Die *rt_die, line_walk_callback_t callback, void *data) * The line is included in given function, and * no inline block includes it. */ - if (!dwarf_haspc(rt_die, addr) || - die_find_inlinefunc(rt_die, addr, &die_mem)) + if (!dwarf_haspc(rt_die, addr)) continue; + if (die_find_inlinefunc(rt_die, addr, &die_mem)) { + dwarf_decl_line(&die_mem, &inl); + if (inl != decl || + decf != dwarf_decl_file(&die_mem)) + continue; + } /* Get source line */ fname = dwarf_linesrc(line, NULL, NULL); -- cgit v0.10.2 From d457c96392bb418bd998f3ccf93e0e4c958fcd0f Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 11 Aug 2015 06:30:47 -0400 Subject: perf callchain: Per-event type selection support This patchkit adds the ability to set callgraph mode (fp, dwarf, lbr) per event. This in term can reduce sampling overhead and the size of the perf.data. Here is an example. perf record -e 'cpu/cpu-cycles,period=1000,call-graph=fp,time=1/,cpu/instructions,call-graph=lbr/' sleep 1 perf evlist -v cpu/cpu-cycles,period=1000,call-graph=fp,time=1/: type: 4, size: 112, config: 0x3c, { sample_period, sample_freq }: 1000, sample_type: IP|TID|TIME|CALLCHAIN|PERIOD|IDENTIFIER, read_format: ID, disabled: 1, inherit: 1, mmap: 1, comm: 1, enable_on_exec: 1, task: 1, sample_id_all: 1, exclude_guest: 1, mmap2: 1, comm_exec: 1 cpu/instructions,call-graph=lbr/: type: 4, size: 112, config: 0xc0, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|CALLCHAIN|PERIOD|BRANCH_STACK|IDENTIFIER, read_format: ID, disabled: 1, inherit: 1, freq: 1, enable_on_exec: 1, sample_id_all: 1, exclude_guest: 1 Signed-off-by: Kan Liang Tested-by: Arnaldo Carvalho de Melo Cc: Andi Kleen Cc: Jiri Olsa Cc: Namhyung Kim Link: http://lkml.kernel.org/r/1439289050-40510-1-git-send-email-kan.liang@intel.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt index afbe45e..7f82dec 100644 --- a/tools/perf/Documentation/perf-record.txt +++ b/tools/perf/Documentation/perf-record.txt @@ -53,6 +53,9 @@ OPTIONS - 'time': Disable/enable time stamping. Acceptable values are 1 for enabling time stamping. 0 for disabling time stamping. The default is 1. + - 'call-graph': Disable/enable callgraph. Acceptable str are "fp" for + FP mode, "dwarf" for DWARF mode, "lbr" for LBR mode. + - 'stack-size': user stack size for dwarf mode Note: If user explicitly sets options which conflict with the params, the value set by the params will be overridden. diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 04fdddd..6647925 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -588,11 +588,36 @@ perf_evsel__config_callgraph(struct perf_evsel *evsel, } } -static void apply_config_terms(struct perf_evsel *evsel) +static void +perf_evsel__reset_callgraph(struct perf_evsel *evsel, + struct callchain_param *param) +{ + struct perf_event_attr *attr = &evsel->attr; + + perf_evsel__reset_sample_bit(evsel, CALLCHAIN); + if (param->record_mode == CALLCHAIN_LBR) { + perf_evsel__reset_sample_bit(evsel, BRANCH_STACK); + attr->branch_sample_type &= ~(PERF_SAMPLE_BRANCH_USER | + PERF_SAMPLE_BRANCH_CALL_STACK); + } + if (param->record_mode == CALLCHAIN_DWARF) { + perf_evsel__reset_sample_bit(evsel, REGS_USER); + perf_evsel__reset_sample_bit(evsel, STACK_USER); + } +} + +static void apply_config_terms(struct perf_evsel *evsel, + struct record_opts *opts) { struct perf_evsel_config_term *term; struct list_head *config_terms = &evsel->config_terms; struct perf_event_attr *attr = &evsel->attr; + struct callchain_param param; + u32 dump_size = 0; + char *callgraph_buf = NULL; + + /* callgraph default */ + param.record_mode = callchain_param.record_mode; list_for_each_entry(term, config_terms, list) { switch (term->type) { @@ -610,10 +635,43 @@ static void apply_config_terms(struct perf_evsel *evsel) else perf_evsel__reset_sample_bit(evsel, TIME); break; + case PERF_EVSEL__CONFIG_TERM_CALLGRAPH: + callgraph_buf = term->val.callgraph; + break; + case PERF_EVSEL__CONFIG_TERM_STACK_USER: + dump_size = term->val.stack_user; + break; default: break; } } + + /* User explicitly set per-event callgraph, clear the old setting and reset. */ + if ((callgraph_buf != NULL) || (dump_size > 0)) { + + /* parse callgraph parameters */ + if (callgraph_buf != NULL) { + param.enabled = true; + if (parse_callchain_record(callgraph_buf, ¶m)) { + pr_err("per-event callgraph setting for %s failed. " + "Apply callgraph global setting for it\n", + evsel->name); + return; + } + } + if (dump_size > 0) { + dump_size = round_up(dump_size, sizeof(u64)); + param.dump_size = dump_size; + } + + /* If global callgraph set, clear it */ + if (callchain_param.enabled) + perf_evsel__reset_callgraph(evsel, &callchain_param); + + /* set perf-event callgraph */ + if (param.enabled) + perf_evsel__config_callgraph(evsel, opts, ¶m); + } } /* @@ -812,7 +870,7 @@ void perf_evsel__config(struct perf_evsel *evsel, struct record_opts *opts) * Apply event specific term settings, * it overloads any global configuration. */ - apply_config_terms(evsel); + apply_config_terms(evsel, opts); } static int perf_evsel__alloc_fd(struct perf_evsel *evsel, int ncpus, int nthreads) diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index fdf2674..93ac6b1 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -41,6 +41,8 @@ enum { PERF_EVSEL__CONFIG_TERM_PERIOD, PERF_EVSEL__CONFIG_TERM_FREQ, PERF_EVSEL__CONFIG_TERM_TIME, + PERF_EVSEL__CONFIG_TERM_CALLGRAPH, + PERF_EVSEL__CONFIG_TERM_STACK_USER, PERF_EVSEL__CONFIG_TERM_MAX, }; @@ -51,6 +53,8 @@ struct perf_evsel_config_term { u64 period; u64 freq; bool time; + char *callgraph; + u64 stack_user; } val; }; diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index dbf315d..d826e6f 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -614,6 +614,12 @@ do { \ return -EINVAL; } break; + case PARSE_EVENTS__TERM_TYPE_CALLGRAPH: + CHECK_TYPE_VAL(STR); + break; + case PARSE_EVENTS__TERM_TYPE_STACKSIZE: + CHECK_TYPE_VAL(NUM); + break; case PARSE_EVENTS__TERM_TYPE_NAME: CHECK_TYPE_VAL(STR); break; @@ -668,6 +674,12 @@ do { \ case PARSE_EVENTS__TERM_TYPE_TIME: ADD_CONFIG_TERM(TIME, time, term->val.num); break; + case PARSE_EVENTS__TERM_TYPE_CALLGRAPH: + ADD_CONFIG_TERM(CALLGRAPH, callgraph, term->val.str); + break; + case PARSE_EVENTS__TERM_TYPE_STACKSIZE: + ADD_CONFIG_TERM(STACK_USER, stack_user, term->val.num); + break; default: break; } diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h index ce2d13a..a09b0e2 100644 --- a/tools/perf/util/parse-events.h +++ b/tools/perf/util/parse-events.h @@ -65,6 +65,8 @@ enum { PARSE_EVENTS__TERM_TYPE_SAMPLE_FREQ, PARSE_EVENTS__TERM_TYPE_BRANCH_SAMPLE_TYPE, PARSE_EVENTS__TERM_TYPE_TIME, + PARSE_EVENTS__TERM_TYPE_CALLGRAPH, + PARSE_EVENTS__TERM_TYPE_STACKSIZE, }; struct parse_events_term { diff --git a/tools/perf/util/parse-events.l b/tools/perf/util/parse-events.l index 4306f5a..936d566 100644 --- a/tools/perf/util/parse-events.l +++ b/tools/perf/util/parse-events.l @@ -185,6 +185,8 @@ period { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_SAMPLE_PERIOD); } freq { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_SAMPLE_FREQ); } branch_type { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_BRANCH_SAMPLE_TYPE); } time { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_TIME); } +call-graph { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_CALLGRAPH); } +stack-size { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_STACKSIZE); } , { return ','; } "/" { BEGIN(INITIAL); return '/'; } {name_minus} { return str(yyscanner, PE_NAME); } diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index d85f11b..84cad05 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -634,7 +634,9 @@ static char *formats_error_string(struct list_head *formats) { struct perf_pmu_format *format; char *err, *str; - static const char *static_terms = "config,config1,config2,name,period,freq,branch_type,time\n"; + static const char *static_terms = "config,config1,config2,name," + "period,freq,branch_type,time," + "call-graph,stack-size\n"; unsigned i = 0; if (!asprintf(&str, "valid terms:")) -- cgit v0.10.2 From f9db0d0f1b2cf030083c83d3ed3a4bbae6bdc8b7 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 11 Aug 2015 06:30:48 -0400 Subject: perf callchain: Allow disabling call graphs per event This patch introduce "call-graph=no" to disable per-event callgraph. Here is an example. perf record -e 'cpu/cpu-cycles,call-graph=fp/,cpu/instructions,call-graph=no/' sleep 1 perf report --stdio # To display the perf.data header info, please use --header/--header-only options. # # # Total Lost Samples: 0 # # Samples: 6 of event 'cpu/cpu-cycles,call-graph=fp/' # Event count (approx.): 774218 # # Children Self Command Shared Object Symbol # ........ ........ ....... ................ ........................................ # 61.94% 0.00% sleep [kernel.vmlinux] [k] entry_SYSCALL_64_fastpath | ---entry_SYSCALL_64_fastpath | |--97.30%-- __brk | --2.70%-- mmap64 _dl_check_map_versions _dl_check_all_versions 61.94% 0.00% sleep [kernel.vmlinux] [k] perf_event_mmap | ---perf_event_mmap | |--97.30%-- do_brk | sys_brk | entry_SYSCALL_64_fastpath | __brk | --2.70%-- mmap_region do_mmap_pgoff vm_mmap_pgoff sys_mmap_pgoff sys_mmap entry_SYSCALL_64_fastpath mmap64 _dl_check_map_versions _dl_check_all_versions ...... # Samples: 6 of event 'cpu/instructions,call-graph=no/' # Event count (approx.): 359692 # # Children Self Command Shared Object Symbol # ........ ........ ....... ................ ................................. # 89.03% 0.00% sleep [unknown] [.] 0xffff6598ffff6598 89.03% 0.00% sleep ld-2.17.so [.] _dl_resolve_conflicts 89.03% 0.00% sleep [kernel.vmlinux] [k] page_fault Signed-off-by: Kan Liang Tested-by: Arnaldo Carvalho de Melo Cc: Andi Kleen Cc: Jiri Olsa Cc: Namhyung Kim Link: http://lkml.kernel.org/r/1439289050-40510-2-git-send-email-kan.liang@intel.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt index 7f82dec..347a273 100644 --- a/tools/perf/Documentation/perf-record.txt +++ b/tools/perf/Documentation/perf-record.txt @@ -54,7 +54,8 @@ OPTIONS enabling time stamping. 0 for disabling time stamping. The default is 1. - 'call-graph': Disable/enable callgraph. Acceptable str are "fp" for - FP mode, "dwarf" for DWARF mode, "lbr" for LBR mode. + FP mode, "dwarf" for DWARF mode, "lbr" for LBR mode and + "no" for disable callgraph. - 'stack-size': user stack size for dwarf mode Note: If user explicitly sets options which conflict with the params, the value set by the params will be overridden. diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c index 467a23b..a32a64e 100644 --- a/tools/perf/builtin-annotate.c +++ b/tools/perf/builtin-annotate.c @@ -239,6 +239,8 @@ static int __cmd_annotate(struct perf_annotate *ann) if (nr_samples > 0) { total_nr_samples += nr_samples; hists__collapse_resort(hists, NULL); + /* Don't sort callchain */ + perf_evsel__reset_sample_bit(pos, CALLCHAIN); hists__output_resort(hists, NULL); if (symbol_conf.event_group && diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c index daaa7dc..0b180a8 100644 --- a/tools/perf/builtin-diff.c +++ b/tools/perf/builtin-diff.c @@ -722,6 +722,9 @@ static void data_process(void) if (verbose || data__files_cnt > 2) data__fprintf(); + /* Don't sort callchain for perf diff */ + perf_evsel__reset_sample_bit(evsel_base, CALLCHAIN); + hists__process(hists_base); } } diff --git a/tools/perf/tests/hists_cumulate.c b/tools/perf/tests/hists_cumulate.c index 7d82c8b..7ed7370 100644 --- a/tools/perf/tests/hists_cumulate.c +++ b/tools/perf/tests/hists_cumulate.c @@ -279,6 +279,7 @@ static int test1(struct perf_evsel *evsel, struct machine *machine) symbol_conf.use_callchain = false; symbol_conf.cumulate_callchain = false; + perf_evsel__reset_sample_bit(evsel, CALLCHAIN); setup_sorting(); callchain_register_param(&callchain_param); @@ -425,6 +426,7 @@ static int test2(struct perf_evsel *evsel, struct machine *machine) symbol_conf.use_callchain = true; symbol_conf.cumulate_callchain = false; + perf_evsel__set_sample_bit(evsel, CALLCHAIN); setup_sorting(); callchain_register_param(&callchain_param); @@ -482,6 +484,7 @@ static int test3(struct perf_evsel *evsel, struct machine *machine) symbol_conf.use_callchain = false; symbol_conf.cumulate_callchain = true; + perf_evsel__reset_sample_bit(evsel, CALLCHAIN); setup_sorting(); callchain_register_param(&callchain_param); @@ -665,6 +668,7 @@ static int test4(struct perf_evsel *evsel, struct machine *machine) symbol_conf.use_callchain = true; symbol_conf.cumulate_callchain = true; + perf_evsel__set_sample_bit(evsel, CALLCHAIN); setup_sorting(); callchain_register_param(&callchain_param); diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 6647925..b096ef7 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -651,12 +651,17 @@ static void apply_config_terms(struct perf_evsel *evsel, /* parse callgraph parameters */ if (callgraph_buf != NULL) { - param.enabled = true; - if (parse_callchain_record(callgraph_buf, ¶m)) { - pr_err("per-event callgraph setting for %s failed. " - "Apply callgraph global setting for it\n", - evsel->name); - return; + if (!strcmp(callgraph_buf, "no")) { + param.enabled = false; + param.record_mode = CALLCHAIN_NONE; + } else { + param.enabled = true; + if (parse_callchain_record(callgraph_buf, ¶m)) { + pr_err("per-event callgraph setting for %s failed. " + "Apply callgraph global setting for it\n", + evsel->name); + return; + } } } if (dump_size > 0) { diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index 6bccfae..1cd785b 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -1109,13 +1109,14 @@ void hists__inc_stats(struct hists *hists, struct hist_entry *h) static void __hists__insert_output_entry(struct rb_root *entries, struct hist_entry *he, - u64 min_callchain_hits) + u64 min_callchain_hits, + bool use_callchain) { struct rb_node **p = &entries->rb_node; struct rb_node *parent = NULL; struct hist_entry *iter; - if (symbol_conf.use_callchain) + if (use_callchain) callchain_param.sort(&he->sorted_chain, he->callchain, min_callchain_hits, &callchain_param); @@ -1139,6 +1140,8 @@ void hists__output_resort(struct hists *hists, struct ui_progress *prog) struct rb_node *next; struct hist_entry *n; u64 min_callchain_hits; + struct perf_evsel *evsel = hists_to_evsel(hists); + bool use_callchain = evsel ? (evsel->attr.sample_type & PERF_SAMPLE_CALLCHAIN) : symbol_conf.use_callchain; min_callchain_hits = hists->stats.total_period * (callchain_param.min_percent / 100); @@ -1157,7 +1160,7 @@ void hists__output_resort(struct hists *hists, struct ui_progress *prog) n = rb_entry(next, struct hist_entry, rb_node_in); next = rb_next(&n->rb_node_in); - __hists__insert_output_entry(&hists->entries, n, min_callchain_hits); + __hists__insert_output_entry(&hists->entries, n, min_callchain_hits, use_callchain); hists__inc_stats(hists, n); if (!n->filtered) -- cgit v0.10.2 From 9e207ddfa20781e56465ce9a537f0a377c9d34fb Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 11 Aug 2015 06:30:49 -0400 Subject: perf report: Show call graph from reference events Introduce --show-ref-call-graph for perf report to print reference callgraph for no callgraph event. Here is an example. perf report --show-ref-call-graph --stdio # To display the perf.data header info, please use --header/--header-only options. # # # Total Lost Samples: 0 # # Samples: 5 of event 'cpu/cpu-cycles,call-graph=fp/' # Event count (approx.): 144985 # # Children Self Command Shared Object Symbol # ........ ........ ....... ................ ........................................ # 72.30% 0.00% sleep [kernel.vmlinux] [k] entry_SYSCALL_64_fastpath | ---entry_SYSCALL_64_fastpath | |--22.62%-- __GI___libc_nanosleep --77.38%-- [...] ...... # Samples: 6 of event 'cpu/instructions,call-graph=no/', show reference callgraph # Event count (approx.): 172780 # # Children Self Command Shared Object Symbol # ........ ........ ....... ................ ........................................ # 73.16% 0.00% sleep [kernel.vmlinux] [k] entry_SYSCALL_64_fastpath | ---entry_SYSCALL_64_fastpath | |--31.44%-- __GI___libc_nanosleep --68.56%-- [...] Signed-off-by: Kan Liang Tested-by: Arnaldo Carvalho de Melo Cc: Andi Kleen Cc: Jiri Olsa Cc: Namhyung Kim Link: http://lkml.kernel.org/r/1439289050-40510-3-git-send-email-kan.liang@intel.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt index 7b07d19..a18ba75 100644 --- a/tools/perf/Documentation/perf-report.txt +++ b/tools/perf/Documentation/perf-report.txt @@ -359,6 +359,17 @@ OPTIONS --full-source-path:: Show the full path for source files for srcline output. +--show-ref-call-graph:: + When multiple events are sampled, it may not be needed to collect + callgraphs for all of them. The sample sites are usually nearby, + and it's enough to collect the callgraphs on a reference event. + So user can use "call-graph=no" event modifier to disable callgraph + for other events to reduce the overhead. + However, perf report cannot show callgraphs for the event which + disable the callgraph. + This option extends the perf report to show reference callgraphs, + which collected by reference event, in no callgraph event. + include::callchain-overhead-calculation.txt[] SEE ALSO diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index f301e86..62b285e 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -316,6 +316,11 @@ static size_t hists__fprintf_nr_sample_events(struct hists *hists, struct report if (evname != NULL) ret += fprintf(fp, " of event '%s'", evname); + if (symbol_conf.show_ref_callgraph && + strstr(evname, "call-graph=no")) { + ret += fprintf(fp, ", show reference callgraph"); + } + if (rep->mem_mode) { ret += fprintf(fp, "\n# Total weight : %" PRIu64, nr_events); ret += fprintf(fp, "\n# Sort order : %s", sort_order ? : default_mem_sort_order); @@ -740,6 +745,8 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused) itrace_parse_synth_opts), OPT_BOOLEAN(0, "full-source-path", &srcline_full_filename, "Show full source file name path for source lines"), + OPT_BOOLEAN(0, "show-ref-call-graph", &symbol_conf.show_ref_callgraph, + "Show callgraph from reference event"), OPT_END() }; struct perf_data_file file = { diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c index f296b73..10c7ec0 100644 --- a/tools/perf/ui/browsers/hists.c +++ b/tools/perf/ui/browsers/hists.c @@ -1267,6 +1267,8 @@ static int hists__browser_title(struct hists *hists, const char *ev_name = perf_evsel__name(evsel); char buf[512]; size_t buflen = sizeof(buf); + char ref[30] = " show reference callgraph, "; + bool enable_ref = false; if (symbol_conf.filter_relative) { nr_samples = hists->stats.nr_non_filtered_samples; @@ -1292,10 +1294,13 @@ static int hists__browser_title(struct hists *hists, } } + if (symbol_conf.show_ref_callgraph && + strstr(ev_name, "call-graph=no")) + enable_ref = true; nr_samples = convert_unit(nr_samples, &unit); printed = scnprintf(bf, size, - "Samples: %lu%c of event '%s', Event count (approx.): %" PRIu64, - nr_samples, unit, ev_name, nr_events); + "Samples: %lu%c of event '%s',%sEvent count (approx.): %" PRIu64, + nr_samples, unit, ev_name, enable_ref ? ref : " ", nr_events); if (hists->uid_filter_str) diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index 1cd785b..08b6cd9 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -1141,7 +1141,12 @@ void hists__output_resort(struct hists *hists, struct ui_progress *prog) struct hist_entry *n; u64 min_callchain_hits; struct perf_evsel *evsel = hists_to_evsel(hists); - bool use_callchain = evsel ? (evsel->attr.sample_type & PERF_SAMPLE_CALLCHAIN) : symbol_conf.use_callchain; + bool use_callchain; + + if (evsel && !symbol_conf.show_ref_callgraph) + use_callchain = evsel->attr.sample_type & PERF_SAMPLE_CALLCHAIN; + else + use_callchain = symbol_conf.use_callchain; min_callchain_hits = hists->stats.total_period * (callchain_param.min_percent / 100); diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h index b98ce51..a4cde92 100644 --- a/tools/perf/util/symbol.h +++ b/tools/perf/util/symbol.h @@ -106,7 +106,8 @@ struct symbol_conf { filter_relative, show_hist_headers, branch_callstack, - has_filter; + has_filter, + show_ref_callgraph; const char *vmlinux_name, *kallsyms_name, *source_prefix, -- cgit v0.10.2 From 71ef150ee06df29c5b427307dc0bacfe06a8baea Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 11 Aug 2015 06:30:50 -0400 Subject: perf tests: Add tests to callgraph and time parse Add tests in tests/parse-events.c to check call-graph and time option. Signed-off-by: Kan Liang Cc: Andi Kleen Cc: Jiri Olsa Cc: Namhyung Kim Link: http://lkml.kernel.org/r/1439289050-40510-4-git-send-email-kan.liang@intel.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/tests/parse-events.c b/tools/perf/tests/parse-events.c index f65bb89..9b6b2b63 100644 --- a/tools/perf/tests/parse-events.c +++ b/tools/perf/tests/parse-events.c @@ -479,6 +479,39 @@ static int test__checkevent_pmu_name(struct perf_evlist *evlist) return 0; } +static int test__checkevent_pmu_partial_time_callgraph(struct perf_evlist *evlist) +{ + struct perf_evsel *evsel = perf_evlist__first(evlist); + + /* cpu/config=1,call-graph=fp,time,period=100000/ */ + TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->nr_entries); + TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->attr.type); + TEST_ASSERT_VAL("wrong config", 1 == evsel->attr.config); + /* + * The period, time and callgraph value gets configured + * within perf_evlist__config, + * while this test executes only parse events method. + */ + TEST_ASSERT_VAL("wrong period", 0 == evsel->attr.sample_period); + TEST_ASSERT_VAL("wrong callgraph", !(PERF_SAMPLE_CALLCHAIN & evsel->attr.sample_type)); + TEST_ASSERT_VAL("wrong time", !(PERF_SAMPLE_TIME & evsel->attr.sample_type)); + + /* cpu/config=2,call-graph=no,time=0,period=2000/ */ + evsel = perf_evsel__next(evsel); + TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->attr.type); + TEST_ASSERT_VAL("wrong config", 2 == evsel->attr.config); + /* + * The period, time and callgraph value gets configured + * within perf_evlist__config, + * while this test executes only parse events method. + */ + TEST_ASSERT_VAL("wrong period", 0 == evsel->attr.sample_period); + TEST_ASSERT_VAL("wrong callgraph", !(PERF_SAMPLE_CALLCHAIN & evsel->attr.sample_type)); + TEST_ASSERT_VAL("wrong time", !(PERF_SAMPLE_TIME & evsel->attr.sample_type)); + + return 0; +} + static int test__checkevent_pmu_events(struct perf_evlist *evlist) { struct perf_evsel *evsel = perf_evlist__first(evlist); @@ -1555,6 +1588,11 @@ static struct evlist_test test__events_pmu[] = { .check = test__checkevent_pmu_name, .id = 1, }, + { + .name = "cpu/config=1,call-graph=fp,time,period=100000/,cpu/config=2,call-graph=no,time=0,period=2000/", + .check = test__checkevent_pmu_partial_time_callgraph, + .id = 2, + }, }; struct terms_test { -- cgit v0.10.2 From 7aec51cbf0646cc15d719b08caea931576dc7f2a Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 12 Aug 2015 15:48:37 -0700 Subject: perf tools: Support static linking with libdw The Fedora 22 version of libdw requires a couple of extra libraries to link. With a dynamic link the dependencies are pulled in automatically, but this doesn't work for static linking. Add the needed libraries explicitely to the feature probe and the Makefile. v2: Explicitly check for static linking and only add the dependencies when -static is set. This is to avoid regressions on Arnaldo's system. Signed-off-by: Andi Kleen Cc: Jiri Olsa Link: http://lkml.kernel.org/r/1439419717-20601-1-git-send-email-andi@firstfloor.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/build/feature/Makefile b/tools/build/feature/Makefile index 1c0d69f..74ca420 100644 --- a/tools/build/feature/Makefile +++ b/tools/build/feature/Makefile @@ -70,8 +70,13 @@ test-libelf.bin: test-glibc.bin: $(BUILD) +DWARFLIBS := -ldw +ifeq ($(findstring -static,${LDFLAGS}),-static) +DWARFLIBS += -lelf -lebl -lz -llzma -lbz2 +endif + test-dwarf.bin: - $(BUILD) -ldw + $(BUILD) $(DWARFLIBS) test-libelf-mmap.bin: $(BUILD) -lelf diff --git a/tools/perf/config/Makefile b/tools/perf/config/Makefile index 823195a..827557f 100644 --- a/tools/perf/config/Makefile +++ b/tools/perf/config/Makefile @@ -297,7 +297,11 @@ ifndef NO_LIBELF else CFLAGS += -DHAVE_DWARF_SUPPORT $(LIBDW_CFLAGS) LDFLAGS += $(LIBDW_LDFLAGS) - EXTLIBS += -ldw + DWARFLIBS := -ldw + ifeq ($(findstring -static,${LDFLAGS}),-static) + DWARFLIBS += -lelf -lebl -lz -llzma -lbz2 + endif + EXTLIBS += ${DWARFLIBS} $(call detected,CONFIG_DWARF) endif # PERF_HAVE_DWARF_REGS endif # NO_DWARF -- cgit v0.10.2 From 86a76027457633488b0a83d5e2bb944159885605 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 13 Aug 2015 06:55:41 +0900 Subject: perf probe: Fix to add missed brace around if block The commit 75186a9b09e4 (perf probe: Fix to show lines of sys_ functions correctly) introduced a bug by a missed brace around if block. This fixes to add it. Signed-off-by: Masami Hiramatsu Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Fixes: 75186a9b09e4 ("perf probe: Fix to show lines of sys_ functions correctly") Link: http://lkml.kernel.org/r/20150812215541.9088.62425.stgit@localhost.localdomain Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/dwarf-aux.c b/tools/perf/util/dwarf-aux.c index 445f455..a509aa84 100644 --- a/tools/perf/util/dwarf-aux.c +++ b/tools/perf/util/dwarf-aux.c @@ -770,7 +770,7 @@ int die_walk_lines(Dwarf_Die *rt_die, line_walk_callback_t callback, void *data) continue; } /* Filter lines based on address */ - if (rt_die != cu_die) + if (rt_die != cu_die) { /* * Address filtering * The line is included in given function, and @@ -784,6 +784,7 @@ int die_walk_lines(Dwarf_Die *rt_die, line_walk_callback_t callback, void *data) decf != dwarf_decl_file(&die_mem)) continue; } + } /* Get source line */ fname = dwarf_linesrc(line, NULL, NULL); -- cgit v0.10.2 From 7f4f800131a281a1e1738c0bc45659c1260dc96a Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 14 Aug 2015 13:16:27 -0300 Subject: perf trace: Move vfs_getname storage to per thread area We were storing the vfs_getname payload (i.e. ptr->string) into the trace wide storage area (struct trace), so that we could use the last payload when setting up the fd->pathname per thread tables, oops, not a good idea for multi cpu tracing sessions... Fix it by moving it to the per thread area (struct thread_trace). Cc: Adrian Hunter Cc: Borislav Petkov Cc: David Ahern Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Namhyung Kim Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-3j05ttqyaem7kh7oubvr1keo@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 489cc11..2f1162d 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -1315,7 +1315,10 @@ struct thread_trace { double runtime_ms; struct { unsigned long ptr; - int entry_str_pos; + short int entry_str_pos; + bool pending_open; + unsigned int namelen; + char *name; } filename; struct { int max; @@ -1391,7 +1394,6 @@ struct trace { size_t nr; int *entries; } ev_qualifier_ids; - const char *last_vfs_getname; struct intlist *tid_list; struct intlist *pid_list; struct { @@ -1966,8 +1968,11 @@ static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel, trace__fprintf_entry_head(trace, thread, 1, sample->time, trace->output); fprintf(trace->output, "%-70s\n", ttrace->entry_str); } - } else + } else { ttrace->entry_pending = true; + /* See trace__vfs_getname & trace__sys_exit */ + ttrace->filename.pending_open = false; + } if (trace->current != thread) { thread__put(trace->current); @@ -2003,9 +2008,9 @@ static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel, ret = perf_evsel__sc_tp_uint(evsel, ret, sample); - if (id == trace->audit.open_id && ret >= 0 && trace->last_vfs_getname) { - trace__set_fd_pathname(thread, ret, trace->last_vfs_getname); - trace->last_vfs_getname = NULL; + if (id == trace->audit.open_id && ret >= 0 && ttrace->filename.pending_open) { + trace__set_fd_pathname(thread, ret, ttrace->filename.name); + ttrace->filename.pending_open = false; ++trace->stats.vfs_getname; } @@ -2065,9 +2070,7 @@ static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel, size_t filename_len, entry_str_len, to_move; ssize_t remaining_space; char *pos; - const char *filename; - - trace->last_vfs_getname = perf_evsel__rawptr(evsel, sample, "pathname"); + const char *filename = perf_evsel__rawptr(evsel, sample, "pathname"); if (!thread) goto out; @@ -2076,6 +2079,21 @@ static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel, if (!ttrace) goto out; + filename_len = strlen(filename); + + if (ttrace->filename.namelen < filename_len) { + char *f = realloc(ttrace->filename.name, filename_len + 1); + + if (f == NULL) + goto out; + + ttrace->filename.namelen = filename_len; + ttrace->filename.name = f; + } + + strcpy(ttrace->filename.name, filename); + ttrace->filename.pending_open = true; + if (!ttrace->filename.ptr) goto out; @@ -2084,8 +2102,6 @@ static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel, if (remaining_space <= 0) goto out; - filename = trace->last_vfs_getname; - filename_len = strlen(filename); if (filename_len > (size_t)remaining_space) { filename += filename_len - remaining_space; filename_len = remaining_space; -- cgit v0.10.2 From 7322d6c98dd214252bd697f8dde64a3576977fab Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 13 Aug 2015 09:17:24 +0200 Subject: perf script: Initialize callchain_param.record_mode Milian Wolff reported non functional DWARF unwind under perf script. The reason is that perf script does not properly configure callchain_param.record_mode, which is needed by unwind code. Stealing the code from report and leaving the place for more initialization code in a hope we could merge it with report__setup_sample_type one day. Reported-by: Milian Wolff Signed-off-by: Jiri Olsa Tested-by: Milian Wolff Cc: David Ahern Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20150813071724.GA21322@krava.brq.redhat.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 7b376d2..105332e 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -1561,6 +1561,22 @@ static int have_cmd(int argc, const char **argv) return 0; } +static void script__setup_sample_type(struct perf_script *script) +{ + struct perf_session *session = script->session; + u64 sample_type = perf_evlist__combined_sample_type(session->evlist); + + if (symbol_conf.use_callchain || symbol_conf.cumulate_callchain) { + if ((sample_type & PERF_SAMPLE_REGS_USER) && + (sample_type & PERF_SAMPLE_STACK_USER)) + callchain_param.record_mode = CALLCHAIN_DWARF; + else if (sample_type & PERF_SAMPLE_BRANCH_STACK) + callchain_param.record_mode = CALLCHAIN_LBR; + else + callchain_param.record_mode = CALLCHAIN_FP; + } +} + int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused) { bool show_full_info = false; @@ -1849,6 +1865,7 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused) goto out_delete; script.session = session; + script__setup_sample_type(&script); session->itrace_synth_opts = &itrace_synth_opts; -- cgit v0.10.2 From 3d7245b0946c13a97e3a10213b8fd0152972a0d5 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Fri, 14 Aug 2015 10:11:34 +0300 Subject: perf annotate: Fix 32-bit compilation error in util/annotate.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix the following 32-bit compilation errors: util/annotate.c: In function ‘addr_map_symbol__account_cycles’: util/annotate.c:643:3: error: format ‘%lx’ expects argument of type ‘long unsigned int’, but argument 4 has type ‘u64’ [-Werror=format=] pr_debug2("BB with bad start: addr %lx start %lx sym %lx saddr %lx\n", ^ util/annotate.c:643:3: error: format ‘%lx’ expects argument of type ‘long unsigned int’, but argument 5 has type ‘u64’ [-Werror=format=] util/annotate.c:643:3: error: format ‘%lx’ expects argument of type ‘long unsigned int’, but argument 6 has type ‘u64’ [-Werror=format=] These were introduced by the patch: "perf report: Add infrastructure for a cycles histogram" Also change the 'saddr' variable from 'unsigned long' to 'u64' noting that theoretically we could be processing data captured on a 64-bit machine but processing it on a 32-bit machine. Signed-off-by: Adrian Hunter Cc: Andi Kleen Cc: Jiri Olsa Fixes: d4957633bf9d ("perf report: Add infrastructure for a cycles histogram") Link: http://lkml.kernel.org/r/1439536294-18241-1-git-send-email-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index e0b6146..8a18347 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -621,7 +621,7 @@ int addr_map_symbol__account_cycles(struct addr_map_symbol *ams, struct addr_map_symbol *start, unsigned cycles) { - unsigned long saddr = 0; + u64 saddr = 0; int err; if (!cycles) @@ -640,7 +640,7 @@ int addr_map_symbol__account_cycles(struct addr_map_symbol *ams, start->addr == ams->sym->start + ams->map->start))) saddr = start->al_addr; if (saddr == 0) - pr_debug2("BB with bad start: addr %lx start %lx sym %lx saddr %lx\n", + pr_debug2("BB with bad start: addr %"PRIx64" start %"PRIx64" sym %"PRIx64" saddr %"PRIx64"\n", ams->addr, start ? start->addr : 0, ams->sym ? ams->sym->start + ams->map->start : 0, -- cgit v0.10.2 From f0ee3b467ab7a74ffce75190259eb59dbf30ecad Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Fri, 14 Aug 2015 15:50:06 +0300 Subject: perf symbols: Fix annotation of vdso Older kernels attempt to prelink vdso to its virtual address. To permit annotation using objdump, the map__rip_2objdump() calculation must result in that same address which we can infer from the start and offset of the text section. Signed-off-by: Adrian Hunter Cc: Jiri Olsa Cc: Will Deacon Link: http://lkml.kernel.org/r/1439556606-11297-1-git-send-email-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c index b0ad810..53bb5f5 100644 --- a/tools/perf/util/symbol-elf.c +++ b/tools/perf/util/symbol-elf.c @@ -875,6 +875,17 @@ int dso__load_sym(struct dso *dso, struct map *map, } } + /* + * Handle any relocation of vdso necessary because older kernels + * attempted to prelink vdso to its virtual address. + */ + if (dso__is_vdso(dso)) { + GElf_Shdr tshdr; + + if (elf_section_by_name(elf, &ehdr, &tshdr, ".text", NULL)) + map->reloc = map->start - tshdr.sh_addr + tshdr.sh_offset; + } + dso->adjust_symbols = runtime_ss->adjust_symbols || ref_reloc(kmap); /* * Initial kernel and module mappings do not map to the dso. For -- cgit v0.10.2 From 835095653ebfe4b16596a30db1c4e7c414014b5e Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Thu, 13 Aug 2015 12:40:56 +0300 Subject: perf tools: Add a helper function to probe whether cpu-wide tracing is possible Add a helper function to probe whether cpu-wide tracing is possible. Signed-off-by: Adrian Hunter Cc: Jiri Olsa Link: http://lkml.kernel.org/r/1439458857-30636-2-git-send-email-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h index 3977570..436e358 100644 --- a/tools/perf/util/evlist.h +++ b/tools/perf/util/evlist.h @@ -115,6 +115,7 @@ void perf_evlist__close(struct perf_evlist *evlist); void perf_evlist__set_id_pos(struct perf_evlist *evlist); bool perf_can_sample_identifier(void); bool perf_can_record_switch_events(void); +bool perf_can_record_cpu_wide(void); void perf_evlist__config(struct perf_evlist *evlist, struct record_opts *opts); int record_opts__config(struct record_opts *opts); diff --git a/tools/perf/util/record.c b/tools/perf/util/record.c index 0d228a2..0467367 100644 --- a/tools/perf/util/record.c +++ b/tools/perf/util/record.c @@ -105,6 +105,30 @@ bool perf_can_record_switch_events(void) return perf_probe_api(perf_probe_context_switch); } +bool perf_can_record_cpu_wide(void) +{ + struct perf_event_attr attr = { + .type = PERF_TYPE_SOFTWARE, + .config = PERF_COUNT_SW_CPU_CLOCK, + .exclude_kernel = 1, + }; + struct cpu_map *cpus; + int cpu, fd; + + cpus = cpu_map__new(NULL); + if (!cpus) + return false; + cpu = cpus->map[0]; + cpu_map__put(cpus); + + fd = sys_perf_event_open(&attr, -1, cpu, -1, 0); + if (fd < 0) + return false; + close(fd); + + return true; +} + void perf_evlist__config(struct perf_evlist *evlist, struct record_opts *opts) { struct perf_evsel *evsel; -- cgit v0.10.2 From 55ea4ab4260f42b824450faa47fe4d129fce0918 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Fri, 17 Jul 2015 19:33:36 +0300 Subject: perf auxtrace: Add Intel PT as an AUX area tracing type Add the Intel Processor Trace type constant PERF_AUXTRACE_INTEL_PT. Signed-off-by: Adrian Hunter Acked-by: Jiri Olsa Cc: Jiri Olsa Link: http://lkml.kernel.org/r/1437150840-31811-2-git-send-email-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/auxtrace.c b/tools/perf/util/auxtrace.c index a25b360..49dbfbe 100644 --- a/tools/perf/util/auxtrace.c +++ b/tools/perf/util/auxtrace.c @@ -884,6 +884,7 @@ int perf_event__process_auxtrace_info(struct perf_tool *tool __maybe_unused, fprintf(stdout, " type: %u\n", type); switch (type) { + case PERF_AUXTRACE_INTEL_PT: case PERF_AUXTRACE_UNKNOWN: default: return -EINVAL; diff --git a/tools/perf/util/auxtrace.h b/tools/perf/util/auxtrace.h index 471aecb..7d12f33 100644 --- a/tools/perf/util/auxtrace.h +++ b/tools/perf/util/auxtrace.h @@ -39,6 +39,7 @@ struct events_stats; enum auxtrace_type { PERF_AUXTRACE_UNKNOWN, + PERF_AUXTRACE_INTEL_PT, }; enum itrace_period_type { -- cgit v0.10.2 From a4e925905c98fb83538c164878946d77d0df1433 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Fri, 17 Jul 2015 19:33:37 +0300 Subject: perf tools: Add Intel PT packet decoder Add support for decoding Intel Processor Trace packets. This essentially provides intel_pt_get_packet() which takes a buffer of binary data and returns the decoded packet. Signed-off-by: Adrian Hunter Cc: Jiri Olsa Link: http://lkml.kernel.org/r/1437150840-31811-3-git-send-email-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/Build b/tools/perf/util/Build index 1ce0adc..615ca12 100644 --- a/tools/perf/util/Build +++ b/tools/perf/util/Build @@ -78,6 +78,7 @@ libperf-$(CONFIG_X86) += tsc.o libperf-y += cloexec.o libperf-y += thread-stack.o libperf-$(CONFIG_AUXTRACE) += auxtrace.o +libperf-$(CONFIG_AUXTRACE) += intel-pt-decoder/ libperf-y += parse-branch-options.o libperf-$(CONFIG_LIBELF) += symbol-elf.o diff --git a/tools/perf/util/intel-pt-decoder/Build b/tools/perf/util/intel-pt-decoder/Build new file mode 100644 index 0000000..9d67381 --- /dev/null +++ b/tools/perf/util/intel-pt-decoder/Build @@ -0,0 +1 @@ +libperf-$(CONFIG_AUXTRACE) += intel-pt-pkt-decoder.o diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-pkt-decoder.c b/tools/perf/util/intel-pt-decoder/intel-pt-pkt-decoder.c new file mode 100644 index 0000000..988c82c --- /dev/null +++ b/tools/perf/util/intel-pt-decoder/intel-pt-pkt-decoder.c @@ -0,0 +1,400 @@ +/* + * intel_pt_pkt_decoder.c: Intel Processor Trace support + * Copyright (c) 2013-2014, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + */ + +#include +#include +#include +#include + +#include "intel-pt-pkt-decoder.h" + +#define BIT(n) (1 << (n)) + +#define BIT63 ((uint64_t)1 << 63) + +#if __BYTE_ORDER == __BIG_ENDIAN +#define le16_to_cpu bswap_16 +#define le32_to_cpu bswap_32 +#define le64_to_cpu bswap_64 +#define memcpy_le64(d, s, n) do { \ + memcpy((d), (s), (n)); \ + *(d) = le64_to_cpu(*(d)); \ +} while (0) +#else +#define le16_to_cpu +#define le32_to_cpu +#define le64_to_cpu +#define memcpy_le64 memcpy +#endif + +static const char * const packet_name[] = { + [INTEL_PT_BAD] = "Bad Packet!", + [INTEL_PT_PAD] = "PAD", + [INTEL_PT_TNT] = "TNT", + [INTEL_PT_TIP_PGD] = "TIP.PGD", + [INTEL_PT_TIP_PGE] = "TIP.PGE", + [INTEL_PT_TSC] = "TSC", + [INTEL_PT_MODE_EXEC] = "MODE.Exec", + [INTEL_PT_MODE_TSX] = "MODE.TSX", + [INTEL_PT_TIP] = "TIP", + [INTEL_PT_FUP] = "FUP", + [INTEL_PT_PSB] = "PSB", + [INTEL_PT_PSBEND] = "PSBEND", + [INTEL_PT_CBR] = "CBR", + [INTEL_PT_PIP] = "PIP", + [INTEL_PT_OVF] = "OVF", +}; + +const char *intel_pt_pkt_name(enum intel_pt_pkt_type type) +{ + return packet_name[type]; +} + +static int intel_pt_get_long_tnt(const unsigned char *buf, size_t len, + struct intel_pt_pkt *packet) +{ + uint64_t payload; + int count; + + if (len < 8) + return INTEL_PT_NEED_MORE_BYTES; + + payload = le64_to_cpu(*(uint64_t *)buf); + + for (count = 47; count; count--) { + if (payload & BIT63) + break; + payload <<= 1; + } + + packet->type = INTEL_PT_TNT; + packet->count = count; + packet->payload = payload << 1; + return 8; +} + +static int intel_pt_get_pip(const unsigned char *buf, size_t len, + struct intel_pt_pkt *packet) +{ + uint64_t payload = 0; + + if (len < 8) + return INTEL_PT_NEED_MORE_BYTES; + + packet->type = INTEL_PT_PIP; + memcpy_le64(&payload, buf + 2, 6); + packet->payload = payload >> 1; + + return 8; +} + +static int intel_pt_get_cbr(const unsigned char *buf, size_t len, + struct intel_pt_pkt *packet) +{ + if (len < 4) + return INTEL_PT_NEED_MORE_BYTES; + packet->type = INTEL_PT_CBR; + packet->payload = buf[2]; + return 4; +} + +static int intel_pt_get_ovf(struct intel_pt_pkt *packet) +{ + packet->type = INTEL_PT_OVF; + return 2; +} + +static int intel_pt_get_psb(const unsigned char *buf, size_t len, + struct intel_pt_pkt *packet) +{ + int i; + + if (len < 16) + return INTEL_PT_NEED_MORE_BYTES; + + for (i = 2; i < 16; i += 2) { + if (buf[i] != 2 || buf[i + 1] != 0x82) + return INTEL_PT_BAD_PACKET; + } + + packet->type = INTEL_PT_PSB; + return 16; +} + +static int intel_pt_get_psbend(struct intel_pt_pkt *packet) +{ + packet->type = INTEL_PT_PSBEND; + return 2; +} + +static int intel_pt_get_pad(struct intel_pt_pkt *packet) +{ + packet->type = INTEL_PT_PAD; + return 1; +} + +static int intel_pt_get_ext(const unsigned char *buf, size_t len, + struct intel_pt_pkt *packet) +{ + if (len < 2) + return INTEL_PT_NEED_MORE_BYTES; + + switch (buf[1]) { + case 0xa3: /* Long TNT */ + return intel_pt_get_long_tnt(buf, len, packet); + case 0x43: /* PIP */ + return intel_pt_get_pip(buf, len, packet); + case 0x03: /* CBR */ + return intel_pt_get_cbr(buf, len, packet); + case 0xf3: /* OVF */ + return intel_pt_get_ovf(packet); + case 0x82: /* PSB */ + return intel_pt_get_psb(buf, len, packet); + case 0x23: /* PSBEND */ + return intel_pt_get_psbend(packet); + default: + return INTEL_PT_BAD_PACKET; + } +} + +static int intel_pt_get_short_tnt(unsigned int byte, + struct intel_pt_pkt *packet) +{ + int count; + + for (count = 6; count; count--) { + if (byte & BIT(7)) + break; + byte <<= 1; + } + + packet->type = INTEL_PT_TNT; + packet->count = count; + packet->payload = (uint64_t)byte << 57; + + return 1; +} + +static int intel_pt_get_ip(enum intel_pt_pkt_type type, unsigned int byte, + const unsigned char *buf, size_t len, + struct intel_pt_pkt *packet) +{ + switch (byte >> 5) { + case 0: + packet->count = 0; + break; + case 1: + if (len < 3) + return INTEL_PT_NEED_MORE_BYTES; + packet->count = 2; + packet->payload = le16_to_cpu(*(uint16_t *)(buf + 1)); + break; + case 2: + if (len < 5) + return INTEL_PT_NEED_MORE_BYTES; + packet->count = 4; + packet->payload = le32_to_cpu(*(uint32_t *)(buf + 1)); + break; + case 3: + case 6: + if (len < 7) + return INTEL_PT_NEED_MORE_BYTES; + packet->count = 6; + memcpy_le64(&packet->payload, buf + 1, 6); + break; + default: + return INTEL_PT_BAD_PACKET; + } + + packet->type = type; + + return packet->count + 1; +} + +static int intel_pt_get_mode(const unsigned char *buf, size_t len, + struct intel_pt_pkt *packet) +{ + if (len < 2) + return INTEL_PT_NEED_MORE_BYTES; + + switch (buf[1] >> 5) { + case 0: + packet->type = INTEL_PT_MODE_EXEC; + switch (buf[1] & 3) { + case 0: + packet->payload = 16; + break; + case 1: + packet->payload = 64; + break; + case 2: + packet->payload = 32; + break; + default: + return INTEL_PT_BAD_PACKET; + } + break; + case 1: + packet->type = INTEL_PT_MODE_TSX; + if ((buf[1] & 3) == 3) + return INTEL_PT_BAD_PACKET; + packet->payload = buf[1] & 3; + break; + default: + return INTEL_PT_BAD_PACKET; + } + + return 2; +} + +static int intel_pt_get_tsc(const unsigned char *buf, size_t len, + struct intel_pt_pkt *packet) +{ + if (len < 8) + return INTEL_PT_NEED_MORE_BYTES; + packet->type = INTEL_PT_TSC; + memcpy_le64(&packet->payload, buf + 1, 7); + return 8; +} + +static int intel_pt_do_get_packet(const unsigned char *buf, size_t len, + struct intel_pt_pkt *packet) +{ + unsigned int byte; + + memset(packet, 0, sizeof(struct intel_pt_pkt)); + + if (!len) + return INTEL_PT_NEED_MORE_BYTES; + + byte = buf[0]; + if (!(byte & BIT(0))) { + if (byte == 0) + return intel_pt_get_pad(packet); + if (byte == 2) + return intel_pt_get_ext(buf, len, packet); + return intel_pt_get_short_tnt(byte, packet); + } + + switch (byte & 0x1f) { + case 0x0D: + return intel_pt_get_ip(INTEL_PT_TIP, byte, buf, len, packet); + case 0x11: + return intel_pt_get_ip(INTEL_PT_TIP_PGE, byte, buf, len, + packet); + case 0x01: + return intel_pt_get_ip(INTEL_PT_TIP_PGD, byte, buf, len, + packet); + case 0x1D: + return intel_pt_get_ip(INTEL_PT_FUP, byte, buf, len, packet); + case 0x19: + switch (byte) { + case 0x99: + return intel_pt_get_mode(buf, len, packet); + case 0x19: + return intel_pt_get_tsc(buf, len, packet); + default: + return INTEL_PT_BAD_PACKET; + } + default: + return INTEL_PT_BAD_PACKET; + } +} + +int intel_pt_get_packet(const unsigned char *buf, size_t len, + struct intel_pt_pkt *packet) +{ + int ret; + + ret = intel_pt_do_get_packet(buf, len, packet); + if (ret > 0) { + while (ret < 8 && len > (size_t)ret && !buf[ret]) + ret += 1; + } + return ret; +} + +int intel_pt_pkt_desc(const struct intel_pt_pkt *packet, char *buf, + size_t buf_len) +{ + int ret, i; + unsigned long long payload = packet->payload; + const char *name = intel_pt_pkt_name(packet->type); + + switch (packet->type) { + case INTEL_PT_BAD: + case INTEL_PT_PAD: + case INTEL_PT_PSB: + case INTEL_PT_PSBEND: + case INTEL_PT_OVF: + return snprintf(buf, buf_len, "%s", name); + case INTEL_PT_TNT: { + size_t blen = buf_len; + + ret = snprintf(buf, blen, "%s ", name); + if (ret < 0) + return ret; + buf += ret; + blen -= ret; + for (i = 0; i < packet->count; i++) { + if (payload & BIT63) + ret = snprintf(buf, blen, "T"); + else + ret = snprintf(buf, blen, "N"); + if (ret < 0) + return ret; + buf += ret; + blen -= ret; + payload <<= 1; + } + ret = snprintf(buf, blen, " (%d)", packet->count); + if (ret < 0) + return ret; + blen -= ret; + return buf_len - blen; + } + case INTEL_PT_TIP_PGD: + case INTEL_PT_TIP_PGE: + case INTEL_PT_TIP: + case INTEL_PT_FUP: + if (!(packet->count)) + return snprintf(buf, buf_len, "%s no ip", name); + case INTEL_PT_CBR: + return snprintf(buf, buf_len, "%s 0x%llx", name, payload); + case INTEL_PT_TSC: + if (packet->count) + return snprintf(buf, buf_len, + "%s 0x%llx CTC 0x%x FC 0x%x", + name, payload, packet->count & 0xffff, + (packet->count >> 16) & 0x1ff); + else + return snprintf(buf, buf_len, "%s 0x%llx", + name, payload); + case INTEL_PT_MODE_EXEC: + return snprintf(buf, buf_len, "%s %lld", name, payload); + case INTEL_PT_MODE_TSX: + return snprintf(buf, buf_len, "%s TXAbort:%u InTX:%u", + name, (unsigned)(payload >> 1) & 1, + (unsigned)payload & 1); + case INTEL_PT_PIP: + ret = snprintf(buf, buf_len, "%s 0x%llx", + name, payload); + return ret; + default: + break; + } + return snprintf(buf, buf_len, "%s 0x%llx (%d)", + name, payload, packet->count); +} diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-pkt-decoder.h b/tools/perf/util/intel-pt-decoder/intel-pt-pkt-decoder.h new file mode 100644 index 0000000..53404fa --- /dev/null +++ b/tools/perf/util/intel-pt-decoder/intel-pt-pkt-decoder.h @@ -0,0 +1,64 @@ +/* + * intel_pt_pkt_decoder.h: Intel Processor Trace support + * Copyright (c) 2013-2014, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + */ + +#ifndef INCLUDE__INTEL_PT_PKT_DECODER_H__ +#define INCLUDE__INTEL_PT_PKT_DECODER_H__ + +#include +#include + +#define INTEL_PT_PKT_DESC_MAX 256 + +#define INTEL_PT_NEED_MORE_BYTES -1 +#define INTEL_PT_BAD_PACKET -2 + +#define INTEL_PT_PSB_STR "\002\202\002\202\002\202\002\202" \ + "\002\202\002\202\002\202\002\202" +#define INTEL_PT_PSB_LEN 16 + +#define INTEL_PT_PKT_MAX_SZ 16 + +enum intel_pt_pkt_type { + INTEL_PT_BAD, + INTEL_PT_PAD, + INTEL_PT_TNT, + INTEL_PT_TIP_PGD, + INTEL_PT_TIP_PGE, + INTEL_PT_TSC, + INTEL_PT_MODE_EXEC, + INTEL_PT_MODE_TSX, + INTEL_PT_TIP, + INTEL_PT_FUP, + INTEL_PT_PSB, + INTEL_PT_PSBEND, + INTEL_PT_CBR, + INTEL_PT_PIP, + INTEL_PT_OVF, +}; + +struct intel_pt_pkt { + enum intel_pt_pkt_type type; + int count; + uint64_t payload; +}; + +const char *intel_pt_pkt_name(enum intel_pt_pkt_type); + +int intel_pt_get_packet(const unsigned char *buf, size_t len, + struct intel_pt_pkt *packet); + +int intel_pt_pkt_desc(const struct intel_pt_pkt *packet, char *buf, size_t len); + +#endif -- cgit v0.10.2 From 237fae79f50d2d0c7bdeb039bc2c87fc6d52c7e7 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Thu, 13 Aug 2015 10:14:55 +0300 Subject: perf tools: Add Intel PT instruction decoder Add support for decoding instructions for Intel Processor Trace. The kernel x86 instruction decoder is copied for this. This essentially provides intel_pt_get_insn() which takes a binary buffer, uses the kernel's x86 instruction decoder to get details of the instruction and then categorizes it for consumption by an Intel PT decoder. Signed-off-by: Adrian Hunter Cc: Jiri Olsa Link: http://lkml.kernel.org/r/1439450095-30122-1-git-send-email-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/build/Makefile.build b/tools/build/Makefile.build index faca2bf..8120af9 100644 --- a/tools/build/Makefile.build +++ b/tools/build/Makefile.build @@ -57,6 +57,8 @@ quiet_cmd_cc_i_c = CPP $@ quiet_cmd_cc_s_c = AS $@ cmd_cc_s_c = $(CC) $(c_flags) -S -o $@ $< +quiet_cmd_gen = GEN $@ + # Link agregate command # If there's nothing to link, create empty $@ object. quiet_cmd_ld_multi = LD $@ diff --git a/tools/perf/.gitignore b/tools/perf/.gitignore index 09db62b..3d1bb80 100644 --- a/tools/perf/.gitignore +++ b/tools/perf/.gitignore @@ -29,3 +29,4 @@ config.mak.autogen *.pyc *.pyo .config-detected +util/intel-pt-decoder/inat-tables.c diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf index 4b58dae..d9863cb 100644 --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf @@ -76,6 +76,12 @@ include config/utilities.mak # # Define NO_AUXTRACE if you do not want AUX area tracing support +# As per kernel Makefile, avoid funny character set dependencies +unexport LC_ALL +LC_COLLATE=C +LC_NUMERIC=C +export LC_COLLATE LC_NUMERIC + ifeq ($(srctree),) srctree := $(patsubst %/,%,$(dir $(shell pwd))) srctree := $(patsubst %/,%,$(dir $(srctree))) @@ -135,6 +141,7 @@ INSTALL = install FLEX = flex BISON = bison STRIP = strip +AWK = awk LIB_DIR = $(srctree)/tools/lib/api/ TRACE_EVENT_DIR = $(srctree)/tools/lib/traceevent/ @@ -289,7 +296,7 @@ strip: $(PROGRAMS) $(OUTPUT)perf PERF_IN := $(OUTPUT)perf-in.o -export srctree OUTPUT RM CC LD AR CFLAGS V BISON FLEX +export srctree OUTPUT RM CC LD AR CFLAGS V BISON FLEX AWK build := -f $(srctree)/tools/build/Makefile.build dir=. obj $(PERF_IN): $(OUTPUT)PERF-VERSION-FILE $(OUTPUT)common-cmds.h FORCE @@ -565,7 +572,8 @@ clean: $(LIBTRACEEVENT)-clean $(LIBAPI)-clean config-clean $(Q)find . -name '*.o' -delete -o -name '\.*.cmd' -delete -o -name '\.*.d' -delete $(Q)$(RM) $(OUTPUT).config-detected $(call QUIET_CLEAN, core-progs) $(RM) $(ALL_PROGRAMS) perf perf-read-vdso32 perf-read-vdsox32 - $(call QUIET_CLEAN, core-gen) $(RM) *.spec *.pyc *.pyo */*.pyc */*.pyo $(OUTPUT)common-cmds.h TAGS tags cscope* $(OUTPUT)PERF-VERSION-FILE $(OUTPUT)FEATURE-DUMP $(OUTPUT)util/*-bison* $(OUTPUT)util/*-flex* + $(call QUIET_CLEAN, core-gen) $(RM) *.spec *.pyc *.pyo */*.pyc */*.pyo $(OUTPUT)common-cmds.h TAGS tags cscope* $(OUTPUT)PERF-VERSION-FILE $(OUTPUT)FEATURE-DUMP $(OUTPUT)util/*-bison* $(OUTPUT)util/*-flex* \ + $(OUTPUT)util/intel-pt-decoder/inat-tables.c $(QUIET_SUBDIR0)Documentation $(QUIET_SUBDIR1) clean $(python-clean) diff --git a/tools/perf/util/intel-pt-decoder/Build b/tools/perf/util/intel-pt-decoder/Build index 9d67381..5a46ce1 100644 --- a/tools/perf/util/intel-pt-decoder/Build +++ b/tools/perf/util/intel-pt-decoder/Build @@ -1 +1,11 @@ -libperf-$(CONFIG_AUXTRACE) += intel-pt-pkt-decoder.o +libperf-$(CONFIG_AUXTRACE) += intel-pt-pkt-decoder.o intel-pt-insn-decoder.o + +inat_tables_script = util/intel-pt-decoder/gen-insn-attr-x86.awk +inat_tables_maps = util/intel-pt-decoder/x86-opcode-map.txt + +$(OUTPUT)util/intel-pt-decoder/inat-tables.c: $(inat_tables_script) $(inat_tables_maps) + @$(call echo-cmd,gen)$(AWK) -f $(inat_tables_script) $(inat_tables_maps) > $@ || rm -f $@ + +$(OUTPUT)util/intel-pt-decoder/intel-pt-insn-decoder.o: util/intel-pt-decoder/inat.c $(OUTPUT)util/intel-pt-decoder/inat-tables.c + +CFLAGS_intel-pt-insn-decoder.o += -I$(OUTPUT)util/intel-pt-decoder -Wno-override-init diff --git a/tools/perf/util/intel-pt-decoder/gen-insn-attr-x86.awk b/tools/perf/util/intel-pt-decoder/gen-insn-attr-x86.awk new file mode 100644 index 0000000..51756734 --- /dev/null +++ b/tools/perf/util/intel-pt-decoder/gen-insn-attr-x86.awk @@ -0,0 +1,386 @@ +#!/bin/awk -f +# gen-insn-attr-x86.awk: Instruction attribute table generator +# Written by Masami Hiramatsu +# +# Usage: awk -f gen-insn-attr-x86.awk x86-opcode-map.txt > inat-tables.c + +# Awk implementation sanity check +function check_awk_implement() { + if (sprintf("%x", 0) != "0") + return "Your awk has a printf-format problem." + return "" +} + +# Clear working vars +function clear_vars() { + delete table + delete lptable2 + delete lptable1 + delete lptable3 + eid = -1 # escape id + gid = -1 # group id + aid = -1 # AVX id + tname = "" +} + +BEGIN { + # Implementation error checking + awkchecked = check_awk_implement() + if (awkchecked != "") { + print "Error: " awkchecked > "/dev/stderr" + print "Please try to use gawk." > "/dev/stderr" + exit 1 + } + + # Setup generating tables + print "/* x86 opcode map generated from x86-opcode-map.txt */" + print "/* Do not change this code. */\n" + ggid = 1 + geid = 1 + gaid = 0 + delete etable + delete gtable + delete atable + + opnd_expr = "^[A-Za-z/]" + ext_expr = "^\\(" + sep_expr = "^\\|$" + group_expr = "^Grp[0-9A-Za-z]+" + + imm_expr = "^[IJAOL][a-z]" + imm_flag["Ib"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)" + imm_flag["Jb"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)" + imm_flag["Iw"] = "INAT_MAKE_IMM(INAT_IMM_WORD)" + imm_flag["Id"] = "INAT_MAKE_IMM(INAT_IMM_DWORD)" + imm_flag["Iq"] = "INAT_MAKE_IMM(INAT_IMM_QWORD)" + imm_flag["Ap"] = "INAT_MAKE_IMM(INAT_IMM_PTR)" + imm_flag["Iz"] = "INAT_MAKE_IMM(INAT_IMM_VWORD32)" + imm_flag["Jz"] = "INAT_MAKE_IMM(INAT_IMM_VWORD32)" + imm_flag["Iv"] = "INAT_MAKE_IMM(INAT_IMM_VWORD)" + imm_flag["Ob"] = "INAT_MOFFSET" + imm_flag["Ov"] = "INAT_MOFFSET" + imm_flag["Lx"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)" + + modrm_expr = "^([CDEGMNPQRSUVW/][a-z]+|NTA|T[012])" + force64_expr = "\\([df]64\\)" + rex_expr = "^REX(\\.[XRWB]+)*" + fpu_expr = "^ESC" # TODO + + lprefix1_expr = "\\((66|!F3)\\)" + lprefix2_expr = "\\(F3\\)" + lprefix3_expr = "\\((F2|!F3|66\\&F2)\\)" + lprefix_expr = "\\((66|F2|F3)\\)" + max_lprefix = 4 + + # All opcodes starting with lower-case 'v' or with (v1) superscript + # accepts VEX prefix + vexok_opcode_expr = "^v.*" + vexok_expr = "\\(v1\\)" + # All opcodes with (v) superscript supports *only* VEX prefix + vexonly_expr = "\\(v\\)" + + prefix_expr = "\\(Prefix\\)" + prefix_num["Operand-Size"] = "INAT_PFX_OPNDSZ" + prefix_num["REPNE"] = "INAT_PFX_REPNE" + prefix_num["REP/REPE"] = "INAT_PFX_REPE" + prefix_num["XACQUIRE"] = "INAT_PFX_REPNE" + prefix_num["XRELEASE"] = "INAT_PFX_REPE" + prefix_num["LOCK"] = "INAT_PFX_LOCK" + prefix_num["SEG=CS"] = "INAT_PFX_CS" + prefix_num["SEG=DS"] = "INAT_PFX_DS" + prefix_num["SEG=ES"] = "INAT_PFX_ES" + prefix_num["SEG=FS"] = "INAT_PFX_FS" + prefix_num["SEG=GS"] = "INAT_PFX_GS" + prefix_num["SEG=SS"] = "INAT_PFX_SS" + prefix_num["Address-Size"] = "INAT_PFX_ADDRSZ" + prefix_num["VEX+1byte"] = "INAT_PFX_VEX2" + prefix_num["VEX+2byte"] = "INAT_PFX_VEX3" + + clear_vars() +} + +function semantic_error(msg) { + print "Semantic error at " NR ": " msg > "/dev/stderr" + exit 1 +} + +function debug(msg) { + print "DEBUG: " msg +} + +function array_size(arr, i,c) { + c = 0 + for (i in arr) + c++ + return c +} + +/^Table:/ { + print "/* " $0 " */" + if (tname != "") + semantic_error("Hit Table: before EndTable:."); +} + +/^Referrer:/ { + if (NF != 1) { + # escape opcode table + ref = "" + for (i = 2; i <= NF; i++) + ref = ref $i + eid = escape[ref] + tname = sprintf("inat_escape_table_%d", eid) + } +} + +/^AVXcode:/ { + if (NF != 1) { + # AVX/escape opcode table + aid = $2 + if (gaid <= aid) + gaid = aid + 1 + if (tname == "") # AVX only opcode table + tname = sprintf("inat_avx_table_%d", $2) + } + if (aid == -1 && eid == -1) # primary opcode table + tname = "inat_primary_table" +} + +/^GrpTable:/ { + print "/* " $0 " */" + if (!($2 in group)) + semantic_error("No group: " $2 ) + gid = group[$2] + tname = "inat_group_table_" gid +} + +function print_table(tbl,name,fmt,n) +{ + print "const insn_attr_t " name " = {" + for (i = 0; i < n; i++) { + id = sprintf(fmt, i) + if (tbl[id]) + print " [" id "] = " tbl[id] "," + } + print "};" +} + +/^EndTable/ { + if (gid != -1) { + # print group tables + if (array_size(table) != 0) { + print_table(table, tname "[INAT_GROUP_TABLE_SIZE]", + "0x%x", 8) + gtable[gid,0] = tname + } + if (array_size(lptable1) != 0) { + print_table(lptable1, tname "_1[INAT_GROUP_TABLE_SIZE]", + "0x%x", 8) + gtable[gid,1] = tname "_1" + } + if (array_size(lptable2) != 0) { + print_table(lptable2, tname "_2[INAT_GROUP_TABLE_SIZE]", + "0x%x", 8) + gtable[gid,2] = tname "_2" + } + if (array_size(lptable3) != 0) { + print_table(lptable3, tname "_3[INAT_GROUP_TABLE_SIZE]", + "0x%x", 8) + gtable[gid,3] = tname "_3" + } + } else { + # print primary/escaped tables + if (array_size(table) != 0) { + print_table(table, tname "[INAT_OPCODE_TABLE_SIZE]", + "0x%02x", 256) + etable[eid,0] = tname + if (aid >= 0) + atable[aid,0] = tname + } + if (array_size(lptable1) != 0) { + print_table(lptable1,tname "_1[INAT_OPCODE_TABLE_SIZE]", + "0x%02x", 256) + etable[eid,1] = tname "_1" + if (aid >= 0) + atable[aid,1] = tname "_1" + } + if (array_size(lptable2) != 0) { + print_table(lptable2,tname "_2[INAT_OPCODE_TABLE_SIZE]", + "0x%02x", 256) + etable[eid,2] = tname "_2" + if (aid >= 0) + atable[aid,2] = tname "_2" + } + if (array_size(lptable3) != 0) { + print_table(lptable3,tname "_3[INAT_OPCODE_TABLE_SIZE]", + "0x%02x", 256) + etable[eid,3] = tname "_3" + if (aid >= 0) + atable[aid,3] = tname "_3" + } + } + print "" + clear_vars() +} + +function add_flags(old,new) { + if (old && new) + return old " | " new + else if (old) + return old + else + return new +} + +# convert operands to flags. +function convert_operands(count,opnd, i,j,imm,mod) +{ + imm = null + mod = null + for (j = 1; j <= count; j++) { + i = opnd[j] + if (match(i, imm_expr) == 1) { + if (!imm_flag[i]) + semantic_error("Unknown imm opnd: " i) + if (imm) { + if (i != "Ib") + semantic_error("Second IMM error") + imm = add_flags(imm, "INAT_SCNDIMM") + } else + imm = imm_flag[i] + } else if (match(i, modrm_expr)) + mod = "INAT_MODRM" + } + return add_flags(imm, mod) +} + +/^[0-9a-f]+\:/ { + if (NR == 1) + next + # get index + idx = "0x" substr($1, 1, index($1,":") - 1) + if (idx in table) + semantic_error("Redefine " idx " in " tname) + + # check if escaped opcode + if ("escape" == $2) { + if ($3 != "#") + semantic_error("No escaped name") + ref = "" + for (i = 4; i <= NF; i++) + ref = ref $i + if (ref in escape) + semantic_error("Redefine escape (" ref ")") + escape[ref] = geid + geid++ + table[idx] = "INAT_MAKE_ESCAPE(" escape[ref] ")" + next + } + + variant = null + # converts + i = 2 + while (i <= NF) { + opcode = $(i++) + delete opnds + ext = null + flags = null + opnd = null + # parse one opcode + if (match($i, opnd_expr)) { + opnd = $i + count = split($(i++), opnds, ",") + flags = convert_operands(count, opnds) + } + if (match($i, ext_expr)) + ext = $(i++) + if (match($i, sep_expr)) + i++ + else if (i < NF) + semantic_error($i " is not a separator") + + # check if group opcode + if (match(opcode, group_expr)) { + if (!(opcode in group)) { + group[opcode] = ggid + ggid++ + } + flags = add_flags(flags, "INAT_MAKE_GROUP(" group[opcode] ")") + } + # check force(or default) 64bit + if (match(ext, force64_expr)) + flags = add_flags(flags, "INAT_FORCE64") + + # check REX prefix + if (match(opcode, rex_expr)) + flags = add_flags(flags, "INAT_MAKE_PREFIX(INAT_PFX_REX)") + + # check coprocessor escape : TODO + if (match(opcode, fpu_expr)) + flags = add_flags(flags, "INAT_MODRM") + + # check VEX codes + if (match(ext, vexonly_expr)) + flags = add_flags(flags, "INAT_VEXOK | INAT_VEXONLY") + else if (match(ext, vexok_expr) || match(opcode, vexok_opcode_expr)) + flags = add_flags(flags, "INAT_VEXOK") + + # check prefixes + if (match(ext, prefix_expr)) { + if (!prefix_num[opcode]) + semantic_error("Unknown prefix: " opcode) + flags = add_flags(flags, "INAT_MAKE_PREFIX(" prefix_num[opcode] ")") + } + if (length(flags) == 0) + continue + # check if last prefix + if (match(ext, lprefix1_expr)) { + lptable1[idx] = add_flags(lptable1[idx],flags) + variant = "INAT_VARIANT" + } + if (match(ext, lprefix2_expr)) { + lptable2[idx] = add_flags(lptable2[idx],flags) + variant = "INAT_VARIANT" + } + if (match(ext, lprefix3_expr)) { + lptable3[idx] = add_flags(lptable3[idx],flags) + variant = "INAT_VARIANT" + } + if (!match(ext, lprefix_expr)){ + table[idx] = add_flags(table[idx],flags) + } + } + if (variant) + table[idx] = add_flags(table[idx],variant) +} + +END { + if (awkchecked != "") + exit 1 + # print escape opcode map's array + print "/* Escape opcode map array */" + print "const insn_attr_t * const inat_escape_tables[INAT_ESC_MAX + 1]" \ + "[INAT_LSTPFX_MAX + 1] = {" + for (i = 0; i < geid; i++) + for (j = 0; j < max_lprefix; j++) + if (etable[i,j]) + print " ["i"]["j"] = "etable[i,j]"," + print "};\n" + # print group opcode map's array + print "/* Group opcode map array */" + print "const insn_attr_t * const inat_group_tables[INAT_GRP_MAX + 1]"\ + "[INAT_LSTPFX_MAX + 1] = {" + for (i = 0; i < ggid; i++) + for (j = 0; j < max_lprefix; j++) + if (gtable[i,j]) + print " ["i"]["j"] = "gtable[i,j]"," + print "};\n" + # print AVX opcode map's array + print "/* AVX opcode map array */" + print "const insn_attr_t * const inat_avx_tables[X86_VEX_M_MAX + 1]"\ + "[INAT_LSTPFX_MAX + 1] = {" + for (i = 0; i < gaid; i++) + for (j = 0; j < max_lprefix; j++) + if (atable[i,j]) + print " ["i"]["j"] = "atable[i,j]"," + print "};" +} diff --git a/tools/perf/util/intel-pt-decoder/inat.c b/tools/perf/util/intel-pt-decoder/inat.c new file mode 100644 index 0000000..feeaa50 --- /dev/null +++ b/tools/perf/util/intel-pt-decoder/inat.c @@ -0,0 +1,96 @@ +/* + * x86 instruction attribute tables + * + * Written by Masami Hiramatsu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + */ +#include + +/* Attribute tables are generated from opcode map */ +#include "inat-tables.c" + +/* Attribute search APIs */ +insn_attr_t inat_get_opcode_attribute(insn_byte_t opcode) +{ + return inat_primary_table[opcode]; +} + +int inat_get_last_prefix_id(insn_byte_t last_pfx) +{ + insn_attr_t lpfx_attr; + + lpfx_attr = inat_get_opcode_attribute(last_pfx); + return inat_last_prefix_id(lpfx_attr); +} + +insn_attr_t inat_get_escape_attribute(insn_byte_t opcode, int lpfx_id, + insn_attr_t esc_attr) +{ + const insn_attr_t *table; + int n; + + n = inat_escape_id(esc_attr); + + table = inat_escape_tables[n][0]; + if (!table) + return 0; + if (inat_has_variant(table[opcode]) && lpfx_id) { + table = inat_escape_tables[n][lpfx_id]; + if (!table) + return 0; + } + return table[opcode]; +} + +insn_attr_t inat_get_group_attribute(insn_byte_t modrm, int lpfx_id, + insn_attr_t grp_attr) +{ + const insn_attr_t *table; + int n; + + n = inat_group_id(grp_attr); + + table = inat_group_tables[n][0]; + if (!table) + return inat_group_common_attribute(grp_attr); + if (inat_has_variant(table[X86_MODRM_REG(modrm)]) && lpfx_id) { + table = inat_group_tables[n][lpfx_id]; + if (!table) + return inat_group_common_attribute(grp_attr); + } + return table[X86_MODRM_REG(modrm)] | + inat_group_common_attribute(grp_attr); +} + +insn_attr_t inat_get_avx_attribute(insn_byte_t opcode, insn_byte_t vex_m, + insn_byte_t vex_p) +{ + const insn_attr_t *table; + if (vex_m > X86_VEX_M_MAX || vex_p > INAT_LSTPFX_MAX) + return 0; + /* At first, this checks the master table */ + table = inat_avx_tables[vex_m][0]; + if (!table) + return 0; + if (!inat_is_group(table[opcode]) && vex_p) { + /* If this is not a group, get attribute directly */ + table = inat_avx_tables[vex_m][vex_p]; + if (!table) + return 0; + } + return table[opcode]; +} diff --git a/tools/perf/util/intel-pt-decoder/inat.h b/tools/perf/util/intel-pt-decoder/inat.h new file mode 100644 index 0000000..74a2e31 --- /dev/null +++ b/tools/perf/util/intel-pt-decoder/inat.h @@ -0,0 +1,221 @@ +#ifndef _ASM_X86_INAT_H +#define _ASM_X86_INAT_H +/* + * x86 instruction attributes + * + * Written by Masami Hiramatsu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + */ +#include + +/* + * Internal bits. Don't use bitmasks directly, because these bits are + * unstable. You should use checking functions. + */ + +#define INAT_OPCODE_TABLE_SIZE 256 +#define INAT_GROUP_TABLE_SIZE 8 + +/* Legacy last prefixes */ +#define INAT_PFX_OPNDSZ 1 /* 0x66 */ /* LPFX1 */ +#define INAT_PFX_REPE 2 /* 0xF3 */ /* LPFX2 */ +#define INAT_PFX_REPNE 3 /* 0xF2 */ /* LPFX3 */ +/* Other Legacy prefixes */ +#define INAT_PFX_LOCK 4 /* 0xF0 */ +#define INAT_PFX_CS 5 /* 0x2E */ +#define INAT_PFX_DS 6 /* 0x3E */ +#define INAT_PFX_ES 7 /* 0x26 */ +#define INAT_PFX_FS 8 /* 0x64 */ +#define INAT_PFX_GS 9 /* 0x65 */ +#define INAT_PFX_SS 10 /* 0x36 */ +#define INAT_PFX_ADDRSZ 11 /* 0x67 */ +/* x86-64 REX prefix */ +#define INAT_PFX_REX 12 /* 0x4X */ +/* AVX VEX prefixes */ +#define INAT_PFX_VEX2 13 /* 2-bytes VEX prefix */ +#define INAT_PFX_VEX3 14 /* 3-bytes VEX prefix */ + +#define INAT_LSTPFX_MAX 3 +#define INAT_LGCPFX_MAX 11 + +/* Immediate size */ +#define INAT_IMM_BYTE 1 +#define INAT_IMM_WORD 2 +#define INAT_IMM_DWORD 3 +#define INAT_IMM_QWORD 4 +#define INAT_IMM_PTR 5 +#define INAT_IMM_VWORD32 6 +#define INAT_IMM_VWORD 7 + +/* Legacy prefix */ +#define INAT_PFX_OFFS 0 +#define INAT_PFX_BITS 4 +#define INAT_PFX_MAX ((1 << INAT_PFX_BITS) - 1) +#define INAT_PFX_MASK (INAT_PFX_MAX << INAT_PFX_OFFS) +/* Escape opcodes */ +#define INAT_ESC_OFFS (INAT_PFX_OFFS + INAT_PFX_BITS) +#define INAT_ESC_BITS 2 +#define INAT_ESC_MAX ((1 << INAT_ESC_BITS) - 1) +#define INAT_ESC_MASK (INAT_ESC_MAX << INAT_ESC_OFFS) +/* Group opcodes (1-16) */ +#define INAT_GRP_OFFS (INAT_ESC_OFFS + INAT_ESC_BITS) +#define INAT_GRP_BITS 5 +#define INAT_GRP_MAX ((1 << INAT_GRP_BITS) - 1) +#define INAT_GRP_MASK (INAT_GRP_MAX << INAT_GRP_OFFS) +/* Immediates */ +#define INAT_IMM_OFFS (INAT_GRP_OFFS + INAT_GRP_BITS) +#define INAT_IMM_BITS 3 +#define INAT_IMM_MASK (((1 << INAT_IMM_BITS) - 1) << INAT_IMM_OFFS) +/* Flags */ +#define INAT_FLAG_OFFS (INAT_IMM_OFFS + INAT_IMM_BITS) +#define INAT_MODRM (1 << (INAT_FLAG_OFFS)) +#define INAT_FORCE64 (1 << (INAT_FLAG_OFFS + 1)) +#define INAT_SCNDIMM (1 << (INAT_FLAG_OFFS + 2)) +#define INAT_MOFFSET (1 << (INAT_FLAG_OFFS + 3)) +#define INAT_VARIANT (1 << (INAT_FLAG_OFFS + 4)) +#define INAT_VEXOK (1 << (INAT_FLAG_OFFS + 5)) +#define INAT_VEXONLY (1 << (INAT_FLAG_OFFS + 6)) +/* Attribute making macros for attribute tables */ +#define INAT_MAKE_PREFIX(pfx) (pfx << INAT_PFX_OFFS) +#define INAT_MAKE_ESCAPE(esc) (esc << INAT_ESC_OFFS) +#define INAT_MAKE_GROUP(grp) ((grp << INAT_GRP_OFFS) | INAT_MODRM) +#define INAT_MAKE_IMM(imm) (imm << INAT_IMM_OFFS) + +/* Attribute search APIs */ +extern insn_attr_t inat_get_opcode_attribute(insn_byte_t opcode); +extern int inat_get_last_prefix_id(insn_byte_t last_pfx); +extern insn_attr_t inat_get_escape_attribute(insn_byte_t opcode, + int lpfx_id, + insn_attr_t esc_attr); +extern insn_attr_t inat_get_group_attribute(insn_byte_t modrm, + int lpfx_id, + insn_attr_t esc_attr); +extern insn_attr_t inat_get_avx_attribute(insn_byte_t opcode, + insn_byte_t vex_m, + insn_byte_t vex_pp); + +/* Attribute checking functions */ +static inline int inat_is_legacy_prefix(insn_attr_t attr) +{ + attr &= INAT_PFX_MASK; + return attr && attr <= INAT_LGCPFX_MAX; +} + +static inline int inat_is_address_size_prefix(insn_attr_t attr) +{ + return (attr & INAT_PFX_MASK) == INAT_PFX_ADDRSZ; +} + +static inline int inat_is_operand_size_prefix(insn_attr_t attr) +{ + return (attr & INAT_PFX_MASK) == INAT_PFX_OPNDSZ; +} + +static inline int inat_is_rex_prefix(insn_attr_t attr) +{ + return (attr & INAT_PFX_MASK) == INAT_PFX_REX; +} + +static inline int inat_last_prefix_id(insn_attr_t attr) +{ + if ((attr & INAT_PFX_MASK) > INAT_LSTPFX_MAX) + return 0; + else + return attr & INAT_PFX_MASK; +} + +static inline int inat_is_vex_prefix(insn_attr_t attr) +{ + attr &= INAT_PFX_MASK; + return attr == INAT_PFX_VEX2 || attr == INAT_PFX_VEX3; +} + +static inline int inat_is_vex3_prefix(insn_attr_t attr) +{ + return (attr & INAT_PFX_MASK) == INAT_PFX_VEX3; +} + +static inline int inat_is_escape(insn_attr_t attr) +{ + return attr & INAT_ESC_MASK; +} + +static inline int inat_escape_id(insn_attr_t attr) +{ + return (attr & INAT_ESC_MASK) >> INAT_ESC_OFFS; +} + +static inline int inat_is_group(insn_attr_t attr) +{ + return attr & INAT_GRP_MASK; +} + +static inline int inat_group_id(insn_attr_t attr) +{ + return (attr & INAT_GRP_MASK) >> INAT_GRP_OFFS; +} + +static inline int inat_group_common_attribute(insn_attr_t attr) +{ + return attr & ~INAT_GRP_MASK; +} + +static inline int inat_has_immediate(insn_attr_t attr) +{ + return attr & INAT_IMM_MASK; +} + +static inline int inat_immediate_size(insn_attr_t attr) +{ + return (attr & INAT_IMM_MASK) >> INAT_IMM_OFFS; +} + +static inline int inat_has_modrm(insn_attr_t attr) +{ + return attr & INAT_MODRM; +} + +static inline int inat_is_force64(insn_attr_t attr) +{ + return attr & INAT_FORCE64; +} + +static inline int inat_has_second_immediate(insn_attr_t attr) +{ + return attr & INAT_SCNDIMM; +} + +static inline int inat_has_moffset(insn_attr_t attr) +{ + return attr & INAT_MOFFSET; +} + +static inline int inat_has_variant(insn_attr_t attr) +{ + return attr & INAT_VARIANT; +} + +static inline int inat_accept_vex(insn_attr_t attr) +{ + return attr & INAT_VEXOK; +} + +static inline int inat_must_vex(insn_attr_t attr) +{ + return attr & INAT_VEXONLY; +} +#endif diff --git a/tools/perf/util/intel-pt-decoder/insn.c b/tools/perf/util/intel-pt-decoder/insn.c new file mode 100644 index 0000000..8f72b33 --- /dev/null +++ b/tools/perf/util/intel-pt-decoder/insn.c @@ -0,0 +1,594 @@ +/* + * x86 instruction analysis + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2002, 2004, 2009 + */ + +#ifdef __KERNEL__ +#include +#else +#include +#endif +#include +#include + +/* Verify next sizeof(t) bytes can be on the same instruction */ +#define validate_next(t, insn, n) \ + ((insn)->next_byte + sizeof(t) + n <= (insn)->end_kaddr) + +#define __get_next(t, insn) \ + ({ t r = *(t*)insn->next_byte; insn->next_byte += sizeof(t); r; }) + +#define __peek_nbyte_next(t, insn, n) \ + ({ t r = *(t*)((insn)->next_byte + n); r; }) + +#define get_next(t, insn) \ + ({ if (unlikely(!validate_next(t, insn, 0))) goto err_out; __get_next(t, insn); }) + +#define peek_nbyte_next(t, insn, n) \ + ({ if (unlikely(!validate_next(t, insn, n))) goto err_out; __peek_nbyte_next(t, insn, n); }) + +#define peek_next(t, insn) peek_nbyte_next(t, insn, 0) + +/** + * insn_init() - initialize struct insn + * @insn: &struct insn to be initialized + * @kaddr: address (in kernel memory) of instruction (or copy thereof) + * @x86_64: !0 for 64-bit kernel or 64-bit app + */ +void insn_init(struct insn *insn, const void *kaddr, int buf_len, int x86_64) +{ + /* + * Instructions longer than MAX_INSN_SIZE (15 bytes) are invalid + * even if the input buffer is long enough to hold them. + */ + if (buf_len > MAX_INSN_SIZE) + buf_len = MAX_INSN_SIZE; + + memset(insn, 0, sizeof(*insn)); + insn->kaddr = kaddr; + insn->end_kaddr = kaddr + buf_len; + insn->next_byte = kaddr; + insn->x86_64 = x86_64 ? 1 : 0; + insn->opnd_bytes = 4; + if (x86_64) + insn->addr_bytes = 8; + else + insn->addr_bytes = 4; +} + +/** + * insn_get_prefixes - scan x86 instruction prefix bytes + * @insn: &struct insn containing instruction + * + * Populates the @insn->prefixes bitmap, and updates @insn->next_byte + * to point to the (first) opcode. No effect if @insn->prefixes.got + * is already set. + */ +void insn_get_prefixes(struct insn *insn) +{ + struct insn_field *prefixes = &insn->prefixes; + insn_attr_t attr; + insn_byte_t b, lb; + int i, nb; + + if (prefixes->got) + return; + + nb = 0; + lb = 0; + b = peek_next(insn_byte_t, insn); + attr = inat_get_opcode_attribute(b); + while (inat_is_legacy_prefix(attr)) { + /* Skip if same prefix */ + for (i = 0; i < nb; i++) + if (prefixes->bytes[i] == b) + goto found; + if (nb == 4) + /* Invalid instruction */ + break; + prefixes->bytes[nb++] = b; + if (inat_is_address_size_prefix(attr)) { + /* address size switches 2/4 or 4/8 */ + if (insn->x86_64) + insn->addr_bytes ^= 12; + else + insn->addr_bytes ^= 6; + } else if (inat_is_operand_size_prefix(attr)) { + /* oprand size switches 2/4 */ + insn->opnd_bytes ^= 6; + } +found: + prefixes->nbytes++; + insn->next_byte++; + lb = b; + b = peek_next(insn_byte_t, insn); + attr = inat_get_opcode_attribute(b); + } + /* Set the last prefix */ + if (lb && lb != insn->prefixes.bytes[3]) { + if (unlikely(insn->prefixes.bytes[3])) { + /* Swap the last prefix */ + b = insn->prefixes.bytes[3]; + for (i = 0; i < nb; i++) + if (prefixes->bytes[i] == lb) + prefixes->bytes[i] = b; + } + insn->prefixes.bytes[3] = lb; + } + + /* Decode REX prefix */ + if (insn->x86_64) { + b = peek_next(insn_byte_t, insn); + attr = inat_get_opcode_attribute(b); + if (inat_is_rex_prefix(attr)) { + insn->rex_prefix.value = b; + insn->rex_prefix.nbytes = 1; + insn->next_byte++; + if (X86_REX_W(b)) + /* REX.W overrides opnd_size */ + insn->opnd_bytes = 8; + } + } + insn->rex_prefix.got = 1; + + /* Decode VEX prefix */ + b = peek_next(insn_byte_t, insn); + attr = inat_get_opcode_attribute(b); + if (inat_is_vex_prefix(attr)) { + insn_byte_t b2 = peek_nbyte_next(insn_byte_t, insn, 1); + if (!insn->x86_64) { + /* + * In 32-bits mode, if the [7:6] bits (mod bits of + * ModRM) on the second byte are not 11b, it is + * LDS or LES. + */ + if (X86_MODRM_MOD(b2) != 3) + goto vex_end; + } + insn->vex_prefix.bytes[0] = b; + insn->vex_prefix.bytes[1] = b2; + if (inat_is_vex3_prefix(attr)) { + b2 = peek_nbyte_next(insn_byte_t, insn, 2); + insn->vex_prefix.bytes[2] = b2; + insn->vex_prefix.nbytes = 3; + insn->next_byte += 3; + if (insn->x86_64 && X86_VEX_W(b2)) + /* VEX.W overrides opnd_size */ + insn->opnd_bytes = 8; + } else { + /* + * For VEX2, fake VEX3-like byte#2. + * Makes it easier to decode vex.W, vex.vvvv, + * vex.L and vex.pp. Masking with 0x7f sets vex.W == 0. + */ + insn->vex_prefix.bytes[2] = b2 & 0x7f; + insn->vex_prefix.nbytes = 2; + insn->next_byte += 2; + } + } +vex_end: + insn->vex_prefix.got = 1; + + prefixes->got = 1; + +err_out: + return; +} + +/** + * insn_get_opcode - collect opcode(s) + * @insn: &struct insn containing instruction + * + * Populates @insn->opcode, updates @insn->next_byte to point past the + * opcode byte(s), and set @insn->attr (except for groups). + * If necessary, first collects any preceding (prefix) bytes. + * Sets @insn->opcode.value = opcode1. No effect if @insn->opcode.got + * is already 1. + */ +void insn_get_opcode(struct insn *insn) +{ + struct insn_field *opcode = &insn->opcode; + insn_byte_t op; + int pfx_id; + if (opcode->got) + return; + if (!insn->prefixes.got) + insn_get_prefixes(insn); + + /* Get first opcode */ + op = get_next(insn_byte_t, insn); + opcode->bytes[0] = op; + opcode->nbytes = 1; + + /* Check if there is VEX prefix or not */ + if (insn_is_avx(insn)) { + insn_byte_t m, p; + m = insn_vex_m_bits(insn); + p = insn_vex_p_bits(insn); + insn->attr = inat_get_avx_attribute(op, m, p); + if (!inat_accept_vex(insn->attr) && !inat_is_group(insn->attr)) + insn->attr = 0; /* This instruction is bad */ + goto end; /* VEX has only 1 byte for opcode */ + } + + insn->attr = inat_get_opcode_attribute(op); + while (inat_is_escape(insn->attr)) { + /* Get escaped opcode */ + op = get_next(insn_byte_t, insn); + opcode->bytes[opcode->nbytes++] = op; + pfx_id = insn_last_prefix_id(insn); + insn->attr = inat_get_escape_attribute(op, pfx_id, insn->attr); + } + if (inat_must_vex(insn->attr)) + insn->attr = 0; /* This instruction is bad */ +end: + opcode->got = 1; + +err_out: + return; +} + +/** + * insn_get_modrm - collect ModRM byte, if any + * @insn: &struct insn containing instruction + * + * Populates @insn->modrm and updates @insn->next_byte to point past the + * ModRM byte, if any. If necessary, first collects the preceding bytes + * (prefixes and opcode(s)). No effect if @insn->modrm.got is already 1. + */ +void insn_get_modrm(struct insn *insn) +{ + struct insn_field *modrm = &insn->modrm; + insn_byte_t pfx_id, mod; + if (modrm->got) + return; + if (!insn->opcode.got) + insn_get_opcode(insn); + + if (inat_has_modrm(insn->attr)) { + mod = get_next(insn_byte_t, insn); + modrm->value = mod; + modrm->nbytes = 1; + if (inat_is_group(insn->attr)) { + pfx_id = insn_last_prefix_id(insn); + insn->attr = inat_get_group_attribute(mod, pfx_id, + insn->attr); + if (insn_is_avx(insn) && !inat_accept_vex(insn->attr)) + insn->attr = 0; /* This is bad */ + } + } + + if (insn->x86_64 && inat_is_force64(insn->attr)) + insn->opnd_bytes = 8; + modrm->got = 1; + +err_out: + return; +} + + +/** + * insn_rip_relative() - Does instruction use RIP-relative addressing mode? + * @insn: &struct insn containing instruction + * + * If necessary, first collects the instruction up to and including the + * ModRM byte. No effect if @insn->x86_64 is 0. + */ +int insn_rip_relative(struct insn *insn) +{ + struct insn_field *modrm = &insn->modrm; + + if (!insn->x86_64) + return 0; + if (!modrm->got) + insn_get_modrm(insn); + /* + * For rip-relative instructions, the mod field (top 2 bits) + * is zero and the r/m field (bottom 3 bits) is 0x5. + */ + return (modrm->nbytes && (modrm->value & 0xc7) == 0x5); +} + +/** + * insn_get_sib() - Get the SIB byte of instruction + * @insn: &struct insn containing instruction + * + * If necessary, first collects the instruction up to and including the + * ModRM byte. + */ +void insn_get_sib(struct insn *insn) +{ + insn_byte_t modrm; + + if (insn->sib.got) + return; + if (!insn->modrm.got) + insn_get_modrm(insn); + if (insn->modrm.nbytes) { + modrm = (insn_byte_t)insn->modrm.value; + if (insn->addr_bytes != 2 && + X86_MODRM_MOD(modrm) != 3 && X86_MODRM_RM(modrm) == 4) { + insn->sib.value = get_next(insn_byte_t, insn); + insn->sib.nbytes = 1; + } + } + insn->sib.got = 1; + +err_out: + return; +} + + +/** + * insn_get_displacement() - Get the displacement of instruction + * @insn: &struct insn containing instruction + * + * If necessary, first collects the instruction up to and including the + * SIB byte. + * Displacement value is sign-expanded. + */ +void insn_get_displacement(struct insn *insn) +{ + insn_byte_t mod, rm, base; + + if (insn->displacement.got) + return; + if (!insn->sib.got) + insn_get_sib(insn); + if (insn->modrm.nbytes) { + /* + * Interpreting the modrm byte: + * mod = 00 - no displacement fields (exceptions below) + * mod = 01 - 1-byte displacement field + * mod = 10 - displacement field is 4 bytes, or 2 bytes if + * address size = 2 (0x67 prefix in 32-bit mode) + * mod = 11 - no memory operand + * + * If address size = 2... + * mod = 00, r/m = 110 - displacement field is 2 bytes + * + * If address size != 2... + * mod != 11, r/m = 100 - SIB byte exists + * mod = 00, SIB base = 101 - displacement field is 4 bytes + * mod = 00, r/m = 101 - rip-relative addressing, displacement + * field is 4 bytes + */ + mod = X86_MODRM_MOD(insn->modrm.value); + rm = X86_MODRM_RM(insn->modrm.value); + base = X86_SIB_BASE(insn->sib.value); + if (mod == 3) + goto out; + if (mod == 1) { + insn->displacement.value = get_next(char, insn); + insn->displacement.nbytes = 1; + } else if (insn->addr_bytes == 2) { + if ((mod == 0 && rm == 6) || mod == 2) { + insn->displacement.value = + get_next(short, insn); + insn->displacement.nbytes = 2; + } + } else { + if ((mod == 0 && rm == 5) || mod == 2 || + (mod == 0 && base == 5)) { + insn->displacement.value = get_next(int, insn); + insn->displacement.nbytes = 4; + } + } + } +out: + insn->displacement.got = 1; + +err_out: + return; +} + +/* Decode moffset16/32/64. Return 0 if failed */ +static int __get_moffset(struct insn *insn) +{ + switch (insn->addr_bytes) { + case 2: + insn->moffset1.value = get_next(short, insn); + insn->moffset1.nbytes = 2; + break; + case 4: + insn->moffset1.value = get_next(int, insn); + insn->moffset1.nbytes = 4; + break; + case 8: + insn->moffset1.value = get_next(int, insn); + insn->moffset1.nbytes = 4; + insn->moffset2.value = get_next(int, insn); + insn->moffset2.nbytes = 4; + break; + default: /* opnd_bytes must be modified manually */ + goto err_out; + } + insn->moffset1.got = insn->moffset2.got = 1; + + return 1; + +err_out: + return 0; +} + +/* Decode imm v32(Iz). Return 0 if failed */ +static int __get_immv32(struct insn *insn) +{ + switch (insn->opnd_bytes) { + case 2: + insn->immediate.value = get_next(short, insn); + insn->immediate.nbytes = 2; + break; + case 4: + case 8: + insn->immediate.value = get_next(int, insn); + insn->immediate.nbytes = 4; + break; + default: /* opnd_bytes must be modified manually */ + goto err_out; + } + + return 1; + +err_out: + return 0; +} + +/* Decode imm v64(Iv/Ov), Return 0 if failed */ +static int __get_immv(struct insn *insn) +{ + switch (insn->opnd_bytes) { + case 2: + insn->immediate1.value = get_next(short, insn); + insn->immediate1.nbytes = 2; + break; + case 4: + insn->immediate1.value = get_next(int, insn); + insn->immediate1.nbytes = 4; + break; + case 8: + insn->immediate1.value = get_next(int, insn); + insn->immediate1.nbytes = 4; + insn->immediate2.value = get_next(int, insn); + insn->immediate2.nbytes = 4; + break; + default: /* opnd_bytes must be modified manually */ + goto err_out; + } + insn->immediate1.got = insn->immediate2.got = 1; + + return 1; +err_out: + return 0; +} + +/* Decode ptr16:16/32(Ap) */ +static int __get_immptr(struct insn *insn) +{ + switch (insn->opnd_bytes) { + case 2: + insn->immediate1.value = get_next(short, insn); + insn->immediate1.nbytes = 2; + break; + case 4: + insn->immediate1.value = get_next(int, insn); + insn->immediate1.nbytes = 4; + break; + case 8: + /* ptr16:64 is not exist (no segment) */ + return 0; + default: /* opnd_bytes must be modified manually */ + goto err_out; + } + insn->immediate2.value = get_next(unsigned short, insn); + insn->immediate2.nbytes = 2; + insn->immediate1.got = insn->immediate2.got = 1; + + return 1; +err_out: + return 0; +} + +/** + * insn_get_immediate() - Get the immediates of instruction + * @insn: &struct insn containing instruction + * + * If necessary, first collects the instruction up to and including the + * displacement bytes. + * Basically, most of immediates are sign-expanded. Unsigned-value can be + * get by bit masking with ((1 << (nbytes * 8)) - 1) + */ +void insn_get_immediate(struct insn *insn) +{ + if (insn->immediate.got) + return; + if (!insn->displacement.got) + insn_get_displacement(insn); + + if (inat_has_moffset(insn->attr)) { + if (!__get_moffset(insn)) + goto err_out; + goto done; + } + + if (!inat_has_immediate(insn->attr)) + /* no immediates */ + goto done; + + switch (inat_immediate_size(insn->attr)) { + case INAT_IMM_BYTE: + insn->immediate.value = get_next(char, insn); + insn->immediate.nbytes = 1; + break; + case INAT_IMM_WORD: + insn->immediate.value = get_next(short, insn); + insn->immediate.nbytes = 2; + break; + case INAT_IMM_DWORD: + insn->immediate.value = get_next(int, insn); + insn->immediate.nbytes = 4; + break; + case INAT_IMM_QWORD: + insn->immediate1.value = get_next(int, insn); + insn->immediate1.nbytes = 4; + insn->immediate2.value = get_next(int, insn); + insn->immediate2.nbytes = 4; + break; + case INAT_IMM_PTR: + if (!__get_immptr(insn)) + goto err_out; + break; + case INAT_IMM_VWORD32: + if (!__get_immv32(insn)) + goto err_out; + break; + case INAT_IMM_VWORD: + if (!__get_immv(insn)) + goto err_out; + break; + default: + /* Here, insn must have an immediate, but failed */ + goto err_out; + } + if (inat_has_second_immediate(insn->attr)) { + insn->immediate2.value = get_next(char, insn); + insn->immediate2.nbytes = 1; + } +done: + insn->immediate.got = 1; + +err_out: + return; +} + +/** + * insn_get_length() - Get the length of instruction + * @insn: &struct insn containing instruction + * + * If necessary, first collects the instruction up to and including the + * immediates bytes. + */ +void insn_get_length(struct insn *insn) +{ + if (insn->length) + return; + if (!insn->immediate.got) + insn_get_immediate(insn); + insn->length = (unsigned char)((unsigned long)insn->next_byte + - (unsigned long)insn->kaddr); +} diff --git a/tools/perf/util/intel-pt-decoder/insn.h b/tools/perf/util/intel-pt-decoder/insn.h new file mode 100644 index 0000000..e7814b7 --- /dev/null +++ b/tools/perf/util/intel-pt-decoder/insn.h @@ -0,0 +1,201 @@ +#ifndef _ASM_X86_INSN_H +#define _ASM_X86_INSN_H +/* + * x86 instruction analysis + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2009 + */ + +/* insn_attr_t is defined in inat.h */ +#include + +struct insn_field { + union { + insn_value_t value; + insn_byte_t bytes[4]; + }; + /* !0 if we've run insn_get_xxx() for this field */ + unsigned char got; + unsigned char nbytes; +}; + +struct insn { + struct insn_field prefixes; /* + * Prefixes + * prefixes.bytes[3]: last prefix + */ + struct insn_field rex_prefix; /* REX prefix */ + struct insn_field vex_prefix; /* VEX prefix */ + struct insn_field opcode; /* + * opcode.bytes[0]: opcode1 + * opcode.bytes[1]: opcode2 + * opcode.bytes[2]: opcode3 + */ + struct insn_field modrm; + struct insn_field sib; + struct insn_field displacement; + union { + struct insn_field immediate; + struct insn_field moffset1; /* for 64bit MOV */ + struct insn_field immediate1; /* for 64bit imm or off16/32 */ + }; + union { + struct insn_field moffset2; /* for 64bit MOV */ + struct insn_field immediate2; /* for 64bit imm or seg16 */ + }; + + insn_attr_t attr; + unsigned char opnd_bytes; + unsigned char addr_bytes; + unsigned char length; + unsigned char x86_64; + + const insn_byte_t *kaddr; /* kernel address of insn to analyze */ + const insn_byte_t *end_kaddr; /* kernel address of last insn in buffer */ + const insn_byte_t *next_byte; +}; + +#define MAX_INSN_SIZE 15 + +#define X86_MODRM_MOD(modrm) (((modrm) & 0xc0) >> 6) +#define X86_MODRM_REG(modrm) (((modrm) & 0x38) >> 3) +#define X86_MODRM_RM(modrm) ((modrm) & 0x07) + +#define X86_SIB_SCALE(sib) (((sib) & 0xc0) >> 6) +#define X86_SIB_INDEX(sib) (((sib) & 0x38) >> 3) +#define X86_SIB_BASE(sib) ((sib) & 0x07) + +#define X86_REX_W(rex) ((rex) & 8) +#define X86_REX_R(rex) ((rex) & 4) +#define X86_REX_X(rex) ((rex) & 2) +#define X86_REX_B(rex) ((rex) & 1) + +/* VEX bit flags */ +#define X86_VEX_W(vex) ((vex) & 0x80) /* VEX3 Byte2 */ +#define X86_VEX_R(vex) ((vex) & 0x80) /* VEX2/3 Byte1 */ +#define X86_VEX_X(vex) ((vex) & 0x40) /* VEX3 Byte1 */ +#define X86_VEX_B(vex) ((vex) & 0x20) /* VEX3 Byte1 */ +#define X86_VEX_L(vex) ((vex) & 0x04) /* VEX3 Byte2, VEX2 Byte1 */ +/* VEX bit fields */ +#define X86_VEX3_M(vex) ((vex) & 0x1f) /* VEX3 Byte1 */ +#define X86_VEX2_M 1 /* VEX2.M always 1 */ +#define X86_VEX_V(vex) (((vex) & 0x78) >> 3) /* VEX3 Byte2, VEX2 Byte1 */ +#define X86_VEX_P(vex) ((vex) & 0x03) /* VEX3 Byte2, VEX2 Byte1 */ +#define X86_VEX_M_MAX 0x1f /* VEX3.M Maximum value */ + +extern void insn_init(struct insn *insn, const void *kaddr, int buf_len, int x86_64); +extern void insn_get_prefixes(struct insn *insn); +extern void insn_get_opcode(struct insn *insn); +extern void insn_get_modrm(struct insn *insn); +extern void insn_get_sib(struct insn *insn); +extern void insn_get_displacement(struct insn *insn); +extern void insn_get_immediate(struct insn *insn); +extern void insn_get_length(struct insn *insn); + +/* Attribute will be determined after getting ModRM (for opcode groups) */ +static inline void insn_get_attribute(struct insn *insn) +{ + insn_get_modrm(insn); +} + +/* Instruction uses RIP-relative addressing */ +extern int insn_rip_relative(struct insn *insn); + +/* Init insn for kernel text */ +static inline void kernel_insn_init(struct insn *insn, + const void *kaddr, int buf_len) +{ +#ifdef CONFIG_X86_64 + insn_init(insn, kaddr, buf_len, 1); +#else /* CONFIG_X86_32 */ + insn_init(insn, kaddr, buf_len, 0); +#endif +} + +static inline int insn_is_avx(struct insn *insn) +{ + if (!insn->prefixes.got) + insn_get_prefixes(insn); + return (insn->vex_prefix.value != 0); +} + +/* Ensure this instruction is decoded completely */ +static inline int insn_complete(struct insn *insn) +{ + return insn->opcode.got && insn->modrm.got && insn->sib.got && + insn->displacement.got && insn->immediate.got; +} + +static inline insn_byte_t insn_vex_m_bits(struct insn *insn) +{ + if (insn->vex_prefix.nbytes == 2) /* 2 bytes VEX */ + return X86_VEX2_M; + else + return X86_VEX3_M(insn->vex_prefix.bytes[1]); +} + +static inline insn_byte_t insn_vex_p_bits(struct insn *insn) +{ + if (insn->vex_prefix.nbytes == 2) /* 2 bytes VEX */ + return X86_VEX_P(insn->vex_prefix.bytes[1]); + else + return X86_VEX_P(insn->vex_prefix.bytes[2]); +} + +/* Get the last prefix id from last prefix or VEX prefix */ +static inline int insn_last_prefix_id(struct insn *insn) +{ + if (insn_is_avx(insn)) + return insn_vex_p_bits(insn); /* VEX_p is a SIMD prefix id */ + + if (insn->prefixes.bytes[3]) + return inat_get_last_prefix_id(insn->prefixes.bytes[3]); + + return 0; +} + +/* Offset of each field from kaddr */ +static inline int insn_offset_rex_prefix(struct insn *insn) +{ + return insn->prefixes.nbytes; +} +static inline int insn_offset_vex_prefix(struct insn *insn) +{ + return insn_offset_rex_prefix(insn) + insn->rex_prefix.nbytes; +} +static inline int insn_offset_opcode(struct insn *insn) +{ + return insn_offset_vex_prefix(insn) + insn->vex_prefix.nbytes; +} +static inline int insn_offset_modrm(struct insn *insn) +{ + return insn_offset_opcode(insn) + insn->opcode.nbytes; +} +static inline int insn_offset_sib(struct insn *insn) +{ + return insn_offset_modrm(insn) + insn->modrm.nbytes; +} +static inline int insn_offset_displacement(struct insn *insn) +{ + return insn_offset_sib(insn) + insn->sib.nbytes; +} +static inline int insn_offset_immediate(struct insn *insn) +{ + return insn_offset_displacement(insn) + insn->displacement.nbytes; +} + +#endif /* _ASM_X86_INSN_H */ diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-insn-decoder.c b/tools/perf/util/intel-pt-decoder/intel-pt-insn-decoder.c new file mode 100644 index 0000000..46980fc --- /dev/null +++ b/tools/perf/util/intel-pt-decoder/intel-pt-insn-decoder.c @@ -0,0 +1,246 @@ +/* + * intel_pt_insn_decoder.c: Intel Processor Trace support + * Copyright (c) 2013-2014, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + */ + +#include +#include +#include +#include + +#include "event.h" + +#include + +#include "inat.c" +#include "insn.c" + +#include "intel-pt-insn-decoder.h" + +/* Based on branch_type() from perf_event_intel_lbr.c */ +static void intel_pt_insn_decoder(struct insn *insn, + struct intel_pt_insn *intel_pt_insn) +{ + enum intel_pt_insn_op op = INTEL_PT_OP_OTHER; + enum intel_pt_insn_branch branch = INTEL_PT_BR_NO_BRANCH; + int ext; + + if (insn_is_avx(insn)) { + intel_pt_insn->op = INTEL_PT_OP_OTHER; + intel_pt_insn->branch = INTEL_PT_BR_NO_BRANCH; + intel_pt_insn->length = insn->length; + return; + } + + switch (insn->opcode.bytes[0]) { + case 0xf: + switch (insn->opcode.bytes[1]) { + case 0x05: /* syscall */ + case 0x34: /* sysenter */ + op = INTEL_PT_OP_SYSCALL; + branch = INTEL_PT_BR_INDIRECT; + break; + case 0x07: /* sysret */ + case 0x35: /* sysexit */ + op = INTEL_PT_OP_SYSRET; + branch = INTEL_PT_BR_INDIRECT; + break; + case 0x80 ... 0x8f: /* jcc */ + op = INTEL_PT_OP_JCC; + branch = INTEL_PT_BR_CONDITIONAL; + break; + default: + break; + } + break; + case 0x70 ... 0x7f: /* jcc */ + op = INTEL_PT_OP_JCC; + branch = INTEL_PT_BR_CONDITIONAL; + break; + case 0xc2: /* near ret */ + case 0xc3: /* near ret */ + case 0xca: /* far ret */ + case 0xcb: /* far ret */ + op = INTEL_PT_OP_RET; + branch = INTEL_PT_BR_INDIRECT; + break; + case 0xcf: /* iret */ + op = INTEL_PT_OP_IRET; + branch = INTEL_PT_BR_INDIRECT; + break; + case 0xcc ... 0xce: /* int */ + op = INTEL_PT_OP_INT; + branch = INTEL_PT_BR_INDIRECT; + break; + case 0xe8: /* call near rel */ + op = INTEL_PT_OP_CALL; + branch = INTEL_PT_BR_UNCONDITIONAL; + break; + case 0x9a: /* call far absolute */ + op = INTEL_PT_OP_CALL; + branch = INTEL_PT_BR_INDIRECT; + break; + case 0xe0 ... 0xe2: /* loop */ + op = INTEL_PT_OP_LOOP; + branch = INTEL_PT_BR_CONDITIONAL; + break; + case 0xe3: /* jcc */ + op = INTEL_PT_OP_JCC; + branch = INTEL_PT_BR_CONDITIONAL; + break; + case 0xe9: /* jmp */ + case 0xeb: /* jmp */ + op = INTEL_PT_OP_JMP; + branch = INTEL_PT_BR_UNCONDITIONAL; + break; + case 0xea: /* far jmp */ + op = INTEL_PT_OP_JMP; + branch = INTEL_PT_BR_INDIRECT; + break; + case 0xff: /* call near absolute, call far absolute ind */ + ext = (insn->modrm.bytes[0] >> 3) & 0x7; + switch (ext) { + case 2: /* near ind call */ + case 3: /* far ind call */ + op = INTEL_PT_OP_CALL; + branch = INTEL_PT_BR_INDIRECT; + break; + case 4: + case 5: + op = INTEL_PT_OP_JMP; + branch = INTEL_PT_BR_INDIRECT; + break; + default: + break; + } + break; + default: + break; + } + + intel_pt_insn->op = op; + intel_pt_insn->branch = branch; + intel_pt_insn->length = insn->length; + + if (branch == INTEL_PT_BR_CONDITIONAL || + branch == INTEL_PT_BR_UNCONDITIONAL) { +#if __BYTE_ORDER == __BIG_ENDIAN + switch (insn->immediate.nbytes) { + case 1: + intel_pt_insn->rel = insn->immediate.value; + break; + case 2: + intel_pt_insn->rel = + bswap_16((short)insn->immediate.value); + break; + case 4: + intel_pt_insn->rel = bswap_32(insn->immediate.value); + break; + } +#else + intel_pt_insn->rel = insn->immediate.value; +#endif + } +} + +int intel_pt_get_insn(const unsigned char *buf, size_t len, int x86_64, + struct intel_pt_insn *intel_pt_insn) +{ + struct insn insn; + + insn_init(&insn, buf, len, x86_64); + insn_get_length(&insn); + if (!insn_complete(&insn) || insn.length > len) + return -1; + intel_pt_insn_decoder(&insn, intel_pt_insn); + if (insn.length < INTEL_PT_INSN_DBG_BUF_SZ) + memcpy(intel_pt_insn->buf, buf, insn.length); + else + memcpy(intel_pt_insn->buf, buf, INTEL_PT_INSN_DBG_BUF_SZ); + return 0; +} + +const char *branch_name[] = { + [INTEL_PT_OP_OTHER] = "Other", + [INTEL_PT_OP_CALL] = "Call", + [INTEL_PT_OP_RET] = "Ret", + [INTEL_PT_OP_JCC] = "Jcc", + [INTEL_PT_OP_JMP] = "Jmp", + [INTEL_PT_OP_LOOP] = "Loop", + [INTEL_PT_OP_IRET] = "IRet", + [INTEL_PT_OP_INT] = "Int", + [INTEL_PT_OP_SYSCALL] = "Syscall", + [INTEL_PT_OP_SYSRET] = "Sysret", +}; + +const char *intel_pt_insn_name(enum intel_pt_insn_op op) +{ + return branch_name[op]; +} + +int intel_pt_insn_desc(const struct intel_pt_insn *intel_pt_insn, char *buf, + size_t buf_len) +{ + switch (intel_pt_insn->branch) { + case INTEL_PT_BR_CONDITIONAL: + case INTEL_PT_BR_UNCONDITIONAL: + return snprintf(buf, buf_len, "%s %s%d", + intel_pt_insn_name(intel_pt_insn->op), + intel_pt_insn->rel > 0 ? "+" : "", + intel_pt_insn->rel); + case INTEL_PT_BR_NO_BRANCH: + case INTEL_PT_BR_INDIRECT: + return snprintf(buf, buf_len, "%s", + intel_pt_insn_name(intel_pt_insn->op)); + default: + break; + } + return 0; +} + +size_t intel_pt_insn_max_size(void) +{ + return MAX_INSN_SIZE; +} + +int intel_pt_insn_type(enum intel_pt_insn_op op) +{ + switch (op) { + case INTEL_PT_OP_OTHER: + return 0; + case INTEL_PT_OP_CALL: + return PERF_IP_FLAG_BRANCH | PERF_IP_FLAG_CALL; + case INTEL_PT_OP_RET: + return PERF_IP_FLAG_BRANCH | PERF_IP_FLAG_RETURN; + case INTEL_PT_OP_JCC: + return PERF_IP_FLAG_BRANCH | PERF_IP_FLAG_CONDITIONAL; + case INTEL_PT_OP_JMP: + return PERF_IP_FLAG_BRANCH; + case INTEL_PT_OP_LOOP: + return PERF_IP_FLAG_BRANCH | PERF_IP_FLAG_CONDITIONAL; + case INTEL_PT_OP_IRET: + return PERF_IP_FLAG_BRANCH | PERF_IP_FLAG_RETURN | + PERF_IP_FLAG_INTERRUPT; + case INTEL_PT_OP_INT: + return PERF_IP_FLAG_BRANCH | PERF_IP_FLAG_CALL | + PERF_IP_FLAG_INTERRUPT; + case INTEL_PT_OP_SYSCALL: + return PERF_IP_FLAG_BRANCH | PERF_IP_FLAG_CALL | + PERF_IP_FLAG_SYSCALLRET; + case INTEL_PT_OP_SYSRET: + return PERF_IP_FLAG_BRANCH | PERF_IP_FLAG_RETURN | + PERF_IP_FLAG_SYSCALLRET; + default: + return 0; + } +} diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-insn-decoder.h b/tools/perf/util/intel-pt-decoder/intel-pt-insn-decoder.h new file mode 100644 index 0000000..b0adbf3 --- /dev/null +++ b/tools/perf/util/intel-pt-decoder/intel-pt-insn-decoder.h @@ -0,0 +1,65 @@ +/* + * intel_pt_insn_decoder.h: Intel Processor Trace support + * Copyright (c) 2013-2014, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + */ + +#ifndef INCLUDE__INTEL_PT_INSN_DECODER_H__ +#define INCLUDE__INTEL_PT_INSN_DECODER_H__ + +#include +#include + +#define INTEL_PT_INSN_DESC_MAX 32 +#define INTEL_PT_INSN_DBG_BUF_SZ 16 + +enum intel_pt_insn_op { + INTEL_PT_OP_OTHER, + INTEL_PT_OP_CALL, + INTEL_PT_OP_RET, + INTEL_PT_OP_JCC, + INTEL_PT_OP_JMP, + INTEL_PT_OP_LOOP, + INTEL_PT_OP_IRET, + INTEL_PT_OP_INT, + INTEL_PT_OP_SYSCALL, + INTEL_PT_OP_SYSRET, +}; + +enum intel_pt_insn_branch { + INTEL_PT_BR_NO_BRANCH, + INTEL_PT_BR_INDIRECT, + INTEL_PT_BR_CONDITIONAL, + INTEL_PT_BR_UNCONDITIONAL, +}; + +struct intel_pt_insn { + enum intel_pt_insn_op op; + enum intel_pt_insn_branch branch; + int length; + int32_t rel; + unsigned char buf[INTEL_PT_INSN_DBG_BUF_SZ]; +}; + +int intel_pt_get_insn(const unsigned char *buf, size_t len, int x86_64, + struct intel_pt_insn *intel_pt_insn); + +const char *intel_pt_insn_name(enum intel_pt_insn_op op); + +int intel_pt_insn_desc(const struct intel_pt_insn *intel_pt_insn, char *buf, + size_t buf_len); + +size_t intel_pt_insn_max_size(void); + +int intel_pt_insn_type(enum intel_pt_insn_op op); + +#endif diff --git a/tools/perf/util/intel-pt-decoder/x86-opcode-map.txt b/tools/perf/util/intel-pt-decoder/x86-opcode-map.txt new file mode 100644 index 0000000..816488c --- /dev/null +++ b/tools/perf/util/intel-pt-decoder/x86-opcode-map.txt @@ -0,0 +1,970 @@ +# x86 Opcode Maps +# +# This is (mostly) based on following documentations. +# - Intel(R) 64 and IA-32 Architectures Software Developer's Manual Vol.2C +# (#326018-047US, June 2013) +# +# +# Table: table-name +# Referrer: escaped-name +# AVXcode: avx-code +# opcode: mnemonic|GrpXXX [operand1[,operand2...]] [(extra1)[,(extra2)...] [| 2nd-mnemonic ...] +# (or) +# opcode: escape # escaped-name +# EndTable +# +# +# GrpTable: GrpXXX +# reg: mnemonic [operand1[,operand2...]] [(extra1)[,(extra2)...] [| 2nd-mnemonic ...] +# EndTable +# +# AVX Superscripts +# (v): this opcode requires VEX prefix. +# (v1): this opcode only supports 128bit VEX. +# +# Last Prefix Superscripts +# - (66): the last prefix is 0x66 +# - (F3): the last prefix is 0xF3 +# - (F2): the last prefix is 0xF2 +# - (!F3) : the last prefix is not 0xF3 (including non-last prefix case) +# - (66&F2): Both 0x66 and 0xF2 prefixes are specified. + +Table: one byte opcode +Referrer: +AVXcode: +# 0x00 - 0x0f +00: ADD Eb,Gb +01: ADD Ev,Gv +02: ADD Gb,Eb +03: ADD Gv,Ev +04: ADD AL,Ib +05: ADD rAX,Iz +06: PUSH ES (i64) +07: POP ES (i64) +08: OR Eb,Gb +09: OR Ev,Gv +0a: OR Gb,Eb +0b: OR Gv,Ev +0c: OR AL,Ib +0d: OR rAX,Iz +0e: PUSH CS (i64) +0f: escape # 2-byte escape +# 0x10 - 0x1f +10: ADC Eb,Gb +11: ADC Ev,Gv +12: ADC Gb,Eb +13: ADC Gv,Ev +14: ADC AL,Ib +15: ADC rAX,Iz +16: PUSH SS (i64) +17: POP SS (i64) +18: SBB Eb,Gb +19: SBB Ev,Gv +1a: SBB Gb,Eb +1b: SBB Gv,Ev +1c: SBB AL,Ib +1d: SBB rAX,Iz +1e: PUSH DS (i64) +1f: POP DS (i64) +# 0x20 - 0x2f +20: AND Eb,Gb +21: AND Ev,Gv +22: AND Gb,Eb +23: AND Gv,Ev +24: AND AL,Ib +25: AND rAx,Iz +26: SEG=ES (Prefix) +27: DAA (i64) +28: SUB Eb,Gb +29: SUB Ev,Gv +2a: SUB Gb,Eb +2b: SUB Gv,Ev +2c: SUB AL,Ib +2d: SUB rAX,Iz +2e: SEG=CS (Prefix) +2f: DAS (i64) +# 0x30 - 0x3f +30: XOR Eb,Gb +31: XOR Ev,Gv +32: XOR Gb,Eb +33: XOR Gv,Ev +34: XOR AL,Ib +35: XOR rAX,Iz +36: SEG=SS (Prefix) +37: AAA (i64) +38: CMP Eb,Gb +39: CMP Ev,Gv +3a: CMP Gb,Eb +3b: CMP Gv,Ev +3c: CMP AL,Ib +3d: CMP rAX,Iz +3e: SEG=DS (Prefix) +3f: AAS (i64) +# 0x40 - 0x4f +40: INC eAX (i64) | REX (o64) +41: INC eCX (i64) | REX.B (o64) +42: INC eDX (i64) | REX.X (o64) +43: INC eBX (i64) | REX.XB (o64) +44: INC eSP (i64) | REX.R (o64) +45: INC eBP (i64) | REX.RB (o64) +46: INC eSI (i64) | REX.RX (o64) +47: INC eDI (i64) | REX.RXB (o64) +48: DEC eAX (i64) | REX.W (o64) +49: DEC eCX (i64) | REX.WB (o64) +4a: DEC eDX (i64) | REX.WX (o64) +4b: DEC eBX (i64) | REX.WXB (o64) +4c: DEC eSP (i64) | REX.WR (o64) +4d: DEC eBP (i64) | REX.WRB (o64) +4e: DEC eSI (i64) | REX.WRX (o64) +4f: DEC eDI (i64) | REX.WRXB (o64) +# 0x50 - 0x5f +50: PUSH rAX/r8 (d64) +51: PUSH rCX/r9 (d64) +52: PUSH rDX/r10 (d64) +53: PUSH rBX/r11 (d64) +54: PUSH rSP/r12 (d64) +55: PUSH rBP/r13 (d64) +56: PUSH rSI/r14 (d64) +57: PUSH rDI/r15 (d64) +58: POP rAX/r8 (d64) +59: POP rCX/r9 (d64) +5a: POP rDX/r10 (d64) +5b: POP rBX/r11 (d64) +5c: POP rSP/r12 (d64) +5d: POP rBP/r13 (d64) +5e: POP rSI/r14 (d64) +5f: POP rDI/r15 (d64) +# 0x60 - 0x6f +60: PUSHA/PUSHAD (i64) +61: POPA/POPAD (i64) +62: BOUND Gv,Ma (i64) +63: ARPL Ew,Gw (i64) | MOVSXD Gv,Ev (o64) +64: SEG=FS (Prefix) +65: SEG=GS (Prefix) +66: Operand-Size (Prefix) +67: Address-Size (Prefix) +68: PUSH Iz (d64) +69: IMUL Gv,Ev,Iz +6a: PUSH Ib (d64) +6b: IMUL Gv,Ev,Ib +6c: INS/INSB Yb,DX +6d: INS/INSW/INSD Yz,DX +6e: OUTS/OUTSB DX,Xb +6f: OUTS/OUTSW/OUTSD DX,Xz +# 0x70 - 0x7f +70: JO Jb +71: JNO Jb +72: JB/JNAE/JC Jb +73: JNB/JAE/JNC Jb +74: JZ/JE Jb +75: JNZ/JNE Jb +76: JBE/JNA Jb +77: JNBE/JA Jb +78: JS Jb +79: JNS Jb +7a: JP/JPE Jb +7b: JNP/JPO Jb +7c: JL/JNGE Jb +7d: JNL/JGE Jb +7e: JLE/JNG Jb +7f: JNLE/JG Jb +# 0x80 - 0x8f +80: Grp1 Eb,Ib (1A) +81: Grp1 Ev,Iz (1A) +82: Grp1 Eb,Ib (1A),(i64) +83: Grp1 Ev,Ib (1A) +84: TEST Eb,Gb +85: TEST Ev,Gv +86: XCHG Eb,Gb +87: XCHG Ev,Gv +88: MOV Eb,Gb +89: MOV Ev,Gv +8a: MOV Gb,Eb +8b: MOV Gv,Ev +8c: MOV Ev,Sw +8d: LEA Gv,M +8e: MOV Sw,Ew +8f: Grp1A (1A) | POP Ev (d64) +# 0x90 - 0x9f +90: NOP | PAUSE (F3) | XCHG r8,rAX +91: XCHG rCX/r9,rAX +92: XCHG rDX/r10,rAX +93: XCHG rBX/r11,rAX +94: XCHG rSP/r12,rAX +95: XCHG rBP/r13,rAX +96: XCHG rSI/r14,rAX +97: XCHG rDI/r15,rAX +98: CBW/CWDE/CDQE +99: CWD/CDQ/CQO +9a: CALLF Ap (i64) +9b: FWAIT/WAIT +9c: PUSHF/D/Q Fv (d64) +9d: POPF/D/Q Fv (d64) +9e: SAHF +9f: LAHF +# 0xa0 - 0xaf +a0: MOV AL,Ob +a1: MOV rAX,Ov +a2: MOV Ob,AL +a3: MOV Ov,rAX +a4: MOVS/B Yb,Xb +a5: MOVS/W/D/Q Yv,Xv +a6: CMPS/B Xb,Yb +a7: CMPS/W/D Xv,Yv +a8: TEST AL,Ib +a9: TEST rAX,Iz +aa: STOS/B Yb,AL +ab: STOS/W/D/Q Yv,rAX +ac: LODS/B AL,Xb +ad: LODS/W/D/Q rAX,Xv +ae: SCAS/B AL,Yb +# Note: The May 2011 Intel manual shows Xv for the second parameter of the +# next instruction but Yv is correct +af: SCAS/W/D/Q rAX,Yv +# 0xb0 - 0xbf +b0: MOV AL/R8L,Ib +b1: MOV CL/R9L,Ib +b2: MOV DL/R10L,Ib +b3: MOV BL/R11L,Ib +b4: MOV AH/R12L,Ib +b5: MOV CH/R13L,Ib +b6: MOV DH/R14L,Ib +b7: MOV BH/R15L,Ib +b8: MOV rAX/r8,Iv +b9: MOV rCX/r9,Iv +ba: MOV rDX/r10,Iv +bb: MOV rBX/r11,Iv +bc: MOV rSP/r12,Iv +bd: MOV rBP/r13,Iv +be: MOV rSI/r14,Iv +bf: MOV rDI/r15,Iv +# 0xc0 - 0xcf +c0: Grp2 Eb,Ib (1A) +c1: Grp2 Ev,Ib (1A) +c2: RETN Iw (f64) +c3: RETN +c4: LES Gz,Mp (i64) | VEX+2byte (Prefix) +c5: LDS Gz,Mp (i64) | VEX+1byte (Prefix) +c6: Grp11A Eb,Ib (1A) +c7: Grp11B Ev,Iz (1A) +c8: ENTER Iw,Ib +c9: LEAVE (d64) +ca: RETF Iw +cb: RETF +cc: INT3 +cd: INT Ib +ce: INTO (i64) +cf: IRET/D/Q +# 0xd0 - 0xdf +d0: Grp2 Eb,1 (1A) +d1: Grp2 Ev,1 (1A) +d2: Grp2 Eb,CL (1A) +d3: Grp2 Ev,CL (1A) +d4: AAM Ib (i64) +d5: AAD Ib (i64) +d6: +d7: XLAT/XLATB +d8: ESC +d9: ESC +da: ESC +db: ESC +dc: ESC +dd: ESC +de: ESC +df: ESC +# 0xe0 - 0xef +# Note: "forced64" is Intel CPU behavior: they ignore 0x66 prefix +# in 64-bit mode. AMD CPUs accept 0x66 prefix, it causes RIP truncation +# to 16 bits. In 32-bit mode, 0x66 is accepted by both Intel and AMD. +e0: LOOPNE/LOOPNZ Jb (f64) +e1: LOOPE/LOOPZ Jb (f64) +e2: LOOP Jb (f64) +e3: JrCXZ Jb (f64) +e4: IN AL,Ib +e5: IN eAX,Ib +e6: OUT Ib,AL +e7: OUT Ib,eAX +# With 0x66 prefix in 64-bit mode, for AMD CPUs immediate offset +# in "near" jumps and calls is 16-bit. For CALL, +# push of return address is 16-bit wide, RSP is decremented by 2 +# but is not truncated to 16 bits, unlike RIP. +e8: CALL Jz (f64) +e9: JMP-near Jz (f64) +ea: JMP-far Ap (i64) +eb: JMP-short Jb (f64) +ec: IN AL,DX +ed: IN eAX,DX +ee: OUT DX,AL +ef: OUT DX,eAX +# 0xf0 - 0xff +f0: LOCK (Prefix) +f1: +f2: REPNE (Prefix) | XACQUIRE (Prefix) +f3: REP/REPE (Prefix) | XRELEASE (Prefix) +f4: HLT +f5: CMC +f6: Grp3_1 Eb (1A) +f7: Grp3_2 Ev (1A) +f8: CLC +f9: STC +fa: CLI +fb: STI +fc: CLD +fd: STD +fe: Grp4 (1A) +ff: Grp5 (1A) +EndTable + +Table: 2-byte opcode (0x0f) +Referrer: 2-byte escape +AVXcode: 1 +# 0x0f 0x00-0x0f +00: Grp6 (1A) +01: Grp7 (1A) +02: LAR Gv,Ew +03: LSL Gv,Ew +04: +05: SYSCALL (o64) +06: CLTS +07: SYSRET (o64) +08: INVD +09: WBINVD +0a: +0b: UD2 (1B) +0c: +# AMD's prefetch group. Intel supports prefetchw(/1) only. +0d: GrpP +0e: FEMMS +# 3DNow! uses the last imm byte as opcode extension. +0f: 3DNow! Pq,Qq,Ib +# 0x0f 0x10-0x1f +# NOTE: According to Intel SDM opcode map, vmovups and vmovupd has no operands +# but it actually has operands. And also, vmovss and vmovsd only accept 128bit. +# MOVSS/MOVSD has too many forms(3) on SDM. This map just shows a typical form. +# Many AVX instructions lack v1 superscript, according to Intel AVX-Prgramming +# Reference A.1 +10: vmovups Vps,Wps | vmovupd Vpd,Wpd (66) | vmovss Vx,Hx,Wss (F3),(v1) | vmovsd Vx,Hx,Wsd (F2),(v1) +11: vmovups Wps,Vps | vmovupd Wpd,Vpd (66) | vmovss Wss,Hx,Vss (F3),(v1) | vmovsd Wsd,Hx,Vsd (F2),(v1) +12: vmovlps Vq,Hq,Mq (v1) | vmovhlps Vq,Hq,Uq (v1) | vmovlpd Vq,Hq,Mq (66),(v1) | vmovsldup Vx,Wx (F3) | vmovddup Vx,Wx (F2) +13: vmovlps Mq,Vq (v1) | vmovlpd Mq,Vq (66),(v1) +14: vunpcklps Vx,Hx,Wx | vunpcklpd Vx,Hx,Wx (66) +15: vunpckhps Vx,Hx,Wx | vunpckhpd Vx,Hx,Wx (66) +16: vmovhps Vdq,Hq,Mq (v1) | vmovlhps Vdq,Hq,Uq (v1) | vmovhpd Vdq,Hq,Mq (66),(v1) | vmovshdup Vx,Wx (F3) +17: vmovhps Mq,Vq (v1) | vmovhpd Mq,Vq (66),(v1) +18: Grp16 (1A) +19: +1a: BNDCL Ev,Gv | BNDCU Ev,Gv | BNDMOV Gv,Ev | BNDLDX Gv,Ev,Gv +1b: BNDCN Ev,Gv | BNDMOV Ev,Gv | BNDMK Gv,Ev | BNDSTX Ev,GV,Gv +1c: +1d: +1e: +1f: NOP Ev +# 0x0f 0x20-0x2f +20: MOV Rd,Cd +21: MOV Rd,Dd +22: MOV Cd,Rd +23: MOV Dd,Rd +24: +25: +26: +27: +28: vmovaps Vps,Wps | vmovapd Vpd,Wpd (66) +29: vmovaps Wps,Vps | vmovapd Wpd,Vpd (66) +2a: cvtpi2ps Vps,Qpi | cvtpi2pd Vpd,Qpi (66) | vcvtsi2ss Vss,Hss,Ey (F3),(v1) | vcvtsi2sd Vsd,Hsd,Ey (F2),(v1) +2b: vmovntps Mps,Vps | vmovntpd Mpd,Vpd (66) +2c: cvttps2pi Ppi,Wps | cvttpd2pi Ppi,Wpd (66) | vcvttss2si Gy,Wss (F3),(v1) | vcvttsd2si Gy,Wsd (F2),(v1) +2d: cvtps2pi Ppi,Wps | cvtpd2pi Qpi,Wpd (66) | vcvtss2si Gy,Wss (F3),(v1) | vcvtsd2si Gy,Wsd (F2),(v1) +2e: vucomiss Vss,Wss (v1) | vucomisd Vsd,Wsd (66),(v1) +2f: vcomiss Vss,Wss (v1) | vcomisd Vsd,Wsd (66),(v1) +# 0x0f 0x30-0x3f +30: WRMSR +31: RDTSC +32: RDMSR +33: RDPMC +34: SYSENTER +35: SYSEXIT +36: +37: GETSEC +38: escape # 3-byte escape 1 +39: +3a: escape # 3-byte escape 2 +3b: +3c: +3d: +3e: +3f: +# 0x0f 0x40-0x4f +40: CMOVO Gv,Ev +41: CMOVNO Gv,Ev +42: CMOVB/C/NAE Gv,Ev +43: CMOVAE/NB/NC Gv,Ev +44: CMOVE/Z Gv,Ev +45: CMOVNE/NZ Gv,Ev +46: CMOVBE/NA Gv,Ev +47: CMOVA/NBE Gv,Ev +48: CMOVS Gv,Ev +49: CMOVNS Gv,Ev +4a: CMOVP/PE Gv,Ev +4b: CMOVNP/PO Gv,Ev +4c: CMOVL/NGE Gv,Ev +4d: CMOVNL/GE Gv,Ev +4e: CMOVLE/NG Gv,Ev +4f: CMOVNLE/G Gv,Ev +# 0x0f 0x50-0x5f +50: vmovmskps Gy,Ups | vmovmskpd Gy,Upd (66) +51: vsqrtps Vps,Wps | vsqrtpd Vpd,Wpd (66) | vsqrtss Vss,Hss,Wss (F3),(v1) | vsqrtsd Vsd,Hsd,Wsd (F2),(v1) +52: vrsqrtps Vps,Wps | vrsqrtss Vss,Hss,Wss (F3),(v1) +53: vrcpps Vps,Wps | vrcpss Vss,Hss,Wss (F3),(v1) +54: vandps Vps,Hps,Wps | vandpd Vpd,Hpd,Wpd (66) +55: vandnps Vps,Hps,Wps | vandnpd Vpd,Hpd,Wpd (66) +56: vorps Vps,Hps,Wps | vorpd Vpd,Hpd,Wpd (66) +57: vxorps Vps,Hps,Wps | vxorpd Vpd,Hpd,Wpd (66) +58: vaddps Vps,Hps,Wps | vaddpd Vpd,Hpd,Wpd (66) | vaddss Vss,Hss,Wss (F3),(v1) | vaddsd Vsd,Hsd,Wsd (F2),(v1) +59: vmulps Vps,Hps,Wps | vmulpd Vpd,Hpd,Wpd (66) | vmulss Vss,Hss,Wss (F3),(v1) | vmulsd Vsd,Hsd,Wsd (F2),(v1) +5a: vcvtps2pd Vpd,Wps | vcvtpd2ps Vps,Wpd (66) | vcvtss2sd Vsd,Hx,Wss (F3),(v1) | vcvtsd2ss Vss,Hx,Wsd (F2),(v1) +5b: vcvtdq2ps Vps,Wdq | vcvtps2dq Vdq,Wps (66) | vcvttps2dq Vdq,Wps (F3) +5c: vsubps Vps,Hps,Wps | vsubpd Vpd,Hpd,Wpd (66) | vsubss Vss,Hss,Wss (F3),(v1) | vsubsd Vsd,Hsd,Wsd (F2),(v1) +5d: vminps Vps,Hps,Wps | vminpd Vpd,Hpd,Wpd (66) | vminss Vss,Hss,Wss (F3),(v1) | vminsd Vsd,Hsd,Wsd (F2),(v1) +5e: vdivps Vps,Hps,Wps | vdivpd Vpd,Hpd,Wpd (66) | vdivss Vss,Hss,Wss (F3),(v1) | vdivsd Vsd,Hsd,Wsd (F2),(v1) +5f: vmaxps Vps,Hps,Wps | vmaxpd Vpd,Hpd,Wpd (66) | vmaxss Vss,Hss,Wss (F3),(v1) | vmaxsd Vsd,Hsd,Wsd (F2),(v1) +# 0x0f 0x60-0x6f +60: punpcklbw Pq,Qd | vpunpcklbw Vx,Hx,Wx (66),(v1) +61: punpcklwd Pq,Qd | vpunpcklwd Vx,Hx,Wx (66),(v1) +62: punpckldq Pq,Qd | vpunpckldq Vx,Hx,Wx (66),(v1) +63: packsswb Pq,Qq | vpacksswb Vx,Hx,Wx (66),(v1) +64: pcmpgtb Pq,Qq | vpcmpgtb Vx,Hx,Wx (66),(v1) +65: pcmpgtw Pq,Qq | vpcmpgtw Vx,Hx,Wx (66),(v1) +66: pcmpgtd Pq,Qq | vpcmpgtd Vx,Hx,Wx (66),(v1) +67: packuswb Pq,Qq | vpackuswb Vx,Hx,Wx (66),(v1) +68: punpckhbw Pq,Qd | vpunpckhbw Vx,Hx,Wx (66),(v1) +69: punpckhwd Pq,Qd | vpunpckhwd Vx,Hx,Wx (66),(v1) +6a: punpckhdq Pq,Qd | vpunpckhdq Vx,Hx,Wx (66),(v1) +6b: packssdw Pq,Qd | vpackssdw Vx,Hx,Wx (66),(v1) +6c: vpunpcklqdq Vx,Hx,Wx (66),(v1) +6d: vpunpckhqdq Vx,Hx,Wx (66),(v1) +6e: movd/q Pd,Ey | vmovd/q Vy,Ey (66),(v1) +6f: movq Pq,Qq | vmovdqa Vx,Wx (66) | vmovdqu Vx,Wx (F3) +# 0x0f 0x70-0x7f +70: pshufw Pq,Qq,Ib | vpshufd Vx,Wx,Ib (66),(v1) | vpshufhw Vx,Wx,Ib (F3),(v1) | vpshuflw Vx,Wx,Ib (F2),(v1) +71: Grp12 (1A) +72: Grp13 (1A) +73: Grp14 (1A) +74: pcmpeqb Pq,Qq | vpcmpeqb Vx,Hx,Wx (66),(v1) +75: pcmpeqw Pq,Qq | vpcmpeqw Vx,Hx,Wx (66),(v1) +76: pcmpeqd Pq,Qq | vpcmpeqd Vx,Hx,Wx (66),(v1) +# Note: Remove (v), because vzeroall and vzeroupper becomes emms without VEX. +77: emms | vzeroupper | vzeroall +78: VMREAD Ey,Gy +79: VMWRITE Gy,Ey +7a: +7b: +7c: vhaddpd Vpd,Hpd,Wpd (66) | vhaddps Vps,Hps,Wps (F2) +7d: vhsubpd Vpd,Hpd,Wpd (66) | vhsubps Vps,Hps,Wps (F2) +7e: movd/q Ey,Pd | vmovd/q Ey,Vy (66),(v1) | vmovq Vq,Wq (F3),(v1) +7f: movq Qq,Pq | vmovdqa Wx,Vx (66) | vmovdqu Wx,Vx (F3) +# 0x0f 0x80-0x8f +# Note: "forced64" is Intel CPU behavior (see comment about CALL insn). +80: JO Jz (f64) +81: JNO Jz (f64) +82: JB/JC/JNAE Jz (f64) +83: JAE/JNB/JNC Jz (f64) +84: JE/JZ Jz (f64) +85: JNE/JNZ Jz (f64) +86: JBE/JNA Jz (f64) +87: JA/JNBE Jz (f64) +88: JS Jz (f64) +89: JNS Jz (f64) +8a: JP/JPE Jz (f64) +8b: JNP/JPO Jz (f64) +8c: JL/JNGE Jz (f64) +8d: JNL/JGE Jz (f64) +8e: JLE/JNG Jz (f64) +8f: JNLE/JG Jz (f64) +# 0x0f 0x90-0x9f +90: SETO Eb +91: SETNO Eb +92: SETB/C/NAE Eb +93: SETAE/NB/NC Eb +94: SETE/Z Eb +95: SETNE/NZ Eb +96: SETBE/NA Eb +97: SETA/NBE Eb +98: SETS Eb +99: SETNS Eb +9a: SETP/PE Eb +9b: SETNP/PO Eb +9c: SETL/NGE Eb +9d: SETNL/GE Eb +9e: SETLE/NG Eb +9f: SETNLE/G Eb +# 0x0f 0xa0-0xaf +a0: PUSH FS (d64) +a1: POP FS (d64) +a2: CPUID +a3: BT Ev,Gv +a4: SHLD Ev,Gv,Ib +a5: SHLD Ev,Gv,CL +a6: GrpPDLK +a7: GrpRNG +a8: PUSH GS (d64) +a9: POP GS (d64) +aa: RSM +ab: BTS Ev,Gv +ac: SHRD Ev,Gv,Ib +ad: SHRD Ev,Gv,CL +ae: Grp15 (1A),(1C) +af: IMUL Gv,Ev +# 0x0f 0xb0-0xbf +b0: CMPXCHG Eb,Gb +b1: CMPXCHG Ev,Gv +b2: LSS Gv,Mp +b3: BTR Ev,Gv +b4: LFS Gv,Mp +b5: LGS Gv,Mp +b6: MOVZX Gv,Eb +b7: MOVZX Gv,Ew +b8: JMPE (!F3) | POPCNT Gv,Ev (F3) +b9: Grp10 (1A) +ba: Grp8 Ev,Ib (1A) +bb: BTC Ev,Gv +bc: BSF Gv,Ev (!F3) | TZCNT Gv,Ev (F3) +bd: BSR Gv,Ev (!F3) | LZCNT Gv,Ev (F3) +be: MOVSX Gv,Eb +bf: MOVSX Gv,Ew +# 0x0f 0xc0-0xcf +c0: XADD Eb,Gb +c1: XADD Ev,Gv +c2: vcmpps Vps,Hps,Wps,Ib | vcmppd Vpd,Hpd,Wpd,Ib (66) | vcmpss Vss,Hss,Wss,Ib (F3),(v1) | vcmpsd Vsd,Hsd,Wsd,Ib (F2),(v1) +c3: movnti My,Gy +c4: pinsrw Pq,Ry/Mw,Ib | vpinsrw Vdq,Hdq,Ry/Mw,Ib (66),(v1) +c5: pextrw Gd,Nq,Ib | vpextrw Gd,Udq,Ib (66),(v1) +c6: vshufps Vps,Hps,Wps,Ib | vshufpd Vpd,Hpd,Wpd,Ib (66) +c7: Grp9 (1A) +c8: BSWAP RAX/EAX/R8/R8D +c9: BSWAP RCX/ECX/R9/R9D +ca: BSWAP RDX/EDX/R10/R10D +cb: BSWAP RBX/EBX/R11/R11D +cc: BSWAP RSP/ESP/R12/R12D +cd: BSWAP RBP/EBP/R13/R13D +ce: BSWAP RSI/ESI/R14/R14D +cf: BSWAP RDI/EDI/R15/R15D +# 0x0f 0xd0-0xdf +d0: vaddsubpd Vpd,Hpd,Wpd (66) | vaddsubps Vps,Hps,Wps (F2) +d1: psrlw Pq,Qq | vpsrlw Vx,Hx,Wx (66),(v1) +d2: psrld Pq,Qq | vpsrld Vx,Hx,Wx (66),(v1) +d3: psrlq Pq,Qq | vpsrlq Vx,Hx,Wx (66),(v1) +d4: paddq Pq,Qq | vpaddq Vx,Hx,Wx (66),(v1) +d5: pmullw Pq,Qq | vpmullw Vx,Hx,Wx (66),(v1) +d6: vmovq Wq,Vq (66),(v1) | movq2dq Vdq,Nq (F3) | movdq2q Pq,Uq (F2) +d7: pmovmskb Gd,Nq | vpmovmskb Gd,Ux (66),(v1) +d8: psubusb Pq,Qq | vpsubusb Vx,Hx,Wx (66),(v1) +d9: psubusw Pq,Qq | vpsubusw Vx,Hx,Wx (66),(v1) +da: pminub Pq,Qq | vpminub Vx,Hx,Wx (66),(v1) +db: pand Pq,Qq | vpand Vx,Hx,Wx (66),(v1) +dc: paddusb Pq,Qq | vpaddusb Vx,Hx,Wx (66),(v1) +dd: paddusw Pq,Qq | vpaddusw Vx,Hx,Wx (66),(v1) +de: pmaxub Pq,Qq | vpmaxub Vx,Hx,Wx (66),(v1) +df: pandn Pq,Qq | vpandn Vx,Hx,Wx (66),(v1) +# 0x0f 0xe0-0xef +e0: pavgb Pq,Qq | vpavgb Vx,Hx,Wx (66),(v1) +e1: psraw Pq,Qq | vpsraw Vx,Hx,Wx (66),(v1) +e2: psrad Pq,Qq | vpsrad Vx,Hx,Wx (66),(v1) +e3: pavgw Pq,Qq | vpavgw Vx,Hx,Wx (66),(v1) +e4: pmulhuw Pq,Qq | vpmulhuw Vx,Hx,Wx (66),(v1) +e5: pmulhw Pq,Qq | vpmulhw Vx,Hx,Wx (66),(v1) +e6: vcvttpd2dq Vx,Wpd (66) | vcvtdq2pd Vx,Wdq (F3) | vcvtpd2dq Vx,Wpd (F2) +e7: movntq Mq,Pq | vmovntdq Mx,Vx (66) +e8: psubsb Pq,Qq | vpsubsb Vx,Hx,Wx (66),(v1) +e9: psubsw Pq,Qq | vpsubsw Vx,Hx,Wx (66),(v1) +ea: pminsw Pq,Qq | vpminsw Vx,Hx,Wx (66),(v1) +eb: por Pq,Qq | vpor Vx,Hx,Wx (66),(v1) +ec: paddsb Pq,Qq | vpaddsb Vx,Hx,Wx (66),(v1) +ed: paddsw Pq,Qq | vpaddsw Vx,Hx,Wx (66),(v1) +ee: pmaxsw Pq,Qq | vpmaxsw Vx,Hx,Wx (66),(v1) +ef: pxor Pq,Qq | vpxor Vx,Hx,Wx (66),(v1) +# 0x0f 0xf0-0xff +f0: vlddqu Vx,Mx (F2) +f1: psllw Pq,Qq | vpsllw Vx,Hx,Wx (66),(v1) +f2: pslld Pq,Qq | vpslld Vx,Hx,Wx (66),(v1) +f3: psllq Pq,Qq | vpsllq Vx,Hx,Wx (66),(v1) +f4: pmuludq Pq,Qq | vpmuludq Vx,Hx,Wx (66),(v1) +f5: pmaddwd Pq,Qq | vpmaddwd Vx,Hx,Wx (66),(v1) +f6: psadbw Pq,Qq | vpsadbw Vx,Hx,Wx (66),(v1) +f7: maskmovq Pq,Nq | vmaskmovdqu Vx,Ux (66),(v1) +f8: psubb Pq,Qq | vpsubb Vx,Hx,Wx (66),(v1) +f9: psubw Pq,Qq | vpsubw Vx,Hx,Wx (66),(v1) +fa: psubd Pq,Qq | vpsubd Vx,Hx,Wx (66),(v1) +fb: psubq Pq,Qq | vpsubq Vx,Hx,Wx (66),(v1) +fc: paddb Pq,Qq | vpaddb Vx,Hx,Wx (66),(v1) +fd: paddw Pq,Qq | vpaddw Vx,Hx,Wx (66),(v1) +fe: paddd Pq,Qq | vpaddd Vx,Hx,Wx (66),(v1) +ff: +EndTable + +Table: 3-byte opcode 1 (0x0f 0x38) +Referrer: 3-byte escape 1 +AVXcode: 2 +# 0x0f 0x38 0x00-0x0f +00: pshufb Pq,Qq | vpshufb Vx,Hx,Wx (66),(v1) +01: phaddw Pq,Qq | vphaddw Vx,Hx,Wx (66),(v1) +02: phaddd Pq,Qq | vphaddd Vx,Hx,Wx (66),(v1) +03: phaddsw Pq,Qq | vphaddsw Vx,Hx,Wx (66),(v1) +04: pmaddubsw Pq,Qq | vpmaddubsw Vx,Hx,Wx (66),(v1) +05: phsubw Pq,Qq | vphsubw Vx,Hx,Wx (66),(v1) +06: phsubd Pq,Qq | vphsubd Vx,Hx,Wx (66),(v1) +07: phsubsw Pq,Qq | vphsubsw Vx,Hx,Wx (66),(v1) +08: psignb Pq,Qq | vpsignb Vx,Hx,Wx (66),(v1) +09: psignw Pq,Qq | vpsignw Vx,Hx,Wx (66),(v1) +0a: psignd Pq,Qq | vpsignd Vx,Hx,Wx (66),(v1) +0b: pmulhrsw Pq,Qq | vpmulhrsw Vx,Hx,Wx (66),(v1) +0c: vpermilps Vx,Hx,Wx (66),(v) +0d: vpermilpd Vx,Hx,Wx (66),(v) +0e: vtestps Vx,Wx (66),(v) +0f: vtestpd Vx,Wx (66),(v) +# 0x0f 0x38 0x10-0x1f +10: pblendvb Vdq,Wdq (66) +11: +12: +13: vcvtph2ps Vx,Wx,Ib (66),(v) +14: blendvps Vdq,Wdq (66) +15: blendvpd Vdq,Wdq (66) +16: vpermps Vqq,Hqq,Wqq (66),(v) +17: vptest Vx,Wx (66) +18: vbroadcastss Vx,Wd (66),(v) +19: vbroadcastsd Vqq,Wq (66),(v) +1a: vbroadcastf128 Vqq,Mdq (66),(v) +1b: +1c: pabsb Pq,Qq | vpabsb Vx,Wx (66),(v1) +1d: pabsw Pq,Qq | vpabsw Vx,Wx (66),(v1) +1e: pabsd Pq,Qq | vpabsd Vx,Wx (66),(v1) +1f: +# 0x0f 0x38 0x20-0x2f +20: vpmovsxbw Vx,Ux/Mq (66),(v1) +21: vpmovsxbd Vx,Ux/Md (66),(v1) +22: vpmovsxbq Vx,Ux/Mw (66),(v1) +23: vpmovsxwd Vx,Ux/Mq (66),(v1) +24: vpmovsxwq Vx,Ux/Md (66),(v1) +25: vpmovsxdq Vx,Ux/Mq (66),(v1) +26: +27: +28: vpmuldq Vx,Hx,Wx (66),(v1) +29: vpcmpeqq Vx,Hx,Wx (66),(v1) +2a: vmovntdqa Vx,Mx (66),(v1) +2b: vpackusdw Vx,Hx,Wx (66),(v1) +2c: vmaskmovps Vx,Hx,Mx (66),(v) +2d: vmaskmovpd Vx,Hx,Mx (66),(v) +2e: vmaskmovps Mx,Hx,Vx (66),(v) +2f: vmaskmovpd Mx,Hx,Vx (66),(v) +# 0x0f 0x38 0x30-0x3f +30: vpmovzxbw Vx,Ux/Mq (66),(v1) +31: vpmovzxbd Vx,Ux/Md (66),(v1) +32: vpmovzxbq Vx,Ux/Mw (66),(v1) +33: vpmovzxwd Vx,Ux/Mq (66),(v1) +34: vpmovzxwq Vx,Ux/Md (66),(v1) +35: vpmovzxdq Vx,Ux/Mq (66),(v1) +36: vpermd Vqq,Hqq,Wqq (66),(v) +37: vpcmpgtq Vx,Hx,Wx (66),(v1) +38: vpminsb Vx,Hx,Wx (66),(v1) +39: vpminsd Vx,Hx,Wx (66),(v1) +3a: vpminuw Vx,Hx,Wx (66),(v1) +3b: vpminud Vx,Hx,Wx (66),(v1) +3c: vpmaxsb Vx,Hx,Wx (66),(v1) +3d: vpmaxsd Vx,Hx,Wx (66),(v1) +3e: vpmaxuw Vx,Hx,Wx (66),(v1) +3f: vpmaxud Vx,Hx,Wx (66),(v1) +# 0x0f 0x38 0x40-0x8f +40: vpmulld Vx,Hx,Wx (66),(v1) +41: vphminposuw Vdq,Wdq (66),(v1) +42: +43: +44: +45: vpsrlvd/q Vx,Hx,Wx (66),(v) +46: vpsravd Vx,Hx,Wx (66),(v) +47: vpsllvd/q Vx,Hx,Wx (66),(v) +# Skip 0x48-0x57 +58: vpbroadcastd Vx,Wx (66),(v) +59: vpbroadcastq Vx,Wx (66),(v) +5a: vbroadcasti128 Vqq,Mdq (66),(v) +# Skip 0x5b-0x77 +78: vpbroadcastb Vx,Wx (66),(v) +79: vpbroadcastw Vx,Wx (66),(v) +# Skip 0x7a-0x7f +80: INVEPT Gy,Mdq (66) +81: INVPID Gy,Mdq (66) +82: INVPCID Gy,Mdq (66) +8c: vpmaskmovd/q Vx,Hx,Mx (66),(v) +8e: vpmaskmovd/q Mx,Vx,Hx (66),(v) +# 0x0f 0x38 0x90-0xbf (FMA) +90: vgatherdd/q Vx,Hx,Wx (66),(v) +91: vgatherqd/q Vx,Hx,Wx (66),(v) +92: vgatherdps/d Vx,Hx,Wx (66),(v) +93: vgatherqps/d Vx,Hx,Wx (66),(v) +94: +95: +96: vfmaddsub132ps/d Vx,Hx,Wx (66),(v) +97: vfmsubadd132ps/d Vx,Hx,Wx (66),(v) +98: vfmadd132ps/d Vx,Hx,Wx (66),(v) +99: vfmadd132ss/d Vx,Hx,Wx (66),(v),(v1) +9a: vfmsub132ps/d Vx,Hx,Wx (66),(v) +9b: vfmsub132ss/d Vx,Hx,Wx (66),(v),(v1) +9c: vfnmadd132ps/d Vx,Hx,Wx (66),(v) +9d: vfnmadd132ss/d Vx,Hx,Wx (66),(v),(v1) +9e: vfnmsub132ps/d Vx,Hx,Wx (66),(v) +9f: vfnmsub132ss/d Vx,Hx,Wx (66),(v),(v1) +a6: vfmaddsub213ps/d Vx,Hx,Wx (66),(v) +a7: vfmsubadd213ps/d Vx,Hx,Wx (66),(v) +a8: vfmadd213ps/d Vx,Hx,Wx (66),(v) +a9: vfmadd213ss/d Vx,Hx,Wx (66),(v),(v1) +aa: vfmsub213ps/d Vx,Hx,Wx (66),(v) +ab: vfmsub213ss/d Vx,Hx,Wx (66),(v),(v1) +ac: vfnmadd213ps/d Vx,Hx,Wx (66),(v) +ad: vfnmadd213ss/d Vx,Hx,Wx (66),(v),(v1) +ae: vfnmsub213ps/d Vx,Hx,Wx (66),(v) +af: vfnmsub213ss/d Vx,Hx,Wx (66),(v),(v1) +b6: vfmaddsub231ps/d Vx,Hx,Wx (66),(v) +b7: vfmsubadd231ps/d Vx,Hx,Wx (66),(v) +b8: vfmadd231ps/d Vx,Hx,Wx (66),(v) +b9: vfmadd231ss/d Vx,Hx,Wx (66),(v),(v1) +ba: vfmsub231ps/d Vx,Hx,Wx (66),(v) +bb: vfmsub231ss/d Vx,Hx,Wx (66),(v),(v1) +bc: vfnmadd231ps/d Vx,Hx,Wx (66),(v) +bd: vfnmadd231ss/d Vx,Hx,Wx (66),(v),(v1) +be: vfnmsub231ps/d Vx,Hx,Wx (66),(v) +bf: vfnmsub231ss/d Vx,Hx,Wx (66),(v),(v1) +# 0x0f 0x38 0xc0-0xff +db: VAESIMC Vdq,Wdq (66),(v1) +dc: VAESENC Vdq,Hdq,Wdq (66),(v1) +dd: VAESENCLAST Vdq,Hdq,Wdq (66),(v1) +de: VAESDEC Vdq,Hdq,Wdq (66),(v1) +df: VAESDECLAST Vdq,Hdq,Wdq (66),(v1) +f0: MOVBE Gy,My | MOVBE Gw,Mw (66) | CRC32 Gd,Eb (F2) | CRC32 Gd,Eb (66&F2) +f1: MOVBE My,Gy | MOVBE Mw,Gw (66) | CRC32 Gd,Ey (F2) | CRC32 Gd,Ew (66&F2) +f2: ANDN Gy,By,Ey (v) +f3: Grp17 (1A) +f5: BZHI Gy,Ey,By (v) | PEXT Gy,By,Ey (F3),(v) | PDEP Gy,By,Ey (F2),(v) +f6: ADCX Gy,Ey (66) | ADOX Gy,Ey (F3) | MULX By,Gy,rDX,Ey (F2),(v) +f7: BEXTR Gy,Ey,By (v) | SHLX Gy,Ey,By (66),(v) | SARX Gy,Ey,By (F3),(v) | SHRX Gy,Ey,By (F2),(v) +EndTable + +Table: 3-byte opcode 2 (0x0f 0x3a) +Referrer: 3-byte escape 2 +AVXcode: 3 +# 0x0f 0x3a 0x00-0xff +00: vpermq Vqq,Wqq,Ib (66),(v) +01: vpermpd Vqq,Wqq,Ib (66),(v) +02: vpblendd Vx,Hx,Wx,Ib (66),(v) +03: +04: vpermilps Vx,Wx,Ib (66),(v) +05: vpermilpd Vx,Wx,Ib (66),(v) +06: vperm2f128 Vqq,Hqq,Wqq,Ib (66),(v) +07: +08: vroundps Vx,Wx,Ib (66) +09: vroundpd Vx,Wx,Ib (66) +0a: vroundss Vss,Wss,Ib (66),(v1) +0b: vroundsd Vsd,Wsd,Ib (66),(v1) +0c: vblendps Vx,Hx,Wx,Ib (66) +0d: vblendpd Vx,Hx,Wx,Ib (66) +0e: vpblendw Vx,Hx,Wx,Ib (66),(v1) +0f: palignr Pq,Qq,Ib | vpalignr Vx,Hx,Wx,Ib (66),(v1) +14: vpextrb Rd/Mb,Vdq,Ib (66),(v1) +15: vpextrw Rd/Mw,Vdq,Ib (66),(v1) +16: vpextrd/q Ey,Vdq,Ib (66),(v1) +17: vextractps Ed,Vdq,Ib (66),(v1) +18: vinsertf128 Vqq,Hqq,Wqq,Ib (66),(v) +19: vextractf128 Wdq,Vqq,Ib (66),(v) +1d: vcvtps2ph Wx,Vx,Ib (66),(v) +20: vpinsrb Vdq,Hdq,Ry/Mb,Ib (66),(v1) +21: vinsertps Vdq,Hdq,Udq/Md,Ib (66),(v1) +22: vpinsrd/q Vdq,Hdq,Ey,Ib (66),(v1) +38: vinserti128 Vqq,Hqq,Wqq,Ib (66),(v) +39: vextracti128 Wdq,Vqq,Ib (66),(v) +40: vdpps Vx,Hx,Wx,Ib (66) +41: vdppd Vdq,Hdq,Wdq,Ib (66),(v1) +42: vmpsadbw Vx,Hx,Wx,Ib (66),(v1) +44: vpclmulqdq Vdq,Hdq,Wdq,Ib (66),(v1) +46: vperm2i128 Vqq,Hqq,Wqq,Ib (66),(v) +4a: vblendvps Vx,Hx,Wx,Lx (66),(v) +4b: vblendvpd Vx,Hx,Wx,Lx (66),(v) +4c: vpblendvb Vx,Hx,Wx,Lx (66),(v1) +60: vpcmpestrm Vdq,Wdq,Ib (66),(v1) +61: vpcmpestri Vdq,Wdq,Ib (66),(v1) +62: vpcmpistrm Vdq,Wdq,Ib (66),(v1) +63: vpcmpistri Vdq,Wdq,Ib (66),(v1) +df: VAESKEYGEN Vdq,Wdq,Ib (66),(v1) +f0: RORX Gy,Ey,Ib (F2),(v) +EndTable + +GrpTable: Grp1 +0: ADD +1: OR +2: ADC +3: SBB +4: AND +5: SUB +6: XOR +7: CMP +EndTable + +GrpTable: Grp1A +0: POP +EndTable + +GrpTable: Grp2 +0: ROL +1: ROR +2: RCL +3: RCR +4: SHL/SAL +5: SHR +6: +7: SAR +EndTable + +GrpTable: Grp3_1 +0: TEST Eb,Ib +1: +2: NOT Eb +3: NEG Eb +4: MUL AL,Eb +5: IMUL AL,Eb +6: DIV AL,Eb +7: IDIV AL,Eb +EndTable + +GrpTable: Grp3_2 +0: TEST Ev,Iz +1: +2: NOT Ev +3: NEG Ev +4: MUL rAX,Ev +5: IMUL rAX,Ev +6: DIV rAX,Ev +7: IDIV rAX,Ev +EndTable + +GrpTable: Grp4 +0: INC Eb +1: DEC Eb +EndTable + +GrpTable: Grp5 +0: INC Ev +1: DEC Ev +# Note: "forced64" is Intel CPU behavior (see comment about CALL insn). +2: CALLN Ev (f64) +3: CALLF Ep +4: JMPN Ev (f64) +5: JMPF Mp +6: PUSH Ev (d64) +7: +EndTable + +GrpTable: Grp6 +0: SLDT Rv/Mw +1: STR Rv/Mw +2: LLDT Ew +3: LTR Ew +4: VERR Ew +5: VERW Ew +EndTable + +GrpTable: Grp7 +0: SGDT Ms | VMCALL (001),(11B) | VMLAUNCH (010),(11B) | VMRESUME (011),(11B) | VMXOFF (100),(11B) +1: SIDT Ms | MONITOR (000),(11B) | MWAIT (001),(11B) | CLAC (010),(11B) | STAC (011),(11B) +2: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B) | VMFUNC (100),(11B) | XEND (101)(11B) | XTEST (110)(11B) +3: LIDT Ms +4: SMSW Mw/Rv +5: +6: LMSW Ew +7: INVLPG Mb | SWAPGS (o64),(000),(11B) | RDTSCP (001),(11B) +EndTable + +GrpTable: Grp8 +4: BT +5: BTS +6: BTR +7: BTC +EndTable + +GrpTable: Grp9 +1: CMPXCHG8B/16B Mq/Mdq +6: VMPTRLD Mq | VMCLEAR Mq (66) | VMXON Mq (F3) | RDRAND Rv (11B) +7: VMPTRST Mq | VMPTRST Mq (F3) | RDSEED Rv (11B) +EndTable + +GrpTable: Grp10 +EndTable + +# Grp11A and Grp11B are expressed as Grp11 in Intel SDM +GrpTable: Grp11A +0: MOV Eb,Ib +7: XABORT Ib (000),(11B) +EndTable + +GrpTable: Grp11B +0: MOV Eb,Iz +7: XBEGIN Jz (000),(11B) +EndTable + +GrpTable: Grp12 +2: psrlw Nq,Ib (11B) | vpsrlw Hx,Ux,Ib (66),(11B),(v1) +4: psraw Nq,Ib (11B) | vpsraw Hx,Ux,Ib (66),(11B),(v1) +6: psllw Nq,Ib (11B) | vpsllw Hx,Ux,Ib (66),(11B),(v1) +EndTable + +GrpTable: Grp13 +2: psrld Nq,Ib (11B) | vpsrld Hx,Ux,Ib (66),(11B),(v1) +4: psrad Nq,Ib (11B) | vpsrad Hx,Ux,Ib (66),(11B),(v1) +6: pslld Nq,Ib (11B) | vpslld Hx,Ux,Ib (66),(11B),(v1) +EndTable + +GrpTable: Grp14 +2: psrlq Nq,Ib (11B) | vpsrlq Hx,Ux,Ib (66),(11B),(v1) +3: vpsrldq Hx,Ux,Ib (66),(11B),(v1) +6: psllq Nq,Ib (11B) | vpsllq Hx,Ux,Ib (66),(11B),(v1) +7: vpslldq Hx,Ux,Ib (66),(11B),(v1) +EndTable + +GrpTable: Grp15 +0: fxsave | RDFSBASE Ry (F3),(11B) +1: fxstor | RDGSBASE Ry (F3),(11B) +2: vldmxcsr Md (v1) | WRFSBASE Ry (F3),(11B) +3: vstmxcsr Md (v1) | WRGSBASE Ry (F3),(11B) +4: XSAVE +5: XRSTOR | lfence (11B) +6: XSAVEOPT | mfence (11B) +7: clflush | sfence (11B) +EndTable + +GrpTable: Grp16 +0: prefetch NTA +1: prefetch T0 +2: prefetch T1 +3: prefetch T2 +EndTable + +GrpTable: Grp17 +1: BLSR By,Ey (v) +2: BLSMSK By,Ey (v) +3: BLSI By,Ey (v) +EndTable + +# AMD's Prefetch Group +GrpTable: GrpP +0: PREFETCH +1: PREFETCHW +EndTable + +GrpTable: GrpPDLK +0: MONTMUL +1: XSHA1 +2: XSHA2 +EndTable + +GrpTable: GrpRNG +0: xstore-rng +1: xcrypt-ecb +2: xcrypt-cbc +4: xcrypt-cfb +5: xcrypt-ofb +EndTable -- cgit v0.10.2 From 53af92849d793662e943d61bb16f7d3eb2d7a072 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Fri, 17 Jul 2015 19:33:39 +0300 Subject: perf tools: Add Intel PT log Add a facility to log Intel Processor Trace decoding. The log is intended for debugging purposes only. The log file name is "intel_pt.log" and is opened in the current directory. The log contains a record of all packets and instructions decoded and can get very large (10 MB would be a small one). Signed-off-by: Adrian Hunter Cc: Jiri Olsa Link: http://lkml.kernel.org/r/1437150840-31811-5-git-send-email-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/intel-pt-decoder/Build b/tools/perf/util/intel-pt-decoder/Build index 5a46ce1..3c717b4 100644 --- a/tools/perf/util/intel-pt-decoder/Build +++ b/tools/perf/util/intel-pt-decoder/Build @@ -1,4 +1,4 @@ -libperf-$(CONFIG_AUXTRACE) += intel-pt-pkt-decoder.o intel-pt-insn-decoder.o +libperf-$(CONFIG_AUXTRACE) += intel-pt-pkt-decoder.o intel-pt-insn-decoder.o intel-pt-log.o inat_tables_script = util/intel-pt-decoder/gen-insn-attr-x86.awk inat_tables_maps = util/intel-pt-decoder/x86-opcode-map.txt diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-log.c b/tools/perf/util/intel-pt-decoder/intel-pt-log.c new file mode 100644 index 0000000..d09c7d9 --- /dev/null +++ b/tools/perf/util/intel-pt-decoder/intel-pt-log.c @@ -0,0 +1,155 @@ +/* + * intel_pt_log.c: Intel Processor Trace support + * Copyright (c) 2013-2014, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + */ + +#include +#include +#include +#include +#include +#include + +#include "intel-pt-log.h" +#include "intel-pt-insn-decoder.h" + +#include "intel-pt-pkt-decoder.h" + +#define MAX_LOG_NAME 256 + +static FILE *f; +static char log_name[MAX_LOG_NAME]; +static bool enable_logging; + +void intel_pt_log_enable(void) +{ + enable_logging = true; +} + +void intel_pt_log_disable(void) +{ + if (f) + fflush(f); + enable_logging = false; +} + +void intel_pt_log_set_name(const char *name) +{ + strncpy(log_name, name, MAX_LOG_NAME - 5); + strcat(log_name, ".log"); +} + +static void intel_pt_print_data(const unsigned char *buf, int len, uint64_t pos, + int indent) +{ + int i; + + for (i = 0; i < indent; i++) + fprintf(f, " "); + + fprintf(f, " %08" PRIx64 ": ", pos); + for (i = 0; i < len; i++) + fprintf(f, " %02x", buf[i]); + for (; i < 16; i++) + fprintf(f, " "); + fprintf(f, " "); +} + +static void intel_pt_print_no_data(uint64_t pos, int indent) +{ + int i; + + for (i = 0; i < indent; i++) + fprintf(f, " "); + + fprintf(f, " %08" PRIx64 ": ", pos); + for (i = 0; i < 16; i++) + fprintf(f, " "); + fprintf(f, " "); +} + +static int intel_pt_log_open(void) +{ + if (!enable_logging) + return -1; + + if (f) + return 0; + + if (!log_name[0]) + return -1; + + f = fopen(log_name, "w+"); + if (!f) { + enable_logging = false; + return -1; + } + + return 0; +} + +void intel_pt_log_packet(const struct intel_pt_pkt *packet, int pkt_len, + uint64_t pos, const unsigned char *buf) +{ + char desc[INTEL_PT_PKT_DESC_MAX]; + + if (intel_pt_log_open()) + return; + + intel_pt_print_data(buf, pkt_len, pos, 0); + intel_pt_pkt_desc(packet, desc, INTEL_PT_PKT_DESC_MAX); + fprintf(f, "%s\n", desc); +} + +void intel_pt_log_insn(struct intel_pt_insn *intel_pt_insn, uint64_t ip) +{ + char desc[INTEL_PT_INSN_DESC_MAX]; + size_t len = intel_pt_insn->length; + + if (intel_pt_log_open()) + return; + + if (len > INTEL_PT_INSN_DBG_BUF_SZ) + len = INTEL_PT_INSN_DBG_BUF_SZ; + intel_pt_print_data(intel_pt_insn->buf, len, ip, 8); + if (intel_pt_insn_desc(intel_pt_insn, desc, INTEL_PT_INSN_DESC_MAX) > 0) + fprintf(f, "%s\n", desc); + else + fprintf(f, "Bad instruction!\n"); +} + +void intel_pt_log_insn_no_data(struct intel_pt_insn *intel_pt_insn, uint64_t ip) +{ + char desc[INTEL_PT_INSN_DESC_MAX]; + + if (intel_pt_log_open()) + return; + + intel_pt_print_no_data(ip, 8); + if (intel_pt_insn_desc(intel_pt_insn, desc, INTEL_PT_INSN_DESC_MAX) > 0) + fprintf(f, "%s\n", desc); + else + fprintf(f, "Bad instruction!\n"); +} + +void intel_pt_log(const char *fmt, ...) +{ + va_list args; + + if (intel_pt_log_open()) + return; + + va_start(args, fmt); + vfprintf(f, fmt, args); + va_end(args); +} diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-log.h b/tools/perf/util/intel-pt-decoder/intel-pt-log.h new file mode 100644 index 0000000..db3942f --- /dev/null +++ b/tools/perf/util/intel-pt-decoder/intel-pt-log.h @@ -0,0 +1,52 @@ +/* + * intel_pt_log.h: Intel Processor Trace support + * Copyright (c) 2013-2014, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + */ + +#ifndef INCLUDE__INTEL_PT_LOG_H__ +#define INCLUDE__INTEL_PT_LOG_H__ + +#include +#include + +struct intel_pt_pkt; + +void intel_pt_log_enable(void); +void intel_pt_log_disable(void); +void intel_pt_log_set_name(const char *name); + +void intel_pt_log_packet(const struct intel_pt_pkt *packet, int pkt_len, + uint64_t pos, const unsigned char *buf); + +struct intel_pt_insn; + +void intel_pt_log_insn(struct intel_pt_insn *intel_pt_insn, uint64_t ip); +void intel_pt_log_insn_no_data(struct intel_pt_insn *intel_pt_insn, + uint64_t ip); + +__attribute__((format(printf, 1, 2))) +void intel_pt_log(const char *fmt, ...); + +#define x64_fmt "0x%" PRIx64 + +static inline void intel_pt_log_at(const char *msg, uint64_t u) +{ + intel_pt_log("%s at " x64_fmt "\n", msg, u); +} + +static inline void intel_pt_log_to(const char *msg, uint64_t u) +{ + intel_pt_log("%s to " x64_fmt "\n", msg, u); +} + +#endif -- cgit v0.10.2 From f4aa081949e7b6b01e711229c5a47ee3482a169c Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Fri, 17 Jul 2015 19:33:40 +0300 Subject: perf tools: Add Intel PT decoder Add support for decoding an Intel Processor Trace. Intel PT trace data must be 'decoded' which involves walking the object code and matching the trace data packets. The decoder requests a buffer of binary data via a get_trace() call-back, which it decodes using instruction information which it gets via another call-back walk_insn(). Signed-off-by: Adrian Hunter Cc: Jiri Olsa Link: http://lkml.kernel.org/r/1437150840-31811-6-git-send-email-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/intel-pt-decoder/Build b/tools/perf/util/intel-pt-decoder/Build index 3c717b4..240730d 100644 --- a/tools/perf/util/intel-pt-decoder/Build +++ b/tools/perf/util/intel-pt-decoder/Build @@ -1,4 +1,4 @@ -libperf-$(CONFIG_AUXTRACE) += intel-pt-pkt-decoder.o intel-pt-insn-decoder.o intel-pt-log.o +libperf-$(CONFIG_AUXTRACE) += intel-pt-pkt-decoder.o intel-pt-insn-decoder.o intel-pt-log.o intel-pt-decoder.o inat_tables_script = util/intel-pt-decoder/gen-insn-attr-x86.awk inat_tables_maps = util/intel-pt-decoder/x86-opcode-map.txt diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c new file mode 100644 index 0000000..f8ac462 --- /dev/null +++ b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c @@ -0,0 +1,1816 @@ +/* + * intel_pt_decoder.c: Intel Processor Trace support + * Copyright (c) 2013-2014, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + */ + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include +#include +#include +#include +#include +#include + +#include "../cache.h" +#include "../util.h" + +#include "intel-pt-insn-decoder.h" +#include "intel-pt-pkt-decoder.h" +#include "intel-pt-decoder.h" +#include "intel-pt-log.h" + +#define INTEL_PT_BLK_SIZE 1024 + +#define BIT63 (((uint64_t)1 << 63)) + +#define INTEL_PT_RETURN 1 + +/* Maximum number of loops with no packets consumed i.e. stuck in a loop */ +#define INTEL_PT_MAX_LOOPS 10000 + +struct intel_pt_blk { + struct intel_pt_blk *prev; + uint64_t ip[INTEL_PT_BLK_SIZE]; +}; + +struct intel_pt_stack { + struct intel_pt_blk *blk; + struct intel_pt_blk *spare; + int pos; +}; + +enum intel_pt_pkt_state { + INTEL_PT_STATE_NO_PSB, + INTEL_PT_STATE_NO_IP, + INTEL_PT_STATE_ERR_RESYNC, + INTEL_PT_STATE_IN_SYNC, + INTEL_PT_STATE_TNT, + INTEL_PT_STATE_TIP, + INTEL_PT_STATE_TIP_PGD, + INTEL_PT_STATE_FUP, + INTEL_PT_STATE_FUP_NO_TIP, +}; + +#ifdef INTEL_PT_STRICT +#define INTEL_PT_STATE_ERR1 INTEL_PT_STATE_NO_PSB +#define INTEL_PT_STATE_ERR2 INTEL_PT_STATE_NO_PSB +#define INTEL_PT_STATE_ERR3 INTEL_PT_STATE_NO_PSB +#define INTEL_PT_STATE_ERR4 INTEL_PT_STATE_NO_PSB +#else +#define INTEL_PT_STATE_ERR1 (decoder->pkt_state) +#define INTEL_PT_STATE_ERR2 INTEL_PT_STATE_NO_IP +#define INTEL_PT_STATE_ERR3 INTEL_PT_STATE_ERR_RESYNC +#define INTEL_PT_STATE_ERR4 INTEL_PT_STATE_IN_SYNC +#endif + +struct intel_pt_decoder { + int (*get_trace)(struct intel_pt_buffer *buffer, void *data); + int (*walk_insn)(struct intel_pt_insn *intel_pt_insn, + uint64_t *insn_cnt_ptr, uint64_t *ip, uint64_t to_ip, + uint64_t max_insn_cnt, void *data); + void *data; + struct intel_pt_state state; + const unsigned char *buf; + size_t len; + bool return_compression; + bool pge; + uint64_t pos; + uint64_t last_ip; + uint64_t ip; + uint64_t cr3; + uint64_t timestamp; + uint64_t tsc_timestamp; + uint64_t ref_timestamp; + uint64_t ret_addr; + struct intel_pt_stack stack; + enum intel_pt_pkt_state pkt_state; + struct intel_pt_pkt packet; + struct intel_pt_pkt tnt; + int pkt_step; + int pkt_len; + unsigned int cbr; + unsigned int max_non_turbo_ratio; + int exec_mode; + unsigned int insn_bytes; + uint64_t sign_bit; + uint64_t sign_bits; + uint64_t period; + enum intel_pt_period_type period_type; + uint64_t period_insn_cnt; + uint64_t period_mask; + uint64_t period_ticks; + uint64_t last_masked_timestamp; + bool continuous_period; + bool overflow; + bool set_fup_tx_flags; + unsigned int fup_tx_flags; + unsigned int tx_flags; + uint64_t timestamp_insn_cnt; + uint64_t stuck_ip; + int no_progress; + int stuck_ip_prd; + int stuck_ip_cnt; + const unsigned char *next_buf; + size_t next_len; + unsigned char temp_buf[INTEL_PT_PKT_MAX_SZ]; +}; + +static uint64_t intel_pt_lower_power_of_2(uint64_t x) +{ + int i; + + for (i = 0; x != 1; i++) + x >>= 1; + + return x << i; +} + +static void intel_pt_setup_period(struct intel_pt_decoder *decoder) +{ + if (decoder->period_type == INTEL_PT_PERIOD_TICKS) { + uint64_t period; + + period = intel_pt_lower_power_of_2(decoder->period); + decoder->period_mask = ~(period - 1); + decoder->period_ticks = period; + } +} + +struct intel_pt_decoder *intel_pt_decoder_new(struct intel_pt_params *params) +{ + struct intel_pt_decoder *decoder; + + if (!params->get_trace || !params->walk_insn) + return NULL; + + decoder = zalloc(sizeof(struct intel_pt_decoder)); + if (!decoder) + return NULL; + + decoder->get_trace = params->get_trace; + decoder->walk_insn = params->walk_insn; + decoder->data = params->data; + decoder->return_compression = params->return_compression; + + decoder->sign_bit = (uint64_t)1 << 47; + decoder->sign_bits = ~(((uint64_t)1 << 48) - 1); + + decoder->period = params->period; + decoder->period_type = params->period_type; + + decoder->max_non_turbo_ratio = params->max_non_turbo_ratio; + + intel_pt_setup_period(decoder); + + return decoder; +} + +static void intel_pt_pop_blk(struct intel_pt_stack *stack) +{ + struct intel_pt_blk *blk = stack->blk; + + stack->blk = blk->prev; + if (!stack->spare) + stack->spare = blk; + else + free(blk); +} + +static uint64_t intel_pt_pop(struct intel_pt_stack *stack) +{ + if (!stack->pos) { + if (!stack->blk) + return 0; + intel_pt_pop_blk(stack); + if (!stack->blk) + return 0; + stack->pos = INTEL_PT_BLK_SIZE; + } + return stack->blk->ip[--stack->pos]; +} + +static int intel_pt_alloc_blk(struct intel_pt_stack *stack) +{ + struct intel_pt_blk *blk; + + if (stack->spare) { + blk = stack->spare; + stack->spare = NULL; + } else { + blk = malloc(sizeof(struct intel_pt_blk)); + if (!blk) + return -ENOMEM; + } + + blk->prev = stack->blk; + stack->blk = blk; + stack->pos = 0; + return 0; +} + +static int intel_pt_push(struct intel_pt_stack *stack, uint64_t ip) +{ + int err; + + if (!stack->blk || stack->pos == INTEL_PT_BLK_SIZE) { + err = intel_pt_alloc_blk(stack); + if (err) + return err; + } + + stack->blk->ip[stack->pos++] = ip; + return 0; +} + +static void intel_pt_clear_stack(struct intel_pt_stack *stack) +{ + while (stack->blk) + intel_pt_pop_blk(stack); + stack->pos = 0; +} + +static void intel_pt_free_stack(struct intel_pt_stack *stack) +{ + intel_pt_clear_stack(stack); + zfree(&stack->blk); + zfree(&stack->spare); +} + +void intel_pt_decoder_free(struct intel_pt_decoder *decoder) +{ + intel_pt_free_stack(&decoder->stack); + free(decoder); +} + +static int intel_pt_ext_err(int code) +{ + switch (code) { + case -ENOMEM: + return INTEL_PT_ERR_NOMEM; + case -ENOSYS: + return INTEL_PT_ERR_INTERN; + case -EBADMSG: + return INTEL_PT_ERR_BADPKT; + case -ENODATA: + return INTEL_PT_ERR_NODATA; + case -EILSEQ: + return INTEL_PT_ERR_NOINSN; + case -ENOENT: + return INTEL_PT_ERR_MISMAT; + case -EOVERFLOW: + return INTEL_PT_ERR_OVR; + case -ENOSPC: + return INTEL_PT_ERR_LOST; + case -ELOOP: + return INTEL_PT_ERR_NELOOP; + default: + return INTEL_PT_ERR_UNK; + } +} + +static const char *intel_pt_err_msgs[] = { + [INTEL_PT_ERR_NOMEM] = "Memory allocation failed", + [INTEL_PT_ERR_INTERN] = "Internal error", + [INTEL_PT_ERR_BADPKT] = "Bad packet", + [INTEL_PT_ERR_NODATA] = "No more data", + [INTEL_PT_ERR_NOINSN] = "Failed to get instruction", + [INTEL_PT_ERR_MISMAT] = "Trace doesn't match instruction", + [INTEL_PT_ERR_OVR] = "Overflow packet", + [INTEL_PT_ERR_LOST] = "Lost trace data", + [INTEL_PT_ERR_UNK] = "Unknown error!", + [INTEL_PT_ERR_NELOOP] = "Never-ending loop", +}; + +int intel_pt__strerror(int code, char *buf, size_t buflen) +{ + if (code < 1 || code > INTEL_PT_ERR_MAX) + code = INTEL_PT_ERR_UNK; + strlcpy(buf, intel_pt_err_msgs[code], buflen); + return 0; +} + +static uint64_t intel_pt_calc_ip(struct intel_pt_decoder *decoder, + const struct intel_pt_pkt *packet, + uint64_t last_ip) +{ + uint64_t ip; + + switch (packet->count) { + case 2: + ip = (last_ip & (uint64_t)0xffffffffffff0000ULL) | + packet->payload; + break; + case 4: + ip = (last_ip & (uint64_t)0xffffffff00000000ULL) | + packet->payload; + break; + case 6: + ip = packet->payload; + break; + default: + return 0; + } + + if (ip & decoder->sign_bit) + return ip | decoder->sign_bits; + + return ip; +} + +static inline void intel_pt_set_last_ip(struct intel_pt_decoder *decoder) +{ + decoder->last_ip = intel_pt_calc_ip(decoder, &decoder->packet, + decoder->last_ip); +} + +static inline void intel_pt_set_ip(struct intel_pt_decoder *decoder) +{ + intel_pt_set_last_ip(decoder); + decoder->ip = decoder->last_ip; +} + +static void intel_pt_decoder_log_packet(struct intel_pt_decoder *decoder) +{ + intel_pt_log_packet(&decoder->packet, decoder->pkt_len, decoder->pos, + decoder->buf); +} + +static int intel_pt_bug(struct intel_pt_decoder *decoder) +{ + intel_pt_log("ERROR: Internal error\n"); + decoder->pkt_state = INTEL_PT_STATE_NO_PSB; + return -ENOSYS; +} + +static inline void intel_pt_clear_tx_flags(struct intel_pt_decoder *decoder) +{ + decoder->tx_flags = 0; +} + +static inline void intel_pt_update_in_tx(struct intel_pt_decoder *decoder) +{ + decoder->tx_flags = decoder->packet.payload & INTEL_PT_IN_TX; +} + +static int intel_pt_bad_packet(struct intel_pt_decoder *decoder) +{ + intel_pt_clear_tx_flags(decoder); + decoder->pkt_len = 1; + decoder->pkt_step = 1; + intel_pt_decoder_log_packet(decoder); + if (decoder->pkt_state != INTEL_PT_STATE_NO_PSB) { + intel_pt_log("ERROR: Bad packet\n"); + decoder->pkt_state = INTEL_PT_STATE_ERR1; + } + return -EBADMSG; +} + +static int intel_pt_get_data(struct intel_pt_decoder *decoder) +{ + struct intel_pt_buffer buffer = { .buf = 0, }; + int ret; + + decoder->pkt_step = 0; + + intel_pt_log("Getting more data\n"); + ret = decoder->get_trace(&buffer, decoder->data); + if (ret) + return ret; + decoder->buf = buffer.buf; + decoder->len = buffer.len; + if (!decoder->len) { + intel_pt_log("No more data\n"); + return -ENODATA; + } + if (!buffer.consecutive) { + decoder->ip = 0; + decoder->pkt_state = INTEL_PT_STATE_NO_PSB; + decoder->ref_timestamp = buffer.ref_timestamp; + decoder->timestamp = 0; + decoder->state.trace_nr = buffer.trace_nr; + intel_pt_log("Reference timestamp 0x%" PRIx64 "\n", + decoder->ref_timestamp); + return -ENOLINK; + } + + return 0; +} + +static int intel_pt_get_next_data(struct intel_pt_decoder *decoder) +{ + if (!decoder->next_buf) + return intel_pt_get_data(decoder); + + decoder->buf = decoder->next_buf; + decoder->len = decoder->next_len; + decoder->next_buf = 0; + decoder->next_len = 0; + return 0; +} + +static int intel_pt_get_split_packet(struct intel_pt_decoder *decoder) +{ + unsigned char *buf = decoder->temp_buf; + size_t old_len, len, n; + int ret; + + old_len = decoder->len; + len = decoder->len; + memcpy(buf, decoder->buf, len); + + ret = intel_pt_get_data(decoder); + if (ret) { + decoder->pos += old_len; + return ret < 0 ? ret : -EINVAL; + } + + n = INTEL_PT_PKT_MAX_SZ - len; + if (n > decoder->len) + n = decoder->len; + memcpy(buf + len, decoder->buf, n); + len += n; + + ret = intel_pt_get_packet(buf, len, &decoder->packet); + if (ret < (int)old_len) { + decoder->next_buf = decoder->buf; + decoder->next_len = decoder->len; + decoder->buf = buf; + decoder->len = old_len; + return intel_pt_bad_packet(decoder); + } + + decoder->next_buf = decoder->buf + (ret - old_len); + decoder->next_len = decoder->len - (ret - old_len); + + decoder->buf = buf; + decoder->len = ret; + + return ret; +} + +static int intel_pt_get_next_packet(struct intel_pt_decoder *decoder) +{ + int ret; + + do { + decoder->pos += decoder->pkt_step; + decoder->buf += decoder->pkt_step; + decoder->len -= decoder->pkt_step; + + if (!decoder->len) { + ret = intel_pt_get_next_data(decoder); + if (ret) + return ret; + } + + ret = intel_pt_get_packet(decoder->buf, decoder->len, + &decoder->packet); + if (ret == INTEL_PT_NEED_MORE_BYTES && + decoder->len < INTEL_PT_PKT_MAX_SZ && !decoder->next_buf) { + ret = intel_pt_get_split_packet(decoder); + if (ret < 0) + return ret; + } + if (ret <= 0) + return intel_pt_bad_packet(decoder); + + decoder->pkt_len = ret; + decoder->pkt_step = ret; + intel_pt_decoder_log_packet(decoder); + } while (decoder->packet.type == INTEL_PT_PAD); + + return 0; +} + +static uint64_t intel_pt_next_period(struct intel_pt_decoder *decoder) +{ + uint64_t timestamp, masked_timestamp; + + timestamp = decoder->timestamp + decoder->timestamp_insn_cnt; + masked_timestamp = timestamp & decoder->period_mask; + if (decoder->continuous_period) { + if (masked_timestamp != decoder->last_masked_timestamp) + return 1; + } else { + timestamp += 1; + masked_timestamp = timestamp & decoder->period_mask; + if (masked_timestamp != decoder->last_masked_timestamp) { + decoder->last_masked_timestamp = masked_timestamp; + decoder->continuous_period = true; + } + } + return decoder->period_ticks - (timestamp - masked_timestamp); +} + +static uint64_t intel_pt_next_sample(struct intel_pt_decoder *decoder) +{ + switch (decoder->period_type) { + case INTEL_PT_PERIOD_INSTRUCTIONS: + return decoder->period - decoder->period_insn_cnt; + case INTEL_PT_PERIOD_TICKS: + return intel_pt_next_period(decoder); + case INTEL_PT_PERIOD_NONE: + default: + return 0; + } +} + +static void intel_pt_sample_insn(struct intel_pt_decoder *decoder) +{ + uint64_t timestamp, masked_timestamp; + + switch (decoder->period_type) { + case INTEL_PT_PERIOD_INSTRUCTIONS: + decoder->period_insn_cnt = 0; + break; + case INTEL_PT_PERIOD_TICKS: + timestamp = decoder->timestamp + decoder->timestamp_insn_cnt; + masked_timestamp = timestamp & decoder->period_mask; + decoder->last_masked_timestamp = masked_timestamp; + break; + case INTEL_PT_PERIOD_NONE: + default: + break; + } + + decoder->state.type |= INTEL_PT_INSTRUCTION; +} + +static int intel_pt_walk_insn(struct intel_pt_decoder *decoder, + struct intel_pt_insn *intel_pt_insn, uint64_t ip) +{ + uint64_t max_insn_cnt, insn_cnt = 0; + int err; + + max_insn_cnt = intel_pt_next_sample(decoder); + + err = decoder->walk_insn(intel_pt_insn, &insn_cnt, &decoder->ip, ip, + max_insn_cnt, decoder->data); + + decoder->timestamp_insn_cnt += insn_cnt; + decoder->period_insn_cnt += insn_cnt; + + if (err) { + decoder->no_progress = 0; + decoder->pkt_state = INTEL_PT_STATE_ERR2; + intel_pt_log_at("ERROR: Failed to get instruction", + decoder->ip); + if (err == -ENOENT) + return -ENOLINK; + return -EILSEQ; + } + + if (ip && decoder->ip == ip) { + err = -EAGAIN; + goto out; + } + + if (max_insn_cnt && insn_cnt >= max_insn_cnt) + intel_pt_sample_insn(decoder); + + if (intel_pt_insn->branch == INTEL_PT_BR_NO_BRANCH) { + decoder->state.type = INTEL_PT_INSTRUCTION; + decoder->state.from_ip = decoder->ip; + decoder->state.to_ip = 0; + decoder->ip += intel_pt_insn->length; + err = INTEL_PT_RETURN; + goto out; + } + + if (intel_pt_insn->op == INTEL_PT_OP_CALL) { + /* Zero-length calls are excluded */ + if (intel_pt_insn->branch != INTEL_PT_BR_UNCONDITIONAL || + intel_pt_insn->rel) { + err = intel_pt_push(&decoder->stack, decoder->ip + + intel_pt_insn->length); + if (err) + goto out; + } + } else if (intel_pt_insn->op == INTEL_PT_OP_RET) { + decoder->ret_addr = intel_pt_pop(&decoder->stack); + } + + if (intel_pt_insn->branch == INTEL_PT_BR_UNCONDITIONAL) { + int cnt = decoder->no_progress++; + + decoder->state.from_ip = decoder->ip; + decoder->ip += intel_pt_insn->length + + intel_pt_insn->rel; + decoder->state.to_ip = decoder->ip; + err = INTEL_PT_RETURN; + + /* + * Check for being stuck in a loop. This can happen if a + * decoder error results in the decoder erroneously setting the + * ip to an address that is itself in an infinite loop that + * consumes no packets. When that happens, there must be an + * unconditional branch. + */ + if (cnt) { + if (cnt == 1) { + decoder->stuck_ip = decoder->state.to_ip; + decoder->stuck_ip_prd = 1; + decoder->stuck_ip_cnt = 1; + } else if (cnt > INTEL_PT_MAX_LOOPS || + decoder->state.to_ip == decoder->stuck_ip) { + intel_pt_log_at("ERROR: Never-ending loop", + decoder->state.to_ip); + decoder->pkt_state = INTEL_PT_STATE_ERR_RESYNC; + err = -ELOOP; + goto out; + } else if (!--decoder->stuck_ip_cnt) { + decoder->stuck_ip_prd += 1; + decoder->stuck_ip_cnt = decoder->stuck_ip_prd; + decoder->stuck_ip = decoder->state.to_ip; + } + } + goto out_no_progress; + } +out: + decoder->no_progress = 0; +out_no_progress: + decoder->state.insn_op = intel_pt_insn->op; + decoder->state.insn_len = intel_pt_insn->length; + + if (decoder->tx_flags & INTEL_PT_IN_TX) + decoder->state.flags |= INTEL_PT_IN_TX; + + return err; +} + +static int intel_pt_walk_fup(struct intel_pt_decoder *decoder) +{ + struct intel_pt_insn intel_pt_insn; + uint64_t ip; + int err; + + ip = decoder->last_ip; + + while (1) { + err = intel_pt_walk_insn(decoder, &intel_pt_insn, ip); + if (err == INTEL_PT_RETURN) + return 0; + if (err == -EAGAIN) { + if (decoder->set_fup_tx_flags) { + decoder->set_fup_tx_flags = false; + decoder->tx_flags = decoder->fup_tx_flags; + decoder->state.type = INTEL_PT_TRANSACTION; + decoder->state.from_ip = decoder->ip; + decoder->state.to_ip = 0; + decoder->state.flags = decoder->fup_tx_flags; + return 0; + } + return err; + } + decoder->set_fup_tx_flags = false; + if (err) + return err; + + if (intel_pt_insn.branch == INTEL_PT_BR_INDIRECT) { + intel_pt_log_at("ERROR: Unexpected indirect branch", + decoder->ip); + decoder->pkt_state = INTEL_PT_STATE_ERR_RESYNC; + return -ENOENT; + } + + if (intel_pt_insn.branch == INTEL_PT_BR_CONDITIONAL) { + intel_pt_log_at("ERROR: Unexpected conditional branch", + decoder->ip); + decoder->pkt_state = INTEL_PT_STATE_ERR_RESYNC; + return -ENOENT; + } + + intel_pt_bug(decoder); + } +} + +static int intel_pt_walk_tip(struct intel_pt_decoder *decoder) +{ + struct intel_pt_insn intel_pt_insn; + int err; + + err = intel_pt_walk_insn(decoder, &intel_pt_insn, 0); + if (err == INTEL_PT_RETURN) + return 0; + if (err) + return err; + + if (intel_pt_insn.branch == INTEL_PT_BR_INDIRECT) { + if (decoder->pkt_state == INTEL_PT_STATE_TIP_PGD) { + decoder->pge = false; + decoder->continuous_period = false; + decoder->pkt_state = INTEL_PT_STATE_IN_SYNC; + decoder->state.from_ip = decoder->ip; + decoder->state.to_ip = 0; + if (decoder->packet.count != 0) + decoder->ip = decoder->last_ip; + } else { + decoder->pkt_state = INTEL_PT_STATE_IN_SYNC; + decoder->state.from_ip = decoder->ip; + if (decoder->packet.count == 0) { + decoder->state.to_ip = 0; + } else { + decoder->state.to_ip = decoder->last_ip; + decoder->ip = decoder->last_ip; + } + } + return 0; + } + + if (intel_pt_insn.branch == INTEL_PT_BR_CONDITIONAL) { + intel_pt_log_at("ERROR: Conditional branch when expecting indirect branch", + decoder->ip); + decoder->pkt_state = INTEL_PT_STATE_ERR_RESYNC; + return -ENOENT; + } + + return intel_pt_bug(decoder); +} + +static int intel_pt_walk_tnt(struct intel_pt_decoder *decoder) +{ + struct intel_pt_insn intel_pt_insn; + int err; + + while (1) { + err = intel_pt_walk_insn(decoder, &intel_pt_insn, 0); + if (err == INTEL_PT_RETURN) + return 0; + if (err) + return err; + + if (intel_pt_insn.op == INTEL_PT_OP_RET) { + if (!decoder->return_compression) { + intel_pt_log_at("ERROR: RET when expecting conditional branch", + decoder->ip); + decoder->pkt_state = INTEL_PT_STATE_ERR3; + return -ENOENT; + } + if (!decoder->ret_addr) { + intel_pt_log_at("ERROR: Bad RET compression (stack empty)", + decoder->ip); + decoder->pkt_state = INTEL_PT_STATE_ERR3; + return -ENOENT; + } + if (!(decoder->tnt.payload & BIT63)) { + intel_pt_log_at("ERROR: Bad RET compression (TNT=N)", + decoder->ip); + decoder->pkt_state = INTEL_PT_STATE_ERR3; + return -ENOENT; + } + decoder->tnt.count -= 1; + if (!decoder->tnt.count) + decoder->pkt_state = INTEL_PT_STATE_IN_SYNC; + decoder->tnt.payload <<= 1; + decoder->state.from_ip = decoder->ip; + decoder->ip = decoder->ret_addr; + decoder->state.to_ip = decoder->ip; + return 0; + } + + if (intel_pt_insn.branch == INTEL_PT_BR_INDIRECT) { + /* Handle deferred TIPs */ + err = intel_pt_get_next_packet(decoder); + if (err) + return err; + if (decoder->packet.type != INTEL_PT_TIP || + decoder->packet.count == 0) { + intel_pt_log_at("ERROR: Missing deferred TIP for indirect branch", + decoder->ip); + decoder->pkt_state = INTEL_PT_STATE_ERR3; + decoder->pkt_step = 0; + return -ENOENT; + } + intel_pt_set_last_ip(decoder); + decoder->state.from_ip = decoder->ip; + decoder->state.to_ip = decoder->last_ip; + decoder->ip = decoder->last_ip; + return 0; + } + + if (intel_pt_insn.branch == INTEL_PT_BR_CONDITIONAL) { + decoder->tnt.count -= 1; + if (!decoder->tnt.count) + decoder->pkt_state = INTEL_PT_STATE_IN_SYNC; + if (decoder->tnt.payload & BIT63) { + decoder->tnt.payload <<= 1; + decoder->state.from_ip = decoder->ip; + decoder->ip += intel_pt_insn.length + + intel_pt_insn.rel; + decoder->state.to_ip = decoder->ip; + return 0; + } + /* Instruction sample for a non-taken branch */ + if (decoder->state.type & INTEL_PT_INSTRUCTION) { + decoder->tnt.payload <<= 1; + decoder->state.type = INTEL_PT_INSTRUCTION; + decoder->state.from_ip = decoder->ip; + decoder->state.to_ip = 0; + decoder->ip += intel_pt_insn.length; + return 0; + } + decoder->ip += intel_pt_insn.length; + if (!decoder->tnt.count) + return -EAGAIN; + decoder->tnt.payload <<= 1; + continue; + } + + return intel_pt_bug(decoder); + } +} + +static int intel_pt_mode_tsx(struct intel_pt_decoder *decoder, bool *no_tip) +{ + unsigned int fup_tx_flags; + int err; + + fup_tx_flags = decoder->packet.payload & + (INTEL_PT_IN_TX | INTEL_PT_ABORT_TX); + err = intel_pt_get_next_packet(decoder); + if (err) + return err; + if (decoder->packet.type == INTEL_PT_FUP) { + decoder->fup_tx_flags = fup_tx_flags; + decoder->set_fup_tx_flags = true; + if (!(decoder->fup_tx_flags & INTEL_PT_ABORT_TX)) + *no_tip = true; + } else { + intel_pt_log_at("ERROR: Missing FUP after MODE.TSX", + decoder->pos); + intel_pt_update_in_tx(decoder); + } + return 0; +} + +static void intel_pt_calc_tsc_timestamp(struct intel_pt_decoder *decoder) +{ + uint64_t timestamp; + + if (decoder->ref_timestamp) { + timestamp = decoder->packet.payload | + (decoder->ref_timestamp & (0xffULL << 56)); + if (timestamp < decoder->ref_timestamp) { + if (decoder->ref_timestamp - timestamp > (1ULL << 55)) + timestamp += (1ULL << 56); + } else { + if (timestamp - decoder->ref_timestamp > (1ULL << 55)) + timestamp -= (1ULL << 56); + } + decoder->tsc_timestamp = timestamp; + decoder->timestamp = timestamp; + decoder->ref_timestamp = 0; + decoder->timestamp_insn_cnt = 0; + } else if (decoder->timestamp) { + timestamp = decoder->packet.payload | + (decoder->timestamp & (0xffULL << 56)); + if (timestamp < decoder->timestamp && + decoder->timestamp - timestamp < 0x100) { + intel_pt_log_to("ERROR: Suppressing backwards timestamp", + timestamp); + timestamp = decoder->timestamp; + } + while (timestamp < decoder->timestamp) { + intel_pt_log_to("Wraparound timestamp", timestamp); + timestamp += (1ULL << 56); + } + decoder->tsc_timestamp = timestamp; + decoder->timestamp = timestamp; + decoder->timestamp_insn_cnt = 0; + } + + intel_pt_log_to("Setting timestamp", decoder->timestamp); +} + +static int intel_pt_overflow(struct intel_pt_decoder *decoder) +{ + intel_pt_log("ERROR: Buffer overflow\n"); + intel_pt_clear_tx_flags(decoder); + decoder->pkt_state = INTEL_PT_STATE_ERR_RESYNC; + decoder->overflow = true; + return -EOVERFLOW; +} + +/* Walk PSB+ packets when already in sync. */ +static int intel_pt_walk_psbend(struct intel_pt_decoder *decoder) +{ + int err; + + while (1) { + err = intel_pt_get_next_packet(decoder); + if (err) + return err; + + switch (decoder->packet.type) { + case INTEL_PT_PSBEND: + return 0; + + case INTEL_PT_TIP_PGD: + case INTEL_PT_TIP_PGE: + case INTEL_PT_TIP: + case INTEL_PT_TNT: + case INTEL_PT_BAD: + case INTEL_PT_PSB: + intel_pt_log("ERROR: Unexpected packet\n"); + return -EAGAIN; + + case INTEL_PT_OVF: + return intel_pt_overflow(decoder); + + case INTEL_PT_TSC: + intel_pt_calc_tsc_timestamp(decoder); + break; + + case INTEL_PT_CBR: + decoder->cbr = decoder->packet.payload; + break; + + case INTEL_PT_MODE_EXEC: + decoder->exec_mode = decoder->packet.payload; + break; + + case INTEL_PT_PIP: + decoder->cr3 = decoder->packet.payload; + break; + + case INTEL_PT_FUP: + decoder->pge = true; + intel_pt_set_last_ip(decoder); + break; + + case INTEL_PT_MODE_TSX: + intel_pt_update_in_tx(decoder); + break; + + case INTEL_PT_PAD: + default: + break; + } + } +} + +static int intel_pt_walk_fup_tip(struct intel_pt_decoder *decoder) +{ + int err; + + if (decoder->tx_flags & INTEL_PT_ABORT_TX) { + decoder->tx_flags = 0; + decoder->state.flags &= ~INTEL_PT_IN_TX; + decoder->state.flags |= INTEL_PT_ABORT_TX; + } else { + decoder->state.flags |= INTEL_PT_ASYNC; + } + + while (1) { + err = intel_pt_get_next_packet(decoder); + if (err) + return err; + + switch (decoder->packet.type) { + case INTEL_PT_TNT: + case INTEL_PT_FUP: + case INTEL_PT_PSB: + case INTEL_PT_TSC: + case INTEL_PT_CBR: + case INTEL_PT_MODE_TSX: + case INTEL_PT_BAD: + case INTEL_PT_PSBEND: + intel_pt_log("ERROR: Missing TIP after FUP\n"); + decoder->pkt_state = INTEL_PT_STATE_ERR3; + return -ENOENT; + + case INTEL_PT_OVF: + return intel_pt_overflow(decoder); + + case INTEL_PT_TIP_PGD: + decoder->state.from_ip = decoder->ip; + decoder->state.to_ip = 0; + if (decoder->packet.count != 0) { + intel_pt_set_ip(decoder); + intel_pt_log("Omitting PGD ip " x64_fmt "\n", + decoder->ip); + } + decoder->pge = false; + decoder->continuous_period = false; + return 0; + + case INTEL_PT_TIP_PGE: + decoder->pge = true; + intel_pt_log("Omitting PGE ip " x64_fmt "\n", + decoder->ip); + decoder->state.from_ip = 0; + if (decoder->packet.count == 0) { + decoder->state.to_ip = 0; + } else { + intel_pt_set_ip(decoder); + decoder->state.to_ip = decoder->ip; + } + return 0; + + case INTEL_PT_TIP: + decoder->state.from_ip = decoder->ip; + if (decoder->packet.count == 0) { + decoder->state.to_ip = 0; + } else { + intel_pt_set_ip(decoder); + decoder->state.to_ip = decoder->ip; + } + return 0; + + case INTEL_PT_PIP: + decoder->cr3 = decoder->packet.payload; + break; + + case INTEL_PT_MODE_EXEC: + decoder->exec_mode = decoder->packet.payload; + break; + + case INTEL_PT_PAD: + break; + + default: + return intel_pt_bug(decoder); + } + } +} + +static int intel_pt_walk_trace(struct intel_pt_decoder *decoder) +{ + bool no_tip = false; + int err; + + while (1) { + err = intel_pt_get_next_packet(decoder); + if (err) + return err; +next: + switch (decoder->packet.type) { + case INTEL_PT_TNT: + if (!decoder->packet.count) + break; + decoder->tnt = decoder->packet; + decoder->pkt_state = INTEL_PT_STATE_TNT; + err = intel_pt_walk_tnt(decoder); + if (err == -EAGAIN) + break; + return err; + + case INTEL_PT_TIP_PGD: + if (decoder->packet.count != 0) + intel_pt_set_last_ip(decoder); + decoder->pkt_state = INTEL_PT_STATE_TIP_PGD; + return intel_pt_walk_tip(decoder); + + case INTEL_PT_TIP_PGE: { + decoder->pge = true; + if (decoder->packet.count == 0) { + intel_pt_log_at("Skipping zero TIP.PGE", + decoder->pos); + break; + } + intel_pt_set_ip(decoder); + decoder->state.from_ip = 0; + decoder->state.to_ip = decoder->ip; + return 0; + } + + case INTEL_PT_OVF: + return intel_pt_overflow(decoder); + + case INTEL_PT_TIP: + if (decoder->packet.count != 0) + intel_pt_set_last_ip(decoder); + decoder->pkt_state = INTEL_PT_STATE_TIP; + return intel_pt_walk_tip(decoder); + + case INTEL_PT_FUP: + if (decoder->packet.count == 0) { + intel_pt_log_at("Skipping zero FUP", + decoder->pos); + no_tip = false; + break; + } + intel_pt_set_last_ip(decoder); + err = intel_pt_walk_fup(decoder); + if (err != -EAGAIN) { + if (err) + return err; + if (no_tip) + decoder->pkt_state = + INTEL_PT_STATE_FUP_NO_TIP; + else + decoder->pkt_state = INTEL_PT_STATE_FUP; + return 0; + } + if (no_tip) { + no_tip = false; + break; + } + return intel_pt_walk_fup_tip(decoder); + + case INTEL_PT_PSB: + intel_pt_clear_stack(&decoder->stack); + err = intel_pt_walk_psbend(decoder); + if (err == -EAGAIN) + goto next; + if (err) + return err; + break; + + case INTEL_PT_PIP: + decoder->cr3 = decoder->packet.payload; + break; + + case INTEL_PT_TSC: + intel_pt_calc_tsc_timestamp(decoder); + break; + + case INTEL_PT_CBR: + decoder->cbr = decoder->packet.payload; + break; + + case INTEL_PT_MODE_EXEC: + decoder->exec_mode = decoder->packet.payload; + break; + + case INTEL_PT_MODE_TSX: + /* MODE_TSX need not be followed by FUP */ + if (!decoder->pge) { + intel_pt_update_in_tx(decoder); + break; + } + err = intel_pt_mode_tsx(decoder, &no_tip); + if (err) + return err; + goto next; + + case INTEL_PT_BAD: /* Does not happen */ + return intel_pt_bug(decoder); + + case INTEL_PT_PSBEND: + case INTEL_PT_PAD: + break; + + default: + return intel_pt_bug(decoder); + } + } +} + +/* Walk PSB+ packets to get in sync. */ +static int intel_pt_walk_psb(struct intel_pt_decoder *decoder) +{ + int err; + + while (1) { + err = intel_pt_get_next_packet(decoder); + if (err) + return err; + + switch (decoder->packet.type) { + case INTEL_PT_TIP_PGD: + decoder->continuous_period = false; + case INTEL_PT_TIP_PGE: + case INTEL_PT_TIP: + intel_pt_log("ERROR: Unexpected packet\n"); + return -ENOENT; + + case INTEL_PT_FUP: + decoder->pge = true; + if (decoder->last_ip || decoder->packet.count == 6 || + decoder->packet.count == 0) { + uint64_t current_ip = decoder->ip; + + intel_pt_set_ip(decoder); + if (current_ip) + intel_pt_log_to("Setting IP", + decoder->ip); + } + break; + + case INTEL_PT_TSC: + intel_pt_calc_tsc_timestamp(decoder); + break; + + case INTEL_PT_CBR: + decoder->cbr = decoder->packet.payload; + break; + + case INTEL_PT_PIP: + decoder->cr3 = decoder->packet.payload; + break; + + case INTEL_PT_MODE_EXEC: + decoder->exec_mode = decoder->packet.payload; + break; + + case INTEL_PT_MODE_TSX: + intel_pt_update_in_tx(decoder); + break; + + case INTEL_PT_TNT: + intel_pt_log("ERROR: Unexpected packet\n"); + if (decoder->ip) + decoder->pkt_state = INTEL_PT_STATE_ERR4; + else + decoder->pkt_state = INTEL_PT_STATE_ERR3; + return -ENOENT; + + case INTEL_PT_BAD: /* Does not happen */ + return intel_pt_bug(decoder); + + case INTEL_PT_OVF: + return intel_pt_overflow(decoder); + + case INTEL_PT_PSBEND: + return 0; + + case INTEL_PT_PSB: + case INTEL_PT_PAD: + default: + break; + } + } +} + +static int intel_pt_walk_to_ip(struct intel_pt_decoder *decoder) +{ + int err; + + while (1) { + err = intel_pt_get_next_packet(decoder); + if (err) + return err; + + switch (decoder->packet.type) { + case INTEL_PT_TIP_PGD: + decoder->continuous_period = false; + case INTEL_PT_TIP_PGE: + case INTEL_PT_TIP: + decoder->pge = decoder->packet.type != INTEL_PT_TIP_PGD; + if (decoder->last_ip || decoder->packet.count == 6 || + decoder->packet.count == 0) + intel_pt_set_ip(decoder); + if (decoder->ip) + return 0; + break; + + case INTEL_PT_FUP: + if (decoder->overflow) { + if (decoder->last_ip || + decoder->packet.count == 6 || + decoder->packet.count == 0) + intel_pt_set_ip(decoder); + if (decoder->ip) + return 0; + } + if (decoder->packet.count) + intel_pt_set_last_ip(decoder); + break; + + case INTEL_PT_TSC: + intel_pt_calc_tsc_timestamp(decoder); + break; + + case INTEL_PT_CBR: + decoder->cbr = decoder->packet.payload; + break; + + case INTEL_PT_PIP: + decoder->cr3 = decoder->packet.payload; + break; + + case INTEL_PT_MODE_EXEC: + decoder->exec_mode = decoder->packet.payload; + break; + + case INTEL_PT_MODE_TSX: + intel_pt_update_in_tx(decoder); + break; + + case INTEL_PT_OVF: + return intel_pt_overflow(decoder); + + case INTEL_PT_BAD: /* Does not happen */ + return intel_pt_bug(decoder); + + case INTEL_PT_PSB: + err = intel_pt_walk_psb(decoder); + if (err) + return err; + if (decoder->ip) { + /* Do not have a sample */ + decoder->state.type = 0; + return 0; + } + break; + + case INTEL_PT_TNT: + case INTEL_PT_PSBEND: + case INTEL_PT_PAD: + default: + break; + } + } +} + +static int intel_pt_sync_ip(struct intel_pt_decoder *decoder) +{ + int err; + + intel_pt_log("Scanning for full IP\n"); + err = intel_pt_walk_to_ip(decoder); + if (err) + return err; + + decoder->pkt_state = INTEL_PT_STATE_IN_SYNC; + decoder->overflow = false; + + decoder->state.from_ip = 0; + decoder->state.to_ip = decoder->ip; + intel_pt_log_to("Setting IP", decoder->ip); + + return 0; +} + +static int intel_pt_part_psb(struct intel_pt_decoder *decoder) +{ + const unsigned char *end = decoder->buf + decoder->len; + size_t i; + + for (i = INTEL_PT_PSB_LEN - 1; i; i--) { + if (i > decoder->len) + continue; + if (!memcmp(end - i, INTEL_PT_PSB_STR, i)) + return i; + } + return 0; +} + +static int intel_pt_rest_psb(struct intel_pt_decoder *decoder, int part_psb) +{ + size_t rest_psb = INTEL_PT_PSB_LEN - part_psb; + const char *psb = INTEL_PT_PSB_STR; + + if (rest_psb > decoder->len || + memcmp(decoder->buf, psb + part_psb, rest_psb)) + return 0; + + return rest_psb; +} + +static int intel_pt_get_split_psb(struct intel_pt_decoder *decoder, + int part_psb) +{ + int rest_psb, ret; + + decoder->pos += decoder->len; + decoder->len = 0; + + ret = intel_pt_get_next_data(decoder); + if (ret) + return ret; + + rest_psb = intel_pt_rest_psb(decoder, part_psb); + if (!rest_psb) + return 0; + + decoder->pos -= part_psb; + decoder->next_buf = decoder->buf + rest_psb; + decoder->next_len = decoder->len - rest_psb; + memcpy(decoder->temp_buf, INTEL_PT_PSB_STR, INTEL_PT_PSB_LEN); + decoder->buf = decoder->temp_buf; + decoder->len = INTEL_PT_PSB_LEN; + + return 0; +} + +static int intel_pt_scan_for_psb(struct intel_pt_decoder *decoder) +{ + unsigned char *next; + int ret; + + intel_pt_log("Scanning for PSB\n"); + while (1) { + if (!decoder->len) { + ret = intel_pt_get_next_data(decoder); + if (ret) + return ret; + } + + next = memmem(decoder->buf, decoder->len, INTEL_PT_PSB_STR, + INTEL_PT_PSB_LEN); + if (!next) { + int part_psb; + + part_psb = intel_pt_part_psb(decoder); + if (part_psb) { + ret = intel_pt_get_split_psb(decoder, part_psb); + if (ret) + return ret; + } else { + decoder->pos += decoder->len; + decoder->len = 0; + } + continue; + } + + decoder->pkt_step = next - decoder->buf; + return intel_pt_get_next_packet(decoder); + } +} + +static int intel_pt_sync(struct intel_pt_decoder *decoder) +{ + int err; + + decoder->pge = false; + decoder->continuous_period = false; + decoder->last_ip = 0; + decoder->ip = 0; + intel_pt_clear_stack(&decoder->stack); + + err = intel_pt_scan_for_psb(decoder); + if (err) + return err; + + decoder->pkt_state = INTEL_PT_STATE_NO_IP; + + err = intel_pt_walk_psb(decoder); + if (err) + return err; + + if (decoder->ip) { + decoder->state.type = 0; /* Do not have a sample */ + decoder->pkt_state = INTEL_PT_STATE_IN_SYNC; + } else { + return intel_pt_sync_ip(decoder); + } + + return 0; +} + +static uint64_t intel_pt_est_timestamp(struct intel_pt_decoder *decoder) +{ + uint64_t est = decoder->timestamp_insn_cnt << 1; + + if (!decoder->cbr || !decoder->max_non_turbo_ratio) + goto out; + + est *= decoder->max_non_turbo_ratio; + est /= decoder->cbr; +out: + return decoder->timestamp + est; +} + +const struct intel_pt_state *intel_pt_decode(struct intel_pt_decoder *decoder) +{ + int err; + + do { + decoder->state.type = INTEL_PT_BRANCH; + decoder->state.flags = 0; + + switch (decoder->pkt_state) { + case INTEL_PT_STATE_NO_PSB: + err = intel_pt_sync(decoder); + break; + case INTEL_PT_STATE_NO_IP: + decoder->last_ip = 0; + /* Fall through */ + case INTEL_PT_STATE_ERR_RESYNC: + err = intel_pt_sync_ip(decoder); + break; + case INTEL_PT_STATE_IN_SYNC: + err = intel_pt_walk_trace(decoder); + break; + case INTEL_PT_STATE_TNT: + err = intel_pt_walk_tnt(decoder); + if (err == -EAGAIN) + err = intel_pt_walk_trace(decoder); + break; + case INTEL_PT_STATE_TIP: + case INTEL_PT_STATE_TIP_PGD: + err = intel_pt_walk_tip(decoder); + break; + case INTEL_PT_STATE_FUP: + decoder->pkt_state = INTEL_PT_STATE_IN_SYNC; + err = intel_pt_walk_fup(decoder); + if (err == -EAGAIN) + err = intel_pt_walk_fup_tip(decoder); + else if (!err) + decoder->pkt_state = INTEL_PT_STATE_FUP; + break; + case INTEL_PT_STATE_FUP_NO_TIP: + decoder->pkt_state = INTEL_PT_STATE_IN_SYNC; + err = intel_pt_walk_fup(decoder); + if (err == -EAGAIN) + err = intel_pt_walk_trace(decoder); + break; + default: + err = intel_pt_bug(decoder); + break; + } + } while (err == -ENOLINK); + + decoder->state.err = err ? intel_pt_ext_err(err) : 0; + decoder->state.timestamp = decoder->timestamp; + decoder->state.est_timestamp = intel_pt_est_timestamp(decoder); + decoder->state.cr3 = decoder->cr3; + + if (err) + decoder->state.from_ip = decoder->ip; + + return &decoder->state; +} + +static bool intel_pt_at_psb(unsigned char *buf, size_t len) +{ + if (len < INTEL_PT_PSB_LEN) + return false; + return memmem(buf, INTEL_PT_PSB_LEN, INTEL_PT_PSB_STR, + INTEL_PT_PSB_LEN); +} + +/** + * intel_pt_next_psb - move buffer pointer to the start of the next PSB packet. + * @buf: pointer to buffer pointer + * @len: size of buffer + * + * Updates the buffer pointer to point to the start of the next PSB packet if + * there is one, otherwise the buffer pointer is unchanged. If @buf is updated, + * @len is adjusted accordingly. + * + * Return: %true if a PSB packet is found, %false otherwise. + */ +static bool intel_pt_next_psb(unsigned char **buf, size_t *len) +{ + unsigned char *next; + + next = memmem(*buf, *len, INTEL_PT_PSB_STR, INTEL_PT_PSB_LEN); + if (next) { + *len -= next - *buf; + *buf = next; + return true; + } + return false; +} + +/** + * intel_pt_step_psb - move buffer pointer to the start of the following PSB + * packet. + * @buf: pointer to buffer pointer + * @len: size of buffer + * + * Updates the buffer pointer to point to the start of the following PSB packet + * (skipping the PSB at @buf itself) if there is one, otherwise the buffer + * pointer is unchanged. If @buf is updated, @len is adjusted accordingly. + * + * Return: %true if a PSB packet is found, %false otherwise. + */ +static bool intel_pt_step_psb(unsigned char **buf, size_t *len) +{ + unsigned char *next; + + if (!*len) + return false; + + next = memmem(*buf + 1, *len - 1, INTEL_PT_PSB_STR, INTEL_PT_PSB_LEN); + if (next) { + *len -= next - *buf; + *buf = next; + return true; + } + return false; +} + +/** + * intel_pt_last_psb - find the last PSB packet in a buffer. + * @buf: buffer + * @len: size of buffer + * + * This function finds the last PSB in a buffer. + * + * Return: A pointer to the last PSB in @buf if found, %NULL otherwise. + */ +static unsigned char *intel_pt_last_psb(unsigned char *buf, size_t len) +{ + const char *n = INTEL_PT_PSB_STR; + unsigned char *p; + size_t k; + + if (len < INTEL_PT_PSB_LEN) + return NULL; + + k = len - INTEL_PT_PSB_LEN + 1; + while (1) { + p = memrchr(buf, n[0], k); + if (!p) + return NULL; + if (!memcmp(p + 1, n + 1, INTEL_PT_PSB_LEN - 1)) + return p; + k = p - buf; + if (!k) + return NULL; + } +} + +/** + * intel_pt_next_tsc - find and return next TSC. + * @buf: buffer + * @len: size of buffer + * @tsc: TSC value returned + * + * Find a TSC packet in @buf and return the TSC value. This function assumes + * that @buf starts at a PSB and that PSB+ will contain TSC and so stops if a + * PSBEND packet is found. + * + * Return: %true if TSC is found, false otherwise. + */ +static bool intel_pt_next_tsc(unsigned char *buf, size_t len, uint64_t *tsc) +{ + struct intel_pt_pkt packet; + int ret; + + while (len) { + ret = intel_pt_get_packet(buf, len, &packet); + if (ret <= 0) + return false; + if (packet.type == INTEL_PT_TSC) { + *tsc = packet.payload; + return true; + } + if (packet.type == INTEL_PT_PSBEND) + return false; + buf += ret; + len -= ret; + } + return false; +} + +/** + * intel_pt_tsc_cmp - compare 7-byte TSCs. + * @tsc1: first TSC to compare + * @tsc2: second TSC to compare + * + * This function compares 7-byte TSC values allowing for the possibility that + * TSC wrapped around. Generally it is not possible to know if TSC has wrapped + * around so for that purpose this function assumes the absolute difference is + * less than half the maximum difference. + * + * Return: %-1 if @tsc1 is before @tsc2, %0 if @tsc1 == @tsc2, %1 if @tsc1 is + * after @tsc2. + */ +static int intel_pt_tsc_cmp(uint64_t tsc1, uint64_t tsc2) +{ + const uint64_t halfway = (1ULL << 55); + + if (tsc1 == tsc2) + return 0; + + if (tsc1 < tsc2) { + if (tsc2 - tsc1 < halfway) + return -1; + else + return 1; + } else { + if (tsc1 - tsc2 < halfway) + return 1; + else + return -1; + } +} + +/** + * intel_pt_find_overlap_tsc - determine start of non-overlapped trace data + * using TSC. + * @buf_a: first buffer + * @len_a: size of first buffer + * @buf_b: second buffer + * @len_b: size of second buffer + * + * If the trace contains TSC we can look at the last TSC of @buf_a and the + * first TSC of @buf_b in order to determine if the buffers overlap, and then + * walk forward in @buf_b until a later TSC is found. A precondition is that + * @buf_a and @buf_b are positioned at a PSB. + * + * Return: A pointer into @buf_b from where non-overlapped data starts, or + * @buf_b + @len_b if there is no non-overlapped data. + */ +static unsigned char *intel_pt_find_overlap_tsc(unsigned char *buf_a, + size_t len_a, + unsigned char *buf_b, + size_t len_b) +{ + uint64_t tsc_a, tsc_b; + unsigned char *p; + size_t len; + + p = intel_pt_last_psb(buf_a, len_a); + if (!p) + return buf_b; /* No PSB in buf_a => no overlap */ + + len = len_a - (p - buf_a); + if (!intel_pt_next_tsc(p, len, &tsc_a)) { + /* The last PSB+ in buf_a is incomplete, so go back one more */ + len_a -= len; + p = intel_pt_last_psb(buf_a, len_a); + if (!p) + return buf_b; /* No full PSB+ => assume no overlap */ + len = len_a - (p - buf_a); + if (!intel_pt_next_tsc(p, len, &tsc_a)) + return buf_b; /* No TSC in buf_a => assume no overlap */ + } + + while (1) { + /* Ignore PSB+ with no TSC */ + if (intel_pt_next_tsc(buf_b, len_b, &tsc_b) && + intel_pt_tsc_cmp(tsc_a, tsc_b) < 0) + return buf_b; /* tsc_a < tsc_b => no overlap */ + + if (!intel_pt_step_psb(&buf_b, &len_b)) + return buf_b + len_b; /* No PSB in buf_b => no data */ + } +} + +/** + * intel_pt_find_overlap - determine start of non-overlapped trace data. + * @buf_a: first buffer + * @len_a: size of first buffer + * @buf_b: second buffer + * @len_b: size of second buffer + * @have_tsc: can use TSC packets to detect overlap + * + * When trace samples or snapshots are recorded there is the possibility that + * the data overlaps. Note that, for the purposes of decoding, data is only + * useful if it begins with a PSB packet. + * + * Return: A pointer into @buf_b from where non-overlapped data starts, or + * @buf_b + @len_b if there is no non-overlapped data. + */ +unsigned char *intel_pt_find_overlap(unsigned char *buf_a, size_t len_a, + unsigned char *buf_b, size_t len_b, + bool have_tsc) +{ + unsigned char *found; + + /* Buffer 'b' must start at PSB so throw away everything before that */ + if (!intel_pt_next_psb(&buf_b, &len_b)) + return buf_b + len_b; /* No PSB */ + + if (!intel_pt_next_psb(&buf_a, &len_a)) + return buf_b; /* No overlap */ + + if (have_tsc) { + found = intel_pt_find_overlap_tsc(buf_a, len_a, buf_b, len_b); + if (found) + return found; + } + + /* + * Buffer 'b' cannot end within buffer 'a' so, for comparison purposes, + * we can ignore the first part of buffer 'a'. + */ + while (len_b < len_a) { + if (!intel_pt_step_psb(&buf_a, &len_a)) + return buf_b; /* No overlap */ + } + + /* Now len_b >= len_a */ + if (len_b > len_a) { + /* The leftover buffer 'b' must start at a PSB */ + while (!intel_pt_at_psb(buf_b + len_a, len_b - len_a)) { + if (!intel_pt_step_psb(&buf_a, &len_a)) + return buf_b; /* No overlap */ + } + } + + while (1) { + /* Potential overlap so check the bytes */ + found = memmem(buf_a, len_a, buf_b, len_a); + if (found) + return buf_b + len_a; + + /* Try again at next PSB in buffer 'a' */ + if (!intel_pt_step_psb(&buf_a, &len_a)) + return buf_b; /* No overlap */ + + /* The leftover buffer 'b' must start at a PSB */ + while (!intel_pt_at_psb(buf_b + len_a, len_b - len_a)) { + if (!intel_pt_step_psb(&buf_a, &len_a)) + return buf_b; /* No overlap */ + } + } +} diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.h b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.h new file mode 100644 index 0000000..4c488023 --- /dev/null +++ b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.h @@ -0,0 +1,104 @@ +/* + * intel_pt_decoder.h: Intel Processor Trace support + * Copyright (c) 2013-2014, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + */ + +#ifndef INCLUDE__INTEL_PT_DECODER_H__ +#define INCLUDE__INTEL_PT_DECODER_H__ + +#include +#include +#include + +#include "intel-pt-insn-decoder.h" + +#define INTEL_PT_IN_TX (1 << 0) +#define INTEL_PT_ABORT_TX (1 << 1) +#define INTEL_PT_ASYNC (1 << 2) + +enum intel_pt_sample_type { + INTEL_PT_BRANCH = 1 << 0, + INTEL_PT_INSTRUCTION = 1 << 1, + INTEL_PT_TRANSACTION = 1 << 2, +}; + +enum intel_pt_period_type { + INTEL_PT_PERIOD_NONE, + INTEL_PT_PERIOD_INSTRUCTIONS, + INTEL_PT_PERIOD_TICKS, +}; + +enum { + INTEL_PT_ERR_NOMEM = 1, + INTEL_PT_ERR_INTERN, + INTEL_PT_ERR_BADPKT, + INTEL_PT_ERR_NODATA, + INTEL_PT_ERR_NOINSN, + INTEL_PT_ERR_MISMAT, + INTEL_PT_ERR_OVR, + INTEL_PT_ERR_LOST, + INTEL_PT_ERR_UNK, + INTEL_PT_ERR_NELOOP, + INTEL_PT_ERR_MAX, +}; + +struct intel_pt_state { + enum intel_pt_sample_type type; + int err; + uint64_t from_ip; + uint64_t to_ip; + uint64_t cr3; + uint64_t timestamp; + uint64_t est_timestamp; + uint64_t trace_nr; + uint32_t flags; + enum intel_pt_insn_op insn_op; + int insn_len; +}; + +struct intel_pt_insn; + +struct intel_pt_buffer { + const unsigned char *buf; + size_t len; + bool consecutive; + uint64_t ref_timestamp; + uint64_t trace_nr; +}; + +struct intel_pt_params { + int (*get_trace)(struct intel_pt_buffer *buffer, void *data); + int (*walk_insn)(struct intel_pt_insn *intel_pt_insn, + uint64_t *insn_cnt_ptr, uint64_t *ip, uint64_t to_ip, + uint64_t max_insn_cnt, void *data); + void *data; + bool return_compression; + uint64_t period; + enum intel_pt_period_type period_type; + unsigned max_non_turbo_ratio; +}; + +struct intel_pt_decoder; + +struct intel_pt_decoder *intel_pt_decoder_new(struct intel_pt_params *params); +void intel_pt_decoder_free(struct intel_pt_decoder *decoder); + +const struct intel_pt_state *intel_pt_decode(struct intel_pt_decoder *decoder); + +unsigned char *intel_pt_find_overlap(unsigned char *buf_a, size_t len_a, + unsigned char *buf_b, size_t len_b, + bool have_tsc); + +int intel_pt__strerror(int code, char *buf, size_t buflen); + +#endif -- cgit v0.10.2 From 90e457f7be0870052724b2d9c2c106e5847f2c19 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Fri, 17 Jul 2015 19:33:41 +0300 Subject: perf tools: Add Intel PT support Add support for Intel Processor Trace. Intel PT support fits within the new auxtrace infrastructure. Recording is supporting by identifying the Intel PT PMU, parsing options and setting up events. Decoding is supported by queuing up trace data by cpu or thread and then decoding synchronously delivering synthesized event samples into the session processing for tools to consume. Signed-off-by: Adrian Hunter Cc: Jiri Olsa Link: http://lkml.kernel.org/r/1437150840-31811-7-git-send-email-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/arch/x86/util/Build b/tools/perf/arch/x86/util/Build index cfbccc4..1396088 100644 --- a/tools/perf/arch/x86/util/Build +++ b/tools/perf/arch/x86/util/Build @@ -6,3 +6,5 @@ libperf-$(CONFIG_DWARF) += dwarf-regs.o libperf-$(CONFIG_LIBUNWIND) += unwind-libunwind.o libperf-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o + +libperf-$(CONFIG_AUXTRACE) += intel-pt.o diff --git a/tools/perf/arch/x86/util/intel-pt.c b/tools/perf/arch/x86/util/intel-pt.c new file mode 100644 index 0000000..da7d2c1 --- /dev/null +++ b/tools/perf/arch/x86/util/intel-pt.c @@ -0,0 +1,752 @@ +/* + * intel_pt.c: Intel Processor Trace support + * Copyright (c) 2013-2015, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + */ + +#include +#include +#include +#include +#include + +#include "../../perf.h" +#include "../../util/session.h" +#include "../../util/event.h" +#include "../../util/evlist.h" +#include "../../util/evsel.h" +#include "../../util/cpumap.h" +#include "../../util/parse-options.h" +#include "../../util/parse-events.h" +#include "../../util/pmu.h" +#include "../../util/debug.h" +#include "../../util/auxtrace.h" +#include "../../util/tsc.h" +#include "../../util/intel-pt.h" + +#define KiB(x) ((x) * 1024) +#define MiB(x) ((x) * 1024 * 1024) +#define KiB_MASK(x) (KiB(x) - 1) +#define MiB_MASK(x) (MiB(x) - 1) + +#define INTEL_PT_DEFAULT_SAMPLE_SIZE KiB(4) + +#define INTEL_PT_MAX_SAMPLE_SIZE KiB(60) + +#define INTEL_PT_PSB_PERIOD_NEAR 256 + +struct intel_pt_snapshot_ref { + void *ref_buf; + size_t ref_offset; + bool wrapped; +}; + +struct intel_pt_recording { + struct auxtrace_record itr; + struct perf_pmu *intel_pt_pmu; + int have_sched_switch; + struct perf_evlist *evlist; + bool snapshot_mode; + bool snapshot_init_done; + size_t snapshot_size; + size_t snapshot_ref_buf_size; + int snapshot_ref_cnt; + struct intel_pt_snapshot_ref *snapshot_refs; +}; + +static int intel_pt_parse_terms_with_default(struct list_head *formats, + const char *str, + u64 *config) +{ + struct list_head *terms; + struct perf_event_attr attr = { .size = 0, }; + int err; + + terms = malloc(sizeof(struct list_head)); + if (!terms) + return -ENOMEM; + + INIT_LIST_HEAD(terms); + + err = parse_events_terms(terms, str); + if (err) + goto out_free; + + attr.config = *config; + err = perf_pmu__config_terms(formats, &attr, terms, true, NULL); + if (err) + goto out_free; + + *config = attr.config; +out_free: + parse_events__free_terms(terms); + return err; +} + +static int intel_pt_parse_terms(struct list_head *formats, const char *str, + u64 *config) +{ + *config = 0; + return intel_pt_parse_terms_with_default(formats, str, config); +} + +static size_t intel_pt_psb_period(struct perf_pmu *intel_pt_pmu __maybe_unused, + struct perf_evlist *evlist __maybe_unused) +{ + return 256; +} + +static u64 intel_pt_default_config(struct perf_pmu *intel_pt_pmu) +{ + u64 config; + + intel_pt_parse_terms(&intel_pt_pmu->format, "tsc", &config); + return config; +} + +static int intel_pt_parse_snapshot_options(struct auxtrace_record *itr, + struct record_opts *opts, + const char *str) +{ + struct intel_pt_recording *ptr = + container_of(itr, struct intel_pt_recording, itr); + unsigned long long snapshot_size = 0; + char *endptr; + + if (str) { + snapshot_size = strtoull(str, &endptr, 0); + if (*endptr || snapshot_size > SIZE_MAX) + return -1; + } + + opts->auxtrace_snapshot_mode = true; + opts->auxtrace_snapshot_size = snapshot_size; + + ptr->snapshot_size = snapshot_size; + + return 0; +} + +struct perf_event_attr * +intel_pt_pmu_default_config(struct perf_pmu *intel_pt_pmu) +{ + struct perf_event_attr *attr; + + attr = zalloc(sizeof(struct perf_event_attr)); + if (!attr) + return NULL; + + attr->config = intel_pt_default_config(intel_pt_pmu); + + intel_pt_pmu->selectable = true; + + return attr; +} + +static size_t intel_pt_info_priv_size(struct auxtrace_record *itr __maybe_unused) +{ + return INTEL_PT_AUXTRACE_PRIV_SIZE; +} + +static int intel_pt_info_fill(struct auxtrace_record *itr, + struct perf_session *session, + struct auxtrace_info_event *auxtrace_info, + size_t priv_size) +{ + struct intel_pt_recording *ptr = + container_of(itr, struct intel_pt_recording, itr); + struct perf_pmu *intel_pt_pmu = ptr->intel_pt_pmu; + struct perf_event_mmap_page *pc; + struct perf_tsc_conversion tc = { .time_mult = 0, }; + bool cap_user_time_zero = false, per_cpu_mmaps; + u64 tsc_bit, noretcomp_bit; + int err; + + if (priv_size != INTEL_PT_AUXTRACE_PRIV_SIZE) + return -EINVAL; + + intel_pt_parse_terms(&intel_pt_pmu->format, "tsc", &tsc_bit); + intel_pt_parse_terms(&intel_pt_pmu->format, "noretcomp", + &noretcomp_bit); + + if (!session->evlist->nr_mmaps) + return -EINVAL; + + pc = session->evlist->mmap[0].base; + if (pc) { + err = perf_read_tsc_conversion(pc, &tc); + if (err) { + if (err != -EOPNOTSUPP) + return err; + } else { + cap_user_time_zero = tc.time_mult != 0; + } + if (!cap_user_time_zero) + ui__warning("Intel Processor Trace: TSC not available\n"); + } + + per_cpu_mmaps = !cpu_map__empty(session->evlist->cpus); + + auxtrace_info->type = PERF_AUXTRACE_INTEL_PT; + auxtrace_info->priv[INTEL_PT_PMU_TYPE] = intel_pt_pmu->type; + auxtrace_info->priv[INTEL_PT_TIME_SHIFT] = tc.time_shift; + auxtrace_info->priv[INTEL_PT_TIME_MULT] = tc.time_mult; + auxtrace_info->priv[INTEL_PT_TIME_ZERO] = tc.time_zero; + auxtrace_info->priv[INTEL_PT_CAP_USER_TIME_ZERO] = cap_user_time_zero; + auxtrace_info->priv[INTEL_PT_TSC_BIT] = tsc_bit; + auxtrace_info->priv[INTEL_PT_NORETCOMP_BIT] = noretcomp_bit; + auxtrace_info->priv[INTEL_PT_HAVE_SCHED_SWITCH] = ptr->have_sched_switch; + auxtrace_info->priv[INTEL_PT_SNAPSHOT_MODE] = ptr->snapshot_mode; + auxtrace_info->priv[INTEL_PT_PER_CPU_MMAPS] = per_cpu_mmaps; + + return 0; +} + +static int intel_pt_track_switches(struct perf_evlist *evlist) +{ + const char *sched_switch = "sched:sched_switch"; + struct perf_evsel *evsel; + int err; + + if (!perf_evlist__can_select_event(evlist, sched_switch)) + return -EPERM; + + err = parse_events(evlist, sched_switch, NULL); + if (err) { + pr_debug2("%s: failed to parse %s, error %d\n", + __func__, sched_switch, err); + return err; + } + + evsel = perf_evlist__last(evlist); + + perf_evsel__set_sample_bit(evsel, CPU); + perf_evsel__set_sample_bit(evsel, TIME); + + evsel->system_wide = true; + evsel->no_aux_samples = true; + evsel->immediate = true; + + return 0; +} + +static int intel_pt_recording_options(struct auxtrace_record *itr, + struct perf_evlist *evlist, + struct record_opts *opts) +{ + struct intel_pt_recording *ptr = + container_of(itr, struct intel_pt_recording, itr); + struct perf_pmu *intel_pt_pmu = ptr->intel_pt_pmu; + bool have_timing_info; + struct perf_evsel *evsel, *intel_pt_evsel = NULL; + const struct cpu_map *cpus = evlist->cpus; + bool privileged = geteuid() == 0 || perf_event_paranoid() < 0; + u64 tsc_bit; + + ptr->evlist = evlist; + ptr->snapshot_mode = opts->auxtrace_snapshot_mode; + + evlist__for_each(evlist, evsel) { + if (evsel->attr.type == intel_pt_pmu->type) { + if (intel_pt_evsel) { + pr_err("There may be only one " INTEL_PT_PMU_NAME " event\n"); + return -EINVAL; + } + evsel->attr.freq = 0; + evsel->attr.sample_period = 1; + intel_pt_evsel = evsel; + opts->full_auxtrace = true; + } + } + + if (opts->auxtrace_snapshot_mode && !opts->full_auxtrace) { + pr_err("Snapshot mode (-S option) requires " INTEL_PT_PMU_NAME " PMU event (-e " INTEL_PT_PMU_NAME ")\n"); + return -EINVAL; + } + + if (opts->use_clockid) { + pr_err("Cannot use clockid (-k option) with " INTEL_PT_PMU_NAME "\n"); + return -EINVAL; + } + + if (!opts->full_auxtrace) + return 0; + + /* Set default sizes for snapshot mode */ + if (opts->auxtrace_snapshot_mode) { + size_t psb_period = intel_pt_psb_period(intel_pt_pmu, evlist); + + if (!opts->auxtrace_snapshot_size && !opts->auxtrace_mmap_pages) { + if (privileged) { + opts->auxtrace_mmap_pages = MiB(4) / page_size; + } else { + opts->auxtrace_mmap_pages = KiB(128) / page_size; + if (opts->mmap_pages == UINT_MAX) + opts->mmap_pages = KiB(256) / page_size; + } + } else if (!opts->auxtrace_mmap_pages && !privileged && + opts->mmap_pages == UINT_MAX) { + opts->mmap_pages = KiB(256) / page_size; + } + if (!opts->auxtrace_snapshot_size) + opts->auxtrace_snapshot_size = + opts->auxtrace_mmap_pages * (size_t)page_size; + if (!opts->auxtrace_mmap_pages) { + size_t sz = opts->auxtrace_snapshot_size; + + sz = round_up(sz, page_size) / page_size; + opts->auxtrace_mmap_pages = roundup_pow_of_two(sz); + } + if (opts->auxtrace_snapshot_size > + opts->auxtrace_mmap_pages * (size_t)page_size) { + pr_err("Snapshot size %zu must not be greater than AUX area tracing mmap size %zu\n", + opts->auxtrace_snapshot_size, + opts->auxtrace_mmap_pages * (size_t)page_size); + return -EINVAL; + } + if (!opts->auxtrace_snapshot_size || !opts->auxtrace_mmap_pages) { + pr_err("Failed to calculate default snapshot size and/or AUX area tracing mmap pages\n"); + return -EINVAL; + } + pr_debug2("Intel PT snapshot size: %zu\n", + opts->auxtrace_snapshot_size); + if (psb_period && + opts->auxtrace_snapshot_size <= psb_period + + INTEL_PT_PSB_PERIOD_NEAR) + ui__warning("Intel PT snapshot size (%zu) may be too small for PSB period (%zu)\n", + opts->auxtrace_snapshot_size, psb_period); + } + + /* Set default sizes for full trace mode */ + if (opts->full_auxtrace && !opts->auxtrace_mmap_pages) { + if (privileged) { + opts->auxtrace_mmap_pages = MiB(4) / page_size; + } else { + opts->auxtrace_mmap_pages = KiB(128) / page_size; + if (opts->mmap_pages == UINT_MAX) + opts->mmap_pages = KiB(256) / page_size; + } + } + + /* Validate auxtrace_mmap_pages */ + if (opts->auxtrace_mmap_pages) { + size_t sz = opts->auxtrace_mmap_pages * (size_t)page_size; + size_t min_sz; + + if (opts->auxtrace_snapshot_mode) + min_sz = KiB(4); + else + min_sz = KiB(8); + + if (sz < min_sz || !is_power_of_2(sz)) { + pr_err("Invalid mmap size for Intel Processor Trace: must be at least %zuKiB and a power of 2\n", + min_sz / 1024); + return -EINVAL; + } + } + + intel_pt_parse_terms(&intel_pt_pmu->format, "tsc", &tsc_bit); + + if (opts->full_auxtrace && (intel_pt_evsel->attr.config & tsc_bit)) + have_timing_info = true; + else + have_timing_info = false; + + /* + * Per-cpu recording needs sched_switch events to distinguish different + * threads. + */ + if (have_timing_info && !cpu_map__empty(cpus)) { + int err; + + err = intel_pt_track_switches(evlist); + if (err == -EPERM) + pr_debug2("Unable to select sched:sched_switch\n"); + else if (err) + return err; + else + ptr->have_sched_switch = 1; + } + + if (intel_pt_evsel) { + /* + * To obtain the auxtrace buffer file descriptor, the auxtrace + * event must come first. + */ + perf_evlist__to_front(evlist, intel_pt_evsel); + /* + * In the case of per-cpu mmaps, we need the CPU on the + * AUX event. + */ + if (!cpu_map__empty(cpus)) + perf_evsel__set_sample_bit(intel_pt_evsel, CPU); + } + + /* Add dummy event to keep tracking */ + if (opts->full_auxtrace) { + struct perf_evsel *tracking_evsel; + int err; + + err = parse_events(evlist, "dummy:u", NULL); + if (err) + return err; + + tracking_evsel = perf_evlist__last(evlist); + + perf_evlist__set_tracking_event(evlist, tracking_evsel); + + tracking_evsel->attr.freq = 0; + tracking_evsel->attr.sample_period = 1; + + /* In per-cpu case, always need the time of mmap events etc */ + if (!cpu_map__empty(cpus)) + perf_evsel__set_sample_bit(tracking_evsel, TIME); + } + + /* + * Warn the user when we do not have enough information to decode i.e. + * per-cpu with no sched_switch (except workload-only). + */ + if (!ptr->have_sched_switch && !cpu_map__empty(cpus) && + !target__none(&opts->target)) + ui__warning("Intel Processor Trace decoding will not be possible except for kernel tracing!\n"); + + return 0; +} + +static int intel_pt_snapshot_start(struct auxtrace_record *itr) +{ + struct intel_pt_recording *ptr = + container_of(itr, struct intel_pt_recording, itr); + struct perf_evsel *evsel; + + evlist__for_each(ptr->evlist, evsel) { + if (evsel->attr.type == ptr->intel_pt_pmu->type) + return perf_evlist__disable_event(ptr->evlist, evsel); + } + return -EINVAL; +} + +static int intel_pt_snapshot_finish(struct auxtrace_record *itr) +{ + struct intel_pt_recording *ptr = + container_of(itr, struct intel_pt_recording, itr); + struct perf_evsel *evsel; + + evlist__for_each(ptr->evlist, evsel) { + if (evsel->attr.type == ptr->intel_pt_pmu->type) + return perf_evlist__enable_event(ptr->evlist, evsel); + } + return -EINVAL; +} + +static int intel_pt_alloc_snapshot_refs(struct intel_pt_recording *ptr, int idx) +{ + const size_t sz = sizeof(struct intel_pt_snapshot_ref); + int cnt = ptr->snapshot_ref_cnt, new_cnt = cnt * 2; + struct intel_pt_snapshot_ref *refs; + + if (!new_cnt) + new_cnt = 16; + + while (new_cnt <= idx) + new_cnt *= 2; + + refs = calloc(new_cnt, sz); + if (!refs) + return -ENOMEM; + + memcpy(refs, ptr->snapshot_refs, cnt * sz); + + ptr->snapshot_refs = refs; + ptr->snapshot_ref_cnt = new_cnt; + + return 0; +} + +static void intel_pt_free_snapshot_refs(struct intel_pt_recording *ptr) +{ + int i; + + for (i = 0; i < ptr->snapshot_ref_cnt; i++) + zfree(&ptr->snapshot_refs[i].ref_buf); + zfree(&ptr->snapshot_refs); +} + +static void intel_pt_recording_free(struct auxtrace_record *itr) +{ + struct intel_pt_recording *ptr = + container_of(itr, struct intel_pt_recording, itr); + + intel_pt_free_snapshot_refs(ptr); + free(ptr); +} + +static int intel_pt_alloc_snapshot_ref(struct intel_pt_recording *ptr, int idx, + size_t snapshot_buf_size) +{ + size_t ref_buf_size = ptr->snapshot_ref_buf_size; + void *ref_buf; + + ref_buf = zalloc(ref_buf_size); + if (!ref_buf) + return -ENOMEM; + + ptr->snapshot_refs[idx].ref_buf = ref_buf; + ptr->snapshot_refs[idx].ref_offset = snapshot_buf_size - ref_buf_size; + + return 0; +} + +static size_t intel_pt_snapshot_ref_buf_size(struct intel_pt_recording *ptr, + size_t snapshot_buf_size) +{ + const size_t max_size = 256 * 1024; + size_t buf_size = 0, psb_period; + + if (ptr->snapshot_size <= 64 * 1024) + return 0; + + psb_period = intel_pt_psb_period(ptr->intel_pt_pmu, ptr->evlist); + if (psb_period) + buf_size = psb_period * 2; + + if (!buf_size || buf_size > max_size) + buf_size = max_size; + + if (buf_size >= snapshot_buf_size) + return 0; + + if (buf_size >= ptr->snapshot_size / 2) + return 0; + + return buf_size; +} + +static int intel_pt_snapshot_init(struct intel_pt_recording *ptr, + size_t snapshot_buf_size) +{ + if (ptr->snapshot_init_done) + return 0; + + ptr->snapshot_init_done = true; + + ptr->snapshot_ref_buf_size = intel_pt_snapshot_ref_buf_size(ptr, + snapshot_buf_size); + + return 0; +} + +/** + * intel_pt_compare_buffers - compare bytes in a buffer to a circular buffer. + * @buf1: first buffer + * @compare_size: number of bytes to compare + * @buf2: second buffer (a circular buffer) + * @offs2: offset in second buffer + * @buf2_size: size of second buffer + * + * The comparison allows for the possibility that the bytes to compare in the + * circular buffer are not contiguous. It is assumed that @compare_size <= + * @buf2_size. This function returns %false if the bytes are identical, %true + * otherwise. + */ +static bool intel_pt_compare_buffers(void *buf1, size_t compare_size, + void *buf2, size_t offs2, size_t buf2_size) +{ + size_t end2 = offs2 + compare_size, part_size; + + if (end2 <= buf2_size) + return memcmp(buf1, buf2 + offs2, compare_size); + + part_size = end2 - buf2_size; + if (memcmp(buf1, buf2 + offs2, part_size)) + return true; + + compare_size -= part_size; + + return memcmp(buf1 + part_size, buf2, compare_size); +} + +static bool intel_pt_compare_ref(void *ref_buf, size_t ref_offset, + size_t ref_size, size_t buf_size, + void *data, size_t head) +{ + size_t ref_end = ref_offset + ref_size; + + if (ref_end > buf_size) { + if (head > ref_offset || head < ref_end - buf_size) + return true; + } else if (head > ref_offset && head < ref_end) { + return true; + } + + return intel_pt_compare_buffers(ref_buf, ref_size, data, ref_offset, + buf_size); +} + +static void intel_pt_copy_ref(void *ref_buf, size_t ref_size, size_t buf_size, + void *data, size_t head) +{ + if (head >= ref_size) { + memcpy(ref_buf, data + head - ref_size, ref_size); + } else { + memcpy(ref_buf, data, head); + ref_size -= head; + memcpy(ref_buf + head, data + buf_size - ref_size, ref_size); + } +} + +static bool intel_pt_wrapped(struct intel_pt_recording *ptr, int idx, + struct auxtrace_mmap *mm, unsigned char *data, + u64 head) +{ + struct intel_pt_snapshot_ref *ref = &ptr->snapshot_refs[idx]; + bool wrapped; + + wrapped = intel_pt_compare_ref(ref->ref_buf, ref->ref_offset, + ptr->snapshot_ref_buf_size, mm->len, + data, head); + + intel_pt_copy_ref(ref->ref_buf, ptr->snapshot_ref_buf_size, mm->len, + data, head); + + return wrapped; +} + +static bool intel_pt_first_wrap(u64 *data, size_t buf_size) +{ + int i, a, b; + + b = buf_size >> 3; + a = b - 512; + if (a < 0) + a = 0; + + for (i = a; i < b; i++) { + if (data[i]) + return true; + } + + return false; +} + +static int intel_pt_find_snapshot(struct auxtrace_record *itr, int idx, + struct auxtrace_mmap *mm, unsigned char *data, + u64 *head, u64 *old) +{ + struct intel_pt_recording *ptr = + container_of(itr, struct intel_pt_recording, itr); + bool wrapped; + int err; + + pr_debug3("%s: mmap index %d old head %zu new head %zu\n", + __func__, idx, (size_t)*old, (size_t)*head); + + err = intel_pt_snapshot_init(ptr, mm->len); + if (err) + goto out_err; + + if (idx >= ptr->snapshot_ref_cnt) { + err = intel_pt_alloc_snapshot_refs(ptr, idx); + if (err) + goto out_err; + } + + if (ptr->snapshot_ref_buf_size) { + if (!ptr->snapshot_refs[idx].ref_buf) { + err = intel_pt_alloc_snapshot_ref(ptr, idx, mm->len); + if (err) + goto out_err; + } + wrapped = intel_pt_wrapped(ptr, idx, mm, data, *head); + } else { + wrapped = ptr->snapshot_refs[idx].wrapped; + if (!wrapped && intel_pt_first_wrap((u64 *)data, mm->len)) { + ptr->snapshot_refs[idx].wrapped = true; + wrapped = true; + } + } + + /* + * In full trace mode 'head' continually increases. However in snapshot + * mode 'head' is an offset within the buffer. Here 'old' and 'head' + * are adjusted to match the full trace case which expects that 'old' is + * always less than 'head'. + */ + if (wrapped) { + *old = *head; + *head += mm->len; + } else { + if (mm->mask) + *old &= mm->mask; + else + *old %= mm->len; + if (*old > *head) + *head += mm->len; + } + + pr_debug3("%s: wrap-around %sdetected, adjusted old head %zu adjusted new head %zu\n", + __func__, wrapped ? "" : "not ", (size_t)*old, (size_t)*head); + + return 0; + +out_err: + pr_err("%s: failed, error %d\n", __func__, err); + return err; +} + +static u64 intel_pt_reference(struct auxtrace_record *itr __maybe_unused) +{ + return rdtsc(); +} + +static int intel_pt_read_finish(struct auxtrace_record *itr, int idx) +{ + struct intel_pt_recording *ptr = + container_of(itr, struct intel_pt_recording, itr); + struct perf_evsel *evsel; + + evlist__for_each(ptr->evlist, evsel) { + if (evsel->attr.type == ptr->intel_pt_pmu->type) + return perf_evlist__enable_event_idx(ptr->evlist, evsel, + idx); + } + return -EINVAL; +} + +struct auxtrace_record *intel_pt_recording_init(int *err) +{ + struct perf_pmu *intel_pt_pmu = perf_pmu__find(INTEL_PT_PMU_NAME); + struct intel_pt_recording *ptr; + + if (!intel_pt_pmu) + return NULL; + + ptr = zalloc(sizeof(struct intel_pt_recording)); + if (!ptr) { + *err = -ENOMEM; + return NULL; + } + + ptr->intel_pt_pmu = intel_pt_pmu; + ptr->itr.recording_options = intel_pt_recording_options; + ptr->itr.info_priv_size = intel_pt_info_priv_size; + ptr->itr.info_fill = intel_pt_info_fill; + ptr->itr.free = intel_pt_recording_free; + ptr->itr.snapshot_start = intel_pt_snapshot_start; + ptr->itr.snapshot_finish = intel_pt_snapshot_finish; + ptr->itr.find_snapshot = intel_pt_find_snapshot; + ptr->itr.parse_snapshot_options = intel_pt_parse_snapshot_options; + ptr->itr.reference = intel_pt_reference; + ptr->itr.read_finish = intel_pt_read_finish; + return &ptr->itr; +} diff --git a/tools/perf/util/Build b/tools/perf/util/Build index 615ca12..c20473d 100644 --- a/tools/perf/util/Build +++ b/tools/perf/util/Build @@ -79,6 +79,7 @@ libperf-y += cloexec.o libperf-y += thread-stack.o libperf-$(CONFIG_AUXTRACE) += auxtrace.o libperf-$(CONFIG_AUXTRACE) += intel-pt-decoder/ +libperf-$(CONFIG_AUXTRACE) += intel-pt.o libperf-y += parse-branch-options.o libperf-$(CONFIG_LIBELF) += symbol-elf.o diff --git a/tools/perf/util/intel-pt.c b/tools/perf/util/intel-pt.c new file mode 100644 index 0000000..2a4a412 --- /dev/null +++ b/tools/perf/util/intel-pt.c @@ -0,0 +1,1911 @@ +/* + * intel_pt.c: Intel Processor Trace support + * Copyright (c) 2013-2015, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + */ + +#include +#include +#include +#include +#include + +#include "../perf.h" +#include "session.h" +#include "machine.h" +#include "tool.h" +#include "event.h" +#include "evlist.h" +#include "evsel.h" +#include "map.h" +#include "color.h" +#include "util.h" +#include "thread.h" +#include "thread-stack.h" +#include "symbol.h" +#include "callchain.h" +#include "dso.h" +#include "debug.h" +#include "auxtrace.h" +#include "tsc.h" +#include "intel-pt.h" + +#include "intel-pt-decoder/intel-pt-log.h" +#include "intel-pt-decoder/intel-pt-decoder.h" +#include "intel-pt-decoder/intel-pt-insn-decoder.h" +#include "intel-pt-decoder/intel-pt-pkt-decoder.h" + +#define MAX_TIMESTAMP (~0ULL) + +struct intel_pt { + struct auxtrace auxtrace; + struct auxtrace_queues queues; + struct auxtrace_heap heap; + u32 auxtrace_type; + struct perf_session *session; + struct machine *machine; + struct perf_evsel *switch_evsel; + struct thread *unknown_thread; + bool timeless_decoding; + bool sampling_mode; + bool snapshot_mode; + bool per_cpu_mmaps; + bool have_tsc; + bool data_queued; + bool est_tsc; + bool sync_switch; + int have_sched_switch; + u32 pmu_type; + u64 kernel_start; + u64 switch_ip; + u64 ptss_ip; + + struct perf_tsc_conversion tc; + bool cap_user_time_zero; + + struct itrace_synth_opts synth_opts; + + bool sample_instructions; + u64 instructions_sample_type; + u64 instructions_sample_period; + u64 instructions_id; + + bool sample_branches; + u32 branches_filter; + u64 branches_sample_type; + u64 branches_id; + + bool sample_transactions; + u64 transactions_sample_type; + u64 transactions_id; + + bool synth_needs_swap; + + u64 tsc_bit; + u64 noretcomp_bit; + unsigned max_non_turbo_ratio; +}; + +enum switch_state { + INTEL_PT_SS_NOT_TRACING, + INTEL_PT_SS_UNKNOWN, + INTEL_PT_SS_TRACING, + INTEL_PT_SS_EXPECTING_SWITCH_EVENT, + INTEL_PT_SS_EXPECTING_SWITCH_IP, +}; + +struct intel_pt_queue { + struct intel_pt *pt; + unsigned int queue_nr; + struct auxtrace_buffer *buffer; + void *decoder; + const struct intel_pt_state *state; + struct ip_callchain *chain; + union perf_event *event_buf; + bool on_heap; + bool stop; + bool step_through_buffers; + bool use_buffer_pid_tid; + pid_t pid, tid; + int cpu; + int switch_state; + pid_t next_tid; + struct thread *thread; + bool exclude_kernel; + bool have_sample; + u64 time; + u64 timestamp; + u32 flags; + u16 insn_len; +}; + +static void intel_pt_dump(struct intel_pt *pt __maybe_unused, + unsigned char *buf, size_t len) +{ + struct intel_pt_pkt packet; + size_t pos = 0; + int ret, pkt_len, i; + char desc[INTEL_PT_PKT_DESC_MAX]; + const char *color = PERF_COLOR_BLUE; + + color_fprintf(stdout, color, + ". ... Intel Processor Trace data: size %zu bytes\n", + len); + + while (len) { + ret = intel_pt_get_packet(buf, len, &packet); + if (ret > 0) + pkt_len = ret; + else + pkt_len = 1; + printf("."); + color_fprintf(stdout, color, " %08x: ", pos); + for (i = 0; i < pkt_len; i++) + color_fprintf(stdout, color, " %02x", buf[i]); + for (; i < 16; i++) + color_fprintf(stdout, color, " "); + if (ret > 0) { + ret = intel_pt_pkt_desc(&packet, desc, + INTEL_PT_PKT_DESC_MAX); + if (ret > 0) + color_fprintf(stdout, color, " %s\n", desc); + } else { + color_fprintf(stdout, color, " Bad packet!\n"); + } + pos += pkt_len; + buf += pkt_len; + len -= pkt_len; + } +} + +static void intel_pt_dump_event(struct intel_pt *pt, unsigned char *buf, + size_t len) +{ + printf(".\n"); + intel_pt_dump(pt, buf, len); +} + +static int intel_pt_do_fix_overlap(struct intel_pt *pt, struct auxtrace_buffer *a, + struct auxtrace_buffer *b) +{ + void *start; + + start = intel_pt_find_overlap(a->data, a->size, b->data, b->size, + pt->have_tsc); + if (!start) + return -EINVAL; + b->use_size = b->data + b->size - start; + b->use_data = start; + return 0; +} + +static void intel_pt_use_buffer_pid_tid(struct intel_pt_queue *ptq, + struct auxtrace_queue *queue, + struct auxtrace_buffer *buffer) +{ + if (queue->cpu == -1 && buffer->cpu != -1) + ptq->cpu = buffer->cpu; + + ptq->pid = buffer->pid; + ptq->tid = buffer->tid; + + intel_pt_log("queue %u cpu %d pid %d tid %d\n", + ptq->queue_nr, ptq->cpu, ptq->pid, ptq->tid); + + thread__zput(ptq->thread); + + if (ptq->tid != -1) { + if (ptq->pid != -1) + ptq->thread = machine__findnew_thread(ptq->pt->machine, + ptq->pid, + ptq->tid); + else + ptq->thread = machine__find_thread(ptq->pt->machine, -1, + ptq->tid); + } +} + +/* This function assumes data is processed sequentially only */ +static int intel_pt_get_trace(struct intel_pt_buffer *b, void *data) +{ + struct intel_pt_queue *ptq = data; + struct auxtrace_buffer *buffer = ptq->buffer, *old_buffer = buffer; + struct auxtrace_queue *queue; + + if (ptq->stop) { + b->len = 0; + return 0; + } + + queue = &ptq->pt->queues.queue_array[ptq->queue_nr]; + + buffer = auxtrace_buffer__next(queue, buffer); + if (!buffer) { + if (old_buffer) + auxtrace_buffer__drop_data(old_buffer); + b->len = 0; + return 0; + } + + ptq->buffer = buffer; + + if (!buffer->data) { + int fd = perf_data_file__fd(ptq->pt->session->file); + + buffer->data = auxtrace_buffer__get_data(buffer, fd); + if (!buffer->data) + return -ENOMEM; + } + + if (ptq->pt->snapshot_mode && !buffer->consecutive && old_buffer && + intel_pt_do_fix_overlap(ptq->pt, old_buffer, buffer)) + return -ENOMEM; + + if (old_buffer) + auxtrace_buffer__drop_data(old_buffer); + + if (buffer->use_data) { + b->len = buffer->use_size; + b->buf = buffer->use_data; + } else { + b->len = buffer->size; + b->buf = buffer->data; + } + b->ref_timestamp = buffer->reference; + + if (!old_buffer || ptq->pt->sampling_mode || (ptq->pt->snapshot_mode && + !buffer->consecutive)) { + b->consecutive = false; + b->trace_nr = buffer->buffer_nr + 1; + } else { + b->consecutive = true; + } + + if (ptq->use_buffer_pid_tid && (ptq->pid != buffer->pid || + ptq->tid != buffer->tid)) + intel_pt_use_buffer_pid_tid(ptq, queue, buffer); + + if (ptq->step_through_buffers) + ptq->stop = true; + + if (!b->len) + return intel_pt_get_trace(b, data); + + return 0; +} + +struct intel_pt_cache_entry { + struct auxtrace_cache_entry entry; + u64 insn_cnt; + u64 byte_cnt; + enum intel_pt_insn_op op; + enum intel_pt_insn_branch branch; + int length; + int32_t rel; +}; + +static int intel_pt_config_div(const char *var, const char *value, void *data) +{ + int *d = data; + long val; + + if (!strcmp(var, "intel-pt.cache-divisor")) { + val = strtol(value, NULL, 0); + if (val > 0 && val <= INT_MAX) + *d = val; + } + + return 0; +} + +static int intel_pt_cache_divisor(void) +{ + static int d; + + if (d) + return d; + + perf_config(intel_pt_config_div, &d); + + if (!d) + d = 64; + + return d; +} + +static unsigned int intel_pt_cache_size(struct dso *dso, + struct machine *machine) +{ + off_t size; + + size = dso__data_size(dso, machine); + size /= intel_pt_cache_divisor(); + if (size < 1000) + return 10; + if (size > (1 << 21)) + return 21; + return 32 - __builtin_clz(size); +} + +static struct auxtrace_cache *intel_pt_cache(struct dso *dso, + struct machine *machine) +{ + struct auxtrace_cache *c; + unsigned int bits; + + if (dso->auxtrace_cache) + return dso->auxtrace_cache; + + bits = intel_pt_cache_size(dso, machine); + + /* Ignoring cache creation failure */ + c = auxtrace_cache__new(bits, sizeof(struct intel_pt_cache_entry), 200); + + dso->auxtrace_cache = c; + + return c; +} + +static int intel_pt_cache_add(struct dso *dso, struct machine *machine, + u64 offset, u64 insn_cnt, u64 byte_cnt, + struct intel_pt_insn *intel_pt_insn) +{ + struct auxtrace_cache *c = intel_pt_cache(dso, machine); + struct intel_pt_cache_entry *e; + int err; + + if (!c) + return -ENOMEM; + + e = auxtrace_cache__alloc_entry(c); + if (!e) + return -ENOMEM; + + e->insn_cnt = insn_cnt; + e->byte_cnt = byte_cnt; + e->op = intel_pt_insn->op; + e->branch = intel_pt_insn->branch; + e->length = intel_pt_insn->length; + e->rel = intel_pt_insn->rel; + + err = auxtrace_cache__add(c, offset, &e->entry); + if (err) + auxtrace_cache__free_entry(c, e); + + return err; +} + +static struct intel_pt_cache_entry * +intel_pt_cache_lookup(struct dso *dso, struct machine *machine, u64 offset) +{ + struct auxtrace_cache *c = intel_pt_cache(dso, machine); + + if (!c) + return NULL; + + return auxtrace_cache__lookup(dso->auxtrace_cache, offset); +} + +static int intel_pt_walk_next_insn(struct intel_pt_insn *intel_pt_insn, + uint64_t *insn_cnt_ptr, uint64_t *ip, + uint64_t to_ip, uint64_t max_insn_cnt, + void *data) +{ + struct intel_pt_queue *ptq = data; + struct machine *machine = ptq->pt->machine; + struct thread *thread; + struct addr_location al; + unsigned char buf[1024]; + size_t bufsz; + ssize_t len; + int x86_64; + u8 cpumode; + u64 offset, start_offset, start_ip; + u64 insn_cnt = 0; + bool one_map = true; + + if (to_ip && *ip == to_ip) + goto out_no_cache; + + bufsz = intel_pt_insn_max_size(); + + if (*ip >= ptq->pt->kernel_start) + cpumode = PERF_RECORD_MISC_KERNEL; + else + cpumode = PERF_RECORD_MISC_USER; + + thread = ptq->thread; + if (!thread) { + if (cpumode != PERF_RECORD_MISC_KERNEL) + return -EINVAL; + thread = ptq->pt->unknown_thread; + } + + while (1) { + thread__find_addr_map(thread, cpumode, MAP__FUNCTION, *ip, &al); + if (!al.map || !al.map->dso) + return -EINVAL; + + if (al.map->dso->data.status == DSO_DATA_STATUS_ERROR && + dso__data_status_seen(al.map->dso, + DSO_DATA_STATUS_SEEN_ITRACE)) + return -ENOENT; + + offset = al.map->map_ip(al.map, *ip); + + if (!to_ip && one_map) { + struct intel_pt_cache_entry *e; + + e = intel_pt_cache_lookup(al.map->dso, machine, offset); + if (e && + (!max_insn_cnt || e->insn_cnt <= max_insn_cnt)) { + *insn_cnt_ptr = e->insn_cnt; + *ip += e->byte_cnt; + intel_pt_insn->op = e->op; + intel_pt_insn->branch = e->branch; + intel_pt_insn->length = e->length; + intel_pt_insn->rel = e->rel; + intel_pt_log_insn_no_data(intel_pt_insn, *ip); + return 0; + } + } + + start_offset = offset; + start_ip = *ip; + + /* Load maps to ensure dso->is_64_bit has been updated */ + map__load(al.map, machine->symbol_filter); + + x86_64 = al.map->dso->is_64_bit; + + while (1) { + len = dso__data_read_offset(al.map->dso, machine, + offset, buf, bufsz); + if (len <= 0) + return -EINVAL; + + if (intel_pt_get_insn(buf, len, x86_64, intel_pt_insn)) + return -EINVAL; + + intel_pt_log_insn(intel_pt_insn, *ip); + + insn_cnt += 1; + + if (intel_pt_insn->branch != INTEL_PT_BR_NO_BRANCH) + goto out; + + if (max_insn_cnt && insn_cnt >= max_insn_cnt) + goto out_no_cache; + + *ip += intel_pt_insn->length; + + if (to_ip && *ip == to_ip) + goto out_no_cache; + + if (*ip >= al.map->end) + break; + + offset += intel_pt_insn->length; + } + one_map = false; + } +out: + *insn_cnt_ptr = insn_cnt; + + if (!one_map) + goto out_no_cache; + + /* + * Didn't lookup in the 'to_ip' case, so do it now to prevent duplicate + * entries. + */ + if (to_ip) { + struct intel_pt_cache_entry *e; + + e = intel_pt_cache_lookup(al.map->dso, machine, start_offset); + if (e) + return 0; + } + + /* Ignore cache errors */ + intel_pt_cache_add(al.map->dso, machine, start_offset, insn_cnt, + *ip - start_ip, intel_pt_insn); + + return 0; + +out_no_cache: + *insn_cnt_ptr = insn_cnt; + return 0; +} + +static bool intel_pt_get_config(struct intel_pt *pt, + struct perf_event_attr *attr, u64 *config) +{ + if (attr->type == pt->pmu_type) { + if (config) + *config = attr->config; + return true; + } + + return false; +} + +static bool intel_pt_exclude_kernel(struct intel_pt *pt) +{ + struct perf_evsel *evsel; + + evlist__for_each(pt->session->evlist, evsel) { + if (intel_pt_get_config(pt, &evsel->attr, NULL) && + !evsel->attr.exclude_kernel) + return false; + } + return true; +} + +static bool intel_pt_return_compression(struct intel_pt *pt) +{ + struct perf_evsel *evsel; + u64 config; + + if (!pt->noretcomp_bit) + return true; + + evlist__for_each(pt->session->evlist, evsel) { + if (intel_pt_get_config(pt, &evsel->attr, &config) && + (config & pt->noretcomp_bit)) + return false; + } + return true; +} + +static bool intel_pt_timeless_decoding(struct intel_pt *pt) +{ + struct perf_evsel *evsel; + bool timeless_decoding = true; + u64 config; + + if (!pt->tsc_bit || !pt->cap_user_time_zero) + return true; + + evlist__for_each(pt->session->evlist, evsel) { + if (!(evsel->attr.sample_type & PERF_SAMPLE_TIME)) + return true; + if (intel_pt_get_config(pt, &evsel->attr, &config)) { + if (config & pt->tsc_bit) + timeless_decoding = false; + else + return true; + } + } + return timeless_decoding; +} + +static bool intel_pt_tracing_kernel(struct intel_pt *pt) +{ + struct perf_evsel *evsel; + + evlist__for_each(pt->session->evlist, evsel) { + if (intel_pt_get_config(pt, &evsel->attr, NULL) && + !evsel->attr.exclude_kernel) + return true; + } + return false; +} + +static bool intel_pt_have_tsc(struct intel_pt *pt) +{ + struct perf_evsel *evsel; + bool have_tsc = false; + u64 config; + + if (!pt->tsc_bit) + return false; + + evlist__for_each(pt->session->evlist, evsel) { + if (intel_pt_get_config(pt, &evsel->attr, &config)) { + if (config & pt->tsc_bit) + have_tsc = true; + else + return false; + } + } + return have_tsc; +} + +static u64 intel_pt_ns_to_ticks(const struct intel_pt *pt, u64 ns) +{ + u64 quot, rem; + + quot = ns / pt->tc.time_mult; + rem = ns % pt->tc.time_mult; + return (quot << pt->tc.time_shift) + (rem << pt->tc.time_shift) / + pt->tc.time_mult; +} + +static struct intel_pt_queue *intel_pt_alloc_queue(struct intel_pt *pt, + unsigned int queue_nr) +{ + struct intel_pt_params params = { .get_trace = 0, }; + struct intel_pt_queue *ptq; + + ptq = zalloc(sizeof(struct intel_pt_queue)); + if (!ptq) + return NULL; + + if (pt->synth_opts.callchain) { + size_t sz = sizeof(struct ip_callchain); + + sz += pt->synth_opts.callchain_sz * sizeof(u64); + ptq->chain = zalloc(sz); + if (!ptq->chain) + goto out_free; + } + + ptq->event_buf = malloc(PERF_SAMPLE_MAX_SIZE); + if (!ptq->event_buf) + goto out_free; + + ptq->pt = pt; + ptq->queue_nr = queue_nr; + ptq->exclude_kernel = intel_pt_exclude_kernel(pt); + ptq->pid = -1; + ptq->tid = -1; + ptq->cpu = -1; + ptq->next_tid = -1; + + params.get_trace = intel_pt_get_trace; + params.walk_insn = intel_pt_walk_next_insn; + params.data = ptq; + params.return_compression = intel_pt_return_compression(pt); + params.max_non_turbo_ratio = pt->max_non_turbo_ratio; + + if (pt->synth_opts.instructions) { + if (pt->synth_opts.period) { + switch (pt->synth_opts.period_type) { + case PERF_ITRACE_PERIOD_INSTRUCTIONS: + params.period_type = + INTEL_PT_PERIOD_INSTRUCTIONS; + params.period = pt->synth_opts.period; + break; + case PERF_ITRACE_PERIOD_TICKS: + params.period_type = INTEL_PT_PERIOD_TICKS; + params.period = pt->synth_opts.period; + break; + case PERF_ITRACE_PERIOD_NANOSECS: + params.period_type = INTEL_PT_PERIOD_TICKS; + params.period = intel_pt_ns_to_ticks(pt, + pt->synth_opts.period); + break; + default: + break; + } + } + + if (!params.period) { + params.period_type = INTEL_PT_PERIOD_INSTRUCTIONS; + params.period = 1000; + } + } + + ptq->decoder = intel_pt_decoder_new(¶ms); + if (!ptq->decoder) + goto out_free; + + return ptq; + +out_free: + zfree(&ptq->event_buf); + zfree(&ptq->chain); + free(ptq); + return NULL; +} + +static void intel_pt_free_queue(void *priv) +{ + struct intel_pt_queue *ptq = priv; + + if (!ptq) + return; + thread__zput(ptq->thread); + intel_pt_decoder_free(ptq->decoder); + zfree(&ptq->event_buf); + zfree(&ptq->chain); + free(ptq); +} + +static void intel_pt_set_pid_tid_cpu(struct intel_pt *pt, + struct auxtrace_queue *queue) +{ + struct intel_pt_queue *ptq = queue->priv; + + if (queue->tid == -1 || pt->have_sched_switch) { + ptq->tid = machine__get_current_tid(pt->machine, ptq->cpu); + thread__zput(ptq->thread); + } + + if (!ptq->thread && ptq->tid != -1) + ptq->thread = machine__find_thread(pt->machine, -1, ptq->tid); + + if (ptq->thread) { + ptq->pid = ptq->thread->pid_; + if (queue->cpu == -1) + ptq->cpu = ptq->thread->cpu; + } +} + +static void intel_pt_sample_flags(struct intel_pt_queue *ptq) +{ + if (ptq->state->flags & INTEL_PT_ABORT_TX) { + ptq->flags = PERF_IP_FLAG_BRANCH | PERF_IP_FLAG_TX_ABORT; + } else if (ptq->state->flags & INTEL_PT_ASYNC) { + if (ptq->state->to_ip) + ptq->flags = PERF_IP_FLAG_BRANCH | PERF_IP_FLAG_CALL | + PERF_IP_FLAG_ASYNC | + PERF_IP_FLAG_INTERRUPT; + else + ptq->flags = PERF_IP_FLAG_BRANCH | + PERF_IP_FLAG_TRACE_END; + ptq->insn_len = 0; + } else { + if (ptq->state->from_ip) + ptq->flags = intel_pt_insn_type(ptq->state->insn_op); + else + ptq->flags = PERF_IP_FLAG_BRANCH | + PERF_IP_FLAG_TRACE_BEGIN; + if (ptq->state->flags & INTEL_PT_IN_TX) + ptq->flags |= PERF_IP_FLAG_IN_TX; + ptq->insn_len = ptq->state->insn_len; + } +} + +static int intel_pt_setup_queue(struct intel_pt *pt, + struct auxtrace_queue *queue, + unsigned int queue_nr) +{ + struct intel_pt_queue *ptq = queue->priv; + + if (list_empty(&queue->head)) + return 0; + + if (!ptq) { + ptq = intel_pt_alloc_queue(pt, queue_nr); + if (!ptq) + return -ENOMEM; + queue->priv = ptq; + + if (queue->cpu != -1) + ptq->cpu = queue->cpu; + ptq->tid = queue->tid; + + if (pt->sampling_mode) { + if (pt->timeless_decoding) + ptq->step_through_buffers = true; + if (pt->timeless_decoding || !pt->have_sched_switch) + ptq->use_buffer_pid_tid = true; + } + } + + if (!ptq->on_heap && + (!pt->sync_switch || + ptq->switch_state != INTEL_PT_SS_EXPECTING_SWITCH_EVENT)) { + const struct intel_pt_state *state; + int ret; + + if (pt->timeless_decoding) + return 0; + + intel_pt_log("queue %u getting timestamp\n", queue_nr); + intel_pt_log("queue %u decoding cpu %d pid %d tid %d\n", + queue_nr, ptq->cpu, ptq->pid, ptq->tid); + while (1) { + state = intel_pt_decode(ptq->decoder); + if (state->err) { + if (state->err == INTEL_PT_ERR_NODATA) { + intel_pt_log("queue %u has no timestamp\n", + queue_nr); + return 0; + } + continue; + } + if (state->timestamp) + break; + } + + ptq->timestamp = state->timestamp; + intel_pt_log("queue %u timestamp 0x%" PRIx64 "\n", + queue_nr, ptq->timestamp); + ptq->state = state; + ptq->have_sample = true; + intel_pt_sample_flags(ptq); + ret = auxtrace_heap__add(&pt->heap, queue_nr, ptq->timestamp); + if (ret) + return ret; + ptq->on_heap = true; + } + + return 0; +} + +static int intel_pt_setup_queues(struct intel_pt *pt) +{ + unsigned int i; + int ret; + + for (i = 0; i < pt->queues.nr_queues; i++) { + ret = intel_pt_setup_queue(pt, &pt->queues.queue_array[i], i); + if (ret) + return ret; + } + return 0; +} + +static int intel_pt_inject_event(union perf_event *event, + struct perf_sample *sample, u64 type, + bool swapped) +{ + event->header.size = perf_event__sample_event_size(sample, type, 0); + return perf_event__synthesize_sample(event, type, 0, sample, swapped); +} + +static int intel_pt_synth_branch_sample(struct intel_pt_queue *ptq) +{ + int ret; + struct intel_pt *pt = ptq->pt; + union perf_event *event = ptq->event_buf; + struct perf_sample sample = { .ip = 0, }; + + event->sample.header.type = PERF_RECORD_SAMPLE; + event->sample.header.misc = PERF_RECORD_MISC_USER; + event->sample.header.size = sizeof(struct perf_event_header); + + if (!pt->timeless_decoding) + sample.time = tsc_to_perf_time(ptq->timestamp, &pt->tc); + + sample.ip = ptq->state->from_ip; + sample.pid = ptq->pid; + sample.tid = ptq->tid; + sample.addr = ptq->state->to_ip; + sample.id = ptq->pt->branches_id; + sample.stream_id = ptq->pt->branches_id; + sample.period = 1; + sample.cpu = ptq->cpu; + sample.flags = ptq->flags; + sample.insn_len = ptq->insn_len; + + if (pt->branches_filter && !(pt->branches_filter & ptq->flags)) + return 0; + + if (pt->synth_opts.inject) { + ret = intel_pt_inject_event(event, &sample, + pt->branches_sample_type, + pt->synth_needs_swap); + if (ret) + return ret; + } + + ret = perf_session__deliver_synth_event(pt->session, event, &sample); + if (ret) + pr_err("Intel Processor Trace: failed to deliver branch event, error %d\n", + ret); + + return ret; +} + +static int intel_pt_synth_instruction_sample(struct intel_pt_queue *ptq) +{ + int ret; + struct intel_pt *pt = ptq->pt; + union perf_event *event = ptq->event_buf; + struct perf_sample sample = { .ip = 0, }; + + event->sample.header.type = PERF_RECORD_SAMPLE; + event->sample.header.misc = PERF_RECORD_MISC_USER; + event->sample.header.size = sizeof(struct perf_event_header); + + if (!pt->timeless_decoding) + sample.time = tsc_to_perf_time(ptq->timestamp, &pt->tc); + + sample.ip = ptq->state->from_ip; + sample.pid = ptq->pid; + sample.tid = ptq->tid; + sample.addr = ptq->state->to_ip; + sample.id = ptq->pt->instructions_id; + sample.stream_id = ptq->pt->instructions_id; + sample.period = ptq->pt->instructions_sample_period; + sample.cpu = ptq->cpu; + sample.flags = ptq->flags; + sample.insn_len = ptq->insn_len; + + if (pt->synth_opts.callchain) { + thread_stack__sample(ptq->thread, ptq->chain, + pt->synth_opts.callchain_sz, sample.ip); + sample.callchain = ptq->chain; + } + + if (pt->synth_opts.inject) { + ret = intel_pt_inject_event(event, &sample, + pt->instructions_sample_type, + pt->synth_needs_swap); + if (ret) + return ret; + } + + ret = perf_session__deliver_synth_event(pt->session, event, &sample); + if (ret) + pr_err("Intel Processor Trace: failed to deliver instruction event, error %d\n", + ret); + + return ret; +} + +static int intel_pt_synth_transaction_sample(struct intel_pt_queue *ptq) +{ + int ret; + struct intel_pt *pt = ptq->pt; + union perf_event *event = ptq->event_buf; + struct perf_sample sample = { .ip = 0, }; + + event->sample.header.type = PERF_RECORD_SAMPLE; + event->sample.header.misc = PERF_RECORD_MISC_USER; + event->sample.header.size = sizeof(struct perf_event_header); + + if (!pt->timeless_decoding) + sample.time = tsc_to_perf_time(ptq->timestamp, &pt->tc); + + sample.ip = ptq->state->from_ip; + sample.pid = ptq->pid; + sample.tid = ptq->tid; + sample.addr = ptq->state->to_ip; + sample.id = ptq->pt->transactions_id; + sample.stream_id = ptq->pt->transactions_id; + sample.period = 1; + sample.cpu = ptq->cpu; + sample.flags = ptq->flags; + sample.insn_len = ptq->insn_len; + + if (pt->synth_opts.callchain) { + thread_stack__sample(ptq->thread, ptq->chain, + pt->synth_opts.callchain_sz, sample.ip); + sample.callchain = ptq->chain; + } + + if (pt->synth_opts.inject) { + ret = intel_pt_inject_event(event, &sample, + pt->transactions_sample_type, + pt->synth_needs_swap); + if (ret) + return ret; + } + + ret = perf_session__deliver_synth_event(pt->session, event, &sample); + if (ret) + pr_err("Intel Processor Trace: failed to deliver transaction event, error %d\n", + ret); + + return ret; +} + +static int intel_pt_synth_error(struct intel_pt *pt, int code, int cpu, + pid_t pid, pid_t tid, u64 ip) +{ + union perf_event event; + char msg[MAX_AUXTRACE_ERROR_MSG]; + int err; + + intel_pt__strerror(code, msg, MAX_AUXTRACE_ERROR_MSG); + + auxtrace_synth_error(&event.auxtrace_error, PERF_AUXTRACE_ERROR_ITRACE, + code, cpu, pid, tid, ip, msg); + + err = perf_session__deliver_synth_event(pt->session, &event, NULL); + if (err) + pr_err("Intel Processor Trace: failed to deliver error event, error %d\n", + err); + + return err; +} + +static int intel_pt_next_tid(struct intel_pt *pt, struct intel_pt_queue *ptq) +{ + struct auxtrace_queue *queue; + pid_t tid = ptq->next_tid; + int err; + + if (tid == -1) + return 0; + + intel_pt_log("switch: cpu %d tid %d\n", ptq->cpu, tid); + + err = machine__set_current_tid(pt->machine, ptq->cpu, -1, tid); + + queue = &pt->queues.queue_array[ptq->queue_nr]; + intel_pt_set_pid_tid_cpu(pt, queue); + + ptq->next_tid = -1; + + return err; +} + +static inline bool intel_pt_is_switch_ip(struct intel_pt_queue *ptq, u64 ip) +{ + struct intel_pt *pt = ptq->pt; + + return ip == pt->switch_ip && + (ptq->flags & PERF_IP_FLAG_BRANCH) && + !(ptq->flags & (PERF_IP_FLAG_CONDITIONAL | PERF_IP_FLAG_ASYNC | + PERF_IP_FLAG_INTERRUPT | PERF_IP_FLAG_TX_ABORT)); +} + +static int intel_pt_sample(struct intel_pt_queue *ptq) +{ + const struct intel_pt_state *state = ptq->state; + struct intel_pt *pt = ptq->pt; + int err; + + if (!ptq->have_sample) + return 0; + + ptq->have_sample = false; + + if (pt->sample_instructions && + (state->type & INTEL_PT_INSTRUCTION)) { + err = intel_pt_synth_instruction_sample(ptq); + if (err) + return err; + } + + if (pt->sample_transactions && + (state->type & INTEL_PT_TRANSACTION)) { + err = intel_pt_synth_transaction_sample(ptq); + if (err) + return err; + } + + if (!(state->type & INTEL_PT_BRANCH)) + return 0; + + if (pt->synth_opts.callchain) + thread_stack__event(ptq->thread, ptq->flags, state->from_ip, + state->to_ip, ptq->insn_len, + state->trace_nr); + else + thread_stack__set_trace_nr(ptq->thread, state->trace_nr); + + if (pt->sample_branches) { + err = intel_pt_synth_branch_sample(ptq); + if (err) + return err; + } + + if (!pt->sync_switch) + return 0; + + if (intel_pt_is_switch_ip(ptq, state->to_ip)) { + switch (ptq->switch_state) { + case INTEL_PT_SS_UNKNOWN: + case INTEL_PT_SS_EXPECTING_SWITCH_IP: + err = intel_pt_next_tid(pt, ptq); + if (err) + return err; + ptq->switch_state = INTEL_PT_SS_TRACING; + break; + default: + ptq->switch_state = INTEL_PT_SS_EXPECTING_SWITCH_EVENT; + return 1; + } + } else if (!state->to_ip) { + ptq->switch_state = INTEL_PT_SS_NOT_TRACING; + } else if (ptq->switch_state == INTEL_PT_SS_NOT_TRACING) { + ptq->switch_state = INTEL_PT_SS_UNKNOWN; + } else if (ptq->switch_state == INTEL_PT_SS_UNKNOWN && + state->to_ip == pt->ptss_ip && + (ptq->flags & PERF_IP_FLAG_CALL)) { + ptq->switch_state = INTEL_PT_SS_TRACING; + } + + return 0; +} + +static u64 intel_pt_switch_ip(struct machine *machine, u64 *ptss_ip) +{ + struct map *map; + struct symbol *sym, *start; + u64 ip, switch_ip = 0; + + if (ptss_ip) + *ptss_ip = 0; + + map = machine__kernel_map(machine, MAP__FUNCTION); + if (!map) + return 0; + + if (map__load(map, machine->symbol_filter)) + return 0; + + start = dso__first_symbol(map->dso, MAP__FUNCTION); + + for (sym = start; sym; sym = dso__next_symbol(sym)) { + if (sym->binding == STB_GLOBAL && + !strcmp(sym->name, "__switch_to")) { + ip = map->unmap_ip(map, sym->start); + if (ip >= map->start && ip < map->end) { + switch_ip = ip; + break; + } + } + } + + if (!switch_ip || !ptss_ip) + return 0; + + for (sym = start; sym; sym = dso__next_symbol(sym)) { + if (!strcmp(sym->name, "perf_trace_sched_switch")) { + ip = map->unmap_ip(map, sym->start); + if (ip >= map->start && ip < map->end) { + *ptss_ip = ip; + break; + } + } + } + + return switch_ip; +} + +static int intel_pt_run_decoder(struct intel_pt_queue *ptq, u64 *timestamp) +{ + const struct intel_pt_state *state = ptq->state; + struct intel_pt *pt = ptq->pt; + int err; + + if (!pt->kernel_start) { + pt->kernel_start = machine__kernel_start(pt->machine); + if (pt->per_cpu_mmaps && pt->have_sched_switch && + !pt->timeless_decoding && intel_pt_tracing_kernel(pt) && + !pt->sampling_mode) { + pt->switch_ip = intel_pt_switch_ip(pt->machine, + &pt->ptss_ip); + if (pt->switch_ip) { + intel_pt_log("switch_ip: %"PRIx64" ptss_ip: %"PRIx64"\n", + pt->switch_ip, pt->ptss_ip); + pt->sync_switch = true; + } + } + } + + intel_pt_log("queue %u decoding cpu %d pid %d tid %d\n", + ptq->queue_nr, ptq->cpu, ptq->pid, ptq->tid); + while (1) { + err = intel_pt_sample(ptq); + if (err) + return err; + + state = intel_pt_decode(ptq->decoder); + if (state->err) { + if (state->err == INTEL_PT_ERR_NODATA) + return 1; + if (pt->sync_switch && + state->from_ip >= pt->kernel_start) { + pt->sync_switch = false; + intel_pt_next_tid(pt, ptq); + } + if (pt->synth_opts.errors) { + err = intel_pt_synth_error(pt, state->err, + ptq->cpu, ptq->pid, + ptq->tid, + state->from_ip); + if (err) + return err; + } + continue; + } + + ptq->state = state; + ptq->have_sample = true; + intel_pt_sample_flags(ptq); + + /* Use estimated TSC upon return to user space */ + if (pt->est_tsc && + (state->from_ip >= pt->kernel_start || !state->from_ip) && + state->to_ip && state->to_ip < pt->kernel_start) { + intel_pt_log("TSC %"PRIx64" est. TSC %"PRIx64"\n", + state->timestamp, state->est_timestamp); + ptq->timestamp = state->est_timestamp; + /* Use estimated TSC in unknown switch state */ + } else if (pt->sync_switch && + ptq->switch_state == INTEL_PT_SS_UNKNOWN && + intel_pt_is_switch_ip(ptq, state->to_ip) && + ptq->next_tid == -1) { + intel_pt_log("TSC %"PRIx64" est. TSC %"PRIx64"\n", + state->timestamp, state->est_timestamp); + ptq->timestamp = state->est_timestamp; + } else if (state->timestamp > ptq->timestamp) { + ptq->timestamp = state->timestamp; + } + + if (!pt->timeless_decoding && ptq->timestamp >= *timestamp) { + *timestamp = ptq->timestamp; + return 0; + } + } + return 0; +} + +static inline int intel_pt_update_queues(struct intel_pt *pt) +{ + if (pt->queues.new_data) { + pt->queues.new_data = false; + return intel_pt_setup_queues(pt); + } + return 0; +} + +static int intel_pt_process_queues(struct intel_pt *pt, u64 timestamp) +{ + unsigned int queue_nr; + u64 ts; + int ret; + + while (1) { + struct auxtrace_queue *queue; + struct intel_pt_queue *ptq; + + if (!pt->heap.heap_cnt) + return 0; + + if (pt->heap.heap_array[0].ordinal >= timestamp) + return 0; + + queue_nr = pt->heap.heap_array[0].queue_nr; + queue = &pt->queues.queue_array[queue_nr]; + ptq = queue->priv; + + intel_pt_log("queue %u processing 0x%" PRIx64 " to 0x%" PRIx64 "\n", + queue_nr, pt->heap.heap_array[0].ordinal, + timestamp); + + auxtrace_heap__pop(&pt->heap); + + if (pt->heap.heap_cnt) { + ts = pt->heap.heap_array[0].ordinal + 1; + if (ts > timestamp) + ts = timestamp; + } else { + ts = timestamp; + } + + intel_pt_set_pid_tid_cpu(pt, queue); + + ret = intel_pt_run_decoder(ptq, &ts); + + if (ret < 0) { + auxtrace_heap__add(&pt->heap, queue_nr, ts); + return ret; + } + + if (!ret) { + ret = auxtrace_heap__add(&pt->heap, queue_nr, ts); + if (ret < 0) + return ret; + } else { + ptq->on_heap = false; + } + } + + return 0; +} + +static int intel_pt_process_timeless_queues(struct intel_pt *pt, pid_t tid, + u64 time_) +{ + struct auxtrace_queues *queues = &pt->queues; + unsigned int i; + u64 ts = 0; + + for (i = 0; i < queues->nr_queues; i++) { + struct auxtrace_queue *queue = &pt->queues.queue_array[i]; + struct intel_pt_queue *ptq = queue->priv; + + if (ptq && (tid == -1 || ptq->tid == tid)) { + ptq->time = time_; + intel_pt_set_pid_tid_cpu(pt, queue); + intel_pt_run_decoder(ptq, &ts); + } + } + return 0; +} + +static int intel_pt_lost(struct intel_pt *pt, struct perf_sample *sample) +{ + return intel_pt_synth_error(pt, INTEL_PT_ERR_LOST, sample->cpu, + sample->pid, sample->tid, 0); +} + +static struct intel_pt_queue *intel_pt_cpu_to_ptq(struct intel_pt *pt, int cpu) +{ + unsigned i, j; + + if (cpu < 0 || !pt->queues.nr_queues) + return NULL; + + if ((unsigned)cpu >= pt->queues.nr_queues) + i = pt->queues.nr_queues - 1; + else + i = cpu; + + if (pt->queues.queue_array[i].cpu == cpu) + return pt->queues.queue_array[i].priv; + + for (j = 0; i > 0; j++) { + if (pt->queues.queue_array[--i].cpu == cpu) + return pt->queues.queue_array[i].priv; + } + + for (; j < pt->queues.nr_queues; j++) { + if (pt->queues.queue_array[j].cpu == cpu) + return pt->queues.queue_array[j].priv; + } + + return NULL; +} + +static int intel_pt_process_switch(struct intel_pt *pt, + struct perf_sample *sample) +{ + struct intel_pt_queue *ptq; + struct perf_evsel *evsel; + pid_t tid; + int cpu, err; + + evsel = perf_evlist__id2evsel(pt->session->evlist, sample->id); + if (evsel != pt->switch_evsel) + return 0; + + tid = perf_evsel__intval(evsel, sample, "next_pid"); + cpu = sample->cpu; + + intel_pt_log("sched_switch: cpu %d tid %d time %"PRIu64" tsc %#"PRIx64"\n", + cpu, tid, sample->time, perf_time_to_tsc(sample->time, + &pt->tc)); + + if (!pt->sync_switch) + goto out; + + ptq = intel_pt_cpu_to_ptq(pt, cpu); + if (!ptq) + goto out; + + switch (ptq->switch_state) { + case INTEL_PT_SS_NOT_TRACING: + ptq->next_tid = -1; + break; + case INTEL_PT_SS_UNKNOWN: + case INTEL_PT_SS_TRACING: + ptq->next_tid = tid; + ptq->switch_state = INTEL_PT_SS_EXPECTING_SWITCH_IP; + return 0; + case INTEL_PT_SS_EXPECTING_SWITCH_EVENT: + if (!ptq->on_heap) { + ptq->timestamp = perf_time_to_tsc(sample->time, + &pt->tc); + err = auxtrace_heap__add(&pt->heap, ptq->queue_nr, + ptq->timestamp); + if (err) + return err; + ptq->on_heap = true; + } + ptq->switch_state = INTEL_PT_SS_TRACING; + break; + case INTEL_PT_SS_EXPECTING_SWITCH_IP: + ptq->next_tid = tid; + intel_pt_log("ERROR: cpu %d expecting switch ip\n", cpu); + break; + default: + break; + } +out: + return machine__set_current_tid(pt->machine, cpu, -1, tid); +} + +static int intel_pt_process_itrace_start(struct intel_pt *pt, + union perf_event *event, + struct perf_sample *sample) +{ + if (!pt->per_cpu_mmaps) + return 0; + + intel_pt_log("itrace_start: cpu %d pid %d tid %d time %"PRIu64" tsc %#"PRIx64"\n", + sample->cpu, event->itrace_start.pid, + event->itrace_start.tid, sample->time, + perf_time_to_tsc(sample->time, &pt->tc)); + + return machine__set_current_tid(pt->machine, sample->cpu, + event->itrace_start.pid, + event->itrace_start.tid); +} + +static int intel_pt_process_event(struct perf_session *session, + union perf_event *event, + struct perf_sample *sample, + struct perf_tool *tool) +{ + struct intel_pt *pt = container_of(session->auxtrace, struct intel_pt, + auxtrace); + u64 timestamp; + int err = 0; + + if (dump_trace) + return 0; + + if (!tool->ordered_events) { + pr_err("Intel Processor Trace requires ordered events\n"); + return -EINVAL; + } + + if (sample->time) + timestamp = perf_time_to_tsc(sample->time, &pt->tc); + else + timestamp = 0; + + if (timestamp || pt->timeless_decoding) { + err = intel_pt_update_queues(pt); + if (err) + return err; + } + + if (pt->timeless_decoding) { + if (event->header.type == PERF_RECORD_EXIT) { + err = intel_pt_process_timeless_queues(pt, + event->comm.tid, + sample->time); + } + } else if (timestamp) { + err = intel_pt_process_queues(pt, timestamp); + } + if (err) + return err; + + if (event->header.type == PERF_RECORD_AUX && + (event->aux.flags & PERF_AUX_FLAG_TRUNCATED) && + pt->synth_opts.errors) { + err = intel_pt_lost(pt, sample); + if (err) + return err; + } + + if (pt->switch_evsel && event->header.type == PERF_RECORD_SAMPLE) + err = intel_pt_process_switch(pt, sample); + else if (event->header.type == PERF_RECORD_ITRACE_START) + err = intel_pt_process_itrace_start(pt, event, sample); + + intel_pt_log("event %s (%u): cpu %d time %"PRIu64" tsc %#"PRIx64"\n", + perf_event__name(event->header.type), event->header.type, + sample->cpu, sample->time, timestamp); + + return err; +} + +static int intel_pt_flush(struct perf_session *session, struct perf_tool *tool) +{ + struct intel_pt *pt = container_of(session->auxtrace, struct intel_pt, + auxtrace); + int ret; + + if (dump_trace) + return 0; + + if (!tool->ordered_events) + return -EINVAL; + + ret = intel_pt_update_queues(pt); + if (ret < 0) + return ret; + + if (pt->timeless_decoding) + return intel_pt_process_timeless_queues(pt, -1, + MAX_TIMESTAMP - 1); + + return intel_pt_process_queues(pt, MAX_TIMESTAMP); +} + +static void intel_pt_free_events(struct perf_session *session) +{ + struct intel_pt *pt = container_of(session->auxtrace, struct intel_pt, + auxtrace); + struct auxtrace_queues *queues = &pt->queues; + unsigned int i; + + for (i = 0; i < queues->nr_queues; i++) { + intel_pt_free_queue(queues->queue_array[i].priv); + queues->queue_array[i].priv = NULL; + } + intel_pt_log_disable(); + auxtrace_queues__free(queues); +} + +static void intel_pt_free(struct perf_session *session) +{ + struct intel_pt *pt = container_of(session->auxtrace, struct intel_pt, + auxtrace); + + auxtrace_heap__free(&pt->heap); + intel_pt_free_events(session); + session->auxtrace = NULL; + thread__delete(pt->unknown_thread); + free(pt); +} + +static int intel_pt_process_auxtrace_event(struct perf_session *session, + union perf_event *event, + struct perf_tool *tool __maybe_unused) +{ + struct intel_pt *pt = container_of(session->auxtrace, struct intel_pt, + auxtrace); + + if (pt->sampling_mode) + return 0; + + if (!pt->data_queued) { + struct auxtrace_buffer *buffer; + off_t data_offset; + int fd = perf_data_file__fd(session->file); + int err; + + if (perf_data_file__is_pipe(session->file)) { + data_offset = 0; + } else { + data_offset = lseek(fd, 0, SEEK_CUR); + if (data_offset == -1) + return -errno; + } + + err = auxtrace_queues__add_event(&pt->queues, session, event, + data_offset, &buffer); + if (err) + return err; + + /* Dump here now we have copied a piped trace out of the pipe */ + if (dump_trace) { + if (auxtrace_buffer__get_data(buffer, fd)) { + intel_pt_dump_event(pt, buffer->data, + buffer->size); + auxtrace_buffer__put_data(buffer); + } + } + } + + return 0; +} + +struct intel_pt_synth { + struct perf_tool dummy_tool; + struct perf_session *session; +}; + +static int intel_pt_event_synth(struct perf_tool *tool, + union perf_event *event, + struct perf_sample *sample __maybe_unused, + struct machine *machine __maybe_unused) +{ + struct intel_pt_synth *intel_pt_synth = + container_of(tool, struct intel_pt_synth, dummy_tool); + + return perf_session__deliver_synth_event(intel_pt_synth->session, event, + NULL); +} + +static int intel_pt_synth_event(struct perf_session *session, + struct perf_event_attr *attr, u64 id) +{ + struct intel_pt_synth intel_pt_synth; + + memset(&intel_pt_synth, 0, sizeof(struct intel_pt_synth)); + intel_pt_synth.session = session; + + return perf_event__synthesize_attr(&intel_pt_synth.dummy_tool, attr, 1, + &id, intel_pt_event_synth); +} + +static int intel_pt_synth_events(struct intel_pt *pt, + struct perf_session *session) +{ + struct perf_evlist *evlist = session->evlist; + struct perf_evsel *evsel; + struct perf_event_attr attr; + bool found = false; + u64 id; + int err; + + evlist__for_each(evlist, evsel) { + if (evsel->attr.type == pt->pmu_type && evsel->ids) { + found = true; + break; + } + } + + if (!found) { + pr_debug("There are no selected events with Intel Processor Trace data\n"); + return 0; + } + + memset(&attr, 0, sizeof(struct perf_event_attr)); + attr.size = sizeof(struct perf_event_attr); + attr.type = PERF_TYPE_HARDWARE; + attr.sample_type = evsel->attr.sample_type & PERF_SAMPLE_MASK; + attr.sample_type |= PERF_SAMPLE_IP | PERF_SAMPLE_TID | + PERF_SAMPLE_PERIOD; + if (pt->timeless_decoding) + attr.sample_type &= ~(u64)PERF_SAMPLE_TIME; + else + attr.sample_type |= PERF_SAMPLE_TIME; + if (!pt->per_cpu_mmaps) + attr.sample_type &= ~(u64)PERF_SAMPLE_CPU; + attr.exclude_user = evsel->attr.exclude_user; + attr.exclude_kernel = evsel->attr.exclude_kernel; + attr.exclude_hv = evsel->attr.exclude_hv; + attr.exclude_host = evsel->attr.exclude_host; + attr.exclude_guest = evsel->attr.exclude_guest; + attr.sample_id_all = evsel->attr.sample_id_all; + attr.read_format = evsel->attr.read_format; + + id = evsel->id[0] + 1000000000; + if (!id) + id = 1; + + if (pt->synth_opts.instructions) { + attr.config = PERF_COUNT_HW_INSTRUCTIONS; + if (pt->synth_opts.period_type == PERF_ITRACE_PERIOD_NANOSECS) + attr.sample_period = + intel_pt_ns_to_ticks(pt, pt->synth_opts.period); + else + attr.sample_period = pt->synth_opts.period; + pt->instructions_sample_period = attr.sample_period; + if (pt->synth_opts.callchain) + attr.sample_type |= PERF_SAMPLE_CALLCHAIN; + pr_debug("Synthesizing 'instructions' event with id %" PRIu64 " sample type %#" PRIx64 "\n", + id, (u64)attr.sample_type); + err = intel_pt_synth_event(session, &attr, id); + if (err) { + pr_err("%s: failed to synthesize 'instructions' event type\n", + __func__); + return err; + } + pt->sample_instructions = true; + pt->instructions_sample_type = attr.sample_type; + pt->instructions_id = id; + id += 1; + } + + if (pt->synth_opts.transactions) { + attr.config = PERF_COUNT_HW_INSTRUCTIONS; + attr.sample_period = 1; + if (pt->synth_opts.callchain) + attr.sample_type |= PERF_SAMPLE_CALLCHAIN; + pr_debug("Synthesizing 'transactions' event with id %" PRIu64 " sample type %#" PRIx64 "\n", + id, (u64)attr.sample_type); + err = intel_pt_synth_event(session, &attr, id); + if (err) { + pr_err("%s: failed to synthesize 'transactions' event type\n", + __func__); + return err; + } + pt->sample_transactions = true; + pt->transactions_id = id; + id += 1; + evlist__for_each(evlist, evsel) { + if (evsel->id && evsel->id[0] == pt->transactions_id) { + if (evsel->name) + zfree(&evsel->name); + evsel->name = strdup("transactions"); + break; + } + } + } + + if (pt->synth_opts.branches) { + attr.config = PERF_COUNT_HW_BRANCH_INSTRUCTIONS; + attr.sample_period = 1; + attr.sample_type |= PERF_SAMPLE_ADDR; + attr.sample_type &= ~(u64)PERF_SAMPLE_CALLCHAIN; + pr_debug("Synthesizing 'branches' event with id %" PRIu64 " sample type %#" PRIx64 "\n", + id, (u64)attr.sample_type); + err = intel_pt_synth_event(session, &attr, id); + if (err) { + pr_err("%s: failed to synthesize 'branches' event type\n", + __func__); + return err; + } + pt->sample_branches = true; + pt->branches_sample_type = attr.sample_type; + pt->branches_id = id; + } + + pt->synth_needs_swap = evsel->needs_swap; + + return 0; +} + +static struct perf_evsel *intel_pt_find_sched_switch(struct perf_evlist *evlist) +{ + struct perf_evsel *evsel; + + evlist__for_each_reverse(evlist, evsel) { + const char *name = perf_evsel__name(evsel); + + if (!strcmp(name, "sched:sched_switch")) + return evsel; + } + + return NULL; +} + +static const char * const intel_pt_info_fmts[] = { + [INTEL_PT_PMU_TYPE] = " PMU Type %"PRId64"\n", + [INTEL_PT_TIME_SHIFT] = " Time Shift %"PRIu64"\n", + [INTEL_PT_TIME_MULT] = " Time Muliplier %"PRIu64"\n", + [INTEL_PT_TIME_ZERO] = " Time Zero %"PRIu64"\n", + [INTEL_PT_CAP_USER_TIME_ZERO] = " Cap Time Zero %"PRId64"\n", + [INTEL_PT_TSC_BIT] = " TSC bit %#"PRIx64"\n", + [INTEL_PT_NORETCOMP_BIT] = " NoRETComp bit %#"PRIx64"\n", + [INTEL_PT_HAVE_SCHED_SWITCH] = " Have sched_switch %"PRId64"\n", + [INTEL_PT_SNAPSHOT_MODE] = " Snapshot mode %"PRId64"\n", + [INTEL_PT_PER_CPU_MMAPS] = " Per-cpu maps %"PRId64"\n", +}; + +static void intel_pt_print_info(u64 *arr, int start, int finish) +{ + int i; + + if (!dump_trace) + return; + + for (i = start; i <= finish; i++) + fprintf(stdout, intel_pt_info_fmts[i], arr[i]); +} + +int intel_pt_process_auxtrace_info(union perf_event *event, + struct perf_session *session) +{ + struct auxtrace_info_event *auxtrace_info = &event->auxtrace_info; + size_t min_sz = sizeof(u64) * INTEL_PT_PER_CPU_MMAPS; + struct intel_pt *pt; + int err; + + if (auxtrace_info->header.size < sizeof(struct auxtrace_info_event) + + min_sz) + return -EINVAL; + + pt = zalloc(sizeof(struct intel_pt)); + if (!pt) + return -ENOMEM; + + err = auxtrace_queues__init(&pt->queues); + if (err) + goto err_free; + + intel_pt_log_set_name(INTEL_PT_PMU_NAME); + + pt->session = session; + pt->machine = &session->machines.host; /* No kvm support */ + pt->auxtrace_type = auxtrace_info->type; + pt->pmu_type = auxtrace_info->priv[INTEL_PT_PMU_TYPE]; + pt->tc.time_shift = auxtrace_info->priv[INTEL_PT_TIME_SHIFT]; + pt->tc.time_mult = auxtrace_info->priv[INTEL_PT_TIME_MULT]; + pt->tc.time_zero = auxtrace_info->priv[INTEL_PT_TIME_ZERO]; + pt->cap_user_time_zero = auxtrace_info->priv[INTEL_PT_CAP_USER_TIME_ZERO]; + pt->tsc_bit = auxtrace_info->priv[INTEL_PT_TSC_BIT]; + pt->noretcomp_bit = auxtrace_info->priv[INTEL_PT_NORETCOMP_BIT]; + pt->have_sched_switch = auxtrace_info->priv[INTEL_PT_HAVE_SCHED_SWITCH]; + pt->snapshot_mode = auxtrace_info->priv[INTEL_PT_SNAPSHOT_MODE]; + pt->per_cpu_mmaps = auxtrace_info->priv[INTEL_PT_PER_CPU_MMAPS]; + intel_pt_print_info(&auxtrace_info->priv[0], INTEL_PT_PMU_TYPE, + INTEL_PT_PER_CPU_MMAPS); + + pt->timeless_decoding = intel_pt_timeless_decoding(pt); + pt->have_tsc = intel_pt_have_tsc(pt); + pt->sampling_mode = false; + pt->est_tsc = !pt->timeless_decoding; + + pt->unknown_thread = thread__new(999999999, 999999999); + if (!pt->unknown_thread) { + err = -ENOMEM; + goto err_free_queues; + } + err = thread__set_comm(pt->unknown_thread, "unknown", 0); + if (err) + goto err_delete_thread; + if (thread__init_map_groups(pt->unknown_thread, pt->machine)) { + err = -ENOMEM; + goto err_delete_thread; + } + + pt->auxtrace.process_event = intel_pt_process_event; + pt->auxtrace.process_auxtrace_event = intel_pt_process_auxtrace_event; + pt->auxtrace.flush_events = intel_pt_flush; + pt->auxtrace.free_events = intel_pt_free_events; + pt->auxtrace.free = intel_pt_free; + session->auxtrace = &pt->auxtrace; + + if (dump_trace) + return 0; + + if (pt->have_sched_switch == 1) { + pt->switch_evsel = intel_pt_find_sched_switch(session->evlist); + if (!pt->switch_evsel) { + pr_err("%s: missing sched_switch event\n", __func__); + goto err_delete_thread; + } + } + + if (session->itrace_synth_opts && session->itrace_synth_opts->set) { + pt->synth_opts = *session->itrace_synth_opts; + } else { + itrace_synth_opts__set_default(&pt->synth_opts); + if (use_browser != -1) { + pt->synth_opts.branches = false; + pt->synth_opts.callchain = true; + } + } + + if (pt->synth_opts.log) + intel_pt_log_enable(); + + /* Maximum non-turbo ratio is TSC freq / 100 MHz */ + if (pt->tc.time_mult) { + u64 tsc_freq = intel_pt_ns_to_ticks(pt, 1000000000); + + pt->max_non_turbo_ratio = (tsc_freq + 50000000) / 100000000; + intel_pt_log("TSC frequency %"PRIu64"\n", tsc_freq); + intel_pt_log("Maximum non-turbo ratio %u\n", + pt->max_non_turbo_ratio); + } + + if (pt->synth_opts.calls) + pt->branches_filter |= PERF_IP_FLAG_CALL | PERF_IP_FLAG_ASYNC | + PERF_IP_FLAG_TRACE_END; + if (pt->synth_opts.returns) + pt->branches_filter |= PERF_IP_FLAG_RETURN | + PERF_IP_FLAG_TRACE_BEGIN; + + if (pt->synth_opts.callchain && !symbol_conf.use_callchain) { + symbol_conf.use_callchain = true; + if (callchain_register_param(&callchain_param) < 0) { + symbol_conf.use_callchain = false; + pt->synth_opts.callchain = false; + } + } + + err = intel_pt_synth_events(pt, session); + if (err) + goto err_delete_thread; + + err = auxtrace_queues__process_index(&pt->queues, session); + if (err) + goto err_delete_thread; + + if (pt->queues.populated) + pt->data_queued = true; + + if (pt->timeless_decoding) + pr_debug2("Intel PT decoding without timestamps\n"); + + return 0; + +err_delete_thread: + thread__delete(pt->unknown_thread); +err_free_queues: + intel_pt_log_disable(); + auxtrace_queues__free(&pt->queues); + session->auxtrace = NULL; +err_free: + free(pt); + return err; +} diff --git a/tools/perf/util/intel-pt.h b/tools/perf/util/intel-pt.h new file mode 100644 index 0000000..a1bfe93 --- /dev/null +++ b/tools/perf/util/intel-pt.h @@ -0,0 +1,51 @@ +/* + * intel_pt.h: Intel Processor Trace support + * Copyright (c) 2013-2015, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + */ + +#ifndef INCLUDE__PERF_INTEL_PT_H__ +#define INCLUDE__PERF_INTEL_PT_H__ + +#define INTEL_PT_PMU_NAME "intel_pt" + +enum { + INTEL_PT_PMU_TYPE, + INTEL_PT_TIME_SHIFT, + INTEL_PT_TIME_MULT, + INTEL_PT_TIME_ZERO, + INTEL_PT_CAP_USER_TIME_ZERO, + INTEL_PT_TSC_BIT, + INTEL_PT_NORETCOMP_BIT, + INTEL_PT_HAVE_SCHED_SWITCH, + INTEL_PT_SNAPSHOT_MODE, + INTEL_PT_PER_CPU_MMAPS, + INTEL_PT_AUXTRACE_PRIV_MAX, +}; + +#define INTEL_PT_AUXTRACE_PRIV_SIZE (INTEL_PT_AUXTRACE_PRIV_MAX * sizeof(u64)) + +struct auxtrace_record; +struct perf_tool; +union perf_event; +struct perf_session; +struct perf_event_attr; +struct perf_pmu; + +struct auxtrace_record *intel_pt_recording_init(int *err); + +int intel_pt_process_auxtrace_info(union perf_event *event, + struct perf_session *session); + +struct perf_event_attr *intel_pt_pmu_default_config(struct perf_pmu *pmu); + +#endif -- cgit v0.10.2 From 5efb1d5489520ce72232bbc28e9156f0ebddc44e Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Fri, 17 Jul 2015 19:33:42 +0300 Subject: perf tools: Take Intel PT into use To record an AUX area, the weak function auxtrace_record__init() must be implemented. Equally to decode an AUX area, the AUX area tracing type must be added to the perf_event__process_auxtrace_info() function. This patch makes those two changes plus hooks up default config for the intel_pt PMU. Also some brief documentation is provided for using the tools with intel_pt. Commiter note: E.g: [root@perf4 ~]# dmesg 451 [0.405807] Performance Events: PEBS fmt2+, 16-deep LBR, Broadwell events, full-width counters, Intel PMU driver. [root@perf4 ~]# perf --version perf version 4.1.g53874a [root@perf4 ~]# perf record -e intel_pt//u -a sleep 10 [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.383 MB perf.data ] [root@perf4 ~]# perf evlist intel_pt//u sched:sched_switch dummy:u [root@perf4 ~]# perf report --stdio # To display the perf.data header info, please use --header/--header-only options. # # # Total Lost Samples: 0 # # Samples: 0 of event 'intel_pt//u' # Event count (approx.): 0 # # Overhead Command Shared Object Symbol # ........ ....... ............. ...... # # Samples: 393 of event 'sched:sched_switch' # Event count (approx.): 393 # # Overhead Command Shared Object Symbol # ........ .............. ................ .............. 49.62% swapper [kernel.vmlinux] [k] __schedule 10.69% rcu_sched [kernel.vmlinux] [k] __schedule 6.62% rcuos/0 [kernel.vmlinux] [k] __schedule 5.60% kworker/0:1 [kernel.vmlinux] [k] __schedule 3.56% rcuos/3 [kernel.vmlinux] [k] __schedule 3.05% kworker/u384:2 [kernel.vmlinux] [k] __schedule 2.54% kworker/2:0 [kernel.vmlinux] [k] __schedule 2.54% tuned [kernel.vmlinux] [k] __schedule # Samples: 0 of event 'dummy:u' # Event count (approx.): 0 # # Overhead Command Shared Object Symbol # ........ ....... ............. ...... # Samples: 28 of event 'instructions:u' # Event count (approx.): 5030172 # # Overhead Command Shared Object Symbol # ........ .......... ................... ................................ # 21.43% tuned libpython2.7.so.1.0 [.] PyEval_EvalFrameEx | ---PyEval_EvalFrameEx | |--83.33%-- PyEval_EvalCodeEx | PyEval_EvalFrameEx | | | |--60.00%-- PyEval_EvalCodeEx | | PyEval_EvalFrameEx | | PyEval_EvalFrameEx | | | --40.00%-- PyEval_EvalFrameEx | --16.67%-- PyEval_EvalFrameEx PyEval_EvalCodeEx PyEval_EvalFrameEx PyEval_EvalCodeEx PyEval_EvalFrameEx PyEval_EvalFrameEx 14.29% tuned libpython2.7.so.1.0 [.] _PyType_Lookup | ---_PyType_Lookup _PyObject_GenericGetAttrWithDict PyEval_EvalFrameEx PyEval_EvalCodeEx PyEval_EvalFrameEx PyEval_EvalCodeEx PyEval_EvalFrameEx | |--75.00%-- PyEval_EvalFrameEx | --25.00%-- PyEval_EvalCodeEx PyEval_EvalFrameEx PyEval_EvalFrameEx 3.57% irqbalance irqbalance [.] 0x0000000000004038 | ---0x4038 0x4761 0x4761 0x4761 0x49f1 0x2295 3.57% irqbalance libc-2.17.so [.] __GI_____strtoull_l_internal | ---__GI_____strtoull_l_internal 0x6f49 0x229a 3.57% irqbalance libc-2.17.so [.] __strchrnul | ---__strchrnul vfprintf __vsprintf_chk __sprintf_chk 0x2724 0x4038 0x2331 3.57% irqbalance libc-2.17.so [.] __strstr_sse42 | ---__strstr_sse42 0x71e0 0x229f # And now to some userspace ftrace on uninstrumented binaries 8-) : # Hand edited to make it a bit more compact, replacing /home/acme/bin/perf # with /bin/perf: [root@perf4 ~]# perf script perf 8921 [3] 7.310889: 1 branches:u: 0 [unknown] ([unknown]) => 7fcecadbf257 __GI___ioctl (/usr/lib64/libc-2.17.so) perf 8921 [3] 7.310889: 1 branches:u: 7fcecadbf25f __GI___ioctl (/usr/lib64/libc-2.17.so) => 481689 perf_evlist__enable (/bin/perf) perf 8921 [3] 7.310889: 1 branches:u: 481694 perf_evlist__enable (/bin/perf) => 481614 perf_evlist__enable (/bin/perf) perf 8921 [3] 7.310889: 1 branches:u: 481630 perf_evlist__enable (/bin/perf) => 4816d8 perf_evlist__enable (/bin/perf) perf 8921 [3] 7.310889: 1 branches:u: 4816de perf_evlist__enable (/bin/perf) => 48164f perf_evlist__enable (/bin/perf) perf 8921 [3] 7.310889: 1 branches:u: 481652 perf_evlist__enable (/bin/perf) => 48165f perf_evlist__enable (/bin/perf) perf 8921 [3] 7.310889: 1 branches:u: 481684 perf_evlist__enable (/bin/perf) => 41d250 ioctl@plt (/bin/perf) perf 8921 [3] 7.310889: 1 branches:u: 41d250 ioctl@plt (/bin/perf) => 7fcecadbf250 __GI___ioctl (/usr/lib64/libc-2.17.so) perf 8921 [3] 7.310889: 1 branches:u: 7fcecadbf255 __GI___ioctl (/usr/lib64/libc-2.17.so) => 0 [unknown] ([unknown]) perf 8921 [3] 7.310890: 1 branches:u: 0 [unknown] ([unknown]) => 7fcecadbf257 __GI___ioctl (/usr/lib64/libc-2.17.so) perf 8921 [3] 7.310890: 1 branches:u: 7fcecadbf25f __GI___ioctl (/usr/lib64/libc-2.17.so) => 481689 perf_evlist__enable (/bin/perf) perf 8921 [3] 7.310890: 1 branches:u: 481694 perf_evlist__enable (/bin/perf) => 481614 perf_evlist__enable (/bin/perf) perf 8921 [3] 7.310890: 1 branches:u: 481652 perf_evlist__enable (/bin/perf) => 48165f perf_evlist__enable (/bin/perf) perf 8921 [3] 7.310890: 1 branches:u: 481684 perf_evlist__enable (/bin/perf) => 41d250 ioctl@plt (/bin/perf) perf 8921 [3] 7.310890: 1 branches:u: 41d250 ioctl@plt (/bin/perf) => 7fcecadbf250 __GI___ioctl (/usr/lib64/libc-2.17.so) perf 8921 [3] 7.310890: 1 branches:u: 7fcecadbf255 __GI___ioctl (/usr/lib64/libc-2.17.so) => 0 [unknown] ([unknown]) perf 8921 [3] 7.310893: 1 branches:u: 0 [unknown] ([unknown]) => 7fcecadbf257 __GI___ioctl (/usr/lib64/libc-2.17.so) perf 8921 [3] 7.310893: 1 branches:u: 7fcecadbf25f __GI___ioctl (/usr/lib64/libc-2.17.so) => 481689 perf_evlist__enable (/bin/perf) perf 8921 [3] 7.310893: 1 branches:u: 4816a8 perf_evlist__enable (/bin/perf) => 4815f8 perf_evlist__enable (/bin/perf) perf 8921 [3] 7.310893: 1 branches:u: 4815fe perf_evlist__enable (/bin/perf) => 481614 perf_evlist__enable (/bin/perf) perf 8921 [3] 7.310893: 1 branches:u: 481652 perf_evlist__enable (/bin/perf) => 48165f perf_evlist__enable (/bin/perf) perf 8921 [3] 7.310893: 1 branches:u: 481684 perf_evlist__enable (/bin/perf) => 41d250 ioctl@plt (/bin/perf) perf 8921 [3] 7.310893: 1 branches:u: 41d250 ioctl@plt (/bin/perf) => 7fcecadbf250 __GI___ioctl (/usr/lib64/libc-2.17.so) perf 8921 [3] 7.310893: 1 branches:u: 7fcecadbf255 __GI___ioctl (/usr/lib64/libc-2.17.so) => 0 [unknown] ([unknown]) perf 8921 [3] 7.310956: 1 branches:u: 0 [unknown] ([unknown]) => 7fcecadbf257 __GI___ioctl (/usr/lib64/libc-2.17.so) perf 8921 [3] 7.310956: 1 branches:u: 7fcecadbf25f __GI___ioctl (/usr/lib64/libc-2.17.so) => 481689 perf_evlist__enable (/bin/perf) perf 8921 [3] 7.310956: 1 branches:u: 481694 perf_evlist__enable (/bin/perf) => 481614 perf_evlist__enable (/bin/perf) perf 8921 [3] 7.310956: 1 branches:u: 481630 perf_evlist__enable (/bin/perf) => 4816d8 perf_evlist__enable (/bin/perf) perf 8921 [3] 7.310956: 1 branches:u: 4816de perf_evlist__enable (/bin/perf) => 48164f perf_evlist__enable (/bin/perf) perf 8921 [3] 7.310956: 1 branches:u: 481652 perf_evlist__enable (/bin/perf) => 48165f perf_evlist__enable (/bin/perf) perf 8921 [3] 7.310956: 1 branches:u: 481684 perf_evlist__enable (/bin/perf) => 41d250 ioctl@plt (/bin/perf) perf 8921 [3] 7.310956: 1 branches:u: 41d250 ioctl@plt (/bin/perf) => 7fcecadbf250 __GI___ioctl (/usr/lib64/libc-2.17.so) perf 8921 [3] 7.310956: 1 branches:u: 7fcecadbf255 __GI___ioctl (/usr/lib64/libc-2.17.so) => 0 [unknown] ([unknown]) perf 8921 [3] 7.310961: 1 branches:u: 0 [unknown] ([unknown]) => 7fcecadbf257 __GI___ioctl (/usr/lib64/libc-2.17.so) perf 8921 [3] 7.310961: 1 branches:u: 7fcecadbf25f __GI___ioctl (/usr/lib64/libc-2.17.so) => 481689 perf_evlist__enable (/bin/perf) perf 8921 [3] 7.310961: 1 branches:u: 481694 perf_evlist__enable (/bin/perf) => 481614 perf_evlist__enable (/bin/perf) perf 8921 [3] 7.310961: 1 branches:u: 481652 perf_evlist__enable (/bin/perf) => 48165f perf_evlist__enable (/bin/perf) perf 8921 [3] 7.310961: 1 branches:u: 481684 perf_evlist__enable (/bin/perf) => 41d250 ioctl@plt (/bin/perf) perf 8921 [3] 7.310961: 1 branches:u: 41d250 ioctl@plt (/bin/perf) => 7fcecadbf250 __GI___ioctl (/usr/lib64/libc-2.17.so) perf 8921 [3] 7.310961: 1 branches:u: 7fcecadbf255 __GI___ioctl (/usr/lib64/libc-2.17.so) => 0 [unknown] ([unknown]) perf 8921 [3] 7.310968: 1 branches:u: 0 [unknown] ([unknown]) => 7fcecadbf257 __GI___ioctl (/usr/lib64/libc-2.17.so) perf 8921 [3] 7.310968: 1 branches:u: 7fcecadbf25f __GI___ioctl (/usr/lib64/libc-2.17.so) => 481689 perf_evlist__enable (/bin/perf) perf 8921 [3] 7.310968: 1 branches:u: 4816a8 perf_evlist__enable (/bin/perf) => 4815f8 perf_evlist__enable (/bin/perf) perf 8921 [3] 7.310968: 1 branches:u: 4815fe perf_evlist__enable (/bin/perf) => 481614 perf_evlist__enable (/bin/perf) perf 8921 [3] 7.310968: 1 branches:u: 481652 perf_evlist__enable (/bin/perf) => 48165f perf_evlist__enable (/bin/perf) perf 8921 [3] 7.310968: 1 branches:u: 481684 perf_evlist__enable (/bin/perf) => 41d250 ioctl@plt (/bin/perf) perf 8921 [3] 7.310968: 1 branches:u: 41d250 ioctl@plt (/bin/perf) => 7fcecadbf250 __GI___ioctl (/usr/lib64/libc-2.17.so) perf 8921 [3] 7.310968: 1 branches:u: 7fcecadbf255 __GI___ioctl (/usr/lib64/libc-2.17.so) => 0 [unknown] ([unknown]) perf 8921 [3] 7.311040: 1 branches:u: 0 [unknown] ([unknown]) => 7fcecadbf257 __GI___ioctl (/usr/lib64/libc-2.17.so) perf 8921 [3] 7.311040: 1 branches:u: 7fcecadbf25f __GI___ioctl (/usr/lib64/libc-2.17.so) => 481689 perf_evlist__enable (/bin/perf) perf 8921 [3] 7.311040: 1 branches:u: 481694 perf_evlist__enable (/bin/perf) => 481614 perf_evlist__enable (/bin/perf) perf 8921 [3] 7.311040: 1 branches:u: 481630 perf_evlist__enable (/bin/perf) => 4816d8 perf_evlist__enable (/bin/perf) perf 8921 [3] 7.311040: 1 branches:u: 4816de perf_evlist__enable (/bin/perf) => 48164f perf_evlist__enable (/bin/perf) perf 8921 [3] 7.311040: 1 branches:u: 481652 perf_evlist__enable (/bin/perf) => 48165f perf_evlist__enable (/bin/perf) perf 8921 [3] 7.311040: 1 branches:u: 481684 perf_evlist__enable (/bin/perf) => 41d250 ioctl@plt (/bin/perf) perf 8921 [3] 7.311040: 1 branches:u: 41d250 ioctl@plt (/bin/perf) => 7fcecadbf250 __GI___ioctl (/usr/lib64/libc-2.17.so) perf 8921 [3] 7.311040: 1 branches:u: 7fcecadbf255 __GI___ioctl (/usr/lib64/libc-2.17.so) => 0 [unknown] ([unknown]) perf 8921 [3] 7.311046: 1 branches:u: 0 [unknown] ([unknown]) => 7fcecadbf257 __GI___ioctl (/usr/lib64/libc-2.17.so) perf 8921 [3] 7.311046: 1 branches:u: 7fcecadbf25f __GI___ioctl (/usr/lib64/libc-2.17.so) => 481689 perf_evlist__enable (/bin/perf) perf 8921 [3] 7.311046: 1 branches:u: 481694 perf_evlist__enable (/bin/perf) => 481614 perf_evlist__enable (/bin/perf) perf 8921 [3] 7.311046: 1 branches:u: 481652 perf_evlist__enable (/bin/perf) => 48165f perf_evlist__enable (/bin/perf) perf 8921 [3] 7.311046: 1 branches:u: 481684 perf_evlist__enable (/bin/perf) => 41d250 ioctl@plt (/bin/perf) perf 8921 [3] 7.311046: 1 branches:u: 41d250 ioctl@plt (/bin/perf) => 7fcecadbf250 __GI___ioctl (/usr/lib64/libc-2.17.so) perf 8921 [3] 7.311046: 1 branches:u: 7fcecadbf255 __GI___ioctl (/usr/lib64/libc-2.17.so) => 0 [unknown] ([unknown]) perf 8921 [3] 7.311050: 1 branches:u: 0 [unknown] ([unknown]) => 7fcecadbf257 __GI___ioctl (/usr/lib64/libc-2.17.so) perf 8921 [3] 7.311050: 1 branches:u: 7fcecadbf25f __GI___ioctl (/usr/lib64/libc-2.17.so) => 481689 perf_evlist__enable (/bin/perf) : Signed-off-by: Adrian Hunter Tested-by: Arnaldo Carvalho de Melo Cc: Jiri Olsa Link: http://lkml.kernel.org/r/1437150840-31811-8-git-send-email-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/Documentation/intel-pt.txt b/tools/perf/Documentation/intel-pt.txt new file mode 100644 index 0000000..2866b62 --- /dev/null +++ b/tools/perf/Documentation/intel-pt.txt @@ -0,0 +1,588 @@ +Intel Processor Trace +===================== + +Overview +======== + +Intel Processor Trace (Intel PT) is an extension of Intel Architecture that +collects information about software execution such as control flow, execution +modes and timings and formats it into highly compressed binary packets. +Technical details are documented in the Intel 64 and IA-32 Architectures +Software Developer Manuals, Chapter 36 Intel Processor Trace. + +Intel PT is first supported in Intel Core M and 5th generation Intel Core +processors that are based on the Intel micro-architecture code name Broadwell. + +Trace data is collected by 'perf record' and stored within the perf.data file. +See below for options to 'perf record'. + +Trace data must be 'decoded' which involves walking the object code and matching +the trace data packets. For example a TNT packet only tells whether a +conditional branch was taken or not taken, so to make use of that packet the +decoder must know precisely which instruction was being executed. + +Decoding is done on-the-fly. The decoder outputs samples in the same format as +samples output by perf hardware events, for example as though the "instructions" +or "branches" events had been recorded. Presently 3 tools support this: +'perf script', 'perf report' and 'perf inject'. See below for more information +on using those tools. + +The main distinguishing feature of Intel PT is that the decoder can determine +the exact flow of software execution. Intel PT can be used to understand why +and how did software get to a certain point, or behave a certain way. The +software does not have to be recompiled, so Intel PT works with debug or release +builds, however the executed images are needed - which makes use in JIT-compiled +environments, or with self-modified code, a challenge. Also symbols need to be +provided to make sense of addresses. + +A limitation of Intel PT is that it produces huge amounts of trace data +(hundreds of megabytes per second per core) which takes a long time to decode, +for example two or three orders of magnitude longer than it took to collect. +Another limitation is the performance impact of tracing, something that will +vary depending on the use-case and architecture. + + +Quickstart +========== + +It is important to start small. That is because it is easy to capture vastly +more data than can possibly be processed. + +The simplest thing to do with Intel PT is userspace profiling of small programs. +Data is captured with 'perf record' e.g. to trace 'ls' userspace-only: + + perf record -e intel_pt//u ls + +And profiled with 'perf report' e.g. + + perf report + +To also trace kernel space presents a problem, namely kernel self-modifying +code. A fairly good kernel image is available in /proc/kcore but to get an +accurate image a copy of /proc/kcore needs to be made under the same conditions +as the data capture. A script perf-with-kcore can do that, but beware that the +script makes use of 'sudo' to copy /proc/kcore. If you have perf installed +locally from the source tree you can do: + + ~/libexec/perf-core/perf-with-kcore record pt_ls -e intel_pt// -- ls + +which will create a directory named 'pt_ls' and put the perf.data file and +copies of /proc/kcore, /proc/kallsyms and /proc/modules into it. Then to use +'perf report' becomes: + + ~/libexec/perf-core/perf-with-kcore report pt_ls + +Because samples are synthesized after-the-fact, the sampling period can be +selected for reporting. e.g. sample every microsecond + + ~/libexec/perf-core/perf-with-kcore report pt_ls --itrace=i1usge + +See the sections below for more information about the --itrace option. + +Beware the smaller the period, the more samples that are produced, and the +longer it takes to process them. + +Also note that the coarseness of Intel PT timing information will start to +distort the statistical value of the sampling as the sampling period becomes +smaller. + +To represent software control flow, "branches" samples are produced. By default +a branch sample is synthesized for every single branch. To get an idea what +data is available you can use the 'perf script' tool with no parameters, which +will list all the samples. + + perf record -e intel_pt//u ls + perf script + +An interesting field that is not printed by default is 'flags' which can be +displayed as follows: + + perf script -Fcomm,tid,pid,time,cpu,event,trace,ip,sym,dso,addr,symoff,flags + +The flags are "bcrosyiABEx" which stand for branch, call, return, conditional, +system, asynchronous, interrupt, transaction abort, trace begin, trace end, and +in transaction, respectively. + +While it is possible to create scripts to analyze the data, an alternative +approach is available to export the data to a postgresql database. Refer to +script export-to-postgresql.py for more details, and to script +call-graph-from-postgresql.py for an example of using the database. + +As mentioned above, it is easy to capture too much data. One way to limit the +data captured is to use 'snapshot' mode which is explained further below. +Refer to 'new snapshot option' and 'Intel PT modes of operation' further below. + +Another problem that will be experienced is decoder errors. They can be caused +by inability to access the executed image, self-modified or JIT-ed code, or the +inability to match side-band information (such as context switches and mmaps) +which results in the decoder not knowing what code was executed. + +There is also the problem of perf not being able to copy the data fast enough, +resulting in data lost because the buffer was full. See 'Buffer handling' below +for more details. + + +perf record +=========== + +new event +--------- + +The Intel PT kernel driver creates a new PMU for Intel PT. PMU events are +selected by providing the PMU name followed by the "config" separated by slashes. +An enhancement has been made to allow default "config" e.g. the option + + -e intel_pt// + +will use a default config value. Currently that is the same as + + -e intel_pt/tsc,noretcomp=0/ + +which is the same as + + -e intel_pt/tsc=1,noretcomp=0/ + +The config terms are listed in /sys/devices/intel_pt/format. They are bit +fields within the config member of the struct perf_event_attr which is +passed to the kernel by the perf_event_open system call. They correspond to bit +fields in the IA32_RTIT_CTL MSR. Here is a list of them and their definitions: + + $ for f in `ls /sys/devices/intel_pt/format`;do + > echo $f + > cat /sys/devices/intel_pt/format/$f + > done + noretcomp + config:11 + tsc + config:10 + +Note that the default config must be overridden for each term i.e. + + -e intel_pt/noretcomp=0/ + +is the same as: + + -e intel_pt/tsc=1,noretcomp=0/ + +So, to disable TSC packets use: + + -e intel_pt/tsc=0/ + +It is also possible to specify the config value explicitly: + + -e intel_pt/config=0x400/ + +Note that, as with all events, the event is suffixed with event modifiers: + + u userspace + k kernel + h hypervisor + G guest + H host + p precise ip + +'h', 'G' and 'H' are for virtualization which is not supported by Intel PT. +'p' is also not relevant to Intel PT. So only options 'u' and 'k' are +meaningful for Intel PT. + +perf_event_attr is displayed if the -vv option is used e.g. + + ------------------------------------------------------------ + perf_event_attr: + type 6 + size 112 + config 0x400 + { sample_period, sample_freq } 1 + sample_type IP|TID|TIME|CPU|IDENTIFIER + read_format ID + disabled 1 + inherit 1 + exclude_kernel 1 + exclude_hv 1 + enable_on_exec 1 + sample_id_all 1 + ------------------------------------------------------------ + sys_perf_event_open: pid 31104 cpu 0 group_fd -1 flags 0x8 + sys_perf_event_open: pid 31104 cpu 1 group_fd -1 flags 0x8 + sys_perf_event_open: pid 31104 cpu 2 group_fd -1 flags 0x8 + sys_perf_event_open: pid 31104 cpu 3 group_fd -1 flags 0x8 + ------------------------------------------------------------ + + +new snapshot option +------------------- + +To select snapshot mode a new option has been added: + + -S + +Optionally it can be followed by the snapshot size e.g. + + -S0x100000 + +The default snapshot size is the auxtrace mmap size. If neither auxtrace mmap size +nor snapshot size is specified, then the default is 4MiB for privileged users +(or if /proc/sys/kernel/perf_event_paranoid < 0), 128KiB for unprivileged users. +If an unprivileged user does not specify mmap pages, the mmap pages will be +reduced as described in the 'new auxtrace mmap size option' section below. + +The snapshot size is displayed if the option -vv is used e.g. + + Intel PT snapshot size: %zu + + +new auxtrace mmap size option +--------------------------- + +Intel PT buffer size is specified by an addition to the -m option e.g. + + -m,16 + +selects a buffer size of 16 pages i.e. 64KiB. + +Note that the existing functionality of -m is unchanged. The auxtrace mmap size +is specified by the optional addition of a comma and the value. + +The default auxtrace mmap size for Intel PT is 4MiB/page_size for privileged users +(or if /proc/sys/kernel/perf_event_paranoid < 0), 128KiB for unprivileged users. +If an unprivileged user does not specify mmap pages, the mmap pages will be +reduced from the default 512KiB/page_size to 256KiB/page_size, otherwise the +user is likely to get an error as they exceed their mlock limit (Max locked +memory as shown in /proc/self/limits). Note that perf does not count the first +512KiB (actually /proc/sys/kernel/perf_event_mlock_kb minus 1 page) per cpu +against the mlock limit so an unprivileged user is allowed 512KiB per cpu plus +their mlock limit (which defaults to 64KiB but is not multiplied by the number +of cpus). + +In full-trace mode, powers of two are allowed for buffer size, with a minimum +size of 2 pages. In snapshot mode, it is the same but the minimum size is +1 page. + +The mmap size and auxtrace mmap size are displayed if the -vv option is used e.g. + + mmap length 528384 + auxtrace mmap length 4198400 + + +Intel PT modes of operation +--------------------------- + +Intel PT can be used in 2 modes: + full-trace mode + snapshot mode + +Full-trace mode traces continuously e.g. + + perf record -e intel_pt//u uname + +Snapshot mode captures the available data when a signal is sent e.g. + + perf record -v -e intel_pt//u -S ./loopy 1000000000 & + [1] 11435 + kill -USR2 11435 + Recording AUX area tracing snapshot + +Note that the signal sent is SIGUSR2. +Note that "Recording AUX area tracing snapshot" is displayed because the -v +option is used. + +The 2 modes cannot be used together. + + +Buffer handling +--------------- + +There may be buffer limitations (i.e. single ToPa entry) which means that actual +buffer sizes are limited to powers of 2 up to 4MiB (MAX_ORDER). In order to +provide other sizes, and in particular an arbitrarily large size, multiple +buffers are logically concatenated. However an interrupt must be used to switch +between buffers. That has two potential problems: + a) the interrupt may not be handled in time so that the current buffer + becomes full and some trace data is lost. + b) the interrupts may slow the system and affect the performance + results. + +If trace data is lost, the driver sets 'truncated' in the PERF_RECORD_AUX event +which the tools report as an error. + +In full-trace mode, the driver waits for data to be copied out before allowing +the (logical) buffer to wrap-around. If data is not copied out quickly enough, +again 'truncated' is set in the PERF_RECORD_AUX event. If the driver has to +wait, the intel_pt event gets disabled. Because it is difficult to know when +that happens, perf tools always re-enable the intel_pt event after copying out +data. + + +Intel PT and build ids +---------------------- + +By default "perf record" post-processes the event stream to find all build ids +for executables for all addresses sampled. Deliberately, Intel PT is not +decoded for that purpose (it would take too long). Instead the build ids for +all executables encountered (due to mmap, comm or task events) are included +in the perf.data file. + +To see buildids included in the perf.data file use the command: + + perf buildid-list + +If the perf.data file contains Intel PT data, that is the same as: + + perf buildid-list --with-hits + + +Snapshot mode and event disabling +--------------------------------- + +In order to make a snapshot, the intel_pt event is disabled using an IOCTL, +namely PERF_EVENT_IOC_DISABLE. However doing that can also disable the +collection of side-band information. In order to prevent that, a dummy +software event has been introduced that permits tracking events (like mmaps) to +continue to be recorded while intel_pt is disabled. That is important to ensure +there is complete side-band information to allow the decoding of subsequent +snapshots. + +A test has been created for that. To find the test: + + perf test list + ... + 23: Test using a dummy software event to keep tracking + +To run the test: + + perf test 23 + 23: Test using a dummy software event to keep tracking : Ok + + +perf record modes (nothing new here) +------------------------------------ + +perf record essentially operates in one of three modes: + per thread + per cpu + workload only + +"per thread" mode is selected by -t or by --per-thread (with -p or -u or just a +workload). +"per cpu" is selected by -C or -a. +"workload only" mode is selected by not using the other options but providing a +command to run (i.e. the workload). + +In per-thread mode an exact list of threads is traced. There is no inheritance. +Each thread has its own event buffer. + +In per-cpu mode all processes (or processes from the selected cgroup i.e. -G +option, or processes selected with -p or -u) are traced. Each cpu has its own +buffer. Inheritance is allowed. + +In workload-only mode, the workload is traced but with per-cpu buffers. +Inheritance is allowed. Note that you can now trace a workload in per-thread +mode by using the --per-thread option. + + +Privileged vs non-privileged users +---------------------------------- + +Unless /proc/sys/kernel/perf_event_paranoid is set to -1, unprivileged users +have memory limits imposed upon them. That affects what buffer sizes they can +have as outlined above. + +Unless /proc/sys/kernel/perf_event_paranoid is set to -1, unprivileged users are +not permitted to use tracepoints which means there is insufficient side-band +information to decode Intel PT in per-cpu mode, and potentially workload-only +mode too if the workload creates new processes. + +Note also, that to use tracepoints, read-access to debugfs is required. So if +debugfs is not mounted or the user does not have read-access, it will again not +be possible to decode Intel PT in per-cpu mode. + + +sched_switch tracepoint +----------------------- + +The sched_switch tracepoint is used to provide side-band data for Intel PT +decoding. sched_switch events are automatically added. e.g. the second event +shown below + + $ perf record -vv -e intel_pt//u uname + ------------------------------------------------------------ + perf_event_attr: + type 6 + size 112 + config 0x400 + { sample_period, sample_freq } 1 + sample_type IP|TID|TIME|CPU|IDENTIFIER + read_format ID + disabled 1 + inherit 1 + exclude_kernel 1 + exclude_hv 1 + enable_on_exec 1 + sample_id_all 1 + ------------------------------------------------------------ + sys_perf_event_open: pid 31104 cpu 0 group_fd -1 flags 0x8 + sys_perf_event_open: pid 31104 cpu 1 group_fd -1 flags 0x8 + sys_perf_event_open: pid 31104 cpu 2 group_fd -1 flags 0x8 + sys_perf_event_open: pid 31104 cpu 3 group_fd -1 flags 0x8 + ------------------------------------------------------------ + perf_event_attr: + type 2 + size 112 + config 0x108 + { sample_period, sample_freq } 1 + sample_type IP|TID|TIME|CPU|PERIOD|RAW|IDENTIFIER + read_format ID + inherit 1 + sample_id_all 1 + exclude_guest 1 + ------------------------------------------------------------ + sys_perf_event_open: pid -1 cpu 0 group_fd -1 flags 0x8 + sys_perf_event_open: pid -1 cpu 1 group_fd -1 flags 0x8 + sys_perf_event_open: pid -1 cpu 2 group_fd -1 flags 0x8 + sys_perf_event_open: pid -1 cpu 3 group_fd -1 flags 0x8 + ------------------------------------------------------------ + perf_event_attr: + type 1 + size 112 + config 0x9 + { sample_period, sample_freq } 1 + sample_type IP|TID|TIME|IDENTIFIER + read_format ID + disabled 1 + inherit 1 + exclude_kernel 1 + exclude_hv 1 + mmap 1 + comm 1 + enable_on_exec 1 + task 1 + sample_id_all 1 + mmap2 1 + comm_exec 1 + ------------------------------------------------------------ + sys_perf_event_open: pid 31104 cpu 0 group_fd -1 flags 0x8 + sys_perf_event_open: pid 31104 cpu 1 group_fd -1 flags 0x8 + sys_perf_event_open: pid 31104 cpu 2 group_fd -1 flags 0x8 + sys_perf_event_open: pid 31104 cpu 3 group_fd -1 flags 0x8 + mmap size 528384B + AUX area mmap length 4194304 + perf event ring buffer mmapped per cpu + Synthesizing auxtrace information + Linux + [ perf record: Woken up 1 times to write data ] + [ perf record: Captured and wrote 0.042 MB perf.data ] + +Note, the sched_switch event is only added if the user is permitted to use it +and only in per-cpu mode. + +Note also, the sched_switch event is only added if TSC packets are requested. +That is because, in the absence of timing information, the sched_switch events +cannot be matched against the Intel PT trace. + + +perf script +=========== + +By default, perf script will decode trace data found in the perf.data file. +This can be further controlled by new option --itrace. + + +New --itrace option +------------------- + +Having no option is the same as + + --itrace + +which, in turn, is the same as + + --itrace=ibxe + +The letters are: + + i synthesize "instructions" events + b synthesize "branches" events + x synthesize "transactions" events + c synthesize branches events (calls only) + r synthesize branches events (returns only) + e synthesize tracing error events + d create a debug log + g synthesize a call chain (use with i or x) + +"Instructions" events look like they were recorded by "perf record -e +instructions". + +"Branches" events look like they were recorded by "perf record -e branches". "c" +and "r" can be combined to get calls and returns. + +"Transactions" events correspond to the start or end of transactions. The +'flags' field can be used in perf script to determine whether the event is a +tranasaction start, commit or abort. + +Error events are new. They show where the decoder lost the trace. Error events +are quite important. Users must know if what they are seeing is a complete +picture or not. + +The "d" option will cause the creation of a file "intel_pt.log" containing all +decoded packets and instructions. Note that this option slows down the decoder +and that the resulting file may be very large. + +In addition, the period of the "instructions" event can be specified. e.g. + + --itrace=i10us + +sets the period to 10us i.e. one instruction sample is synthesized for each 10 +microseconds of trace. Alternatives to "us" are "ms" (milliseconds), +"ns" (nanoseconds), "t" (TSC ticks) or "i" (instructions). + +"ms", "us" and "ns" are converted to TSC ticks. + +The timing information included with Intel PT does not give the time of every +instruction. Consequently, for the purpose of sampling, the decoder estimates +the time since the last timing packet based on 1 tick per instruction. The time +on the sample is *not* adjusted and reflects the last known value of TSC. + +For Intel PT, the default period is 100us. + +Also the call chain size (default 16, max. 1024) for instructions or +transactions events can be specified. e.g. + + --itrace=ig32 + --itrace=xg32 + +To disable trace decoding entirely, use the option --no-itrace. + + +dump option +----------- + +perf script has an option (-D) to "dump" the events i.e. display the binary +data. + +When -D is used, Intel PT packets are displayed. The packet decoder does not +pay attention to PSB packets, but just decodes the bytes - so the packets seen +by the actual decoder may not be identical in places where the data is corrupt. +One example of that would be when the buffer-switching interrupt has been too +slow, and the buffer has been filled completely. In that case, the last packet +in the buffer might be truncated and immediately followed by a PSB as the trace +continues in the next buffer. + +To disable the display of Intel PT packets, combine the -D option with +--no-itrace. + + +perf report +=========== + +By default, perf report will decode trace data found in the perf.data file. +This can be further controlled by new option --itrace exactly the same as +perf script, with the exception that the default is --itrace=igxe. + + +perf inject +=========== + +perf inject also accepts the --itrace option in which case tracing data is +removed and replaced with the synthesized events. e.g. + + perf inject --itrace -i perf.data -o perf.data.new diff --git a/tools/perf/arch/x86/util/Build b/tools/perf/arch/x86/util/Build index 1396088..a8be9f9 100644 --- a/tools/perf/arch/x86/util/Build +++ b/tools/perf/arch/x86/util/Build @@ -1,5 +1,6 @@ libperf-y += header.o libperf-y += tsc.o +libperf-y += pmu.o libperf-y += kvm-stat.o libperf-$(CONFIG_DWARF) += dwarf-regs.o @@ -7,4 +8,5 @@ libperf-$(CONFIG_DWARF) += dwarf-regs.o libperf-$(CONFIG_LIBUNWIND) += unwind-libunwind.o libperf-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o +libperf-$(CONFIG_AUXTRACE) += auxtrace.o libperf-$(CONFIG_AUXTRACE) += intel-pt.o diff --git a/tools/perf/arch/x86/util/auxtrace.c b/tools/perf/arch/x86/util/auxtrace.c new file mode 100644 index 0000000..e7654b5 --- /dev/null +++ b/tools/perf/arch/x86/util/auxtrace.c @@ -0,0 +1,38 @@ +/* + * auxtrace.c: AUX area tracing support + * Copyright (c) 2013-2014, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + */ + +#include "../../util/header.h" +#include "../../util/auxtrace.h" +#include "../../util/intel-pt.h" + +struct auxtrace_record *auxtrace_record__init(struct perf_evlist *evlist __maybe_unused, + int *err) +{ + char buffer[64]; + int ret; + + *err = 0; + + ret = get_cpuid(buffer, sizeof(buffer)); + if (ret) { + *err = ret; + return NULL; + } + + if (!strncmp(buffer, "GenuineIntel,", 13)) + return intel_pt_recording_init(err); + + return NULL; +} diff --git a/tools/perf/arch/x86/util/pmu.c b/tools/perf/arch/x86/util/pmu.c new file mode 100644 index 0000000..fd11cc3 --- /dev/null +++ b/tools/perf/arch/x86/util/pmu.c @@ -0,0 +1,15 @@ +#include + +#include + +#include "../../util/intel-pt.h" +#include "../../util/pmu.h" + +struct perf_event_attr *perf_pmu__get_default_config(struct perf_pmu *pmu __maybe_unused) +{ +#ifdef HAVE_AUXTRACE_SUPPORT + if (!strcmp(pmu->name, INTEL_PT_PMU_NAME)) + return intel_pt_pmu_default_config(pmu); +#endif + return NULL; +} diff --git a/tools/perf/util/auxtrace.c b/tools/perf/util/auxtrace.c index 49dbfbe..0f0b7e1 100644 --- a/tools/perf/util/auxtrace.c +++ b/tools/perf/util/auxtrace.c @@ -47,6 +47,8 @@ #include "debug.h" #include "parse-options.h" +#include "intel-pt.h" + int auxtrace_mmap__mmap(struct auxtrace_mmap *mm, struct auxtrace_mmap_params *mp, void *userpg, int fd) @@ -876,7 +878,7 @@ static bool auxtrace__dont_decode(struct perf_session *session) int perf_event__process_auxtrace_info(struct perf_tool *tool __maybe_unused, union perf_event *event, - struct perf_session *session __maybe_unused) + struct perf_session *session) { enum auxtrace_type type = event->auxtrace_info.type; @@ -885,6 +887,7 @@ int perf_event__process_auxtrace_info(struct perf_tool *tool __maybe_unused, switch (type) { case PERF_AUXTRACE_INTEL_PT: + return intel_pt_process_auxtrace_info(event, session); case PERF_AUXTRACE_UNKNOWN: default: return -EINVAL; diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index 84cad05..3c71138 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -462,8 +462,8 @@ static struct perf_pmu *pmu_lookup(const char *name) LIST_HEAD(aliases); __u32 type; - /* No support for intel_bts or intel_pt so disallow them */ - if (!strcmp(name, "intel_bts") || !strcmp(name, "intel_pt")) + /* No support for intel_bts so disallow it */ + if (!strcmp(name, "intel_bts")) return NULL; /* -- cgit v0.10.2 From 82819ffb42fb45197bacf3223191deca31d3eb91 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 21 Aug 2015 08:14:46 +0200 Subject: perf/x86/msr: Fix the MSR driver build The new MSR PMU driver made use of rdtsc() which does not exist (yet) in this tree: arch/x86/kernel/cpu/perf_event_msr.c:91:3: error: implicit declaration of function 'rdtsc' Use the old rdtscll() primitive for now. Reported-by: kbuild test robot Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/cpu/perf_event_msr.c b/arch/x86/kernel/cpu/perf_event_msr.c index b0dd2e8..086b12e 100644 --- a/arch/x86/kernel/cpu/perf_event_msr.c +++ b/arch/x86/kernel/cpu/perf_event_msr.c @@ -145,7 +145,7 @@ static inline u64 msr_read_counter(struct perf_event *event) if (event->hw.event_base) rdmsrl(event->hw.event_base, now); else - now = rdtsc(); + rdtscll(now); return now; } -- cgit v0.10.2 From 05169df5561363ff04ac04d6aad0be3b45c26ac1 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Thu, 20 Aug 2015 11:26:45 +0300 Subject: perf script: Fix segfault using --show-mmap-events Patch "perf script: Don't assume evsel position of tracking events" changed 'perf script' to use 'perf_evlist__id2evsel()'. That results in a segfault if there is more than 1 event and there are synthesized mmap events e.g. $ perf record -e cycles,instructions -p$$ sleep 1 $ perf script --show-mmap-events Segmentation fault (core dumped) That happens because these synthesized events have an 'id' of zero which does not match any 'evsel'. Currently, these synthesized events use the sample type of the first evsel. Change 'perf_evlist__id2evsel()' to reflect that which also makes it consistent with 'perf_evlist__event2evsel()'. Signed-off-by: Adrian Hunter Fixes: 06b234ec26fd ("perf script: Don't assume evsel position of tracking events") Cc: Jiri Olsa Link: http://lkml.kernel.org/r/1440059205-1765-1-git-send-email-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 373f65b..e9a5d43 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -573,7 +573,7 @@ struct perf_evsel *perf_evlist__id2evsel(struct perf_evlist *evlist, u64 id) { struct perf_sample_id *sid; - if (evlist->nr_entries == 1) + if (evlist->nr_entries == 1 || !id) return perf_evlist__first(evlist); sid = perf_evlist__id2sid(evlist, id); -- cgit v0.10.2 From 133de940435493266acd4100b0dae8ba7eaa7c71 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Thu, 20 Aug 2015 13:07:40 +0300 Subject: perf tools: /proc/kcore requires CAP_SYS_RAWIO message too noisy The "/proc/kcore requires CAP_SYS_RAWIO" message comes up all the time for 'perf script' if vmlinux is not found and the user isn't root, even when the kernel is not being traced and even though the message is only really relevant for annotation. Change it to pr_debug and instead put a note in the message displayed if annotation is not possible. Also, the file being accessed might not be /proc/kcore. Tools can be directed to a different location using the --kallsyms option in which case kcore is expected to be in the same directory. Adjust the message so it is not misleading in that case. Signed-off-by: Adrian Hunter Cc: Jiri Olsa Cc: Li Zhang Cc: Sukadev Bhattiprolu Link: http://lkml.kernel.org/r/1440065260-8802-1-git-send-email-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 8a18347..d1eece7 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -1126,6 +1126,7 @@ fallback: dso->annotate_warned = 1; pr_err("Can't annotate %s:\n\n" "No vmlinux file%s\nwas found in the path.\n\n" + "Note that annotation using /proc/kcore requires CAP_SYS_RAWIO capability.\n\n" "Please use:\n\n" " perf buildid-cache -vu vmlinux\n\n" "or:\n\n" diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c index 725640f..42e98ab 100644 --- a/tools/perf/util/symbol.c +++ b/tools/perf/util/symbol.c @@ -1138,8 +1138,8 @@ static int dso__load_kcore(struct dso *dso, struct map *map, fd = open(kcore_filename, O_RDONLY); if (fd < 0) { - pr_err("%s requires CAP_SYS_RAWIO capability to access.\n", - kcore_filename); + pr_debug("Failed to open %s. Note /proc/kcore requires CAP_SYS_RAWIO capability to access.\n", + kcore_filename); return -EINVAL; } -- cgit v0.10.2 From 81cd60cc29a9c3e92ad6ca167a4764b2c2d2fc04 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Thu, 20 Aug 2015 11:51:32 +0300 Subject: perf tools: Fix Intel PT timestamp handling Events that don't sample the timestamp have a timestamp value of -1. Intel PT processing wasn't taking that into account. This is particularly noticeable with Intel BTS because timestamps are not requested by default. Then, if the conversion of -1 to TSC results in a small number, the processing is unaffected. However if the conversion results in a big number, then the data is processed prematurely before relevant sideband data like mmap events, which in turn results in samples with unknown dsos. Commiter note: Since BTS wasn't upstream, I split the patch to fold the BTS part with the patch introducing it, to avoid having this bug in the commit history. PT was already upstream, so this patch contains that part. Signed-off-by: Adrian Hunter Cc: Jiri Olsa Link: http://lkml.kernel.org/r/1440060692-5585-1-git-send-email-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/intel-pt.c b/tools/perf/util/intel-pt.c index 2a4a412..a5acd2f 100644 --- a/tools/perf/util/intel-pt.c +++ b/tools/perf/util/intel-pt.c @@ -1450,7 +1450,7 @@ static int intel_pt_process_event(struct perf_session *session, return -EINVAL; } - if (sample->time) + if (sample->time && sample->time != (u64)-1) timestamp = perf_time_to_tsc(sample->time, &pt->tc); else timestamp = 0; -- cgit v0.10.2 From 6f56e9cf581c6cedcaea3eb69444b169867ccf3d Mon Sep 17 00:00:00 2001 From: Dean Nelson Date: Thu, 20 Aug 2015 11:16:32 -0400 Subject: tools lib traceevent: Add checks for returned EVENT_ERROR type Running the following perf-stat command on an arm64 system produces the following result... [root@aarch64 ~]# perf stat -e kmem:mm_page_alloc -a sleep 1 Warning: [kmem:mm_page_alloc] function sizeof not defined Warning: Error: expected type 4 but read 0 Segmentation fault [root@aarch64 ~]# The second warning was a result of the first warning not stopping processing after it detected the issue. That is, code that found the issue reported the first problem, but because it did not exit out of the functions smoothly, it caused the other warning to appear and not only that, it later caused the SIGSEGV. Signed-off-by: Dean Nelson Reviewed-by: Steven Rostedt Acked-by: Namhyung Kim Cc: Jiri Olsa Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20150820151632.13927.13791.email-sent-by-dnelson@teal Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/lib/traceevent/event-parse.c b/tools/lib/traceevent/event-parse.c index fcd8a9e..5c1867a 100644 --- a/tools/lib/traceevent/event-parse.c +++ b/tools/lib/traceevent/event-parse.c @@ -1745,6 +1745,9 @@ process_cond(struct event_format *event, struct print_arg *top, char **tok) type = process_arg(event, left, &token); again: + if (type == EVENT_ERROR) + goto out_free; + /* Handle other operations in the arguments */ if (type == EVENT_OP && strcmp(token, ":") != 0) { type = process_op(event, left, &token); @@ -2004,6 +2007,12 @@ process_op(struct event_format *event, struct print_arg *arg, char **tok) goto out_warn_free; type = process_arg_token(event, right, tok, type); + if (type == EVENT_ERROR) { + free_arg(right); + /* token was freed in process_arg_token() via *tok */ + token = NULL; + goto out_free; + } if (right->type == PRINT_OP && get_op_prio(arg->op.op) < get_op_prio(right->op.op)) { -- cgit v0.10.2 From d0170af7004dce9cd90b749842c37e379476cbc8 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Fri, 17 Jul 2015 19:33:43 +0300 Subject: perf tools: Add Intel BTS support Intel BTS support fits within the new auxtrace infrastructure. Recording is supporting by identifying the Intel BTS PMU, parsing options and setting up events. Decoding is supported by queuing up trace data by thread and then decoding synchronously delivering synthesized event samples into the session processing for tools to consume. Committer note: E.g: [root@felicio ~]# perf record --per-thread -e intel_bts// ls anaconda-ks.cfg apctest.output bin kernel-rt-3.10.0-298.rt56.171.el7.x86_64.rpm libexec lock_page.bpf.c perf.data perf.data.old [ perf record: Woken up 3 times to write data ] [ perf record: Captured and wrote 4.367 MB perf.data ] [root@felicio ~]# perf evlist -v intel_bts//: type: 6, size: 112, { sample_period, sample_freq }: 1, sample_type: IP|TID|IDENTIFIER, read_format: ID, disabled: 1, enable_on_exec: 1, sample_id_all: 1, exclude_guest: 1 dummy:u: type: 1, size: 112, config: 0x9, { sample_period, sample_freq }: 1, sample_type: IP|TID|IDENTIFIER, read_format: ID, disabled: 1, exclude_kernel: 1, exclude_hv: 1, mmap: 1, comm: 1, enable_on_exec: 1, task: 1, sample_id_all: 1, mmap2: 1, comm_exec: 1 [root@felicio ~]# perf script # the navigate in the pager to some interesting place: ls 1843 1 branches: ffffffff810a60cb flush_signal_handlers ([kernel.kallsyms]) => ffffffff8121a522 setup_new_exec ([kernel.kallsyms]) ls 1843 1 branches: ffffffff8121a529 setup_new_exec ([kernel.kallsyms]) => ffffffff8122fa30 do_close_on_exec ([kernel.kallsyms]) ls 1843 1 branches: ffffffff8122fa5d do_close_on_exec ([kernel.kallsyms]) => ffffffff81767ae0 _raw_spin_lock ([kernel.kallsyms]) ls 1843 1 branches: ffffffff81767af4 _raw_spin_lock ([kernel.kallsyms]) => ffffffff8122fa62 do_close_on_exec ([kernel.kallsyms]) ls 1843 1 branches: ffffffff8122fa8e do_close_on_exec ([kernel.kallsyms]) => ffffffff8122faf0 do_close_on_exec ([kernel.kallsyms]) ls 1843 1 branches: ffffffff8122faf7 do_close_on_exec ([kernel.kallsyms]) => ffffffff8122fa8b do_close_on_exec ([kernel.kallsyms]) ls 1843 1 branches: ffffffff8122fa8e do_close_on_exec ([kernel.kallsyms]) => ffffffff8122faf0 do_close_on_exec ([kernel.kallsyms]) ls 1843 1 branches: ffffffff8122faf7 do_close_on_exec ([kernel.kallsyms]) => ffffffff8122fa8b do_close_on_exec ([kernel.kallsyms]) ls 1843 1 branches: ffffffff8122fa8e do_close_on_exec ([kernel.kallsyms]) => ffffffff8122faf0 do_close_on_exec ([kernel.kallsyms]) ls 1843 1 branches: ffffffff8122faf7 do_close_on_exec ([kernel.kallsyms]) => ffffffff8122fa8b do_close_on_exec ([kernel.kallsyms]) ls 1843 1 branches: ffffffff8122fa8e do_close_on_exec ([kernel.kallsyms]) => ffffffff8122faf0 do_close_on_exec ([kernel.kallsyms]) ls 1843 1 branches: ffffffff8122faf7 do_close_on_exec ([kernel.kallsyms]) => ffffffff8122fa8b do_close_on_exec ([kernel.kallsyms]) ls 1843 1 branches: ffffffff8122fa8e do_close_on_exec ([kernel.kallsyms]) => ffffffff8122faf0 do_close_on_exec ([kernel.kallsyms]) ls 1843 1 branches: ffffffff8122faf7 do_close_on_exec ([kernel.kallsyms]) => ffffffff8122fa8b do_close_on_exec ([kernel.kallsyms]) ls 1843 1 branches: ffffffff8122fa8e do_close_on_exec ([kernel.kallsyms]) => ffffffff8122faf0 do_close_on_exec ([kernel.kallsyms]) ls 1843 1 branches: ffffffff8122faf7 do_close_on_exec ([kernel.kallsyms]) => ffffffff8122fa8b do_close_on_exec ([kernel.kallsyms]) ls 1843 1 branches: ffffffff8122fac9 do_close_on_exec ([kernel.kallsyms]) => ffffffff8122fad2 do_close_on_exec ([kernel.kallsyms]) ls 1843 1 branches: ffffffff8122fadd do_close_on_exec ([kernel.kallsyms]) => ffffffff8120fc80 filp_close ([kernel.kallsyms]) ls 1843 1 branches: ffffffff8120fcaf filp_close ([kernel.kallsyms]) => ffffffff8120fcb6 filp_close ([kernel.kallsyms]) ls 1843 1 branches: ffffffff8120fcc2 filp_close ([kernel.kallsyms]) => ffffffff812547f0 dnotify_flush ([kernel.kallsyms]) ls 1843 1 branches: ffffffff81254823 dnotify_flush ([kernel.kallsyms]) => ffffffff8120fcc7 filp_close ([kernel.kallsyms]) ls 1843 1 branches: ffffffff8120fccd filp_close ([kernel.kallsyms]) => ffffffff81261790 locks_remove_posix ([kernel.kallsyms]) ls 1843 1 branches: ffffffff812617a3 locks_remove_posix ([kernel.kallsyms]) => ffffffff812617b9 locks_remove_posix ([kernel.kallsyms]) ls 1843 1 branches: ffffffff812617b9 locks_remove_posix ([kernel.kallsyms]) => ffffffff8120fcd2 filp_close ([kernel.kallsyms]) ls 1843 1 branches: ffffffff8120fcd5 filp_close ([kernel.kallsyms]) => ffffffff812142c0 fput ([kernel.kallsyms]) ls 1843 1 branches: ffffffff812142d6 fput ([kernel.kallsyms]) => ffffffff812142df fput ([kernel.kallsyms]) ls 1843 1 branches: ffffffff8121430c fput ([kernel.kallsyms]) => ffffffff810b6580 task_work_add ([kernel.kallsyms]) ls 1843 1 branches: ffffffff810b65ad task_work_add ([kernel.kallsyms]) => ffffffff810b65b1 task_work_add ([kernel.kallsyms]) ls 1843 1 branches: ffffffff810b65c1 task_work_add ([kernel.kallsyms]) => ffffffff810bc710 kick_process ([kernel.kallsyms]) ls 1843 1 branches: ffffffff810bc725 kick_process ([kernel.kallsyms]) => ffffffff810bc742 kick_process ([kernel.kallsyms]) ls 1843 1 branches: ffffffff810bc742 kick_process ([kernel.kallsyms]) => ffffffff810b65c6 task_work_add ([kernel.kallsyms]) ls 1843 1 branches: ffffffff810b65c9 task_work_add ([kernel.kallsyms]) => ffffffff81214311 fput ([kernel.kallsyms]) Signed-off-by: Adrian Hunter Tested-by: Arnaldo Carvalho de Melo Cc: Jiri Olsa Link: http://lkml.kernel.org/r/1437150840-31811-9-git-send-email-adrian.hunter@intel.com [ Merged sample->time fix for bug found after first round of testing on slightly older kernel ] Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/Documentation/intel-bts.txt b/tools/perf/Documentation/intel-bts.txt new file mode 100644 index 0000000..8bdc93b --- /dev/null +++ b/tools/perf/Documentation/intel-bts.txt @@ -0,0 +1,86 @@ +Intel Branch Trace Store +======================== + +Overview +======== + +Intel BTS could be regarded as a predecessor to Intel PT and has some +similarities because it can also identify every branch a program takes. A +notable difference is that Intel BTS has no timing information and as a +consequence the present implementation is limited to per-thread recording. + +While decoding Intel BTS does not require walking the object code, the object +code is still needed to pair up calls and returns correctly, consequently much +of the Intel PT documentation applies also to Intel BTS. Refer to the Intel PT +documentation and consider that the PMU 'intel_bts' can usually be used in +place of 'intel_pt' in the examples provided, with the proviso that per-thread +recording must also be stipulated i.e. the --per-thread option for +'perf record'. + + +perf record +=========== + +new event +--------- + +The Intel BTS kernel driver creates a new PMU for Intel BTS. The perf record +option is: + + -e intel_bts// + +Currently Intel BTS is limited to per-thread tracing so the --per-thread option +is also needed. + + +snapshot option +--------------- + +The snapshot option is the same as Intel PT (refer Intel PT documentation). + + +auxtrace mmap size option +----------------------- + +The mmap size option is the same as Intel PT (refer Intel PT documentation). + + +perf script +=========== + +By default, perf script will decode trace data found in the perf.data file. +This can be further controlled by option --itrace. The --itrace option is +the same as Intel PT (refer Intel PT documentation) except that neither +"instructions" events nor "transactions" events (and consequently call +chains) are supported. + +To disable trace decoding entirely, use the option --no-itrace. + + +dump option +----------- + +perf script has an option (-D) to "dump" the events i.e. display the binary +data. + +When -D is used, Intel BTS packets are displayed. + +To disable the display of Intel BTS packets, combine the -D option with +--no-itrace. + + +perf report +=========== + +By default, perf report will decode trace data found in the perf.data file. +This can be further controlled by new option --itrace exactly the same as +perf script. + + +perf inject +=========== + +perf inject also accepts the --itrace option in which case tracing data is +removed and replaced with the synthesized events. e.g. + + perf inject --itrace -i perf.data -o perf.data.new diff --git a/tools/perf/arch/x86/util/Build b/tools/perf/arch/x86/util/Build index a8be9f9..2c55e1b 100644 --- a/tools/perf/arch/x86/util/Build +++ b/tools/perf/arch/x86/util/Build @@ -10,3 +10,4 @@ libperf-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o libperf-$(CONFIG_AUXTRACE) += auxtrace.o libperf-$(CONFIG_AUXTRACE) += intel-pt.o +libperf-$(CONFIG_AUXTRACE) += intel-bts.o diff --git a/tools/perf/arch/x86/util/auxtrace.c b/tools/perf/arch/x86/util/auxtrace.c index e7654b5..7a78055 100644 --- a/tools/perf/arch/x86/util/auxtrace.c +++ b/tools/perf/arch/x86/util/auxtrace.c @@ -13,11 +13,56 @@ * */ +#include + #include "../../util/header.h" +#include "../../util/debug.h" +#include "../../util/pmu.h" #include "../../util/auxtrace.h" #include "../../util/intel-pt.h" +#include "../../util/intel-bts.h" +#include "../../util/evlist.h" + +static +struct auxtrace_record *auxtrace_record__init_intel(struct perf_evlist *evlist, + int *err) +{ + struct perf_pmu *intel_pt_pmu; + struct perf_pmu *intel_bts_pmu; + struct perf_evsel *evsel; + bool found_pt = false; + bool found_bts = false; + + intel_pt_pmu = perf_pmu__find(INTEL_PT_PMU_NAME); + intel_bts_pmu = perf_pmu__find(INTEL_BTS_PMU_NAME); + + if (evlist) { + evlist__for_each(evlist, evsel) { + if (intel_pt_pmu && + evsel->attr.type == intel_pt_pmu->type) + found_pt = true; + if (intel_bts_pmu && + evsel->attr.type == intel_bts_pmu->type) + found_bts = true; + } + } + + if (found_pt && found_bts) { + pr_err("intel_pt and intel_bts may not be used together\n"); + *err = -EINVAL; + return NULL; + } + + if (found_pt) + return intel_pt_recording_init(err); + + if (found_bts) + return intel_bts_recording_init(err); -struct auxtrace_record *auxtrace_record__init(struct perf_evlist *evlist __maybe_unused, + return NULL; +} + +struct auxtrace_record *auxtrace_record__init(struct perf_evlist *evlist, int *err) { char buffer[64]; @@ -32,7 +77,7 @@ struct auxtrace_record *auxtrace_record__init(struct perf_evlist *evlist __maybe } if (!strncmp(buffer, "GenuineIntel,", 13)) - return intel_pt_recording_init(err); + return auxtrace_record__init_intel(evlist, err); return NULL; } diff --git a/tools/perf/arch/x86/util/intel-bts.c b/tools/perf/arch/x86/util/intel-bts.c new file mode 100644 index 0000000..9b94ce5 --- /dev/null +++ b/tools/perf/arch/x86/util/intel-bts.c @@ -0,0 +1,458 @@ +/* + * intel-bts.c: Intel Processor Trace support + * Copyright (c) 2013-2015, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + */ + +#include +#include +#include +#include + +#include "../../util/cpumap.h" +#include "../../util/evsel.h" +#include "../../util/evlist.h" +#include "../../util/session.h" +#include "../../util/util.h" +#include "../../util/pmu.h" +#include "../../util/debug.h" +#include "../../util/tsc.h" +#include "../../util/auxtrace.h" +#include "../../util/intel-bts.h" + +#define KiB(x) ((x) * 1024) +#define MiB(x) ((x) * 1024 * 1024) +#define KiB_MASK(x) (KiB(x) - 1) +#define MiB_MASK(x) (MiB(x) - 1) + +#define INTEL_BTS_DFLT_SAMPLE_SIZE KiB(4) + +#define INTEL_BTS_MAX_SAMPLE_SIZE KiB(60) + +struct intel_bts_snapshot_ref { + void *ref_buf; + size_t ref_offset; + bool wrapped; +}; + +struct intel_bts_recording { + struct auxtrace_record itr; + struct perf_pmu *intel_bts_pmu; + struct perf_evlist *evlist; + bool snapshot_mode; + size_t snapshot_size; + int snapshot_ref_cnt; + struct intel_bts_snapshot_ref *snapshot_refs; +}; + +struct branch { + u64 from; + u64 to; + u64 misc; +}; + +static size_t intel_bts_info_priv_size(struct auxtrace_record *itr __maybe_unused) +{ + return INTEL_BTS_AUXTRACE_PRIV_SIZE; +} + +static int intel_bts_info_fill(struct auxtrace_record *itr, + struct perf_session *session, + struct auxtrace_info_event *auxtrace_info, + size_t priv_size) +{ + struct intel_bts_recording *btsr = + container_of(itr, struct intel_bts_recording, itr); + struct perf_pmu *intel_bts_pmu = btsr->intel_bts_pmu; + struct perf_event_mmap_page *pc; + struct perf_tsc_conversion tc = { .time_mult = 0, }; + bool cap_user_time_zero = false; + int err; + + if (priv_size != INTEL_BTS_AUXTRACE_PRIV_SIZE) + return -EINVAL; + + if (!session->evlist->nr_mmaps) + return -EINVAL; + + pc = session->evlist->mmap[0].base; + if (pc) { + err = perf_read_tsc_conversion(pc, &tc); + if (err) { + if (err != -EOPNOTSUPP) + return err; + } else { + cap_user_time_zero = tc.time_mult != 0; + } + if (!cap_user_time_zero) + ui__warning("Intel BTS: TSC not available\n"); + } + + auxtrace_info->type = PERF_AUXTRACE_INTEL_BTS; + auxtrace_info->priv[INTEL_BTS_PMU_TYPE] = intel_bts_pmu->type; + auxtrace_info->priv[INTEL_BTS_TIME_SHIFT] = tc.time_shift; + auxtrace_info->priv[INTEL_BTS_TIME_MULT] = tc.time_mult; + auxtrace_info->priv[INTEL_BTS_TIME_ZERO] = tc.time_zero; + auxtrace_info->priv[INTEL_BTS_CAP_USER_TIME_ZERO] = cap_user_time_zero; + auxtrace_info->priv[INTEL_BTS_SNAPSHOT_MODE] = btsr->snapshot_mode; + + return 0; +} + +static int intel_bts_recording_options(struct auxtrace_record *itr, + struct perf_evlist *evlist, + struct record_opts *opts) +{ + struct intel_bts_recording *btsr = + container_of(itr, struct intel_bts_recording, itr); + struct perf_pmu *intel_bts_pmu = btsr->intel_bts_pmu; + struct perf_evsel *evsel, *intel_bts_evsel = NULL; + const struct cpu_map *cpus = evlist->cpus; + bool privileged = geteuid() == 0 || perf_event_paranoid() < 0; + + btsr->evlist = evlist; + btsr->snapshot_mode = opts->auxtrace_snapshot_mode; + + evlist__for_each(evlist, evsel) { + if (evsel->attr.type == intel_bts_pmu->type) { + if (intel_bts_evsel) { + pr_err("There may be only one " INTEL_BTS_PMU_NAME " event\n"); + return -EINVAL; + } + evsel->attr.freq = 0; + evsel->attr.sample_period = 1; + intel_bts_evsel = evsel; + opts->full_auxtrace = true; + } + } + + if (opts->auxtrace_snapshot_mode && !opts->full_auxtrace) { + pr_err("Snapshot mode (-S option) requires " INTEL_BTS_PMU_NAME " PMU event (-e " INTEL_BTS_PMU_NAME ")\n"); + return -EINVAL; + } + + if (!opts->full_auxtrace) + return 0; + + if (opts->full_auxtrace && !cpu_map__empty(cpus)) { + pr_err(INTEL_BTS_PMU_NAME " does not support per-cpu recording\n"); + return -EINVAL; + } + + /* Set default sizes for snapshot mode */ + if (opts->auxtrace_snapshot_mode) { + if (!opts->auxtrace_snapshot_size && !opts->auxtrace_mmap_pages) { + if (privileged) { + opts->auxtrace_mmap_pages = MiB(4) / page_size; + } else { + opts->auxtrace_mmap_pages = KiB(128) / page_size; + if (opts->mmap_pages == UINT_MAX) + opts->mmap_pages = KiB(256) / page_size; + } + } else if (!opts->auxtrace_mmap_pages && !privileged && + opts->mmap_pages == UINT_MAX) { + opts->mmap_pages = KiB(256) / page_size; + } + if (!opts->auxtrace_snapshot_size) + opts->auxtrace_snapshot_size = + opts->auxtrace_mmap_pages * (size_t)page_size; + if (!opts->auxtrace_mmap_pages) { + size_t sz = opts->auxtrace_snapshot_size; + + sz = round_up(sz, page_size) / page_size; + opts->auxtrace_mmap_pages = roundup_pow_of_two(sz); + } + if (opts->auxtrace_snapshot_size > + opts->auxtrace_mmap_pages * (size_t)page_size) { + pr_err("Snapshot size %zu must not be greater than AUX area tracing mmap size %zu\n", + opts->auxtrace_snapshot_size, + opts->auxtrace_mmap_pages * (size_t)page_size); + return -EINVAL; + } + if (!opts->auxtrace_snapshot_size || !opts->auxtrace_mmap_pages) { + pr_err("Failed to calculate default snapshot size and/or AUX area tracing mmap pages\n"); + return -EINVAL; + } + pr_debug2("Intel BTS snapshot size: %zu\n", + opts->auxtrace_snapshot_size); + } + + /* Set default sizes for full trace mode */ + if (opts->full_auxtrace && !opts->auxtrace_mmap_pages) { + if (privileged) { + opts->auxtrace_mmap_pages = MiB(4) / page_size; + } else { + opts->auxtrace_mmap_pages = KiB(128) / page_size; + if (opts->mmap_pages == UINT_MAX) + opts->mmap_pages = KiB(256) / page_size; + } + } + + /* Validate auxtrace_mmap_pages */ + if (opts->auxtrace_mmap_pages) { + size_t sz = opts->auxtrace_mmap_pages * (size_t)page_size; + size_t min_sz; + + if (opts->auxtrace_snapshot_mode) + min_sz = KiB(4); + else + min_sz = KiB(8); + + if (sz < min_sz || !is_power_of_2(sz)) { + pr_err("Invalid mmap size for Intel BTS: must be at least %zuKiB and a power of 2\n", + min_sz / 1024); + return -EINVAL; + } + } + + if (intel_bts_evsel) { + /* + * To obtain the auxtrace buffer file descriptor, the auxtrace event + * must come first. + */ + perf_evlist__to_front(evlist, intel_bts_evsel); + /* + * In the case of per-cpu mmaps, we need the CPU on the + * AUX event. + */ + if (!cpu_map__empty(cpus)) + perf_evsel__set_sample_bit(intel_bts_evsel, CPU); + } + + /* Add dummy event to keep tracking */ + if (opts->full_auxtrace) { + struct perf_evsel *tracking_evsel; + int err; + + err = parse_events(evlist, "dummy:u", NULL); + if (err) + return err; + + tracking_evsel = perf_evlist__last(evlist); + + perf_evlist__set_tracking_event(evlist, tracking_evsel); + + tracking_evsel->attr.freq = 0; + tracking_evsel->attr.sample_period = 1; + } + + return 0; +} + +static int intel_bts_parse_snapshot_options(struct auxtrace_record *itr, + struct record_opts *opts, + const char *str) +{ + struct intel_bts_recording *btsr = + container_of(itr, struct intel_bts_recording, itr); + unsigned long long snapshot_size = 0; + char *endptr; + + if (str) { + snapshot_size = strtoull(str, &endptr, 0); + if (*endptr || snapshot_size > SIZE_MAX) + return -1; + } + + opts->auxtrace_snapshot_mode = true; + opts->auxtrace_snapshot_size = snapshot_size; + + btsr->snapshot_size = snapshot_size; + + return 0; +} + +static u64 intel_bts_reference(struct auxtrace_record *itr __maybe_unused) +{ + return rdtsc(); +} + +static int intel_bts_alloc_snapshot_refs(struct intel_bts_recording *btsr, + int idx) +{ + const size_t sz = sizeof(struct intel_bts_snapshot_ref); + int cnt = btsr->snapshot_ref_cnt, new_cnt = cnt * 2; + struct intel_bts_snapshot_ref *refs; + + if (!new_cnt) + new_cnt = 16; + + while (new_cnt <= idx) + new_cnt *= 2; + + refs = calloc(new_cnt, sz); + if (!refs) + return -ENOMEM; + + memcpy(refs, btsr->snapshot_refs, cnt * sz); + + btsr->snapshot_refs = refs; + btsr->snapshot_ref_cnt = new_cnt; + + return 0; +} + +static void intel_bts_free_snapshot_refs(struct intel_bts_recording *btsr) +{ + int i; + + for (i = 0; i < btsr->snapshot_ref_cnt; i++) + zfree(&btsr->snapshot_refs[i].ref_buf); + zfree(&btsr->snapshot_refs); +} + +static void intel_bts_recording_free(struct auxtrace_record *itr) +{ + struct intel_bts_recording *btsr = + container_of(itr, struct intel_bts_recording, itr); + + intel_bts_free_snapshot_refs(btsr); + free(btsr); +} + +static int intel_bts_snapshot_start(struct auxtrace_record *itr) +{ + struct intel_bts_recording *btsr = + container_of(itr, struct intel_bts_recording, itr); + struct perf_evsel *evsel; + + evlist__for_each(btsr->evlist, evsel) { + if (evsel->attr.type == btsr->intel_bts_pmu->type) + return perf_evlist__disable_event(btsr->evlist, evsel); + } + return -EINVAL; +} + +static int intel_bts_snapshot_finish(struct auxtrace_record *itr) +{ + struct intel_bts_recording *btsr = + container_of(itr, struct intel_bts_recording, itr); + struct perf_evsel *evsel; + + evlist__for_each(btsr->evlist, evsel) { + if (evsel->attr.type == btsr->intel_bts_pmu->type) + return perf_evlist__enable_event(btsr->evlist, evsel); + } + return -EINVAL; +} + +static bool intel_bts_first_wrap(u64 *data, size_t buf_size) +{ + int i, a, b; + + b = buf_size >> 3; + a = b - 512; + if (a < 0) + a = 0; + + for (i = a; i < b; i++) { + if (data[i]) + return true; + } + + return false; +} + +static int intel_bts_find_snapshot(struct auxtrace_record *itr, int idx, + struct auxtrace_mmap *mm, unsigned char *data, + u64 *head, u64 *old) +{ + struct intel_bts_recording *btsr = + container_of(itr, struct intel_bts_recording, itr); + bool wrapped; + int err; + + pr_debug3("%s: mmap index %d old head %zu new head %zu\n", + __func__, idx, (size_t)*old, (size_t)*head); + + if (idx >= btsr->snapshot_ref_cnt) { + err = intel_bts_alloc_snapshot_refs(btsr, idx); + if (err) + goto out_err; + } + + wrapped = btsr->snapshot_refs[idx].wrapped; + if (!wrapped && intel_bts_first_wrap((u64 *)data, mm->len)) { + btsr->snapshot_refs[idx].wrapped = true; + wrapped = true; + } + + /* + * In full trace mode 'head' continually increases. However in snapshot + * mode 'head' is an offset within the buffer. Here 'old' and 'head' + * are adjusted to match the full trace case which expects that 'old' is + * always less than 'head'. + */ + if (wrapped) { + *old = *head; + *head += mm->len; + } else { + if (mm->mask) + *old &= mm->mask; + else + *old %= mm->len; + if (*old > *head) + *head += mm->len; + } + + pr_debug3("%s: wrap-around %sdetected, adjusted old head %zu adjusted new head %zu\n", + __func__, wrapped ? "" : "not ", (size_t)*old, (size_t)*head); + + return 0; + +out_err: + pr_err("%s: failed, error %d\n", __func__, err); + return err; +} + +static int intel_bts_read_finish(struct auxtrace_record *itr, int idx) +{ + struct intel_bts_recording *btsr = + container_of(itr, struct intel_bts_recording, itr); + struct perf_evsel *evsel; + + evlist__for_each(btsr->evlist, evsel) { + if (evsel->attr.type == btsr->intel_bts_pmu->type) + return perf_evlist__enable_event_idx(btsr->evlist, + evsel, idx); + } + return -EINVAL; +} + +struct auxtrace_record *intel_bts_recording_init(int *err) +{ + struct perf_pmu *intel_bts_pmu = perf_pmu__find(INTEL_BTS_PMU_NAME); + struct intel_bts_recording *btsr; + + if (!intel_bts_pmu) + return NULL; + + btsr = zalloc(sizeof(struct intel_bts_recording)); + if (!btsr) { + *err = -ENOMEM; + return NULL; + } + + btsr->intel_bts_pmu = intel_bts_pmu; + btsr->itr.recording_options = intel_bts_recording_options; + btsr->itr.info_priv_size = intel_bts_info_priv_size; + btsr->itr.info_fill = intel_bts_info_fill; + btsr->itr.free = intel_bts_recording_free; + btsr->itr.snapshot_start = intel_bts_snapshot_start; + btsr->itr.snapshot_finish = intel_bts_snapshot_finish; + btsr->itr.find_snapshot = intel_bts_find_snapshot; + btsr->itr.parse_snapshot_options = intel_bts_parse_snapshot_options; + btsr->itr.reference = intel_bts_reference; + btsr->itr.read_finish = intel_bts_read_finish; + btsr->itr.alignment = sizeof(struct branch); + return &btsr->itr; +} diff --git a/tools/perf/arch/x86/util/pmu.c b/tools/perf/arch/x86/util/pmu.c index fd11cc3..79fe071 100644 --- a/tools/perf/arch/x86/util/pmu.c +++ b/tools/perf/arch/x86/util/pmu.c @@ -3,6 +3,7 @@ #include #include "../../util/intel-pt.h" +#include "../../util/intel-bts.h" #include "../../util/pmu.h" struct perf_event_attr *perf_pmu__get_default_config(struct perf_pmu *pmu __maybe_unused) @@ -10,6 +11,8 @@ struct perf_event_attr *perf_pmu__get_default_config(struct perf_pmu *pmu __mayb #ifdef HAVE_AUXTRACE_SUPPORT if (!strcmp(pmu->name, INTEL_PT_PMU_NAME)) return intel_pt_pmu_default_config(pmu); + if (!strcmp(pmu->name, INTEL_BTS_PMU_NAME)) + pmu->selectable = true; #endif return NULL; } diff --git a/tools/perf/util/Build b/tools/perf/util/Build index c20473d..e912856 100644 --- a/tools/perf/util/Build +++ b/tools/perf/util/Build @@ -80,6 +80,7 @@ libperf-y += thread-stack.o libperf-$(CONFIG_AUXTRACE) += auxtrace.o libperf-$(CONFIG_AUXTRACE) += intel-pt-decoder/ libperf-$(CONFIG_AUXTRACE) += intel-pt.o +libperf-$(CONFIG_AUXTRACE) += intel-bts.o libperf-y += parse-branch-options.o libperf-$(CONFIG_LIBELF) += symbol-elf.o diff --git a/tools/perf/util/auxtrace.c b/tools/perf/util/auxtrace.c index 0f0b7e1..a980e7c 100644 --- a/tools/perf/util/auxtrace.c +++ b/tools/perf/util/auxtrace.c @@ -48,6 +48,7 @@ #include "parse-options.h" #include "intel-pt.h" +#include "intel-bts.h" int auxtrace_mmap__mmap(struct auxtrace_mmap *mm, struct auxtrace_mmap_params *mp, @@ -888,6 +889,8 @@ int perf_event__process_auxtrace_info(struct perf_tool *tool __maybe_unused, switch (type) { case PERF_AUXTRACE_INTEL_PT: return intel_pt_process_auxtrace_info(event, session); + case PERF_AUXTRACE_INTEL_BTS: + return intel_bts_process_auxtrace_info(event, session); case PERF_AUXTRACE_UNKNOWN: default: return -EINVAL; diff --git a/tools/perf/util/auxtrace.h b/tools/perf/util/auxtrace.h index 7d12f33..bf72b77 100644 --- a/tools/perf/util/auxtrace.h +++ b/tools/perf/util/auxtrace.h @@ -40,6 +40,7 @@ struct events_stats; enum auxtrace_type { PERF_AUXTRACE_UNKNOWN, PERF_AUXTRACE_INTEL_PT, + PERF_AUXTRACE_INTEL_BTS, }; enum itrace_period_type { diff --git a/tools/perf/util/intel-bts.c b/tools/perf/util/intel-bts.c new file mode 100644 index 0000000..ea76862 --- /dev/null +++ b/tools/perf/util/intel-bts.c @@ -0,0 +1,933 @@ +/* + * intel-bts.c: Intel Processor Trace support + * Copyright (c) 2013-2015, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + */ + +#include +#include +#include +#include +#include +#include + +#include "cpumap.h" +#include "color.h" +#include "evsel.h" +#include "evlist.h" +#include "machine.h" +#include "session.h" +#include "util.h" +#include "thread.h" +#include "thread-stack.h" +#include "debug.h" +#include "tsc.h" +#include "auxtrace.h" +#include "intel-pt-decoder/intel-pt-insn-decoder.h" +#include "intel-bts.h" + +#define MAX_TIMESTAMP (~0ULL) + +#define INTEL_BTS_ERR_NOINSN 5 +#define INTEL_BTS_ERR_LOST 9 + +#if __BYTE_ORDER == __BIG_ENDIAN +#define le64_to_cpu bswap_64 +#else +#define le64_to_cpu +#endif + +struct intel_bts { + struct auxtrace auxtrace; + struct auxtrace_queues queues; + struct auxtrace_heap heap; + u32 auxtrace_type; + struct perf_session *session; + struct machine *machine; + bool sampling_mode; + bool snapshot_mode; + bool data_queued; + u32 pmu_type; + struct perf_tsc_conversion tc; + bool cap_user_time_zero; + struct itrace_synth_opts synth_opts; + bool sample_branches; + u32 branches_filter; + u64 branches_sample_type; + u64 branches_id; + size_t branches_event_size; + bool synth_needs_swap; +}; + +struct intel_bts_queue { + struct intel_bts *bts; + unsigned int queue_nr; + struct auxtrace_buffer *buffer; + bool on_heap; + bool done; + pid_t pid; + pid_t tid; + int cpu; + u64 time; + struct intel_pt_insn intel_pt_insn; + u32 sample_flags; +}; + +struct branch { + u64 from; + u64 to; + u64 misc; +}; + +static void intel_bts_dump(struct intel_bts *bts __maybe_unused, + unsigned char *buf, size_t len) +{ + struct branch *branch; + size_t i, pos = 0, br_sz = sizeof(struct branch), sz; + const char *color = PERF_COLOR_BLUE; + + color_fprintf(stdout, color, + ". ... Intel BTS data: size %zu bytes\n", + len); + + while (len) { + if (len >= br_sz) + sz = br_sz; + else + sz = len; + printf("."); + color_fprintf(stdout, color, " %08x: ", pos); + for (i = 0; i < sz; i++) + color_fprintf(stdout, color, " %02x", buf[i]); + for (; i < br_sz; i++) + color_fprintf(stdout, color, " "); + if (len >= br_sz) { + branch = (struct branch *)buf; + color_fprintf(stdout, color, " %"PRIx64" -> %"PRIx64" %s\n", + le64_to_cpu(branch->from), + le64_to_cpu(branch->to), + le64_to_cpu(branch->misc) & 0x10 ? + "pred" : "miss"); + } else { + color_fprintf(stdout, color, " Bad record!\n"); + } + pos += sz; + buf += sz; + len -= sz; + } +} + +static void intel_bts_dump_event(struct intel_bts *bts, unsigned char *buf, + size_t len) +{ + printf(".\n"); + intel_bts_dump(bts, buf, len); +} + +static int intel_bts_lost(struct intel_bts *bts, struct perf_sample *sample) +{ + union perf_event event; + int err; + + auxtrace_synth_error(&event.auxtrace_error, PERF_AUXTRACE_ERROR_ITRACE, + INTEL_BTS_ERR_LOST, sample->cpu, sample->pid, + sample->tid, 0, "Lost trace data"); + + err = perf_session__deliver_synth_event(bts->session, &event, NULL); + if (err) + pr_err("Intel BTS: failed to deliver error event, error %d\n", + err); + + return err; +} + +static struct intel_bts_queue *intel_bts_alloc_queue(struct intel_bts *bts, + unsigned int queue_nr) +{ + struct intel_bts_queue *btsq; + + btsq = zalloc(sizeof(struct intel_bts_queue)); + if (!btsq) + return NULL; + + btsq->bts = bts; + btsq->queue_nr = queue_nr; + btsq->pid = -1; + btsq->tid = -1; + btsq->cpu = -1; + + return btsq; +} + +static int intel_bts_setup_queue(struct intel_bts *bts, + struct auxtrace_queue *queue, + unsigned int queue_nr) +{ + struct intel_bts_queue *btsq = queue->priv; + + if (list_empty(&queue->head)) + return 0; + + if (!btsq) { + btsq = intel_bts_alloc_queue(bts, queue_nr); + if (!btsq) + return -ENOMEM; + queue->priv = btsq; + + if (queue->cpu != -1) + btsq->cpu = queue->cpu; + btsq->tid = queue->tid; + } + + if (bts->sampling_mode) + return 0; + + if (!btsq->on_heap && !btsq->buffer) { + int ret; + + btsq->buffer = auxtrace_buffer__next(queue, NULL); + if (!btsq->buffer) + return 0; + + ret = auxtrace_heap__add(&bts->heap, queue_nr, + btsq->buffer->reference); + if (ret) + return ret; + btsq->on_heap = true; + } + + return 0; +} + +static int intel_bts_setup_queues(struct intel_bts *bts) +{ + unsigned int i; + int ret; + + for (i = 0; i < bts->queues.nr_queues; i++) { + ret = intel_bts_setup_queue(bts, &bts->queues.queue_array[i], + i); + if (ret) + return ret; + } + return 0; +} + +static inline int intel_bts_update_queues(struct intel_bts *bts) +{ + if (bts->queues.new_data) { + bts->queues.new_data = false; + return intel_bts_setup_queues(bts); + } + return 0; +} + +static unsigned char *intel_bts_find_overlap(unsigned char *buf_a, size_t len_a, + unsigned char *buf_b, size_t len_b) +{ + size_t offs, len; + + if (len_a > len_b) + offs = len_a - len_b; + else + offs = 0; + + for (; offs < len_a; offs += sizeof(struct branch)) { + len = len_a - offs; + if (!memcmp(buf_a + offs, buf_b, len)) + return buf_b + len; + } + + return buf_b; +} + +static int intel_bts_do_fix_overlap(struct auxtrace_queue *queue, + struct auxtrace_buffer *b) +{ + struct auxtrace_buffer *a; + void *start; + + if (b->list.prev == &queue->head) + return 0; + a = list_entry(b->list.prev, struct auxtrace_buffer, list); + start = intel_bts_find_overlap(a->data, a->size, b->data, b->size); + if (!start) + return -EINVAL; + b->use_size = b->data + b->size - start; + b->use_data = start; + return 0; +} + +static int intel_bts_synth_branch_sample(struct intel_bts_queue *btsq, + struct branch *branch) +{ + int ret; + struct intel_bts *bts = btsq->bts; + union perf_event event; + struct perf_sample sample = { .ip = 0, }; + + event.sample.header.type = PERF_RECORD_SAMPLE; + event.sample.header.misc = PERF_RECORD_MISC_USER; + event.sample.header.size = sizeof(struct perf_event_header); + + sample.ip = le64_to_cpu(branch->from); + sample.pid = btsq->pid; + sample.tid = btsq->tid; + sample.addr = le64_to_cpu(branch->to); + sample.id = btsq->bts->branches_id; + sample.stream_id = btsq->bts->branches_id; + sample.period = 1; + sample.cpu = btsq->cpu; + sample.flags = btsq->sample_flags; + sample.insn_len = btsq->intel_pt_insn.length; + + if (bts->synth_opts.inject) { + event.sample.header.size = bts->branches_event_size; + ret = perf_event__synthesize_sample(&event, + bts->branches_sample_type, + 0, &sample, + bts->synth_needs_swap); + if (ret) + return ret; + } + + ret = perf_session__deliver_synth_event(bts->session, &event, &sample); + if (ret) + pr_err("Intel BTS: failed to deliver branch event, error %d\n", + ret); + + return ret; +} + +static int intel_bts_get_next_insn(struct intel_bts_queue *btsq, u64 ip) +{ + struct machine *machine = btsq->bts->machine; + struct thread *thread; + struct addr_location al; + unsigned char buf[1024]; + size_t bufsz; + ssize_t len; + int x86_64; + uint8_t cpumode; + int err = -1; + + bufsz = intel_pt_insn_max_size(); + + if (machine__kernel_ip(machine, ip)) + cpumode = PERF_RECORD_MISC_KERNEL; + else + cpumode = PERF_RECORD_MISC_USER; + + thread = machine__find_thread(machine, -1, btsq->tid); + if (!thread) + return -1; + + thread__find_addr_map(thread, cpumode, MAP__FUNCTION, ip, &al); + if (!al.map || !al.map->dso) + goto out_put; + + len = dso__data_read_addr(al.map->dso, al.map, machine, ip, buf, bufsz); + if (len <= 0) + goto out_put; + + /* Load maps to ensure dso->is_64_bit has been updated */ + map__load(al.map, machine->symbol_filter); + + x86_64 = al.map->dso->is_64_bit; + + if (intel_pt_get_insn(buf, len, x86_64, &btsq->intel_pt_insn)) + goto out_put; + + err = 0; +out_put: + thread__put(thread); + return err; +} + +static int intel_bts_synth_error(struct intel_bts *bts, int cpu, pid_t pid, + pid_t tid, u64 ip) +{ + union perf_event event; + int err; + + auxtrace_synth_error(&event.auxtrace_error, PERF_AUXTRACE_ERROR_ITRACE, + INTEL_BTS_ERR_NOINSN, cpu, pid, tid, ip, + "Failed to get instruction"); + + err = perf_session__deliver_synth_event(bts->session, &event, NULL); + if (err) + pr_err("Intel BTS: failed to deliver error event, error %d\n", + err); + + return err; +} + +static int intel_bts_get_branch_type(struct intel_bts_queue *btsq, + struct branch *branch) +{ + int err; + + if (!branch->from) { + if (branch->to) + btsq->sample_flags = PERF_IP_FLAG_BRANCH | + PERF_IP_FLAG_TRACE_BEGIN; + else + btsq->sample_flags = 0; + btsq->intel_pt_insn.length = 0; + } else if (!branch->to) { + btsq->sample_flags = PERF_IP_FLAG_BRANCH | + PERF_IP_FLAG_TRACE_END; + btsq->intel_pt_insn.length = 0; + } else { + err = intel_bts_get_next_insn(btsq, branch->from); + if (err) { + btsq->sample_flags = 0; + btsq->intel_pt_insn.length = 0; + if (!btsq->bts->synth_opts.errors) + return 0; + err = intel_bts_synth_error(btsq->bts, btsq->cpu, + btsq->pid, btsq->tid, + branch->from); + return err; + } + btsq->sample_flags = intel_pt_insn_type(btsq->intel_pt_insn.op); + /* Check for an async branch into the kernel */ + if (!machine__kernel_ip(btsq->bts->machine, branch->from) && + machine__kernel_ip(btsq->bts->machine, branch->to) && + btsq->sample_flags != (PERF_IP_FLAG_BRANCH | + PERF_IP_FLAG_CALL | + PERF_IP_FLAG_SYSCALLRET)) + btsq->sample_flags = PERF_IP_FLAG_BRANCH | + PERF_IP_FLAG_CALL | + PERF_IP_FLAG_ASYNC | + PERF_IP_FLAG_INTERRUPT; + } + + return 0; +} + +static int intel_bts_process_buffer(struct intel_bts_queue *btsq, + struct auxtrace_buffer *buffer) +{ + struct branch *branch; + size_t sz, bsz = sizeof(struct branch); + u32 filter = btsq->bts->branches_filter; + int err = 0; + + if (buffer->use_data) { + sz = buffer->use_size; + branch = buffer->use_data; + } else { + sz = buffer->size; + branch = buffer->data; + } + + if (!btsq->bts->sample_branches) + return 0; + + for (; sz > bsz; branch += 1, sz -= bsz) { + if (!branch->from && !branch->to) + continue; + intel_bts_get_branch_type(btsq, branch); + if (filter && !(filter & btsq->sample_flags)) + continue; + err = intel_bts_synth_branch_sample(btsq, branch); + if (err) + break; + } + return err; +} + +static int intel_bts_process_queue(struct intel_bts_queue *btsq, u64 *timestamp) +{ + struct auxtrace_buffer *buffer = btsq->buffer, *old_buffer = buffer; + struct auxtrace_queue *queue; + struct thread *thread; + int err; + + if (btsq->done) + return 1; + + if (btsq->pid == -1) { + thread = machine__find_thread(btsq->bts->machine, -1, + btsq->tid); + if (thread) + btsq->pid = thread->pid_; + } else { + thread = machine__findnew_thread(btsq->bts->machine, btsq->pid, + btsq->tid); + } + + queue = &btsq->bts->queues.queue_array[btsq->queue_nr]; + + if (!buffer) + buffer = auxtrace_buffer__next(queue, NULL); + + if (!buffer) { + if (!btsq->bts->sampling_mode) + btsq->done = 1; + err = 1; + goto out_put; + } + + /* Currently there is no support for split buffers */ + if (buffer->consecutive) { + err = -EINVAL; + goto out_put; + } + + if (!buffer->data) { + int fd = perf_data_file__fd(btsq->bts->session->file); + + buffer->data = auxtrace_buffer__get_data(buffer, fd); + if (!buffer->data) { + err = -ENOMEM; + goto out_put; + } + } + + if (btsq->bts->snapshot_mode && !buffer->consecutive && + intel_bts_do_fix_overlap(queue, buffer)) { + err = -ENOMEM; + goto out_put; + } + + if (!btsq->bts->synth_opts.callchain && thread && + (!old_buffer || btsq->bts->sampling_mode || + (btsq->bts->snapshot_mode && !buffer->consecutive))) + thread_stack__set_trace_nr(thread, buffer->buffer_nr + 1); + + err = intel_bts_process_buffer(btsq, buffer); + + auxtrace_buffer__drop_data(buffer); + + btsq->buffer = auxtrace_buffer__next(queue, buffer); + if (btsq->buffer) { + if (timestamp) + *timestamp = btsq->buffer->reference; + } else { + if (!btsq->bts->sampling_mode) + btsq->done = 1; + } +out_put: + thread__put(thread); + return err; +} + +static int intel_bts_flush_queue(struct intel_bts_queue *btsq) +{ + u64 ts = 0; + int ret; + + while (1) { + ret = intel_bts_process_queue(btsq, &ts); + if (ret < 0) + return ret; + if (ret) + break; + } + return 0; +} + +static int intel_bts_process_tid_exit(struct intel_bts *bts, pid_t tid) +{ + struct auxtrace_queues *queues = &bts->queues; + unsigned int i; + + for (i = 0; i < queues->nr_queues; i++) { + struct auxtrace_queue *queue = &bts->queues.queue_array[i]; + struct intel_bts_queue *btsq = queue->priv; + + if (btsq && btsq->tid == tid) + return intel_bts_flush_queue(btsq); + } + return 0; +} + +static int intel_bts_process_queues(struct intel_bts *bts, u64 timestamp) +{ + while (1) { + unsigned int queue_nr; + struct auxtrace_queue *queue; + struct intel_bts_queue *btsq; + u64 ts = 0; + int ret; + + if (!bts->heap.heap_cnt) + return 0; + + if (bts->heap.heap_array[0].ordinal > timestamp) + return 0; + + queue_nr = bts->heap.heap_array[0].queue_nr; + queue = &bts->queues.queue_array[queue_nr]; + btsq = queue->priv; + + auxtrace_heap__pop(&bts->heap); + + ret = intel_bts_process_queue(btsq, &ts); + if (ret < 0) { + auxtrace_heap__add(&bts->heap, queue_nr, ts); + return ret; + } + + if (!ret) { + ret = auxtrace_heap__add(&bts->heap, queue_nr, ts); + if (ret < 0) + return ret; + } else { + btsq->on_heap = false; + } + } + + return 0; +} + +static int intel_bts_process_event(struct perf_session *session, + union perf_event *event, + struct perf_sample *sample, + struct perf_tool *tool) +{ + struct intel_bts *bts = container_of(session->auxtrace, struct intel_bts, + auxtrace); + u64 timestamp; + int err; + + if (dump_trace) + return 0; + + if (!tool->ordered_events) { + pr_err("Intel BTS requires ordered events\n"); + return -EINVAL; + } + + if (sample->time && sample->time != (u64)-1) + timestamp = perf_time_to_tsc(sample->time, &bts->tc); + else + timestamp = 0; + + err = intel_bts_update_queues(bts); + if (err) + return err; + + err = intel_bts_process_queues(bts, timestamp); + if (err) + return err; + if (event->header.type == PERF_RECORD_EXIT) { + err = intel_bts_process_tid_exit(bts, event->comm.tid); + if (err) + return err; + } + + if (event->header.type == PERF_RECORD_AUX && + (event->aux.flags & PERF_AUX_FLAG_TRUNCATED) && + bts->synth_opts.errors) + err = intel_bts_lost(bts, sample); + + return err; +} + +static int intel_bts_process_auxtrace_event(struct perf_session *session, + union perf_event *event, + struct perf_tool *tool __maybe_unused) +{ + struct intel_bts *bts = container_of(session->auxtrace, struct intel_bts, + auxtrace); + + if (bts->sampling_mode) + return 0; + + if (!bts->data_queued) { + struct auxtrace_buffer *buffer; + off_t data_offset; + int fd = perf_data_file__fd(session->file); + int err; + + if (perf_data_file__is_pipe(session->file)) { + data_offset = 0; + } else { + data_offset = lseek(fd, 0, SEEK_CUR); + if (data_offset == -1) + return -errno; + } + + err = auxtrace_queues__add_event(&bts->queues, session, event, + data_offset, &buffer); + if (err) + return err; + + /* Dump here now we have copied a piped trace out of the pipe */ + if (dump_trace) { + if (auxtrace_buffer__get_data(buffer, fd)) { + intel_bts_dump_event(bts, buffer->data, + buffer->size); + auxtrace_buffer__put_data(buffer); + } + } + } + + return 0; +} + +static int intel_bts_flush(struct perf_session *session __maybe_unused, + struct perf_tool *tool __maybe_unused) +{ + struct intel_bts *bts = container_of(session->auxtrace, struct intel_bts, + auxtrace); + int ret; + + if (dump_trace || bts->sampling_mode) + return 0; + + if (!tool->ordered_events) + return -EINVAL; + + ret = intel_bts_update_queues(bts); + if (ret < 0) + return ret; + + return intel_bts_process_queues(bts, MAX_TIMESTAMP); +} + +static void intel_bts_free_queue(void *priv) +{ + struct intel_bts_queue *btsq = priv; + + if (!btsq) + return; + free(btsq); +} + +static void intel_bts_free_events(struct perf_session *session) +{ + struct intel_bts *bts = container_of(session->auxtrace, struct intel_bts, + auxtrace); + struct auxtrace_queues *queues = &bts->queues; + unsigned int i; + + for (i = 0; i < queues->nr_queues; i++) { + intel_bts_free_queue(queues->queue_array[i].priv); + queues->queue_array[i].priv = NULL; + } + auxtrace_queues__free(queues); +} + +static void intel_bts_free(struct perf_session *session) +{ + struct intel_bts *bts = container_of(session->auxtrace, struct intel_bts, + auxtrace); + + auxtrace_heap__free(&bts->heap); + intel_bts_free_events(session); + session->auxtrace = NULL; + free(bts); +} + +struct intel_bts_synth { + struct perf_tool dummy_tool; + struct perf_session *session; +}; + +static int intel_bts_event_synth(struct perf_tool *tool, + union perf_event *event, + struct perf_sample *sample __maybe_unused, + struct machine *machine __maybe_unused) +{ + struct intel_bts_synth *intel_bts_synth = + container_of(tool, struct intel_bts_synth, dummy_tool); + + return perf_session__deliver_synth_event(intel_bts_synth->session, + event, NULL); +} + +static int intel_bts_synth_event(struct perf_session *session, + struct perf_event_attr *attr, u64 id) +{ + struct intel_bts_synth intel_bts_synth; + + memset(&intel_bts_synth, 0, sizeof(struct intel_bts_synth)); + intel_bts_synth.session = session; + + return perf_event__synthesize_attr(&intel_bts_synth.dummy_tool, attr, 1, + &id, intel_bts_event_synth); +} + +static int intel_bts_synth_events(struct intel_bts *bts, + struct perf_session *session) +{ + struct perf_evlist *evlist = session->evlist; + struct perf_evsel *evsel; + struct perf_event_attr attr; + bool found = false; + u64 id; + int err; + + evlist__for_each(evlist, evsel) { + if (evsel->attr.type == bts->pmu_type && evsel->ids) { + found = true; + break; + } + } + + if (!found) { + pr_debug("There are no selected events with Intel BTS data\n"); + return 0; + } + + memset(&attr, 0, sizeof(struct perf_event_attr)); + attr.size = sizeof(struct perf_event_attr); + attr.type = PERF_TYPE_HARDWARE; + attr.sample_type = evsel->attr.sample_type & PERF_SAMPLE_MASK; + attr.sample_type |= PERF_SAMPLE_IP | PERF_SAMPLE_TID | + PERF_SAMPLE_PERIOD; + attr.sample_type &= ~(u64)PERF_SAMPLE_TIME; + attr.sample_type &= ~(u64)PERF_SAMPLE_CPU; + attr.exclude_user = evsel->attr.exclude_user; + attr.exclude_kernel = evsel->attr.exclude_kernel; + attr.exclude_hv = evsel->attr.exclude_hv; + attr.exclude_host = evsel->attr.exclude_host; + attr.exclude_guest = evsel->attr.exclude_guest; + attr.sample_id_all = evsel->attr.sample_id_all; + attr.read_format = evsel->attr.read_format; + + id = evsel->id[0] + 1000000000; + if (!id) + id = 1; + + if (bts->synth_opts.branches) { + attr.config = PERF_COUNT_HW_BRANCH_INSTRUCTIONS; + attr.sample_period = 1; + attr.sample_type |= PERF_SAMPLE_ADDR; + pr_debug("Synthesizing 'branches' event with id %" PRIu64 " sample type %#" PRIx64 "\n", + id, (u64)attr.sample_type); + err = intel_bts_synth_event(session, &attr, id); + if (err) { + pr_err("%s: failed to synthesize 'branches' event type\n", + __func__); + return err; + } + bts->sample_branches = true; + bts->branches_sample_type = attr.sample_type; + bts->branches_id = id; + /* + * We only use sample types from PERF_SAMPLE_MASK so we can use + * __perf_evsel__sample_size() here. + */ + bts->branches_event_size = sizeof(struct sample_event) + + __perf_evsel__sample_size(attr.sample_type); + } + + bts->synth_needs_swap = evsel->needs_swap; + + return 0; +} + +static const char * const intel_bts_info_fmts[] = { + [INTEL_BTS_PMU_TYPE] = " PMU Type %"PRId64"\n", + [INTEL_BTS_TIME_SHIFT] = " Time Shift %"PRIu64"\n", + [INTEL_BTS_TIME_MULT] = " Time Muliplier %"PRIu64"\n", + [INTEL_BTS_TIME_ZERO] = " Time Zero %"PRIu64"\n", + [INTEL_BTS_CAP_USER_TIME_ZERO] = " Cap Time Zero %"PRId64"\n", + [INTEL_BTS_SNAPSHOT_MODE] = " Snapshot mode %"PRId64"\n", +}; + +static void intel_bts_print_info(u64 *arr, int start, int finish) +{ + int i; + + if (!dump_trace) + return; + + for (i = start; i <= finish; i++) + fprintf(stdout, intel_bts_info_fmts[i], arr[i]); +} + +u64 intel_bts_auxtrace_info_priv[INTEL_BTS_AUXTRACE_PRIV_SIZE]; + +int intel_bts_process_auxtrace_info(union perf_event *event, + struct perf_session *session) +{ + struct auxtrace_info_event *auxtrace_info = &event->auxtrace_info; + size_t min_sz = sizeof(u64) * INTEL_BTS_SNAPSHOT_MODE; + struct intel_bts *bts; + int err; + + if (auxtrace_info->header.size < sizeof(struct auxtrace_info_event) + + min_sz) + return -EINVAL; + + bts = zalloc(sizeof(struct intel_bts)); + if (!bts) + return -ENOMEM; + + err = auxtrace_queues__init(&bts->queues); + if (err) + goto err_free; + + bts->session = session; + bts->machine = &session->machines.host; /* No kvm support */ + bts->auxtrace_type = auxtrace_info->type; + bts->pmu_type = auxtrace_info->priv[INTEL_BTS_PMU_TYPE]; + bts->tc.time_shift = auxtrace_info->priv[INTEL_BTS_TIME_SHIFT]; + bts->tc.time_mult = auxtrace_info->priv[INTEL_BTS_TIME_MULT]; + bts->tc.time_zero = auxtrace_info->priv[INTEL_BTS_TIME_ZERO]; + bts->cap_user_time_zero = + auxtrace_info->priv[INTEL_BTS_CAP_USER_TIME_ZERO]; + bts->snapshot_mode = auxtrace_info->priv[INTEL_BTS_SNAPSHOT_MODE]; + + bts->sampling_mode = false; + + bts->auxtrace.process_event = intel_bts_process_event; + bts->auxtrace.process_auxtrace_event = intel_bts_process_auxtrace_event; + bts->auxtrace.flush_events = intel_bts_flush; + bts->auxtrace.free_events = intel_bts_free_events; + bts->auxtrace.free = intel_bts_free; + session->auxtrace = &bts->auxtrace; + + intel_bts_print_info(&auxtrace_info->priv[0], INTEL_BTS_PMU_TYPE, + INTEL_BTS_SNAPSHOT_MODE); + + if (dump_trace) + return 0; + + if (session->itrace_synth_opts && session->itrace_synth_opts->set) + bts->synth_opts = *session->itrace_synth_opts; + else + itrace_synth_opts__set_default(&bts->synth_opts); + + if (bts->synth_opts.calls) + bts->branches_filter |= PERF_IP_FLAG_CALL | PERF_IP_FLAG_ASYNC | + PERF_IP_FLAG_TRACE_END; + if (bts->synth_opts.returns) + bts->branches_filter |= PERF_IP_FLAG_RETURN | + PERF_IP_FLAG_TRACE_BEGIN; + + err = intel_bts_synth_events(bts, session); + if (err) + goto err_free_queues; + + err = auxtrace_queues__process_index(&bts->queues, session); + if (err) + goto err_free_queues; + + if (bts->queues.populated) + bts->data_queued = true; + + return 0; + +err_free_queues: + auxtrace_queues__free(&bts->queues); + session->auxtrace = NULL; +err_free: + free(bts); + return err; +} diff --git a/tools/perf/util/intel-bts.h b/tools/perf/util/intel-bts.h new file mode 100644 index 0000000..ca65e21 --- /dev/null +++ b/tools/perf/util/intel-bts.h @@ -0,0 +1,43 @@ +/* + * intel-bts.h: Intel Processor Trace support + * Copyright (c) 2013-2014, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + */ + +#ifndef INCLUDE__PERF_INTEL_BTS_H__ +#define INCLUDE__PERF_INTEL_BTS_H__ + +#define INTEL_BTS_PMU_NAME "intel_bts" + +enum { + INTEL_BTS_PMU_TYPE, + INTEL_BTS_TIME_SHIFT, + INTEL_BTS_TIME_MULT, + INTEL_BTS_TIME_ZERO, + INTEL_BTS_CAP_USER_TIME_ZERO, + INTEL_BTS_SNAPSHOT_MODE, + INTEL_BTS_AUXTRACE_PRIV_MAX, +}; + +#define INTEL_BTS_AUXTRACE_PRIV_SIZE (INTEL_BTS_AUXTRACE_PRIV_MAX * sizeof(u64)) + +struct auxtrace_record; +struct perf_tool; +union perf_event; +struct perf_session; + +struct auxtrace_record *intel_bts_recording_init(int *err); + +int intel_bts_process_auxtrace_info(union perf_event *event, + struct perf_session *session); + +#endif diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index 3c71138..89c91a1 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -462,10 +462,6 @@ static struct perf_pmu *pmu_lookup(const char *name) LIST_HEAD(aliases); __u32 type; - /* No support for intel_bts so disallow it */ - if (!strcmp(name, "intel_bts")) - return NULL; - /* * The pmu data we store & need consists of the pmu * type value and format definitions. Load both right -- cgit v0.10.2 From 60b88d8743892218f82048a3df624f5fc5460843 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Fri, 17 Jul 2015 19:33:44 +0300 Subject: perf tools: Put itrace options into an asciidoc include perf script, report and inject all have the same itrace options. Put them into an asciidoc include file. Signed-off-by: Adrian Hunter Cc: Jiri Olsa Link: http://lkml.kernel.org/r/1437150840-31811-10-git-send-email-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/Documentation/itrace.txt b/tools/perf/Documentation/itrace.txt new file mode 100644 index 0000000..2ff9466 --- /dev/null +++ b/tools/perf/Documentation/itrace.txt @@ -0,0 +1,22 @@ + i synthesize instructions events + b synthesize branches events + c synthesize branches events (calls only) + r synthesize branches events (returns only) + x synthesize transactions events + e synthesize error events + d create a debug log + g synthesize a call chain (use with i or x) + + The default is all events i.e. the same as --itrace=ibxe + + In addition, the period (default 100000) for instructions events + can be specified in units of: + + i instructions + t ticks + ms milliseconds + us microseconds + ns nanoseconds (default) + + Also the call chain size (default 16, max. 1024) for instructions or + transactions events can be specified. diff --git a/tools/perf/Documentation/perf-inject.txt b/tools/perf/Documentation/perf-inject.txt index b876ae3..0c721c3 100644 --- a/tools/perf/Documentation/perf-inject.txt +++ b/tools/perf/Documentation/perf-inject.txt @@ -48,28 +48,7 @@ OPTIONS Decode Instruction Tracing data, replacing it with synthesized events. Options are: - i synthesize instructions events - b synthesize branches events - c synthesize branches events (calls only) - r synthesize branches events (returns only) - x synthesize transactions events - e synthesize error events - d create a debug log - g synthesize a call chain (use with i or x) - - The default is all events i.e. the same as --itrace=ibxe - - In addition, the period (default 100000) for instructions events - can be specified in units of: - - i instructions - t ticks - ms milliseconds - us microseconds - ns nanoseconds (default) - - Also the call chain size (default 16, max. 1024) for instructions or - transactions events can be specified. +include::itrace.txt[] SEE ALSO -------- diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt index a18ba75..9c7981b 100644 --- a/tools/perf/Documentation/perf-report.txt +++ b/tools/perf/Documentation/perf-report.txt @@ -331,28 +331,7 @@ OPTIONS --itrace:: Options for decoding instruction tracing data. The options are: - i synthesize instructions events - b synthesize branches events - c synthesize branches events (calls only) - r synthesize branches events (returns only) - x synthesize transactions events - e synthesize error events - d create a debug log - g synthesize a call chain (use with i or x) - - The default is all events i.e. the same as --itrace=ibxe - - In addition, the period (default 100000) for instructions events - can be specified in units of: - - i instructions - t ticks - ms milliseconds - us microseconds - ns nanoseconds (default) - - Also the call chain size (default 16, max. 1024) for instructions or - transactions events can be specified. +include::itrace.txt[] To disable decoding entirely, use --no-itrace. diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt index 8e9be1f..c0d2479 100644 --- a/tools/perf/Documentation/perf-script.txt +++ b/tools/perf/Documentation/perf-script.txt @@ -235,28 +235,7 @@ OPTIONS --itrace:: Options for decoding instruction tracing data. The options are: - i synthesize instructions events - b synthesize branches events - c synthesize branches events (calls only) - r synthesize branches events (returns only) - x synthesize transactions events - e synthesize error events - d create a debug log - g synthesize a call chain (use with i or x) - - The default is all events i.e. the same as --itrace=ibxe - - In addition, the period (default 100000) for instructions events - can be specified in units of: - - i instructions - t ticks - ms milliseconds - us microseconds - ns nanoseconds (default) - - Also the call chain size (default 16, max. 1024) for instructions or - transactions events can be specified. +include::itrace.txt[] To disable decoding entirely, use --no-itrace. -- cgit v0.10.2 From 4b715d24f4f14731c7b553cbb8604fe865cb8d3c Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Fri, 17 Jul 2015 19:33:45 +0300 Subject: perf tools: Add example call-graph script Add a script to produce a call-graph from data exported to a postgresql database and derived from a processor trace event like intel_pt or intel_bts. Refer to comments in the scripts call-graph-from-postgresql.py and export-to-postgresql.py for more details on how to set up the environment, install the required packages, etc. Committer note: From the scripts, for convenience while reading 'git log': An example of using this script with Intel PT: $ perf record -e intel_pt//u ls $ perf script -s ~/libexec/perf-core/scripts/python/export-to-postgresql.py pt_example branches calls 2015-05-29 12:49:23.464364 Creating database... 2015-05-29 12:49:26.281717 Writing to intermediate files... 2015-05-29 12:49:27.190383 Copying to database... 2015-05-29 12:49:28.140451 Removing intermediate files... 2015-05-29 12:49:28.147451 Adding primary keys 2015-05-29 12:49:28.655683 Adding foreign keys 2015-05-29 12:49:29.365350 Done $ python tools/perf/scripts/python/call-graph-from-postgresql.py pt_example # The result is a GUI window with a tree representing a context-sensitive # call-graph. Expanding a couple of levels of the tree and adjusting column # widths to suit will display something like: Call Graph: pt_example Call Path |Object |Count|Time(ns)|Time(%)|Branch Count|Branch Count(%) v- ls v- 2638:2638 v- _start ld-2.19.so 1 10074071 100.0 211135 100.0 |- unknown unknown 1 13198 0.1 1 0.0 >- _dl_start ld-2.19.so 1 1400980 13.9 19637 9.3 >- _d_linit_internal ld-2.19.so 1 448152 4.4 11094 5.3 v-__libc_start_main@plt ls 1 8211741 81.5 180397 85.4 >- _dl_fixup ld-2.19.so 1 7607 0.1 108 0.1 >- __cxa_atexit libc-2.19.so 1 11737 0.1 10 0.0 >- __libc_csu_init ls 1 10354 0.1 10 0.0 |- _setjmp libc-2.19.so 1 0 0.0 4 0.0 v- main ls 1 8182043 99.6 180254 99.9 Signed-off-by: Adrian Hunter Tested-by: Arnaldo Carvalho de Melo Cc: Jiri Olsa Link: http://lkml.kernel.org/r/1437150840-31811-11-git-send-email-adrian.hunter@intel.com [ Added 'python-pyside qt-postgresql' to the yum cmdline installing required packages ] Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/scripts/python/call-graph-from-postgresql.py b/tools/perf/scripts/python/call-graph-from-postgresql.py new file mode 100644 index 0000000..e78fdc2 --- /dev/null +++ b/tools/perf/scripts/python/call-graph-from-postgresql.py @@ -0,0 +1,327 @@ +#!/usr/bin/python2 +# call-graph-from-postgresql.py: create call-graph from postgresql database +# Copyright (c) 2014, Intel Corporation. +# +# This program is free software; you can redistribute it and/or modify it +# under the terms and conditions of the GNU General Public License, +# version 2, as published by the Free Software Foundation. +# +# This program is distributed in the hope it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +# more details. + +# To use this script you will need to have exported data using the +# export-to-postgresql.py script. Refer to that script for details. +# +# Following on from the example in the export-to-postgresql.py script, a +# call-graph can be displayed for the pt_example database like this: +# +# python tools/perf/scripts/python/call-graph-from-postgresql.py pt_example +# +# Note this script supports connecting to remote databases by setting hostname, +# port, username, password, and dbname e.g. +# +# python tools/perf/scripts/python/call-graph-from-postgresql.py "hostname=myhost username=myuser password=mypassword dbname=pt_example" +# +# The result is a GUI window with a tree representing a context-sensitive +# call-graph. Expanding a couple of levels of the tree and adjusting column +# widths to suit will display something like: +# +# Call Graph: pt_example +# Call Path Object Count Time(ns) Time(%) Branch Count Branch Count(%) +# v- ls +# v- 2638:2638 +# v- _start ld-2.19.so 1 10074071 100.0 211135 100.0 +# |- unknown unknown 1 13198 0.1 1 0.0 +# >- _dl_start ld-2.19.so 1 1400980 13.9 19637 9.3 +# >- _d_linit_internal ld-2.19.so 1 448152 4.4 11094 5.3 +# v-__libc_start_main@plt ls 1 8211741 81.5 180397 85.4 +# >- _dl_fixup ld-2.19.so 1 7607 0.1 108 0.1 +# >- __cxa_atexit libc-2.19.so 1 11737 0.1 10 0.0 +# >- __libc_csu_init ls 1 10354 0.1 10 0.0 +# |- _setjmp libc-2.19.so 1 0 0.0 4 0.0 +# v- main ls 1 8182043 99.6 180254 99.9 +# +# Points to note: +# The top level is a command name (comm) +# The next level is a thread (pid:tid) +# Subsequent levels are functions +# 'Count' is the number of calls +# 'Time' is the elapsed time until the function returns +# Percentages are relative to the level above +# 'Branch Count' is the total number of branches for that function and all +# functions that it calls + +import sys +from PySide.QtCore import * +from PySide.QtGui import * +from PySide.QtSql import * +from decimal import * + +class TreeItem(): + + def __init__(self, db, row, parent_item): + self.db = db + self.row = row + self.parent_item = parent_item + self.query_done = False; + self.child_count = 0 + self.child_items = [] + self.data = ["", "", "", "", "", "", ""] + self.comm_id = 0 + self.thread_id = 0 + self.call_path_id = 1 + self.branch_count = 0 + self.time = 0 + if not parent_item: + self.setUpRoot() + + def setUpRoot(self): + self.query_done = True + query = QSqlQuery(self.db) + ret = query.exec_('SELECT id, comm FROM comms') + if not ret: + raise Exception("Query failed: " + query.lastError().text()) + while query.next(): + if not query.value(0): + continue + child_item = TreeItem(self.db, self.child_count, self) + self.child_items.append(child_item) + self.child_count += 1 + child_item.setUpLevel1(query.value(0), query.value(1)) + + def setUpLevel1(self, comm_id, comm): + self.query_done = True; + self.comm_id = comm_id + self.data[0] = comm + self.child_items = [] + self.child_count = 0 + query = QSqlQuery(self.db) + ret = query.exec_('SELECT thread_id, ( SELECT pid FROM threads WHERE id = thread_id ), ( SELECT tid FROM threads WHERE id = thread_id ) FROM comm_threads WHERE comm_id = ' + str(comm_id)) + if not ret: + raise Exception("Query failed: " + query.lastError().text()) + while query.next(): + child_item = TreeItem(self.db, self.child_count, self) + self.child_items.append(child_item) + self.child_count += 1 + child_item.setUpLevel2(comm_id, query.value(0), query.value(1), query.value(2)) + + def setUpLevel2(self, comm_id, thread_id, pid, tid): + self.comm_id = comm_id + self.thread_id = thread_id + self.data[0] = str(pid) + ":" + str(tid) + + def getChildItem(self, row): + return self.child_items[row] + + def getParentItem(self): + return self.parent_item + + def getRow(self): + return self.row + + def timePercent(self, b): + if not self.time: + return "0.0" + x = (b * Decimal(100)) / self.time + return str(x.quantize(Decimal('.1'), rounding=ROUND_HALF_UP)) + + def branchPercent(self, b): + if not self.branch_count: + return "0.0" + x = (b * Decimal(100)) / self.branch_count + return str(x.quantize(Decimal('.1'), rounding=ROUND_HALF_UP)) + + def addChild(self, call_path_id, name, dso, count, time, branch_count): + child_item = TreeItem(self.db, self.child_count, self) + child_item.comm_id = self.comm_id + child_item.thread_id = self.thread_id + child_item.call_path_id = call_path_id + child_item.branch_count = branch_count + child_item.time = time + child_item.data[0] = name + if dso == "[kernel.kallsyms]": + dso = "[kernel]" + child_item.data[1] = dso + child_item.data[2] = str(count) + child_item.data[3] = str(time) + child_item.data[4] = self.timePercent(time) + child_item.data[5] = str(branch_count) + child_item.data[6] = self.branchPercent(branch_count) + self.child_items.append(child_item) + self.child_count += 1 + + def selectCalls(self): + self.query_done = True; + query = QSqlQuery(self.db) + ret = query.exec_('SELECT id, call_path_id, branch_count, call_time, return_time, ' + '( SELECT name FROM symbols WHERE id = ( SELECT symbol_id FROM call_paths WHERE id = call_path_id ) ), ' + '( SELECT short_name FROM dsos WHERE id = ( SELECT dso_id FROM symbols WHERE id = ( SELECT symbol_id FROM call_paths WHERE id = call_path_id ) ) ), ' + '( SELECT ip FROM call_paths where id = call_path_id ) ' + 'FROM calls WHERE parent_call_path_id = ' + str(self.call_path_id) + ' AND comm_id = ' + str(self.comm_id) + ' AND thread_id = ' + str(self.thread_id) + + 'ORDER BY call_path_id') + if not ret: + raise Exception("Query failed: " + query.lastError().text()) + last_call_path_id = 0 + name = "" + dso = "" + count = 0 + branch_count = 0 + total_branch_count = 0 + time = 0 + total_time = 0 + while query.next(): + if query.value(1) == last_call_path_id: + count += 1 + branch_count += query.value(2) + time += query.value(4) - query.value(3) + else: + if count: + self.addChild(last_call_path_id, name, dso, count, time, branch_count) + last_call_path_id = query.value(1) + name = query.value(5) + dso = query.value(6) + count = 1 + total_branch_count += branch_count + total_time += time + branch_count = query.value(2) + time = query.value(4) - query.value(3) + if count: + self.addChild(last_call_path_id, name, dso, count, time, branch_count) + total_branch_count += branch_count + total_time += time + # Top level does not have time or branch count, so fix that here + if total_branch_count > self.branch_count: + self.branch_count = total_branch_count + if self.branch_count: + for child_item in self.child_items: + child_item.data[6] = self.branchPercent(child_item.branch_count) + if total_time > self.time: + self.time = total_time + if self.time: + for child_item in self.child_items: + child_item.data[4] = self.timePercent(child_item.time) + + def childCount(self): + if not self.query_done: + self.selectCalls() + return self.child_count + + def columnCount(self): + return 7 + + def columnHeader(self, column): + headers = ["Call Path", "Object", "Count ", "Time (ns) ", "Time (%) ", "Branch Count ", "Branch Count (%) "] + return headers[column] + + def getData(self, column): + return self.data[column] + +class TreeModel(QAbstractItemModel): + + def __init__(self, db, parent=None): + super(TreeModel, self).__init__(parent) + self.db = db + self.root = TreeItem(db, 0, None) + + def columnCount(self, parent): + return self.root.columnCount() + + def rowCount(self, parent): + if parent.isValid(): + parent_item = parent.internalPointer() + else: + parent_item = self.root + return parent_item.childCount() + + def headerData(self, section, orientation, role): + if role == Qt.TextAlignmentRole: + if section > 1: + return Qt.AlignRight + if role != Qt.DisplayRole: + return None + if orientation != Qt.Horizontal: + return None + return self.root.columnHeader(section) + + def parent(self, child): + child_item = child.internalPointer() + if child_item is self.root: + return QModelIndex() + parent_item = child_item.getParentItem() + return self.createIndex(parent_item.getRow(), 0, parent_item) + + def index(self, row, column, parent): + if parent.isValid(): + parent_item = parent.internalPointer() + else: + parent_item = self.root + child_item = parent_item.getChildItem(row) + return self.createIndex(row, column, child_item) + + def data(self, index, role): + if role == Qt.TextAlignmentRole: + if index.column() > 1: + return Qt.AlignRight + if role != Qt.DisplayRole: + return None + index_item = index.internalPointer() + return index_item.getData(index.column()) + +class MainWindow(QMainWindow): + + def __init__(self, db, dbname, parent=None): + super(MainWindow, self).__init__(parent) + + self.setObjectName("MainWindow") + self.setWindowTitle("Call Graph: " + dbname) + self.move(100, 100) + self.resize(800, 600) + style = self.style() + icon = style.standardIcon(QStyle.SP_MessageBoxInformation) + self.setWindowIcon(icon); + + self.model = TreeModel(db) + + self.view = QTreeView() + self.view.setModel(self.model) + + self.setCentralWidget(self.view) + +if __name__ == '__main__': + if (len(sys.argv) < 2): + print >> sys.stderr, "Usage is: call-graph-from-postgresql.py " + raise Exception("Too few arguments") + + dbname = sys.argv[1] + + db = QSqlDatabase.addDatabase('QPSQL') + + opts = dbname.split() + for opt in opts: + if '=' in opt: + opt = opt.split('=') + if opt[0] == 'hostname': + db.setHostName(opt[1]) + elif opt[0] == 'port': + db.setPort(int(opt[1])) + elif opt[0] == 'username': + db.setUserName(opt[1]) + elif opt[0] == 'password': + db.setPassword(opt[1]) + elif opt[0] == 'dbname': + dbname = opt[1] + else: + dbname = opt + + db.setDatabaseName(dbname) + if not db.open(): + raise Exception("Failed to open database " + dbname + " error: " + db.lastError().text()) + + app = QApplication(sys.argv) + window = MainWindow(db, dbname) + window.show() + err = app.exec_() + db.close() + sys.exit(err) diff --git a/tools/perf/scripts/python/export-to-postgresql.py b/tools/perf/scripts/python/export-to-postgresql.py index 4cdafd8..84a3203 100644 --- a/tools/perf/scripts/python/export-to-postgresql.py +++ b/tools/perf/scripts/python/export-to-postgresql.py @@ -15,6 +15,53 @@ import sys import struct import datetime +# To use this script you will need to have installed package python-pyside which +# provides LGPL-licensed Python bindings for Qt. You will also need the package +# libqt4-sql-psql for Qt postgresql support. +# +# The script assumes postgresql is running on the local machine and that the +# user has postgresql permissions to create databases. Examples of installing +# postgresql and adding such a user are: +# +# fedora: +# +# $ sudo yum install postgresql postgresql-server python-pyside qt-postgresql +# $ sudo su - postgres -c initdb +# $ sudo service postgresql start +# $ sudo su - postgres +# $ createuser +# Shall the new role be a superuser? (y/n) y +# +# ubuntu: +# +# $ sudo apt-get install postgresql +# $ sudo su - postgres +# $ createuser +# Shall the new role be a superuser? (y/n) y +# +# An example of using this script with Intel PT: +# +# $ perf record -e intel_pt//u ls +# $ perf script -s ~/libexec/perf-core/scripts/python/export-to-postgresql.py pt_example branches calls +# 2015-05-29 12:49:23.464364 Creating database... +# 2015-05-29 12:49:26.281717 Writing to intermediate files... +# 2015-05-29 12:49:27.190383 Copying to database... +# 2015-05-29 12:49:28.140451 Removing intermediate files... +# 2015-05-29 12:49:28.147451 Adding primary keys +# 2015-05-29 12:49:28.655683 Adding foreign keys +# 2015-05-29 12:49:29.365350 Done +# +# To browse the database, psql can be used e.g. +# +# $ psql pt_example +# pt_example=# select * from samples_view where id < 100; +# pt_example=# \d+ +# pt_example=# \d+ samples_view +# pt_example=# \q +# +# An example of using the database is provided by the script +# call-graph-from-postgresql.py. Refer to that script for details. + from PySide.QtSql import * # Need to access PostgreSQL C library directly to use COPY FROM STDIN -- cgit v0.10.2 From 66671d001aeb525b9101e0ccb9062627539de555 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 18 Aug 2015 15:19:50 -0300 Subject: perf tools: Initialize reference counts in map__clone() Map clone was written before we introduced reference counts for maps and dsos, so all that was needed was just a copy and then we would insert it into the new map_groups instance. Fix it by, after copying, initializing the map->refcnt, grabbing a struct dso refcount and resetting pointers that may be used to determine if a map, when deleted, is in a rb_tree. Cc: Adrian Hunter Cc: Borislav Petkov Cc: David Ahern Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Namhyung Kim Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-pd4mr80o5b9gvk50iineacec@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c index ce37e95..b1c475d 100644 --- a/tools/perf/util/map.c +++ b/tools/perf/util/map.c @@ -348,9 +348,18 @@ struct symbol *map__find_symbol_by_name(struct map *map, const char *name, return dso__find_symbol_by_name(map->dso, map->type, name); } -struct map *map__clone(struct map *map) +struct map *map__clone(struct map *from) { - return memdup(map, sizeof(*map)); + struct map *map = memdup(from, sizeof(*map)); + + if (map != NULL) { + atomic_set(&map->refcnt, 1); + RB_CLEAR_NODE(&map->rb_node); + dso__get(map->dso); + map->groups = NULL; + } + + return map; } int map__overlap(struct map *l, struct map *r) -- cgit v0.10.2 From 1c0bd0e891aaed0219010bfe79b32e1b0b82d662 Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Fri, 21 Aug 2015 10:09:02 +0000 Subject: perf probe: Try to use symbol table if searching debug info failed A problem can occur in a statically linked perf when vmlinux can be found: # perf probe --add sys_epoll_pwait probe-definition(0): sys_epoll_pwait symbol:sys_epoll_pwait file:(null) line:0 offset:0 return:0 lazy:(null) 0 arguments Looking at the vmlinux_path (7 entries long) Using /lib/modules/4.2.0-rc1+/build/vmlinux for symbols Open Debuginfo file: /lib/modules/4.2.0-rc1+/build/vmlinux Try to find probe point from debuginfo. Symbol sys_epoll_pwait address found : ffffffff8122bd40 Matched function: SyS_epoll_pwait Failed to get call frame on 0xffffffff8122bd40 An error occurred in debuginfo analysis (-2). Error: Failed to add events. Reason: No such file or directory (Code: -2) The reason is caused by libdw that, if libdw is statically linked, it can't load libebl_{arch}.so reliable. In this case it is still possible to get the address from /proc/kalksyms. However, perf tries that only when libdw returns -EBADF. This patch gives it another chance to utilize symbol table, even if libdw returns an error code other than -EBADF. After applying this patch: # perf probe -nv --add sys_epoll_pwait probe-definition(0): sys_epoll_pwait symbol:sys_epoll_pwait file:(null) line:0 offset:0 return:0 lazy:(null) 0 arguments Looking at the vmlinux_path (7 entries long) Using /lib/modules/4.2.0-rc1+/build/vmlinux for symbols Open Debuginfo file: /lib/modules/4.2.0-rc1+/build/vmlinux Try to find probe point from debuginfo. Symbol sys_epoll_pwait address found : ffffffff8122bd40 Matched function: SyS_epoll_pwait Failed to get call frame on 0xffffffff8122bd40 An error occurred in debuginfo analysis (-2). Trying to use symbols. Opening /sys/kernel/debug/tracing/kprobe_events write=1 Added new event: Writing event: p:probe/sys_epoll_pwait _text+2276672 probe:sys_epoll_pwait (on sys_epoll_pwait) You can now use it in all perf tools, such as: perf record -e probe:sys_epoll_pwait -aR sleep 1 Although libdw returns an error (Failed to get call frame), perf tries symbol table and finally gets correct address. Signed-off-by: Wang Nan Cc: Alexei Starovoitov Cc: Brendan Gregg Cc: Daniel Borkmann Cc: David Ahern Cc: He Kuang Cc: Jiri Olsa Cc: Kaixu Xia Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Zefan Li Cc: pi3orama@163.com Link: http://lkml.kernel.org/r/1440151770-129878-2-git-send-email-wangnan0@huawei.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c index fe4941a..f07374b 100644 --- a/tools/perf/util/probe-event.c +++ b/tools/perf/util/probe-event.c @@ -705,9 +705,10 @@ static int try_to_find_probe_trace_events(struct perf_probe_event *pev, } /* Error path : ntevs < 0 */ pr_debug("An error occurred in debuginfo analysis (%d).\n", ntevs); - if (ntevs == -EBADF) { - pr_warning("Warning: No dwarf info found in the vmlinux - " - "please rebuild kernel with CONFIG_DEBUG_INFO=y.\n"); + if (ntevs < 0) { + if (ntevs == -EBADF) + pr_warning("Warning: No dwarf info found in the vmlinux - " + "please rebuild kernel with CONFIG_DEBUG_INFO=y.\n"); if (!need_dwarf) { pr_debug("Trying to use symbols.\n"); return 0; -- cgit v0.10.2 From 5839a5506de30c23d4cfaf49755cd294f5c48368 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Fri, 21 Aug 2015 22:05:58 +0300 Subject: perf tools: Fix tarball build broken by pt/bts Fix some include paths and add missing inat_types.h. Reported-by: Arnaldo Carvalho de Melo Signed-off-by: Adrian Hunter Cc: Jiri Olsa Link: http://lkml.kernel.org/r/55D77696.60102@intel.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/intel-pt-decoder/inat.c b/tools/perf/util/intel-pt-decoder/inat.c index feeaa50..906d94a 100644 --- a/tools/perf/util/intel-pt-decoder/inat.c +++ b/tools/perf/util/intel-pt-decoder/inat.c @@ -18,7 +18,7 @@ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * */ -#include +#include "insn.h" /* Attribute tables are generated from opcode map */ #include "inat-tables.c" diff --git a/tools/perf/util/intel-pt-decoder/inat.h b/tools/perf/util/intel-pt-decoder/inat.h index 74a2e31..611645e 100644 --- a/tools/perf/util/intel-pt-decoder/inat.h +++ b/tools/perf/util/intel-pt-decoder/inat.h @@ -20,7 +20,7 @@ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * */ -#include +#include "inat_types.h" /* * Internal bits. Don't use bitmasks directly, because these bits are diff --git a/tools/perf/util/intel-pt-decoder/inat_types.h b/tools/perf/util/intel-pt-decoder/inat_types.h new file mode 100644 index 0000000..cb3c20c --- /dev/null +++ b/tools/perf/util/intel-pt-decoder/inat_types.h @@ -0,0 +1,29 @@ +#ifndef _ASM_X86_INAT_TYPES_H +#define _ASM_X86_INAT_TYPES_H +/* + * x86 instruction attributes + * + * Written by Masami Hiramatsu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + */ + +/* Instruction attributes */ +typedef unsigned int insn_attr_t; +typedef unsigned char insn_byte_t; +typedef signed int insn_value_t; + +#endif diff --git a/tools/perf/util/intel-pt-decoder/insn.c b/tools/perf/util/intel-pt-decoder/insn.c index 8f72b33..47314a6 100644 --- a/tools/perf/util/intel-pt-decoder/insn.c +++ b/tools/perf/util/intel-pt-decoder/insn.c @@ -23,8 +23,8 @@ #else #include #endif -#include -#include +#include "inat.h" +#include "insn.h" /* Verify next sizeof(t) bytes can be on the same instruction */ #define validate_next(t, insn, n) \ diff --git a/tools/perf/util/intel-pt-decoder/insn.h b/tools/perf/util/intel-pt-decoder/insn.h index e7814b7..dd12da0 100644 --- a/tools/perf/util/intel-pt-decoder/insn.h +++ b/tools/perf/util/intel-pt-decoder/insn.h @@ -21,7 +21,7 @@ */ /* insn_attr_t is defined in inat.h */ -#include +#include "inat.h" struct insn_field { union { diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-insn-decoder.c b/tools/perf/util/intel-pt-decoder/intel-pt-insn-decoder.c index 46980fc..9e4eb8f 100644 --- a/tools/perf/util/intel-pt-decoder/intel-pt-insn-decoder.c +++ b/tools/perf/util/intel-pt-decoder/intel-pt-insn-decoder.c @@ -20,7 +20,7 @@ #include "event.h" -#include +#include "insn.h" #include "inat.c" #include "insn.c" -- cgit v0.10.2 From c0b4dffbc529244d3e4e3bd392f2bffa2d8531a7 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 24 Aug 2015 13:33:14 -0300 Subject: perf annotate: Reset the dso find_symbol cache when removing symbols The 'annotate' tool does some filtering in the entries in a DSO but forgot to reset the cache done in dso__find_symbol(), cauxing a SEGV: [root@zoo ~]# perf annotate netlink_poll perf: Segmentation fault -------- backtrace -------- perf[0x526ceb] /lib64/libc.so.6(+0x34960)[0x7faedfbe0960] perf(rb_erase+0x223)[0x499d63] perf[0x4213e9] perf[0x4bc123] perf[0x4bc621] perf[0x4bf26b] perf[0x4bc855] perf(perf_session__process_events+0x340)[0x4bddc0] perf(cmd_annotate+0x6bb)[0x421b5b] perf[0x479063] perf(main+0x60a)[0x42098a] /lib64/libc.so.6(__libc_start_main+0xf0)[0x7faedfbcbfe0] perf[0x420aa9] [0x0] [root@zoo ~]# Fix it by reseting the find cache when removing symbols. Cc: Adrian Hunter Cc: Borislav Petkov Cc: David Ahern Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Namhyung Kim Cc: Stephane Eranian Fixes: b685ac22b436 ("perf symbols: Add front end cache for DSO symbol lookup") Link: http://lkml.kernel.org/n/tip-b2y9x46y0t8yem1ive41zqyp@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c index a32a64e..8edc205 100644 --- a/tools/perf/builtin-annotate.c +++ b/tools/perf/builtin-annotate.c @@ -67,6 +67,7 @@ static int perf_evsel__add_sample(struct perf_evsel *evsel, rb_erase(&al->sym->rb_node, &al->map->dso->symbols[al->map->type]); symbol__delete(al->sym); + dso__reset_find_symbol_cache(al->map->dso); } return 0; } diff --git a/tools/perf/util/dso.h b/tools/perf/util/dso.h index c73276d..fc8db9c 100644 --- a/tools/perf/util/dso.h +++ b/tools/perf/util/dso.h @@ -324,6 +324,8 @@ struct dso *__dsos__findnew(struct dsos *dsos, const char *name); struct dso *dsos__findnew(struct dsos *dsos, const char *name); bool __dsos__read_build_ids(struct list_head *head, bool with_hits); +void dso__reset_find_symbol_cache(struct dso *dso); + size_t __dsos__fprintf_buildid(struct list_head *head, FILE *fp, bool (skip)(struct dso *dso, int parm), int parm); size_t __dsos__fprintf(struct list_head *head, FILE *fp); diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c index 42e98ab..46ae053 100644 --- a/tools/perf/util/symbol.c +++ b/tools/perf/util/symbol.c @@ -441,6 +441,16 @@ static struct symbol *symbols__find_by_name(struct rb_root *symbols, return &s->sym; } +void dso__reset_find_symbol_cache(struct dso *dso) +{ + enum map_type type; + + for (type = MAP__FUNCTION; type <= MAP__VARIABLE; ++type) { + dso->last_find_result[type].addr = 0; + dso->last_find_result[type].symbol = NULL; + } +} + struct symbol *dso__find_symbol(struct dso *dso, enum map_type type, u64 addr) { -- cgit v0.10.2 From 1e259ad4a25e37eb298bb5772b9a51e70bed7782 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 24 Aug 2015 16:18:26 -0300 Subject: perf ui tui progress: Implement the ui_progress_ops->finish() method MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit So that we can erase the progress bar after we're done with it, avoiding things like: ------------------------------------------------------------------- ┌─Error:──────────────────────────────────────────────────────┐ │Can't annotate unmapped_area_topdown: │ │ │ │No vmlinux file with build id a826726b5ddacfab1f0bade868f1a79│ │was found in the path. │ │ │ │Note that annotation using /proc/kcore requires CAP_SYS_RAWIO│ ┌Processin│ │──┐ │ │Please use: │ │ └─────────│ │──┘ │ perf buildid-cache -vu vmlinux │ │ │ │or: │ │ │ │ --vmlinux vmlinux │ │ │ │ │ │Press any key... │ └─────────────────────────────────────────────────────────────┘ Can't annotate unmapped_area_topdown: ------------------------------------------------------------------- I.e. that finished progress bar behind the error window. It is not a problem when we end up redrawing the whole screen, but its ugly when we present such error windows, provide a TUI method so that code like the above may avoid this situation, as will be done with the annotation code in the next cset. Cc: Adrian Hunter Cc: Borislav Petkov Cc: David Ahern Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Namhyung Kim Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-qvktnojzwwe37pweging058t@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/ui/tui/progress.c b/tools/perf/ui/tui/progress.c index c61d14b..c4b9900 100644 --- a/tools/perf/ui/tui/progress.c +++ b/tools/perf/ui/tui/progress.c @@ -33,9 +33,26 @@ static void tui_progress__update(struct ui_progress *p) pthread_mutex_unlock(&ui__lock); } +static void tui_progress__finish(void) +{ + int y; + + if (use_browser <= 0) + return; + + ui__refresh_dimensions(false); + pthread_mutex_lock(&ui__lock); + y = SLtt_Screen_Rows / 2 - 2; + SLsmg_set_color(0); + SLsmg_fill_region(y, 0, 3, SLtt_Screen_Cols, ' '); + SLsmg_refresh(); + pthread_mutex_unlock(&ui__lock); +} + static struct ui_progress_ops tui_progress__ops = { - .update = tui_progress__update, + .update = tui_progress__update, + .finish = tui_progress__finish, }; void tui_progress__init(void) -- cgit v0.10.2 From 5c9ce1e644c1919ac4bff4394a4c372f47a89b4e Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 24 Aug 2015 17:16:22 -0300 Subject: perf ordered_events: Clear the progress bar at the end of a flush We were depending on the next screen operation after a flush() being one that would redraw the whole screen so that the progress bar would be overwritten, when that didn't happen a screen artifact of, say, a error dialog window would be overlaid on top of the progress bar, fix it by calling ui_browser__finish(), that now has a TUI implementation. Cc: Adrian Hunter Cc: Borislav Petkov Cc: David Ahern Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Namhyung Kim Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-el0fyw6duemnx62lydjzhs8c@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/ordered-events.c b/tools/perf/util/ordered-events.c index 52be201..b1b9e23 100644 --- a/tools/perf/util/ordered-events.c +++ b/tools/perf/util/ordered-events.c @@ -220,6 +220,9 @@ static int __ordered_events__flush(struct ordered_events *oe) else if (last_ts <= limit) oe->last = list_entry(head->prev, struct ordered_event, list); + if (show_progress) + ui_progress__finish(); + return 0; } -- cgit v0.10.2 From 2a21d03686881331b0af0471588674e7e896eeb2 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Fri, 17 Jul 2015 19:33:48 +0300 Subject: perf tools: Fix Intel PT 'instructions' sample period The period on synthesized 'instructions' samples was being set to a fixed value, whereas the correct value is the number of instructions since the last sample, which is a value that the decoder can provide. So do it that way. Signed-off-by: Adrian Hunter Cc: Jiri Olsa Link: http://lkml.kernel.org/r/1437150840-31811-14-git-send-email-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c index f8ac462..56790ea 100644 --- a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c +++ b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c @@ -108,6 +108,7 @@ struct intel_pt_decoder { uint64_t sign_bits; uint64_t period; enum intel_pt_period_type period_type; + uint64_t tot_insn_cnt; uint64_t period_insn_cnt; uint64_t period_mask; uint64_t period_ticks; @@ -559,6 +560,7 @@ static int intel_pt_walk_insn(struct intel_pt_decoder *decoder, err = decoder->walk_insn(intel_pt_insn, &insn_cnt, &decoder->ip, ip, max_insn_cnt, decoder->data); + decoder->tot_insn_cnt += insn_cnt; decoder->timestamp_insn_cnt += insn_cnt; decoder->period_insn_cnt += insn_cnt; @@ -1529,6 +1531,7 @@ const struct intel_pt_state *intel_pt_decode(struct intel_pt_decoder *decoder) decoder->state.timestamp = decoder->timestamp; decoder->state.est_timestamp = intel_pt_est_timestamp(decoder); decoder->state.cr3 = decoder->cr3; + decoder->state.tot_insn_cnt = decoder->tot_insn_cnt; if (err) decoder->state.from_ip = decoder->ip; diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.h b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.h index 4c488023..cbf5704 100644 --- a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.h +++ b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.h @@ -58,6 +58,7 @@ struct intel_pt_state { uint64_t from_ip; uint64_t to_ip; uint64_t cr3; + uint64_t tot_insn_cnt; uint64_t timestamp; uint64_t est_timestamp; uint64_t trace_nr; diff --git a/tools/perf/util/intel-pt.c b/tools/perf/util/intel-pt.c index a5acd2f..3b34a64 100644 --- a/tools/perf/util/intel-pt.c +++ b/tools/perf/util/intel-pt.c @@ -126,6 +126,7 @@ struct intel_pt_queue { u64 timestamp; u32 flags; u16 insn_len; + u64 last_insn_cnt; }; static void intel_pt_dump(struct intel_pt *pt __maybe_unused, @@ -920,11 +921,13 @@ static int intel_pt_synth_instruction_sample(struct intel_pt_queue *ptq) sample.addr = ptq->state->to_ip; sample.id = ptq->pt->instructions_id; sample.stream_id = ptq->pt->instructions_id; - sample.period = ptq->pt->instructions_sample_period; + sample.period = ptq->state->tot_insn_cnt - ptq->last_insn_cnt; sample.cpu = ptq->cpu; sample.flags = ptq->flags; sample.insn_len = ptq->insn_len; + ptq->last_insn_cnt = ptq->state->tot_insn_cnt; + if (pt->synth_opts.callchain) { thread_stack__sample(ptq->thread, ptq->chain, pt->synth_opts.callchain_sz, sample.ip); -- cgit v0.10.2 From bc9b6bf07c8b3f4e85509f9b3a552c86e567b4ae Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Fri, 17 Jul 2015 19:33:52 +0300 Subject: perf tools: Add Intel PT support for PSB periods The PSB packet is a synchronization packet that provides a starting point for decoding or recovery from errors. This patch adds support for a new Intel PT feature that allows the frequency of PSB packets to be specified. Support for this feature is indicated by /sys/bus/event_source/devices/intel_pt/caps/psb_cyc which contains "1" if the feature is supported and "0" otherwise. The PSB period can be specified as a PMU config term e.g. perf record -e intel_pt/psb_period=2/u sleep 1 The default value is 3 or the nearest lower value that is supported. 0 is always supported. Valid values are given by: /sys/bus/event_source/devices/intel_pt/caps/psb_periods which contains a hexadecimal value, the bits of which represent valid values e.g. bit 2 set means value 2 is valid. The value is converted to the approximate number of trace bytes between PSB packets as: 2 ^ (value + 11) e.g. value 3 means 16KiB bytes between PSBs If an invalid value is entered, the error message will give a list of valid values e.g. $ perf record -e intel_pt/psb_period=15/u uname Invalid psb_period for intel_pt. Valid values are: 0-5 tools/perf/Documentation/intel-pt.txt is updated in a later patch as there are a number of new features being added. For more information about PSB periods refer to the Intel 64 and IA-32 Architectures SDM Chapter 36 Intel Processor Trace from June 2015 or later. Signed-off-by: Adrian Hunter Cc: Jiri Olsa Link: http://lkml.kernel.org/r/1437150840-31811-18-git-send-email-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/arch/x86/util/intel-pt.c b/tools/perf/arch/x86/util/intel-pt.c index da7d2c1..145975b 100644 --- a/tools/perf/arch/x86/util/intel-pt.c +++ b/tools/perf/arch/x86/util/intel-pt.c @@ -99,17 +99,121 @@ static int intel_pt_parse_terms(struct list_head *formats, const char *str, return intel_pt_parse_terms_with_default(formats, str, config); } -static size_t intel_pt_psb_period(struct perf_pmu *intel_pt_pmu __maybe_unused, - struct perf_evlist *evlist __maybe_unused) +static u64 intel_pt_masked_bits(u64 mask, u64 bits) { - return 256; + const u64 top_bit = 1ULL << 63; + u64 res = 0; + int i; + + for (i = 0; i < 64; i++) { + if (mask & top_bit) { + res <<= 1; + if (bits & top_bit) + res |= 1; + } + mask <<= 1; + bits <<= 1; + } + + return res; +} + +static int intel_pt_read_config(struct perf_pmu *intel_pt_pmu, const char *str, + struct perf_evlist *evlist, u64 *res) +{ + struct perf_evsel *evsel; + u64 mask; + + *res = 0; + + mask = perf_pmu__format_bits(&intel_pt_pmu->format, str); + if (!mask) + return -EINVAL; + + evlist__for_each(evlist, evsel) { + if (evsel->attr.type == intel_pt_pmu->type) { + *res = intel_pt_masked_bits(mask, evsel->attr.config); + return 0; + } + } + + return -EINVAL; +} + +static size_t intel_pt_psb_period(struct perf_pmu *intel_pt_pmu, + struct perf_evlist *evlist) +{ + u64 val; + int err, topa_multiple_entries; + size_t psb_period; + + if (perf_pmu__scan_file(intel_pt_pmu, "caps/topa_multiple_entries", + "%d", &topa_multiple_entries) != 1) + topa_multiple_entries = 0; + + /* + * Use caps/topa_multiple_entries to indicate early hardware that had + * extra frequent PSBs. + */ + if (!topa_multiple_entries) { + psb_period = 256; + goto out; + } + + err = intel_pt_read_config(intel_pt_pmu, "psb_period", evlist, &val); + if (err) + val = 0; + + psb_period = 1 << (val + 11); +out: + pr_debug2("%s psb_period %zu\n", intel_pt_pmu->name, psb_period); + return psb_period; +} + +static int intel_pt_pick_bit(int bits, int target) +{ + int pos, pick = -1; + + for (pos = 0; bits; bits >>= 1, pos++) { + if (bits & 1) { + if (pos <= target || pick < 0) + pick = pos; + if (pos >= target) + break; + } + } + + return pick; } static u64 intel_pt_default_config(struct perf_pmu *intel_pt_pmu) { + char buf[256]; + int psb_cyc, psb_periods, psb_period; + int pos = 0; u64 config; - intel_pt_parse_terms(&intel_pt_pmu->format, "tsc", &config); + pos += scnprintf(buf + pos, sizeof(buf) - pos, "tsc"); + + if (perf_pmu__scan_file(intel_pt_pmu, "caps/psb_cyc", "%d", + &psb_cyc) != 1) + psb_cyc = 1; + + if (psb_cyc) { + if (perf_pmu__scan_file(intel_pt_pmu, "caps/psb_periods", "%x", + &psb_periods) != 1) + psb_periods = 0; + if (psb_periods) { + psb_period = intel_pt_pick_bit(psb_periods, 3); + pos += scnprintf(buf + pos, sizeof(buf) - pos, + ",psb_period=%d", psb_period); + } + } + + pr_debug2("%s default config: %s\n", intel_pt_pmu->name, buf); + + intel_pt_parse_terms(&intel_pt_pmu->format, buf, &config); + return config; } @@ -239,6 +343,103 @@ static int intel_pt_track_switches(struct perf_evlist *evlist) return 0; } +static void intel_pt_valid_str(char *str, size_t len, u64 valid) +{ + unsigned int val, last = 0, state = 1; + int p = 0; + + str[0] = '\0'; + + for (val = 0; val <= 64; val++, valid >>= 1) { + if (valid & 1) { + last = val; + switch (state) { + case 0: + p += scnprintf(str + p, len - p, ","); + /* Fall through */ + case 1: + p += scnprintf(str + p, len - p, "%u", val); + state = 2; + break; + case 2: + state = 3; + break; + case 3: + state = 4; + break; + default: + break; + } + } else { + switch (state) { + case 3: + p += scnprintf(str + p, len - p, ",%u", last); + state = 0; + break; + case 4: + p += scnprintf(str + p, len - p, "-%u", last); + state = 0; + break; + default: + break; + } + if (state != 1) + state = 0; + } + } +} + +static int intel_pt_val_config_term(struct perf_pmu *intel_pt_pmu, + const char *caps, const char *name, + const char *supported, u64 config) +{ + char valid_str[256]; + unsigned int shift; + unsigned long long valid; + u64 bits; + int ok; + + if (perf_pmu__scan_file(intel_pt_pmu, caps, "%llx", &valid) != 1) + valid = 0; + + if (supported && + perf_pmu__scan_file(intel_pt_pmu, supported, "%d", &ok) == 1 && !ok) + valid = 0; + + valid |= 1; + + bits = perf_pmu__format_bits(&intel_pt_pmu->format, name); + + config &= bits; + + for (shift = 0; bits && !(bits & 1); shift++) + bits >>= 1; + + config >>= shift; + + if (config > 63) + goto out_err; + + if (valid & (1 << config)) + return 0; +out_err: + intel_pt_valid_str(valid_str, sizeof(valid_str), valid); + pr_err("Invalid %s for %s. Valid values are: %s\n", + name, INTEL_PT_PMU_NAME, valid_str); + return -EINVAL; +} + +static int intel_pt_validate_config(struct perf_pmu *intel_pt_pmu, + struct perf_evsel *evsel) +{ + if (!evsel) + return 0; + + return intel_pt_val_config_term(intel_pt_pmu, "caps/psb_periods", + "psb_period", "caps/psb_cyc", + evsel->attr.config); +} + static int intel_pt_recording_options(struct auxtrace_record *itr, struct perf_evlist *evlist, struct record_opts *opts) @@ -251,6 +452,7 @@ static int intel_pt_recording_options(struct auxtrace_record *itr, const struct cpu_map *cpus = evlist->cpus; bool privileged = geteuid() == 0 || perf_event_paranoid() < 0; u64 tsc_bit; + int err; ptr->evlist = evlist; ptr->snapshot_mode = opts->auxtrace_snapshot_mode; @@ -281,6 +483,10 @@ static int intel_pt_recording_options(struct auxtrace_record *itr, if (!opts->full_auxtrace) return 0; + err = intel_pt_validate_config(intel_pt_pmu, intel_pt_evsel); + if (err) + return err; + /* Set default sizes for snapshot mode */ if (opts->auxtrace_snapshot_mode) { size_t psb_period = intel_pt_psb_period(intel_pt_pmu, evlist); @@ -366,8 +572,6 @@ static int intel_pt_recording_options(struct auxtrace_record *itr, * threads. */ if (have_timing_info && !cpu_map__empty(cpus)) { - int err; - err = intel_pt_track_switches(evlist); if (err == -EPERM) pr_debug2("Unable to select sched:sched_switch\n"); @@ -394,7 +598,6 @@ static int intel_pt_recording_options(struct auxtrace_record *itr, /* Add dummy event to keep tracking */ if (opts->full_auxtrace) { struct perf_evsel *tracking_evsel; - int err; err = parse_events(evlist, "dummy:u", NULL); if (err) -- cgit v0.10.2 From 3d49807870f08d6f3406b77efd94bb3788372162 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Fri, 17 Jul 2015 19:33:53 +0300 Subject: perf tools: Add new Intel PT packet definitions New features have been added to Intel PT which include a number of new packet definitions. This patch adds packet definitions for new packets: TMA, MTC, CYC, VMCS, TRACESTOP and MNT. Also another bit in PIP is defined. This patch only adds support for the definitions. Later patches add support for decoding TMA, MTC, CYC and TRACESTOP which is where those packets are explained. VMCS and the newly defined bit in PIP are used with virtualization which is not supported yet. MNT is a maintenance packet which the decoder should ignore. For details, refer to the June 2015 or later Intel 64 and IA-32 Architectures SDM Chapter 36 Intel Processor Trace. Signed-off-by: Adrian Hunter Cc: Jiri Olsa Link: http://lkml.kernel.org/r/1437150840-31811-19-git-send-email-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c index 56790ea..4a0e9fb 100644 --- a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c +++ b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c @@ -923,6 +923,7 @@ static int intel_pt_walk_psbend(struct intel_pt_decoder *decoder) case INTEL_PT_TIP_PGE: case INTEL_PT_TIP: case INTEL_PT_TNT: + case INTEL_PT_TRACESTOP: case INTEL_PT_BAD: case INTEL_PT_PSB: intel_pt_log("ERROR: Unexpected packet\n"); @@ -935,6 +936,9 @@ static int intel_pt_walk_psbend(struct intel_pt_decoder *decoder) intel_pt_calc_tsc_timestamp(decoder); break; + case INTEL_PT_TMA: + break; + case INTEL_PT_CBR: decoder->cbr = decoder->packet.payload; break; @@ -944,7 +948,7 @@ static int intel_pt_walk_psbend(struct intel_pt_decoder *decoder) break; case INTEL_PT_PIP: - decoder->cr3 = decoder->packet.payload; + decoder->cr3 = decoder->packet.payload & (BIT63 - 1); break; case INTEL_PT_FUP: @@ -956,6 +960,12 @@ static int intel_pt_walk_psbend(struct intel_pt_decoder *decoder) intel_pt_update_in_tx(decoder); break; + case INTEL_PT_MTC: + break; + + case INTEL_PT_CYC: + case INTEL_PT_VMCS: + case INTEL_PT_MNT: case INTEL_PT_PAD: default: break; @@ -983,8 +993,10 @@ static int intel_pt_walk_fup_tip(struct intel_pt_decoder *decoder) switch (decoder->packet.type) { case INTEL_PT_TNT: case INTEL_PT_FUP: + case INTEL_PT_TRACESTOP: case INTEL_PT_PSB: case INTEL_PT_TSC: + case INTEL_PT_TMA: case INTEL_PT_CBR: case INTEL_PT_MODE_TSX: case INTEL_PT_BAD: @@ -1032,13 +1044,21 @@ static int intel_pt_walk_fup_tip(struct intel_pt_decoder *decoder) return 0; case INTEL_PT_PIP: - decoder->cr3 = decoder->packet.payload; + decoder->cr3 = decoder->packet.payload & (BIT63 - 1); + break; + + case INTEL_PT_MTC: + break; + + case INTEL_PT_CYC: break; case INTEL_PT_MODE_EXEC: decoder->exec_mode = decoder->packet.payload; break; + case INTEL_PT_VMCS: + case INTEL_PT_MNT: case INTEL_PT_PAD: break; @@ -1122,6 +1142,9 @@ next: } return intel_pt_walk_fup_tip(decoder); + case INTEL_PT_TRACESTOP: + break; + case INTEL_PT_PSB: intel_pt_clear_stack(&decoder->stack); err = intel_pt_walk_psbend(decoder); @@ -1132,13 +1155,22 @@ next: break; case INTEL_PT_PIP: - decoder->cr3 = decoder->packet.payload; + decoder->cr3 = decoder->packet.payload & (BIT63 - 1); + break; + + case INTEL_PT_MTC: break; case INTEL_PT_TSC: intel_pt_calc_tsc_timestamp(decoder); break; + case INTEL_PT_TMA: + break; + + case INTEL_PT_CYC: + break; + case INTEL_PT_CBR: decoder->cbr = decoder->packet.payload; break; @@ -1162,6 +1194,8 @@ next: return intel_pt_bug(decoder); case INTEL_PT_PSBEND: + case INTEL_PT_VMCS: + case INTEL_PT_MNT: case INTEL_PT_PAD: break; @@ -1202,16 +1236,25 @@ static int intel_pt_walk_psb(struct intel_pt_decoder *decoder) } break; + case INTEL_PT_MTC: + break; + case INTEL_PT_TSC: intel_pt_calc_tsc_timestamp(decoder); break; + case INTEL_PT_TMA: + break; + + case INTEL_PT_CYC: + break; + case INTEL_PT_CBR: decoder->cbr = decoder->packet.payload; break; case INTEL_PT_PIP: - decoder->cr3 = decoder->packet.payload; + decoder->cr3 = decoder->packet.payload & (BIT63 - 1); break; case INTEL_PT_MODE_EXEC: @@ -1222,6 +1265,7 @@ static int intel_pt_walk_psb(struct intel_pt_decoder *decoder) intel_pt_update_in_tx(decoder); break; + case INTEL_PT_TRACESTOP: case INTEL_PT_TNT: intel_pt_log("ERROR: Unexpected packet\n"); if (decoder->ip) @@ -1240,6 +1284,8 @@ static int intel_pt_walk_psb(struct intel_pt_decoder *decoder) return 0; case INTEL_PT_PSB: + case INTEL_PT_VMCS: + case INTEL_PT_MNT: case INTEL_PT_PAD: default: break; @@ -1282,16 +1328,25 @@ static int intel_pt_walk_to_ip(struct intel_pt_decoder *decoder) intel_pt_set_last_ip(decoder); break; + case INTEL_PT_MTC: + break; + case INTEL_PT_TSC: intel_pt_calc_tsc_timestamp(decoder); break; + case INTEL_PT_TMA: + break; + + case INTEL_PT_CYC: + break; + case INTEL_PT_CBR: decoder->cbr = decoder->packet.payload; break; case INTEL_PT_PIP: - decoder->cr3 = decoder->packet.payload; + decoder->cr3 = decoder->packet.payload & (BIT63 - 1); break; case INTEL_PT_MODE_EXEC: @@ -1308,6 +1363,9 @@ static int intel_pt_walk_to_ip(struct intel_pt_decoder *decoder) case INTEL_PT_BAD: /* Does not happen */ return intel_pt_bug(decoder); + case INTEL_PT_TRACESTOP: + break; + case INTEL_PT_PSB: err = intel_pt_walk_psb(decoder); if (err) @@ -1321,6 +1379,8 @@ static int intel_pt_walk_to_ip(struct intel_pt_decoder *decoder) case INTEL_PT_TNT: case INTEL_PT_PSBEND: + case INTEL_PT_VMCS: + case INTEL_PT_MNT: case INTEL_PT_PAD: default: break; diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-pkt-decoder.c b/tools/perf/util/intel-pt-decoder/intel-pt-pkt-decoder.c index 988c82c..b1257c8 100644 --- a/tools/perf/util/intel-pt-decoder/intel-pt-pkt-decoder.c +++ b/tools/perf/util/intel-pt-decoder/intel-pt-pkt-decoder.c @@ -24,6 +24,8 @@ #define BIT63 ((uint64_t)1 << 63) +#define NR_FLAG BIT63 + #if __BYTE_ORDER == __BIG_ENDIAN #define le16_to_cpu bswap_16 #define le32_to_cpu bswap_32 @@ -46,15 +48,21 @@ static const char * const packet_name[] = { [INTEL_PT_TIP_PGD] = "TIP.PGD", [INTEL_PT_TIP_PGE] = "TIP.PGE", [INTEL_PT_TSC] = "TSC", + [INTEL_PT_TMA] = "TMA", [INTEL_PT_MODE_EXEC] = "MODE.Exec", [INTEL_PT_MODE_TSX] = "MODE.TSX", + [INTEL_PT_MTC] = "MTC", [INTEL_PT_TIP] = "TIP", [INTEL_PT_FUP] = "FUP", + [INTEL_PT_CYC] = "CYC", + [INTEL_PT_VMCS] = "VMCS", [INTEL_PT_PSB] = "PSB", [INTEL_PT_PSBEND] = "PSBEND", [INTEL_PT_CBR] = "CBR", + [INTEL_PT_TRACESTOP] = "TraceSTOP", [INTEL_PT_PIP] = "PIP", [INTEL_PT_OVF] = "OVF", + [INTEL_PT_MNT] = "MNT", }; const char *intel_pt_pkt_name(enum intel_pt_pkt_type type) @@ -96,10 +104,18 @@ static int intel_pt_get_pip(const unsigned char *buf, size_t len, packet->type = INTEL_PT_PIP; memcpy_le64(&payload, buf + 2, 6); packet->payload = payload >> 1; + if (payload & 1) + packet->payload |= NR_FLAG; return 8; } +static int intel_pt_get_tracestop(struct intel_pt_pkt *packet) +{ + packet->type = INTEL_PT_TRACESTOP; + return 2; +} + static int intel_pt_get_cbr(const unsigned char *buf, size_t len, struct intel_pt_pkt *packet) { @@ -110,6 +126,24 @@ static int intel_pt_get_cbr(const unsigned char *buf, size_t len, return 4; } +static int intel_pt_get_vmcs(const unsigned char *buf, size_t len, + struct intel_pt_pkt *packet) +{ + unsigned int count = (52 - 5) >> 3; + + if (count < 1 || count > 7) + return INTEL_PT_BAD_PACKET; + + if (len < count + 2) + return INTEL_PT_NEED_MORE_BYTES; + + packet->type = INTEL_PT_VMCS; + packet->count = count; + memcpy_le64(&packet->payload, buf + 2, count); + + return count + 2; +} + static int intel_pt_get_ovf(struct intel_pt_pkt *packet) { packet->type = INTEL_PT_OVF; @@ -139,12 +173,49 @@ static int intel_pt_get_psbend(struct intel_pt_pkt *packet) return 2; } +static int intel_pt_get_tma(const unsigned char *buf, size_t len, + struct intel_pt_pkt *packet) +{ + if (len < 7) + return INTEL_PT_NEED_MORE_BYTES; + + packet->type = INTEL_PT_TMA; + packet->payload = buf[2] | (buf[3] << 8); + packet->count = buf[5] | ((buf[6] & BIT(0)) << 8); + return 7; +} + static int intel_pt_get_pad(struct intel_pt_pkt *packet) { packet->type = INTEL_PT_PAD; return 1; } +static int intel_pt_get_mnt(const unsigned char *buf, size_t len, + struct intel_pt_pkt *packet) +{ + if (len < 11) + return INTEL_PT_NEED_MORE_BYTES; + packet->type = INTEL_PT_MNT; + memcpy_le64(&packet->payload, buf + 3, 8); + return 11 +; +} + +static int intel_pt_get_3byte(const unsigned char *buf, size_t len, + struct intel_pt_pkt *packet) +{ + if (len < 3) + return INTEL_PT_NEED_MORE_BYTES; + + switch (buf[2]) { + case 0x88: /* MNT */ + return intel_pt_get_mnt(buf, len, packet); + default: + return INTEL_PT_BAD_PACKET; + } +} + static int intel_pt_get_ext(const unsigned char *buf, size_t len, struct intel_pt_pkt *packet) { @@ -156,14 +227,22 @@ static int intel_pt_get_ext(const unsigned char *buf, size_t len, return intel_pt_get_long_tnt(buf, len, packet); case 0x43: /* PIP */ return intel_pt_get_pip(buf, len, packet); + case 0x83: /* TraceStop */ + return intel_pt_get_tracestop(packet); case 0x03: /* CBR */ return intel_pt_get_cbr(buf, len, packet); + case 0xc8: /* VMCS */ + return intel_pt_get_vmcs(buf, len, packet); case 0xf3: /* OVF */ return intel_pt_get_ovf(packet); case 0x82: /* PSB */ return intel_pt_get_psb(buf, len, packet); case 0x23: /* PSBEND */ return intel_pt_get_psbend(packet); + case 0x73: /* TMA */ + return intel_pt_get_tma(buf, len, packet); + case 0xC3: /* 3-byte header */ + return intel_pt_get_3byte(buf, len, packet); default: return INTEL_PT_BAD_PACKET; } @@ -187,6 +266,28 @@ static int intel_pt_get_short_tnt(unsigned int byte, return 1; } +static int intel_pt_get_cyc(unsigned int byte, const unsigned char *buf, + size_t len, struct intel_pt_pkt *packet) +{ + unsigned int offs = 1, shift; + uint64_t payload = byte >> 3; + + byte >>= 2; + len -= 1; + for (shift = 5; byte & 1; shift += 7) { + if (offs > 9) + return INTEL_PT_BAD_PACKET; + if (len < offs) + return INTEL_PT_NEED_MORE_BYTES; + byte = buf[offs++]; + payload |= (byte >> 1) << shift; + } + + packet->type = INTEL_PT_CYC; + packet->payload = payload; + return offs; +} + static int intel_pt_get_ip(enum intel_pt_pkt_type type, unsigned int byte, const unsigned char *buf, size_t len, struct intel_pt_pkt *packet) @@ -269,6 +370,16 @@ static int intel_pt_get_tsc(const unsigned char *buf, size_t len, return 8; } +static int intel_pt_get_mtc(const unsigned char *buf, size_t len, + struct intel_pt_pkt *packet) +{ + if (len < 2) + return INTEL_PT_NEED_MORE_BYTES; + packet->type = INTEL_PT_MTC; + packet->payload = buf[1]; + return 2; +} + static int intel_pt_do_get_packet(const unsigned char *buf, size_t len, struct intel_pt_pkt *packet) { @@ -288,6 +399,9 @@ static int intel_pt_do_get_packet(const unsigned char *buf, size_t len, return intel_pt_get_short_tnt(byte, packet); } + if ((byte & 2)) + return intel_pt_get_cyc(byte, buf, len, packet); + switch (byte & 0x1f) { case 0x0D: return intel_pt_get_ip(INTEL_PT_TIP, byte, buf, len, packet); @@ -305,6 +419,8 @@ static int intel_pt_do_get_packet(const unsigned char *buf, size_t len, return intel_pt_get_mode(buf, len, packet); case 0x19: return intel_pt_get_tsc(buf, len, packet); + case 0x59: + return intel_pt_get_mtc(buf, len, packet); default: return INTEL_PT_BAD_PACKET; } @@ -329,7 +445,7 @@ int intel_pt_get_packet(const unsigned char *buf, size_t len, int intel_pt_pkt_desc(const struct intel_pt_pkt *packet, char *buf, size_t buf_len) { - int ret, i; + int ret, i, nr; unsigned long long payload = packet->payload; const char *name = intel_pt_pkt_name(packet->type); @@ -338,6 +454,7 @@ int intel_pt_pkt_desc(const struct intel_pt_pkt *packet, char *buf, case INTEL_PT_PAD: case INTEL_PT_PSB: case INTEL_PT_PSBEND: + case INTEL_PT_TRACESTOP: case INTEL_PT_OVF: return snprintf(buf, buf_len, "%s", name); case INTEL_PT_TNT: { @@ -371,17 +488,16 @@ int intel_pt_pkt_desc(const struct intel_pt_pkt *packet, char *buf, case INTEL_PT_FUP: if (!(packet->count)) return snprintf(buf, buf_len, "%s no ip", name); + case INTEL_PT_CYC: + case INTEL_PT_VMCS: + case INTEL_PT_MTC: + case INTEL_PT_MNT: case INTEL_PT_CBR: - return snprintf(buf, buf_len, "%s 0x%llx", name, payload); case INTEL_PT_TSC: - if (packet->count) - return snprintf(buf, buf_len, - "%s 0x%llx CTC 0x%x FC 0x%x", - name, payload, packet->count & 0xffff, - (packet->count >> 16) & 0x1ff); - else - return snprintf(buf, buf_len, "%s 0x%llx", - name, payload); + return snprintf(buf, buf_len, "%s 0x%llx", name, payload); + case INTEL_PT_TMA: + return snprintf(buf, buf_len, "%s CTC 0x%x FC 0x%x", name, + (unsigned)payload, packet->count); case INTEL_PT_MODE_EXEC: return snprintf(buf, buf_len, "%s %lld", name, payload); case INTEL_PT_MODE_TSX: @@ -389,8 +505,10 @@ int intel_pt_pkt_desc(const struct intel_pt_pkt *packet, char *buf, name, (unsigned)(payload >> 1) & 1, (unsigned)payload & 1); case INTEL_PT_PIP: - ret = snprintf(buf, buf_len, "%s 0x%llx", - name, payload); + nr = packet->payload & NR_FLAG ? 1 : 0; + payload &= ~NR_FLAG; + ret = snprintf(buf, buf_len, "%s 0x%llx (NR=%d)", + name, payload, nr); return ret; default: break; diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-pkt-decoder.h b/tools/perf/util/intel-pt-decoder/intel-pt-pkt-decoder.h index 53404fa..781bb79 100644 --- a/tools/perf/util/intel-pt-decoder/intel-pt-pkt-decoder.h +++ b/tools/perf/util/intel-pt-decoder/intel-pt-pkt-decoder.h @@ -37,15 +37,21 @@ enum intel_pt_pkt_type { INTEL_PT_TIP_PGD, INTEL_PT_TIP_PGE, INTEL_PT_TSC, + INTEL_PT_TMA, INTEL_PT_MODE_EXEC, INTEL_PT_MODE_TSX, + INTEL_PT_MTC, INTEL_PT_TIP, INTEL_PT_FUP, + INTEL_PT_CYC, + INTEL_PT_VMCS, INTEL_PT_PSB, INTEL_PT_PSBEND, INTEL_PT_CBR, + INTEL_PT_TRACESTOP, INTEL_PT_PIP, INTEL_PT_OVF, + INTEL_PT_MNT, }; struct intel_pt_pkt { -- cgit v0.10.2 From 11fa7cb86b56d3610043ba2ac6cbd81feab4b7c4 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Fri, 17 Jul 2015 19:33:54 +0300 Subject: perf tools: Pass Intel PT information for decoding MTC and CYC Record additional information in the AUXTRACE_INFO event in preparation for decoding MTC and CYC packets. Pass the information to the decoder. The AUXTRACE_INFO record can be extended by using the size to indicate the presence of new members. The additional information includes PMU config bit positions and the TSC to CTC (hardware crystal clock) ratio needed to decode MTC packets. Signed-off-by: Adrian Hunter Cc: Jiri Olsa Link: http://lkml.kernel.org/r/1437150840-31811-20-git-send-email-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/arch/x86/util/intel-pt.c b/tools/perf/arch/x86/util/intel-pt.c index 145975b..faae928 100644 --- a/tools/perf/arch/x86/util/intel-pt.c +++ b/tools/perf/arch/x86/util/intel-pt.c @@ -18,6 +18,7 @@ #include #include #include +#include #include "../../perf.h" #include "../../util/session.h" @@ -261,6 +262,15 @@ static size_t intel_pt_info_priv_size(struct auxtrace_record *itr __maybe_unused return INTEL_PT_AUXTRACE_PRIV_SIZE; } +static void intel_pt_tsc_ctc_ratio(u32 *n, u32 *d) +{ + unsigned int eax = 0, ebx = 0, ecx = 0, edx = 0; + + __get_cpuid(0x15, &eax, &ebx, &ecx, &edx); + *n = ebx; + *d = eax; +} + static int intel_pt_info_fill(struct auxtrace_record *itr, struct perf_session *session, struct auxtrace_info_event *auxtrace_info, @@ -272,7 +282,8 @@ static int intel_pt_info_fill(struct auxtrace_record *itr, struct perf_event_mmap_page *pc; struct perf_tsc_conversion tc = { .time_mult = 0, }; bool cap_user_time_zero = false, per_cpu_mmaps; - u64 tsc_bit, noretcomp_bit; + u64 tsc_bit, mtc_bit, mtc_freq_bits, cyc_bit, noretcomp_bit; + u32 tsc_ctc_ratio_n, tsc_ctc_ratio_d; int err; if (priv_size != INTEL_PT_AUXTRACE_PRIV_SIZE) @@ -281,6 +292,12 @@ static int intel_pt_info_fill(struct auxtrace_record *itr, intel_pt_parse_terms(&intel_pt_pmu->format, "tsc", &tsc_bit); intel_pt_parse_terms(&intel_pt_pmu->format, "noretcomp", &noretcomp_bit); + intel_pt_parse_terms(&intel_pt_pmu->format, "mtc", &mtc_bit); + mtc_freq_bits = perf_pmu__format_bits(&intel_pt_pmu->format, + "mtc_period"); + intel_pt_parse_terms(&intel_pt_pmu->format, "cyc", &cyc_bit); + + intel_pt_tsc_ctc_ratio(&tsc_ctc_ratio_n, &tsc_ctc_ratio_d); if (!session->evlist->nr_mmaps) return -EINVAL; @@ -311,6 +328,11 @@ static int intel_pt_info_fill(struct auxtrace_record *itr, auxtrace_info->priv[INTEL_PT_HAVE_SCHED_SWITCH] = ptr->have_sched_switch; auxtrace_info->priv[INTEL_PT_SNAPSHOT_MODE] = ptr->snapshot_mode; auxtrace_info->priv[INTEL_PT_PER_CPU_MMAPS] = per_cpu_mmaps; + auxtrace_info->priv[INTEL_PT_MTC_BIT] = mtc_bit; + auxtrace_info->priv[INTEL_PT_MTC_FREQ_BITS] = mtc_freq_bits; + auxtrace_info->priv[INTEL_PT_TSC_CTC_N] = tsc_ctc_ratio_n; + auxtrace_info->priv[INTEL_PT_TSC_CTC_D] = tsc_ctc_ratio_d; + auxtrace_info->priv[INTEL_PT_CYC_BIT] = cyc_bit; return 0; } diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.h b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.h index cbf5704..56cc47b 100644 --- a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.h +++ b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.h @@ -87,6 +87,9 @@ struct intel_pt_params { uint64_t period; enum intel_pt_period_type period_type; unsigned max_non_turbo_ratio; + unsigned int mtc_period; + uint32_t tsc_ctc_ratio_n; + uint32_t tsc_ctc_ratio_d; }; struct intel_pt_decoder; diff --git a/tools/perf/util/intel-pt.c b/tools/perf/util/intel-pt.c index 3b34a64..bb41c20 100644 --- a/tools/perf/util/intel-pt.c +++ b/tools/perf/util/intel-pt.c @@ -91,6 +91,11 @@ struct intel_pt { bool synth_needs_swap; u64 tsc_bit; + u64 mtc_bit; + u64 mtc_freq_bits; + u32 tsc_ctc_ratio_n; + u32 tsc_ctc_ratio_d; + u64 cyc_bit; u64 noretcomp_bit; unsigned max_non_turbo_ratio; }; @@ -568,6 +573,25 @@ static bool intel_pt_return_compression(struct intel_pt *pt) return true; } +static unsigned int intel_pt_mtc_period(struct intel_pt *pt) +{ + struct perf_evsel *evsel; + unsigned int shift; + u64 config; + + if (!pt->mtc_freq_bits) + return 0; + + for (shift = 0, config = pt->mtc_freq_bits; !(config & 1); shift++) + config >>= 1; + + evlist__for_each(pt->session->evlist, evsel) { + if (intel_pt_get_config(pt, &evsel->attr, &config)) + return (config & pt->mtc_freq_bits) >> shift; + } + return 0; +} + static bool intel_pt_timeless_decoding(struct intel_pt *pt) { struct perf_evsel *evsel; @@ -668,6 +692,9 @@ static struct intel_pt_queue *intel_pt_alloc_queue(struct intel_pt *pt, params.data = ptq; params.return_compression = intel_pt_return_compression(pt); params.max_non_turbo_ratio = pt->max_non_turbo_ratio; + params.mtc_period = intel_pt_mtc_period(pt); + params.tsc_ctc_ratio_n = pt->tsc_ctc_ratio_n; + params.tsc_ctc_ratio_d = pt->tsc_ctc_ratio_d; if (pt->synth_opts.instructions) { if (pt->synth_opts.period) { @@ -1751,16 +1778,20 @@ static struct perf_evsel *intel_pt_find_sched_switch(struct perf_evlist *evlist) } static const char * const intel_pt_info_fmts[] = { - [INTEL_PT_PMU_TYPE] = " PMU Type %"PRId64"\n", - [INTEL_PT_TIME_SHIFT] = " Time Shift %"PRIu64"\n", - [INTEL_PT_TIME_MULT] = " Time Muliplier %"PRIu64"\n", - [INTEL_PT_TIME_ZERO] = " Time Zero %"PRIu64"\n", - [INTEL_PT_CAP_USER_TIME_ZERO] = " Cap Time Zero %"PRId64"\n", - [INTEL_PT_TSC_BIT] = " TSC bit %#"PRIx64"\n", - [INTEL_PT_NORETCOMP_BIT] = " NoRETComp bit %#"PRIx64"\n", - [INTEL_PT_HAVE_SCHED_SWITCH] = " Have sched_switch %"PRId64"\n", - [INTEL_PT_SNAPSHOT_MODE] = " Snapshot mode %"PRId64"\n", - [INTEL_PT_PER_CPU_MMAPS] = " Per-cpu maps %"PRId64"\n", + [INTEL_PT_PMU_TYPE] = " PMU Type %"PRId64"\n", + [INTEL_PT_TIME_SHIFT] = " Time Shift %"PRIu64"\n", + [INTEL_PT_TIME_MULT] = " Time Muliplier %"PRIu64"\n", + [INTEL_PT_TIME_ZERO] = " Time Zero %"PRIu64"\n", + [INTEL_PT_CAP_USER_TIME_ZERO] = " Cap Time Zero %"PRId64"\n", + [INTEL_PT_TSC_BIT] = " TSC bit %#"PRIx64"\n", + [INTEL_PT_NORETCOMP_BIT] = " NoRETComp bit %#"PRIx64"\n", + [INTEL_PT_HAVE_SCHED_SWITCH] = " Have sched_switch %"PRId64"\n", + [INTEL_PT_SNAPSHOT_MODE] = " Snapshot mode %"PRId64"\n", + [INTEL_PT_PER_CPU_MMAPS] = " Per-cpu maps %"PRId64"\n", + [INTEL_PT_MTC_BIT] = " MTC bit %#"PRIx64"\n", + [INTEL_PT_TSC_CTC_N] = " TSC:CTC numerator %"PRIu64"\n", + [INTEL_PT_TSC_CTC_D] = " TSC:CTC denominator %"PRIu64"\n", + [INTEL_PT_CYC_BIT] = " CYC bit %#"PRIx64"\n", }; static void intel_pt_print_info(u64 *arr, int start, int finish) @@ -1812,6 +1843,17 @@ int intel_pt_process_auxtrace_info(union perf_event *event, intel_pt_print_info(&auxtrace_info->priv[0], INTEL_PT_PMU_TYPE, INTEL_PT_PER_CPU_MMAPS); + if (auxtrace_info->header.size >= sizeof(struct auxtrace_info_event) + + (sizeof(u64) * INTEL_PT_CYC_BIT)) { + pt->mtc_bit = auxtrace_info->priv[INTEL_PT_MTC_BIT]; + pt->mtc_freq_bits = auxtrace_info->priv[INTEL_PT_MTC_FREQ_BITS]; + pt->tsc_ctc_ratio_n = auxtrace_info->priv[INTEL_PT_TSC_CTC_N]; + pt->tsc_ctc_ratio_d = auxtrace_info->priv[INTEL_PT_TSC_CTC_D]; + pt->cyc_bit = auxtrace_info->priv[INTEL_PT_CYC_BIT]; + intel_pt_print_info(&auxtrace_info->priv[0], INTEL_PT_MTC_BIT, + INTEL_PT_CYC_BIT); + } + pt->timeless_decoding = intel_pt_timeless_decoding(pt); pt->have_tsc = intel_pt_have_tsc(pt); pt->sampling_mode = false; diff --git a/tools/perf/util/intel-pt.h b/tools/perf/util/intel-pt.h index a1bfe93..0065949 100644 --- a/tools/perf/util/intel-pt.h +++ b/tools/perf/util/intel-pt.h @@ -29,6 +29,11 @@ enum { INTEL_PT_HAVE_SCHED_SWITCH, INTEL_PT_SNAPSHOT_MODE, INTEL_PT_PER_CPU_MMAPS, + INTEL_PT_MTC_BIT, + INTEL_PT_MTC_FREQ_BITS, + INTEL_PT_TSC_CTC_N, + INTEL_PT_TSC_CTC_D, + INTEL_PT_CYC_BIT, INTEL_PT_AUXTRACE_PRIV_MAX, }; -- cgit v0.10.2 From 79b58424b821c651a4b4df9018a14684e3670f42 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Fri, 17 Jul 2015 19:33:55 +0300 Subject: perf tools: Add Intel PT support for decoding MTC packets MTC packets provide finer grain timestamp information than TSC packets. MTC packets record time using the hardware crystal clock (CTC) which is related to TSC packets using a TMA packet. This patch just adds decoder support. Support for a default value and validation of values is provided by a later patch. Also documentation is updated in a separate patch. For details refer to the June 2015 or later Intel 64 and IA-32 Architectures SDM Chapter 36 Intel Processor Trace. Signed-off-by: Adrian Hunter Cc: Jiri Olsa Link: http://lkml.kernel.org/r/1437150840-31811-21-git-send-email-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c index 4a0e9fb..f7119a1 100644 --- a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c +++ b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c @@ -85,7 +85,9 @@ struct intel_pt_decoder { const unsigned char *buf; size_t len; bool return_compression; + bool mtc_insn; bool pge; + bool have_tma; uint64_t pos; uint64_t last_ip; uint64_t ip; @@ -94,6 +96,15 @@ struct intel_pt_decoder { uint64_t tsc_timestamp; uint64_t ref_timestamp; uint64_t ret_addr; + uint64_t ctc_timestamp; + uint64_t ctc_delta; + uint32_t last_mtc; + uint32_t tsc_ctc_ratio_n; + uint32_t tsc_ctc_ratio_d; + uint32_t tsc_ctc_mult; + uint32_t tsc_slip; + uint32_t ctc_rem_mask; + int mtc_shift; struct intel_pt_stack stack; enum intel_pt_pkt_state pkt_state; struct intel_pt_pkt packet; @@ -149,6 +160,13 @@ static void intel_pt_setup_period(struct intel_pt_decoder *decoder) } } +static uint64_t multdiv(uint64_t t, uint32_t n, uint32_t d) +{ + if (!d) + return 0; + return (t / d) * n + ((t % d) * n) / d; +} + struct intel_pt_decoder *intel_pt_decoder_new(struct intel_pt_params *params) { struct intel_pt_decoder *decoder; @@ -175,6 +193,39 @@ struct intel_pt_decoder *intel_pt_decoder_new(struct intel_pt_params *params) intel_pt_setup_period(decoder); + decoder->mtc_shift = params->mtc_period; + decoder->ctc_rem_mask = (1 << decoder->mtc_shift) - 1; + + decoder->tsc_ctc_ratio_n = params->tsc_ctc_ratio_n; + decoder->tsc_ctc_ratio_d = params->tsc_ctc_ratio_d; + + if (!decoder->tsc_ctc_ratio_n) + decoder->tsc_ctc_ratio_d = 0; + + if (decoder->tsc_ctc_ratio_d) { + if (!(decoder->tsc_ctc_ratio_n % decoder->tsc_ctc_ratio_d)) + decoder->tsc_ctc_mult = decoder->tsc_ctc_ratio_n / + decoder->tsc_ctc_ratio_d; + + /* + * Allow for timestamps appearing to backwards because a TSC + * packet has slipped past a MTC packet, so allow 2 MTC ticks + * or ... + */ + decoder->tsc_slip = multdiv(2 << decoder->mtc_shift, + decoder->tsc_ctc_ratio_n, + decoder->tsc_ctc_ratio_d); + } + /* ... or 0x100 paranoia */ + if (decoder->tsc_slip < 0x100) + decoder->tsc_slip = 0x100; + + intel_pt_log("timestamp: mtc_shift %u\n", decoder->mtc_shift); + intel_pt_log("timestamp: tsc_ctc_ratio_n %u\n", decoder->tsc_ctc_ratio_n); + intel_pt_log("timestamp: tsc_ctc_ratio_d %u\n", decoder->tsc_ctc_ratio_d); + intel_pt_log("timestamp: tsc_ctc_mult %u\n", decoder->tsc_ctc_mult); + intel_pt_log("timestamp: tsc_slip %#x\n", decoder->tsc_slip); + return decoder; } @@ -368,6 +419,7 @@ static inline void intel_pt_update_in_tx(struct intel_pt_decoder *decoder) static int intel_pt_bad_packet(struct intel_pt_decoder *decoder) { intel_pt_clear_tx_flags(decoder); + decoder->have_tma = false; decoder->pkt_len = 1; decoder->pkt_step = 1; intel_pt_decoder_log_packet(decoder); @@ -400,6 +452,7 @@ static int intel_pt_get_data(struct intel_pt_decoder *decoder) decoder->pkt_state = INTEL_PT_STATE_NO_PSB; decoder->ref_timestamp = buffer.ref_timestamp; decoder->timestamp = 0; + decoder->have_tma = false; decoder->state.trace_nr = buffer.trace_nr; intel_pt_log("Reference timestamp 0x%" PRIx64 "\n", decoder->ref_timestamp); @@ -523,6 +576,7 @@ static uint64_t intel_pt_next_sample(struct intel_pt_decoder *decoder) case INTEL_PT_PERIOD_TICKS: return intel_pt_next_period(decoder); case INTEL_PT_PERIOD_NONE: + case INTEL_PT_PERIOD_MTC: default: return 0; } @@ -542,6 +596,7 @@ static void intel_pt_sample_insn(struct intel_pt_decoder *decoder) decoder->last_masked_timestamp = masked_timestamp; break; case INTEL_PT_PERIOD_NONE: + case INTEL_PT_PERIOD_MTC: default: break; } @@ -555,6 +610,9 @@ static int intel_pt_walk_insn(struct intel_pt_decoder *decoder, uint64_t max_insn_cnt, insn_cnt = 0; int err; + if (!decoder->mtc_insn) + decoder->mtc_insn = true; + max_insn_cnt = intel_pt_next_sample(decoder); err = decoder->walk_insn(intel_pt_insn, &insn_cnt, &decoder->ip, ip, @@ -861,6 +919,8 @@ static void intel_pt_calc_tsc_timestamp(struct intel_pt_decoder *decoder) { uint64_t timestamp; + decoder->have_tma = false; + if (decoder->ref_timestamp) { timestamp = decoder->packet.payload | (decoder->ref_timestamp & (0xffULL << 56)); @@ -878,17 +938,18 @@ static void intel_pt_calc_tsc_timestamp(struct intel_pt_decoder *decoder) } else if (decoder->timestamp) { timestamp = decoder->packet.payload | (decoder->timestamp & (0xffULL << 56)); + decoder->tsc_timestamp = timestamp; if (timestamp < decoder->timestamp && - decoder->timestamp - timestamp < 0x100) { - intel_pt_log_to("ERROR: Suppressing backwards timestamp", + decoder->timestamp - timestamp < decoder->tsc_slip) { + intel_pt_log_to("Suppressing backwards timestamp", timestamp); timestamp = decoder->timestamp; } while (timestamp < decoder->timestamp) { intel_pt_log_to("Wraparound timestamp", timestamp); timestamp += (1ULL << 56); + decoder->tsc_timestamp = timestamp; } - decoder->tsc_timestamp = timestamp; decoder->timestamp = timestamp; decoder->timestamp_insn_cnt = 0; } @@ -900,11 +961,73 @@ static int intel_pt_overflow(struct intel_pt_decoder *decoder) { intel_pt_log("ERROR: Buffer overflow\n"); intel_pt_clear_tx_flags(decoder); + decoder->have_tma = false; decoder->pkt_state = INTEL_PT_STATE_ERR_RESYNC; decoder->overflow = true; return -EOVERFLOW; } +static void intel_pt_calc_tma(struct intel_pt_decoder *decoder) +{ + uint32_t ctc = decoder->packet.payload; + uint32_t fc = decoder->packet.count; + uint32_t ctc_rem = ctc & decoder->ctc_rem_mask; + + if (!decoder->tsc_ctc_ratio_d) + return; + + decoder->last_mtc = (ctc >> decoder->mtc_shift) & 0xff; + decoder->ctc_timestamp = decoder->tsc_timestamp - fc; + if (decoder->tsc_ctc_mult) { + decoder->ctc_timestamp -= ctc_rem * decoder->tsc_ctc_mult; + } else { + decoder->ctc_timestamp -= multdiv(ctc_rem, + decoder->tsc_ctc_ratio_n, + decoder->tsc_ctc_ratio_d); + } + decoder->ctc_delta = 0; + decoder->have_tma = true; + intel_pt_log("CTC timestamp " x64_fmt " last MTC %#x CTC rem %#x\n", + decoder->ctc_timestamp, decoder->last_mtc, ctc_rem); +} + +static void intel_pt_calc_mtc_timestamp(struct intel_pt_decoder *decoder) +{ + uint64_t timestamp; + uint32_t mtc, mtc_delta; + + if (!decoder->have_tma) + return; + + mtc = decoder->packet.payload; + + if (mtc > decoder->last_mtc) + mtc_delta = mtc - decoder->last_mtc; + else + mtc_delta = mtc + 256 - decoder->last_mtc; + + decoder->ctc_delta += mtc_delta << decoder->mtc_shift; + + if (decoder->tsc_ctc_mult) { + timestamp = decoder->ctc_timestamp + + decoder->ctc_delta * decoder->tsc_ctc_mult; + } else { + timestamp = decoder->ctc_timestamp + + multdiv(decoder->ctc_delta, + decoder->tsc_ctc_ratio_n, + decoder->tsc_ctc_ratio_d); + } + + if (timestamp < decoder->timestamp) + intel_pt_log("Suppressing MTC timestamp " x64_fmt " less than current timestamp " x64_fmt "\n", + timestamp, decoder->timestamp); + else + decoder->timestamp = timestamp; + + decoder->timestamp_insn_cnt = 0; + decoder->last_mtc = mtc; +} + /* Walk PSB+ packets when already in sync. */ static int intel_pt_walk_psbend(struct intel_pt_decoder *decoder) { @@ -926,6 +1049,7 @@ static int intel_pt_walk_psbend(struct intel_pt_decoder *decoder) case INTEL_PT_TRACESTOP: case INTEL_PT_BAD: case INTEL_PT_PSB: + decoder->have_tma = false; intel_pt_log("ERROR: Unexpected packet\n"); return -EAGAIN; @@ -937,6 +1061,7 @@ static int intel_pt_walk_psbend(struct intel_pt_decoder *decoder) break; case INTEL_PT_TMA: + intel_pt_calc_tma(decoder); break; case INTEL_PT_CBR: @@ -961,6 +1086,9 @@ static int intel_pt_walk_psbend(struct intel_pt_decoder *decoder) break; case INTEL_PT_MTC: + intel_pt_calc_mtc_timestamp(decoder); + if (decoder->period_type == INTEL_PT_PERIOD_MTC) + decoder->state.type |= INTEL_PT_INSTRUCTION; break; case INTEL_PT_CYC: @@ -1048,6 +1176,9 @@ static int intel_pt_walk_fup_tip(struct intel_pt_decoder *decoder) break; case INTEL_PT_MTC: + intel_pt_calc_mtc_timestamp(decoder); + if (decoder->period_type == INTEL_PT_PERIOD_MTC) + decoder->state.type |= INTEL_PT_INSTRUCTION; break; case INTEL_PT_CYC: @@ -1159,13 +1290,31 @@ next: break; case INTEL_PT_MTC: - break; + intel_pt_calc_mtc_timestamp(decoder); + if (decoder->period_type != INTEL_PT_PERIOD_MTC) + break; + /* + * Ensure that there has been an instruction since the + * last MTC. + */ + if (!decoder->mtc_insn) + break; + decoder->mtc_insn = false; + /* Ensure that there is a timestamp */ + if (!decoder->timestamp) + break; + decoder->state.type = INTEL_PT_INSTRUCTION; + decoder->state.from_ip = decoder->ip; + decoder->state.to_ip = 0; + decoder->mtc_insn = false; + return 0; case INTEL_PT_TSC: intel_pt_calc_tsc_timestamp(decoder); break; case INTEL_PT_TMA: + intel_pt_calc_tma(decoder); break; case INTEL_PT_CYC: @@ -1237,6 +1386,7 @@ static int intel_pt_walk_psb(struct intel_pt_decoder *decoder) break; case INTEL_PT_MTC: + intel_pt_calc_mtc_timestamp(decoder); break; case INTEL_PT_TSC: @@ -1244,6 +1394,7 @@ static int intel_pt_walk_psb(struct intel_pt_decoder *decoder) break; case INTEL_PT_TMA: + intel_pt_calc_tma(decoder); break; case INTEL_PT_CYC: @@ -1267,6 +1418,7 @@ static int intel_pt_walk_psb(struct intel_pt_decoder *decoder) case INTEL_PT_TRACESTOP: case INTEL_PT_TNT: + decoder->have_tma = false; intel_pt_log("ERROR: Unexpected packet\n"); if (decoder->ip) decoder->pkt_state = INTEL_PT_STATE_ERR4; @@ -1329,6 +1481,7 @@ static int intel_pt_walk_to_ip(struct intel_pt_decoder *decoder) break; case INTEL_PT_MTC: + intel_pt_calc_mtc_timestamp(decoder); break; case INTEL_PT_TSC: @@ -1336,6 +1489,7 @@ static int intel_pt_walk_to_ip(struct intel_pt_decoder *decoder) break; case INTEL_PT_TMA: + intel_pt_calc_tma(decoder); break; case INTEL_PT_CYC: diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.h b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.h index 56cc47b..02c38fe 100644 --- a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.h +++ b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.h @@ -36,6 +36,7 @@ enum intel_pt_period_type { INTEL_PT_PERIOD_NONE, INTEL_PT_PERIOD_INSTRUCTIONS, INTEL_PT_PERIOD_TICKS, + INTEL_PT_PERIOD_MTC, }; enum { -- cgit v0.10.2 From b45fc0bfaf4a0b60ce2deda222f8ef2a23b89a5f Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Fri, 17 Jul 2015 19:33:56 +0300 Subject: perf tools: Add Intel PT support for using MTC packets MTC packets are a new Intel PT feature. MTC packets provide finer grain timestamp information than TSC packets. Support for this feature is indicated by: /sys/bus/event_source/devices/intel_pt/caps/mtc which contains "1" if the feature is supported and "0" otherwise. MTC packets can be requested using a PMU config term e.g. perf record -e intel_pt/mtc/u sleep 1 The frequency of MTC packets can also be specified. e.g. perf record -e intel_pt/mtc,mtc_period=2/u sleep 1 The default value is 3 or the nearest lower value that is supported. 0 is always supported. Valid values are given by: /sys/bus/event_source/devices/intel_pt/caps/mtc_periods which contains a hexadecimal value, the bits of which represent valid values e.g. bit 2 set means value 2 is valid. The value is converted to the MTC frequency as: CTC-frequency / (2 ^ value) e.g. value 3 means one eighth of CTC-frequency Where CTC is the hardware crystal clock, the frequency of which can be related to TSC via values provided in cpuid leaf 0x15. If an invalid value is entered, the error message will give a list of valid values e.g. $ perf record -e intel_pt/mtc_period=15/u uname Invalid mtc_period for intel_pt. Valid values are: 0,3,6,9 tools/perf/Documentation/intel-pt.txt is updated in a later patch as there are a number of new features being added. For more information refer to the June 2015 or later Intel 64 and IA-32 Architectures SDM Chapter 36 Intel Processor Trace. Signed-off-by: Adrian Hunter Cc: Jiri Olsa Link: http://lkml.kernel.org/r/1437150840-31811-22-git-send-email-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/arch/x86/util/intel-pt.c b/tools/perf/arch/x86/util/intel-pt.c index faae928..a5de01d 100644 --- a/tools/perf/arch/x86/util/intel-pt.c +++ b/tools/perf/arch/x86/util/intel-pt.c @@ -190,17 +190,33 @@ static int intel_pt_pick_bit(int bits, int target) static u64 intel_pt_default_config(struct perf_pmu *intel_pt_pmu) { char buf[256]; + int mtc, mtc_periods = 0, mtc_period; int psb_cyc, psb_periods, psb_period; int pos = 0; u64 config; pos += scnprintf(buf + pos, sizeof(buf) - pos, "tsc"); + if (perf_pmu__scan_file(intel_pt_pmu, "caps/mtc", "%d", + &mtc) != 1) + mtc = 1; + + if (mtc) { + if (perf_pmu__scan_file(intel_pt_pmu, "caps/mtc_periods", "%x", + &mtc_periods) != 1) + mtc_periods = 0; + if (mtc_periods) { + mtc_period = intel_pt_pick_bit(mtc_periods, 3); + pos += scnprintf(buf + pos, sizeof(buf) - pos, + ",mtc,mtc_period=%d", mtc_period); + } + } + if (perf_pmu__scan_file(intel_pt_pmu, "caps/psb_cyc", "%d", &psb_cyc) != 1) psb_cyc = 1; - if (psb_cyc) { + if (psb_cyc && mtc_periods) { if (perf_pmu__scan_file(intel_pt_pmu, "caps/psb_periods", "%x", &psb_periods) != 1) psb_periods = 0; @@ -454,9 +470,17 @@ out_err: static int intel_pt_validate_config(struct perf_pmu *intel_pt_pmu, struct perf_evsel *evsel) { + int err; + if (!evsel) return 0; + err = intel_pt_val_config_term(intel_pt_pmu, "caps/mtc_periods", + "mtc_period", "caps/mtc", + evsel->attr.config); + if (err) + return err; + return intel_pt_val_config_term(intel_pt_pmu, "caps/psb_periods", "psb_period", "caps/psb_cyc", evsel->attr.config); -- cgit v0.10.2 From cc33618619cefc6d730cca3bb8e15311016a4da7 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Fri, 17 Jul 2015 19:33:57 +0300 Subject: perf tools: Add Intel PT support for decoding CYC packets CYC packets provide even finer grain timestamp information than MTC and TSC packets. A CYC packet contains the number of CPU cycles since the last CYC packet. This patch just adds decoder support. The CPU frequency can be related to TSC using the Maximum Non-Turbo Ratio in combination with the CBR (core-to-bus ratio) packet. However more accuracy is achieved by simply interpolating the number of cycles between other timing packets like MTC or TSC. This patch takes the latter approach. Support for a default value and validation of values is provided by a later patch. Also documentation is updated in a separate patch. For details refer to the June 2015 or later Intel 64 and IA-32 Architectures SDM Chapter 36 Intel Processor Trace. Signed-off-by: Adrian Hunter Cc: Jiri Olsa Link: http://lkml.kernel.org/r/1437150840-31811-23-git-send-email-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c index f7119a1..0845c5e 100644 --- a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c +++ b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c @@ -88,6 +88,7 @@ struct intel_pt_decoder { bool mtc_insn; bool pge; bool have_tma; + bool have_cyc; uint64_t pos; uint64_t last_ip; uint64_t ip; @@ -98,6 +99,8 @@ struct intel_pt_decoder { uint64_t ret_addr; uint64_t ctc_timestamp; uint64_t ctc_delta; + uint64_t cycle_cnt; + uint64_t cyc_ref_timestamp; uint32_t last_mtc; uint32_t tsc_ctc_ratio_n; uint32_t tsc_ctc_ratio_d; @@ -111,8 +114,13 @@ struct intel_pt_decoder { struct intel_pt_pkt tnt; int pkt_step; int pkt_len; + int last_packet_type; unsigned int cbr; unsigned int max_non_turbo_ratio; + double max_non_turbo_ratio_fp; + double cbr_cyc_to_tsc; + double calc_cyc_to_tsc; + bool have_calc_cyc_to_tsc; int exec_mode; unsigned int insn_bytes; uint64_t sign_bit; @@ -189,7 +197,8 @@ struct intel_pt_decoder *intel_pt_decoder_new(struct intel_pt_params *params) decoder->period = params->period; decoder->period_type = params->period_type; - decoder->max_non_turbo_ratio = params->max_non_turbo_ratio; + decoder->max_non_turbo_ratio = params->max_non_turbo_ratio; + decoder->max_non_turbo_ratio_fp = params->max_non_turbo_ratio; intel_pt_setup_period(decoder); @@ -514,10 +523,247 @@ static int intel_pt_get_split_packet(struct intel_pt_decoder *decoder) return ret; } +struct intel_pt_pkt_info { + struct intel_pt_decoder *decoder; + struct intel_pt_pkt packet; + uint64_t pos; + int pkt_len; + int last_packet_type; + void *data; +}; + +typedef int (*intel_pt_pkt_cb_t)(struct intel_pt_pkt_info *pkt_info); + +/* Lookahead packets in current buffer */ +static int intel_pt_pkt_lookahead(struct intel_pt_decoder *decoder, + intel_pt_pkt_cb_t cb, void *data) +{ + struct intel_pt_pkt_info pkt_info; + const unsigned char *buf = decoder->buf; + size_t len = decoder->len; + int ret; + + pkt_info.decoder = decoder; + pkt_info.pos = decoder->pos; + pkt_info.pkt_len = decoder->pkt_step; + pkt_info.last_packet_type = decoder->last_packet_type; + pkt_info.data = data; + + while (1) { + do { + pkt_info.pos += pkt_info.pkt_len; + buf += pkt_info.pkt_len; + len -= pkt_info.pkt_len; + + if (!len) + return INTEL_PT_NEED_MORE_BYTES; + + ret = intel_pt_get_packet(buf, len, &pkt_info.packet); + if (!ret) + return INTEL_PT_NEED_MORE_BYTES; + if (ret < 0) + return ret; + + pkt_info.pkt_len = ret; + } while (pkt_info.packet.type == INTEL_PT_PAD); + + ret = cb(&pkt_info); + if (ret) + return 0; + + pkt_info.last_packet_type = pkt_info.packet.type; + } +} + +struct intel_pt_calc_cyc_to_tsc_info { + uint64_t cycle_cnt; + unsigned int cbr; + uint32_t last_mtc; + uint64_t ctc_timestamp; + uint64_t ctc_delta; + uint64_t tsc_timestamp; + uint64_t timestamp; + bool have_tma; + bool from_mtc; + double cbr_cyc_to_tsc; +}; + +static int intel_pt_calc_cyc_cb(struct intel_pt_pkt_info *pkt_info) +{ + struct intel_pt_decoder *decoder = pkt_info->decoder; + struct intel_pt_calc_cyc_to_tsc_info *data = pkt_info->data; + uint64_t timestamp; + double cyc_to_tsc; + unsigned int cbr; + uint32_t mtc, mtc_delta, ctc, fc, ctc_rem; + + switch (pkt_info->packet.type) { + case INTEL_PT_TNT: + case INTEL_PT_TIP_PGE: + case INTEL_PT_TIP: + case INTEL_PT_FUP: + case INTEL_PT_PSB: + case INTEL_PT_PIP: + case INTEL_PT_MODE_EXEC: + case INTEL_PT_MODE_TSX: + case INTEL_PT_PSBEND: + case INTEL_PT_PAD: + case INTEL_PT_VMCS: + case INTEL_PT_MNT: + return 0; + + case INTEL_PT_MTC: + if (!data->have_tma) + return 0; + + mtc = pkt_info->packet.payload; + if (mtc > data->last_mtc) + mtc_delta = mtc - data->last_mtc; + else + mtc_delta = mtc + 256 - data->last_mtc; + data->ctc_delta += mtc_delta << decoder->mtc_shift; + data->last_mtc = mtc; + + if (decoder->tsc_ctc_mult) { + timestamp = data->ctc_timestamp + + data->ctc_delta * decoder->tsc_ctc_mult; + } else { + timestamp = data->ctc_timestamp + + multdiv(data->ctc_delta, + decoder->tsc_ctc_ratio_n, + decoder->tsc_ctc_ratio_d); + } + + if (timestamp < data->timestamp) + return 1; + + if (pkt_info->last_packet_type != INTEL_PT_CYC) { + data->timestamp = timestamp; + return 0; + } + + break; + + case INTEL_PT_TSC: + timestamp = pkt_info->packet.payload | + (data->timestamp & (0xffULL << 56)); + if (data->from_mtc && timestamp < data->timestamp && + data->timestamp - timestamp < decoder->tsc_slip) + return 1; + while (timestamp < data->timestamp) + timestamp += (1ULL << 56); + if (pkt_info->last_packet_type != INTEL_PT_CYC) { + if (data->from_mtc) + return 1; + data->tsc_timestamp = timestamp; + data->timestamp = timestamp; + return 0; + } + break; + + case INTEL_PT_TMA: + if (data->from_mtc) + return 1; + + if (!decoder->tsc_ctc_ratio_d) + return 0; + + ctc = pkt_info->packet.payload; + fc = pkt_info->packet.count; + ctc_rem = ctc & decoder->ctc_rem_mask; + + data->last_mtc = (ctc >> decoder->mtc_shift) & 0xff; + + data->ctc_timestamp = data->tsc_timestamp - fc; + if (decoder->tsc_ctc_mult) { + data->ctc_timestamp -= ctc_rem * decoder->tsc_ctc_mult; + } else { + data->ctc_timestamp -= + multdiv(ctc_rem, decoder->tsc_ctc_ratio_n, + decoder->tsc_ctc_ratio_d); + } + + data->ctc_delta = 0; + data->have_tma = true; + + return 0; + + case INTEL_PT_CYC: + data->cycle_cnt += pkt_info->packet.payload; + return 0; + + case INTEL_PT_CBR: + cbr = pkt_info->packet.payload; + if (data->cbr && data->cbr != cbr) + return 1; + data->cbr = cbr; + data->cbr_cyc_to_tsc = decoder->max_non_turbo_ratio_fp / cbr; + return 0; + + case INTEL_PT_TIP_PGD: + case INTEL_PT_TRACESTOP: + case INTEL_PT_OVF: + case INTEL_PT_BAD: /* Does not happen */ + default: + return 1; + } + + if (!data->cbr && decoder->cbr) { + data->cbr = decoder->cbr; + data->cbr_cyc_to_tsc = decoder->cbr_cyc_to_tsc; + } + + if (!data->cycle_cnt) + return 1; + + cyc_to_tsc = (double)(timestamp - decoder->timestamp) / data->cycle_cnt; + + if (data->cbr && cyc_to_tsc > data->cbr_cyc_to_tsc && + cyc_to_tsc / data->cbr_cyc_to_tsc > 1.25) { + intel_pt_log("Timestamp: calculated %g TSC ticks per cycle too big (c.f. CBR-based value %g), pos " x64_fmt "\n", + cyc_to_tsc, data->cbr_cyc_to_tsc, pkt_info->pos); + return 1; + } + + decoder->calc_cyc_to_tsc = cyc_to_tsc; + decoder->have_calc_cyc_to_tsc = true; + + if (data->cbr) { + intel_pt_log("Timestamp: calculated %g TSC ticks per cycle c.f. CBR-based value %g, pos " x64_fmt "\n", + cyc_to_tsc, data->cbr_cyc_to_tsc, pkt_info->pos); + } else { + intel_pt_log("Timestamp: calculated %g TSC ticks per cycle c.f. unknown CBR-based value, pos " x64_fmt "\n", + cyc_to_tsc, pkt_info->pos); + } + + return 1; +} + +static void intel_pt_calc_cyc_to_tsc(struct intel_pt_decoder *decoder, + bool from_mtc) +{ + struct intel_pt_calc_cyc_to_tsc_info data = { + .cycle_cnt = 0, + .cbr = 0, + .last_mtc = decoder->last_mtc, + .ctc_timestamp = decoder->ctc_timestamp, + .ctc_delta = decoder->ctc_delta, + .tsc_timestamp = decoder->tsc_timestamp, + .timestamp = decoder->timestamp, + .have_tma = decoder->have_tma, + .from_mtc = from_mtc, + .cbr_cyc_to_tsc = 0, + }; + + intel_pt_pkt_lookahead(decoder, intel_pt_calc_cyc_cb, &data); +} + static int intel_pt_get_next_packet(struct intel_pt_decoder *decoder) { int ret; + decoder->last_packet_type = decoder->packet.type; + do { decoder->pos += decoder->pkt_step; decoder->buf += decoder->pkt_step; @@ -954,6 +1200,13 @@ static void intel_pt_calc_tsc_timestamp(struct intel_pt_decoder *decoder) decoder->timestamp_insn_cnt = 0; } + if (decoder->last_packet_type == INTEL_PT_CYC) { + decoder->cyc_ref_timestamp = decoder->timestamp; + decoder->cycle_cnt = 0; + decoder->have_calc_cyc_to_tsc = false; + intel_pt_calc_cyc_to_tsc(decoder, false); + } + intel_pt_log_to("Setting timestamp", decoder->timestamp); } @@ -962,6 +1215,7 @@ static int intel_pt_overflow(struct intel_pt_decoder *decoder) intel_pt_log("ERROR: Buffer overflow\n"); intel_pt_clear_tx_flags(decoder); decoder->have_tma = false; + decoder->cbr = 0; decoder->pkt_state = INTEL_PT_STATE_ERR_RESYNC; decoder->overflow = true; return -EOVERFLOW; @@ -1026,6 +1280,49 @@ static void intel_pt_calc_mtc_timestamp(struct intel_pt_decoder *decoder) decoder->timestamp_insn_cnt = 0; decoder->last_mtc = mtc; + + if (decoder->last_packet_type == INTEL_PT_CYC) { + decoder->cyc_ref_timestamp = decoder->timestamp; + decoder->cycle_cnt = 0; + decoder->have_calc_cyc_to_tsc = false; + intel_pt_calc_cyc_to_tsc(decoder, true); + } +} + +static void intel_pt_calc_cbr(struct intel_pt_decoder *decoder) +{ + unsigned int cbr = decoder->packet.payload; + + if (decoder->cbr == cbr) + return; + + decoder->cbr = cbr; + decoder->cbr_cyc_to_tsc = decoder->max_non_turbo_ratio_fp / cbr; +} + +static void intel_pt_calc_cyc_timestamp(struct intel_pt_decoder *decoder) +{ + uint64_t timestamp = decoder->cyc_ref_timestamp; + + decoder->have_cyc = true; + + decoder->cycle_cnt += decoder->packet.payload; + + if (!decoder->cyc_ref_timestamp) + return; + + if (decoder->have_calc_cyc_to_tsc) + timestamp += decoder->cycle_cnt * decoder->calc_cyc_to_tsc; + else if (decoder->cbr) + timestamp += decoder->cycle_cnt * decoder->cbr_cyc_to_tsc; + else + return; + + if (timestamp < decoder->timestamp) + intel_pt_log("Suppressing CYC timestamp " x64_fmt " less than current timestamp " x64_fmt "\n", + timestamp, decoder->timestamp); + else + decoder->timestamp = timestamp; } /* Walk PSB+ packets when already in sync. */ @@ -1065,7 +1362,7 @@ static int intel_pt_walk_psbend(struct intel_pt_decoder *decoder) break; case INTEL_PT_CBR: - decoder->cbr = decoder->packet.payload; + intel_pt_calc_cbr(decoder); break; case INTEL_PT_MODE_EXEC: @@ -1182,6 +1479,7 @@ static int intel_pt_walk_fup_tip(struct intel_pt_decoder *decoder) break; case INTEL_PT_CYC: + intel_pt_calc_cyc_timestamp(decoder); break; case INTEL_PT_MODE_EXEC: @@ -1318,10 +1616,11 @@ next: break; case INTEL_PT_CYC: + intel_pt_calc_cyc_timestamp(decoder); break; case INTEL_PT_CBR: - decoder->cbr = decoder->packet.payload; + intel_pt_calc_cbr(decoder); break; case INTEL_PT_MODE_EXEC: @@ -1398,10 +1697,11 @@ static int intel_pt_walk_psb(struct intel_pt_decoder *decoder) break; case INTEL_PT_CYC: + intel_pt_calc_cyc_timestamp(decoder); break; case INTEL_PT_CBR: - decoder->cbr = decoder->packet.payload; + intel_pt_calc_cbr(decoder); break; case INTEL_PT_PIP: @@ -1493,10 +1793,11 @@ static int intel_pt_walk_to_ip(struct intel_pt_decoder *decoder) break; case INTEL_PT_CYC: + intel_pt_calc_cyc_timestamp(decoder); break; case INTEL_PT_CBR: - decoder->cbr = decoder->packet.payload; + intel_pt_calc_cbr(decoder); break; case INTEL_PT_PIP: -- cgit v0.10.2 From 0de802abd14abdf8cbbba28b421a1a00fa0939d5 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Fri, 17 Jul 2015 19:33:58 +0300 Subject: perf tools: Add Intel PT support for using CYC packets CYC packets are a new Intel PT feature. CYC packets provide even finer grain timestamp information than MTC and TSC packets. A CYC packet contains the number of CPU cycles since the last CYC packet. Unlike MTC and TSC packets, CYC packets are only sent when another packet is also sent. Support for this feature is indicated by: /sys/bus/event_source/devices/intel_pt/caps/psb_cyc which contains "1" if the feature is supported and "0" otherwise. CYC packets can be requested using a PMU config term e.g. perf record -e intel_pt/cyc/u sleep 1 The frequency of CYC packets can also be specified. e.g. perf record -e intel_pt/cyc,cyc_thresh=2/u sleep 1 CYC packets are not requested by default. Valid cyc_thresh values are given by: /sys/bus/event_source/devices/intel_pt/caps/cycle_thresholds which contains a hexadecimal value, the bits of which represent valid values e.g. bit 2 set means value 2 is valid. The value represents the minimum number of CPU cycles that must have passed before a CYC packet can be sent. The number of CPU cycles is: 2 ^ (value - 1) e.g. value 4 means 8 CPU cycles must pass before a CYC packet can be sent. Note a CYC packet is still only sent when another packet is sent, not at, e.g. every 8 CPU cycles. If an invalid value is entered, the error message will give a list of valid values e.g. $ perf record -e intel_pt/cyc,cyc_thresh=15/u uname Invalid cyc_thresh for intel_pt. Valid values are: 0-12 tools/perf/Documentation/intel-pt.txt is updated in a later patch as there are a number of new features being added. For more information refer to the June 2015 or later Intel 64 and IA-32 Architectures SDM Chapter 36 Intel Processor Trace. Signed-off-by: Adrian Hunter Cc: Jiri Olsa Link: http://lkml.kernel.org/r/1437150840-31811-24-git-send-email-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/arch/x86/util/intel-pt.c b/tools/perf/arch/x86/util/intel-pt.c index a5de01d..2ca10d7 100644 --- a/tools/perf/arch/x86/util/intel-pt.c +++ b/tools/perf/arch/x86/util/intel-pt.c @@ -475,6 +475,12 @@ static int intel_pt_validate_config(struct perf_pmu *intel_pt_pmu, if (!evsel) return 0; + err = intel_pt_val_config_term(intel_pt_pmu, "caps/cycle_thresholds", + "cyc_thresh", "caps/psb_cyc", + evsel->attr.config); + if (err) + return err; + err = intel_pt_val_config_term(intel_pt_pmu, "caps/mtc_periods", "mtc_period", "caps/mtc", evsel->attr.config); -- cgit v0.10.2 From 7eacca3ebb03a4ee7bb41284aafeb19a54242621 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Fri, 17 Jul 2015 19:33:59 +0300 Subject: perf tools: Add Intel PT support for decoding TRACESTOP packets A TRACESTOP packet is produced when an Intel PT trace enters a defined region of the address space at which point the tracing stops. This patch just adds decoder support. Support for specifying TRACESTOP regions is left until later. For details refer to the June 2015 or later Intel 64 and IA-32 Architectures SDM Chapter 36 Intel Processor Trace. Signed-off-by: Adrian Hunter Cc: Jiri Olsa Link: http://lkml.kernel.org/r/1437150840-31811-25-git-send-email-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c index 0845c5e..22ba502 100644 --- a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c +++ b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c @@ -1572,6 +1572,10 @@ next: return intel_pt_walk_fup_tip(decoder); case INTEL_PT_TRACESTOP: + decoder->pge = false; + decoder->continuous_period = false; + intel_pt_clear_tx_flags(decoder); + decoder->have_tma = false; break; case INTEL_PT_PSB: @@ -1717,6 +1721,9 @@ static int intel_pt_walk_psb(struct intel_pt_decoder *decoder) break; case INTEL_PT_TRACESTOP: + decoder->pge = false; + decoder->continuous_period = false; + intel_pt_clear_tx_flags(decoder); case INTEL_PT_TNT: decoder->have_tma = false; intel_pt_log("ERROR: Unexpected packet\n"); @@ -1819,6 +1826,10 @@ static int intel_pt_walk_to_ip(struct intel_pt_decoder *decoder) return intel_pt_bug(decoder); case INTEL_PT_TRACESTOP: + decoder->pge = false; + decoder->continuous_period = false; + intel_pt_clear_tx_flags(decoder); + decoder->have_tma = false; break; case INTEL_PT_PSB: -- cgit v0.10.2 From 9d1bf02ac3d41367896b38793db6f8f30bb9a295 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Fri, 17 Jul 2015 19:34:00 +0300 Subject: perf tools: Update Intel PT documentation Update Intel PT documentation to describe new features. Signed-off-by: Adrian Hunter Cc: Jiri Olsa Link: http://lkml.kernel.org/r/1437150840-31811-26-git-send-email-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/Documentation/intel-pt.txt b/tools/perf/Documentation/intel-pt.txt index 2866b62..4a0501d 100644 --- a/tools/perf/Documentation/intel-pt.txt +++ b/tools/perf/Documentation/intel-pt.txt @@ -142,19 +142,21 @@ which is the same as -e intel_pt/tsc=1,noretcomp=0/ +Note there are now new config terms - see section 'config terms' further below. + The config terms are listed in /sys/devices/intel_pt/format. They are bit fields within the config member of the struct perf_event_attr which is passed to the kernel by the perf_event_open system call. They correspond to bit fields in the IA32_RTIT_CTL MSR. Here is a list of them and their definitions: - $ for f in `ls /sys/devices/intel_pt/format`;do - > echo $f - > cat /sys/devices/intel_pt/format/$f - > done - noretcomp - config:11 - tsc - config:10 + $ grep -H . /sys/bus/event_source/devices/intel_pt/format/* + /sys/bus/event_source/devices/intel_pt/format/cyc:config:1 + /sys/bus/event_source/devices/intel_pt/format/cyc_thresh:config:19-22 + /sys/bus/event_source/devices/intel_pt/format/mtc:config:9 + /sys/bus/event_source/devices/intel_pt/format/mtc_period:config:14-17 + /sys/bus/event_source/devices/intel_pt/format/noretcomp:config:11 + /sys/bus/event_source/devices/intel_pt/format/psb_period:config:24-27 + /sys/bus/event_source/devices/intel_pt/format/tsc:config:10 Note that the default config must be overridden for each term i.e. @@ -209,9 +211,185 @@ perf_event_attr is displayed if the -vv option is used e.g. ------------------------------------------------------------ +config terms +------------ + +The June 2015 version of Intel 64 and IA-32 Architectures Software Developer +Manuals, Chapter 36 Intel Processor Trace, defined new Intel PT features. +Some of the features are reflect in new config terms. All the config terms are +described below. + +tsc Always supported. Produces TSC timestamp packets to provide + timing information. In some cases it is possible to decode + without timing information, for example a per-thread context + that does not overlap executable memory maps. + + The default config selects tsc (i.e. tsc=1). + +noretcomp Always supported. Disables "return compression" so a TIP packet + is produced when a function returns. Causes more packets to be + produced but might make decoding more reliable. + + The default config does not select noretcomp (i.e. noretcomp=0). + +psb_period Allows the frequency of PSB packets to be specified. + + The PSB packet is a synchronization packet that provides a + starting point for decoding or recovery from errors. + + Support for psb_period is indicated by: + + /sys/bus/event_source/devices/intel_pt/caps/psb_cyc + + which contains "1" if the feature is supported and "0" + otherwise. + + Valid values are given by: + + /sys/bus/event_source/devices/intel_pt/caps/psb_periods + + which contains a hexadecimal value, the bits of which represent + valid values e.g. bit 2 set means value 2 is valid. + + The psb_period value is converted to the approximate number of + trace bytes between PSB packets as: + + 2 ^ (value + 11) + + e.g. value 3 means 16KiB bytes between PSBs + + If an invalid value is entered, the error message + will give a list of valid values e.g. + + $ perf record -e intel_pt/psb_period=15/u uname + Invalid psb_period for intel_pt. Valid values are: 0-5 + + If MTC packets are selected, the default config selects a value + of 3 (i.e. psb_period=3) or the nearest lower value that is + supported (0 is always supported). Otherwise the default is 0. + + If decoding is expected to be reliable and the buffer is large + then a large PSB period can be used. + + Because a TSC packet is produced with PSB, the PSB period can + also affect the granularity to timing information in the absence + of MTC or CYC. + +mtc Produces MTC timing packets. + + MTC packets provide finer grain timestamp information than TSC + packets. MTC packets record time using the hardware crystal + clock (CTC) which is related to TSC packets using a TMA packet. + + Support for this feature is indicated by: + + /sys/bus/event_source/devices/intel_pt/caps/mtc + + which contains "1" if the feature is supported and + "0" otherwise. + + The frequency of MTC packets can also be specified - see + mtc_period below. + +mtc_period Specifies how frequently MTC packets are produced - see mtc + above for how to determine if MTC packets are supported. + + Valid values are given by: + + /sys/bus/event_source/devices/intel_pt/caps/mtc_periods + + which contains a hexadecimal value, the bits of which represent + valid values e.g. bit 2 set means value 2 is valid. + + The mtc_period value is converted to the MTC frequency as: + + CTC-frequency / (2 ^ value) + + e.g. value 3 means one eighth of CTC-frequency + + Where CTC is the hardware crystal clock, the frequency of which + can be related to TSC via values provided in cpuid leaf 0x15. + + If an invalid value is entered, the error message + will give a list of valid values e.g. + + $ perf record -e intel_pt/mtc_period=15/u uname + Invalid mtc_period for intel_pt. Valid values are: 0,3,6,9 + + The default value is 3 or the nearest lower value + that is supported (0 is always supported). + +cyc Produces CYC timing packets. + + CYC packets provide even finer grain timestamp information than + MTC and TSC packets. A CYC packet contains the number of CPU + cycles since the last CYC packet. Unlike MTC and TSC packets, + CYC packets are only sent when another packet is also sent. + + Support for this feature is indicated by: + + /sys/bus/event_source/devices/intel_pt/caps/psb_cyc + + which contains "1" if the feature is supported and + "0" otherwise. + + The number of CYC packets produced can be reduced by specifying + a threshold - see cyc_thresh below. + +cyc_thresh Specifies how frequently CYC packets are produced - see cyc + above for how to determine if CYC packets are supported. + + Valid cyc_thresh values are given by: + + /sys/bus/event_source/devices/intel_pt/caps/cycle_thresholds + + which contains a hexadecimal value, the bits of which represent + valid values e.g. bit 2 set means value 2 is valid. + + The cyc_thresh value represents the minimum number of CPU cycles + that must have passed before a CYC packet can be sent. The + number of CPU cycles is: + + 2 ^ (value - 1) + + e.g. value 4 means 8 CPU cycles must pass before a CYC packet + can be sent. Note a CYC packet is still only sent when another + packet is sent, not at, e.g. every 8 CPU cycles. + + If an invalid value is entered, the error message + will give a list of valid values e.g. + + $ perf record -e intel_pt/cyc,cyc_thresh=15/u uname + Invalid cyc_thresh for intel_pt. Valid values are: 0-12 + + CYC packets are not requested by default. + +no_force_psb This is a driver option and is not in the IA32_RTIT_CTL MSR. + + It stops the driver resetting the byte count to zero whenever + enabling the trace (for example on context switches) which in + turn results in no PSB being forced. However some processors + will produce a PSB anyway. + + In any case, there is still a PSB when the trace is enabled for + the first time. + + no_force_psb can be used to slightly decrease the trace size but + may make it harder for the decoder to recover from errors. + + no_force_psb is not selected by default. + + new snapshot option ------------------- +The difference between full trace and snapshot from the kernel's perspective is +that in full trace we don't overwrite trace data that the user hasn't collected +yet (and indicated that by advancing aux_tail), whereas in snapshot mode we let +the trace run and overwrite older data in the buffer so that whenever something +interesting happens, we can stop it and grab a snapshot of what was going on +around that interesting moment. + To select snapshot mode a new option has been added: -S -- cgit v0.10.2 From e486367f0101d43a044404ed2fa35aa043b19667 Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Tue, 25 Aug 2015 13:27:35 +0000 Subject: perf probe: Prevent segfault when reading probe point with absolute address 'perf probe -l' panic if there is a manually inserted probing point with absolute address. For example: # echo 'p:probe/abs_ffffffff811e6615 0xffffffff811e6615' > /sys/kernel/debug/tracing/kprobe_events # perf probe -l Segmentation fault (core dumped) This patch fix this problem by considering the situation that "tp->symbol == NULL" in find_perf_probe_point_from_dwarf() and find_perf_probe_point_from_map(). After this patch: # perf probe -l probe:abs_ffffffff811e6615 (on SyS_write+5@fs/read_write.c) And when debug info is missing: # rm -rf ~/.debug # mv /lib/modules/4.2.0-rc1+/build/vmlinux /lib/modules/4.2.0-rc1+/build/vmlinux.bak # perf probe -l probe:abs_ffffffff811e6615 (on sys_write+5) Signed-off-by: Wang Nan Tested-by: Arnaldo Carvalho de Melo Acked-by: Masami Hiramatsu Cc: Namhyung Kim Cc: pi3orama@163.com Link: http://lkml.kernel.org/r/1440509256-193590-1-git-send-email-wangnan0@huawei.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c index f07374b..6c7e538 100644 --- a/tools/perf/util/probe-event.c +++ b/tools/perf/util/probe-event.c @@ -515,7 +515,7 @@ static int find_perf_probe_point_from_dwarf(struct probe_trace_point *tp, if (ret < 0) goto error; addr += stext; - } else { + } else if (tp->symbol) { addr = kernel_get_symbol_address_by_name(tp->symbol, false); if (addr == 0) goto error; @@ -1815,17 +1815,17 @@ static int find_perf_probe_point_from_map(struct probe_trace_point *tp, { struct symbol *sym = NULL; struct map *map; - u64 addr; + u64 addr = tp->address; int ret = -ENOENT; if (!is_kprobe) { map = dso__new_map(tp->module); if (!map) goto out; - addr = tp->address; sym = map__find_symbol(map, addr, NULL); } else { - addr = kernel_get_symbol_address_by_name(tp->symbol, true); + if (tp->symbol) + addr = kernel_get_symbol_address_by_name(tp->symbol, true); if (addr) { addr += tp->offset; sym = __find_kernel_function(addr, &map); -- cgit v0.10.2 From 18b9a05868391e8f617febb4528bc1765dc921cf Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 26 Aug 2015 10:07:50 +0200 Subject: perf tools: Remove export.h from MANIFEST We don't carry an export.h wrapper anymore, remove it from the MANIFEST file to avoid breaking the make perf-tar targets. Signed-off-by: Jiri Olsa Cc: Adrian Hunter Cc: Sukadev Bhattiprolu Link: http://lkml.kernel.org/r/20150826080750.GD22670@krava.redhat.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/MANIFEST b/tools/perf/MANIFEST index f31f15a..af009bd 100644 --- a/tools/perf/MANIFEST +++ b/tools/perf/MANIFEST @@ -41,7 +41,6 @@ tools/include/asm-generic/bitops.h tools/include/linux/atomic.h tools/include/linux/bitops.h tools/include/linux/compiler.h -tools/include/linux/export.h tools/include/linux/hash.h tools/include/linux/kernel.h tools/include/linux/list.h -- cgit v0.10.2 From 0bdede8a3e4ff7710622a2e6713d04b3243c24a8 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 26 Aug 2015 15:01:03 +0200 Subject: tools build: Allow duplicate objects in the object list It's sometimes useful to specify the object affiliation to multiple config options like: libperf-$(CONFIG_X86) += tsc.o libperf-$(CONFIG_AUXTRACE) += tsc.o while the object itself is linked only once. Adding the support for this and ignoring duplicate objects in the object list. Suggested-by: Arnaldo Carvalho de Melo Signed-off-by: Jiri Olsa Cc: Adrian Hunter Cc: Sukadev Bhattiprolu Link: http://lkml.kernel.org/r/20150826130103.GF22670@krava.redhat.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/build/Documentation/Build.txt b/tools/build/Documentation/Build.txt index 00ad2d6..aa5e092 100644 --- a/tools/build/Documentation/Build.txt +++ b/tools/build/Documentation/Build.txt @@ -66,6 +66,7 @@ To follow the above example, the user provides following 'Build' files: ex/Build: ex-y += a.o ex-y += b.o + ex-y += b.o # duplicates in the lists are allowed libex-y += c.o libex-y += d.o diff --git a/tools/build/Makefile.build b/tools/build/Makefile.build index 8120af9..0c5f485 100644 --- a/tools/build/Makefile.build +++ b/tools/build/Makefile.build @@ -63,7 +63,7 @@ quiet_cmd_gen = GEN $@ # If there's nothing to link, create empty $@ object. quiet_cmd_ld_multi = LD $@ cmd_ld_multi = $(if $(strip $(obj-y)),\ - $(LD) -r -o $@ $(obj-y),rm -f $@; $(AR) rcs $@) + $(LD) -r -o $@ $(filter $(obj-y),$^),rm -f $@; $(AR) rcs $@) # Build rules $(OUTPUT)%.o: %.c FORCE diff --git a/tools/build/tests/ex/Build b/tools/build/tests/ex/Build index 70d8762..429c7d4 100644 --- a/tools/build/tests/ex/Build +++ b/tools/build/tests/ex/Build @@ -1,6 +1,7 @@ ex-y += ex.o ex-y += a.o ex-y += b.o +ex-y += b.o ex-y += empty/ ex-y += empty2/ -- cgit v0.10.2 From 614e2fdbd79b47ddf63a1ccd3f2343e98c3077fb Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Wed, 26 Aug 2015 10:57:42 +0000 Subject: perf probe: Fix list result when symbol can't be found 'perf probe -l' reports error if it is unable find symbol through address. Here is an example. # echo 'p:probe_libc/abs_5 /lib64/libc.so.6:0x5' > /sys/kernel/debug/tracing/uprobe_events # cat /sys/kernel/debug/tracing/uprobe_events p:probe_libc/abs_5 /lib64/libc.so.6:0x0000000000000005 # perf probe -l Error: Failed to show event list Also, this situation triggers a logical inconsistency in convert_to_perf_probe_point() that, it returns ENOMEM but actually it never try strdup(). This patch removes !tp->module && !is_kprobe condition, so it always uses address to build function name if symbol not found. Test result: # perf probe -l probe_libc:abs_5 (on 0x5 in /lib64/libc.so.6) Signed-off-by: Wang Nan Acked-by: Masami Hiramatsu Cc: Namhyung Kim Cc: Steven Rostedt Cc: Zefan Li Cc: pi3orama@163.com Link: http://lkml.kernel.org/r/1440586666-235233-4-git-send-email-wangnan0@huawei.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c index 6c7e538..f7bacbb 100644 --- a/tools/perf/util/probe-event.c +++ b/tools/perf/util/probe-event.c @@ -1866,7 +1866,7 @@ static int convert_to_perf_probe_point(struct probe_trace_point *tp, if (tp->symbol) { pp->function = strdup(tp->symbol); pp->offset = tp->offset; - } else if (!tp->module && !is_kprobe) { + } else { ret = e_snprintf(buf, 128, "0x%" PRIx64, (u64)tp->address); if (ret < 0) return ret; -- cgit v0.10.2 From be07afe92a09638db9159d2c0794487d66a437a2 Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Wed, 26 Aug 2015 10:57:43 +0000 Subject: perf probe: Fix list result when address is zero When manually added uprobe point with zero address, 'perf probe -l' reports error. For example: # echo p:probe_libc/abs_0 /path/to/lib.bin:0x0 arg1=%ax > \ /sys/kernel/debug/tracing/uprobe_events # perf probe -l Error: Failed to show event list. Probing at 0x0 is possible and useful when lib.bin is not a normal shared object but is manually mapped. However, in this case kernel report: # cat /sys/kernel/debug/tracing/uprobe_events p:probe_libc/abs_0 /path/to/lib.bin:0x (null) arg1=%ax This patch supports the above kernel output. Signed-off-by: Wang Nan Acked-by: Masami Hiramatsu Cc: Namhyung Kim Cc: Steven Rostedt Cc: Zefan Li Cc: pi3orama@163.com Link: http://lkml.kernel.org/r/1440586666-235233-5-git-send-email-wangnan0@huawei.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c index f7bacbb..926bcec 100644 --- a/tools/perf/util/probe-event.c +++ b/tools/perf/util/probe-event.c @@ -1519,9 +1519,31 @@ int parse_probe_trace_command(const char *cmd, struct probe_trace_event *tev) } else p = argv[1]; fmt1_str = strtok_r(p, "+", &fmt); - if (fmt1_str[0] == '0') /* only the address started with 0x */ - tp->address = strtoul(fmt1_str, NULL, 0); - else { + /* only the address started with 0x */ + if (fmt1_str[0] == '0') { + /* + * Fix a special case: + * if address == 0, kernel reports something like: + * p:probe_libc/abs_0 /lib/libc-2.18.so:0x (null) arg1=%ax + * Newer kernel may fix that, but we want to + * support old kernel also. + */ + if (strcmp(fmt1_str, "0x") == 0) { + if (!argv[2] || strcmp(argv[2], "(null)")) { + ret = -EINVAL; + goto out; + } + tp->address = 0; + + free(argv[2]); + for (i = 2; argv[i + 1] != NULL; i++) + argv[i] = argv[i + 1]; + + argv[i] = NULL; + argc -= 1; + } else + tp->address = strtoul(fmt1_str, NULL, 0); + } else { /* Only the symbol-based probe has offset */ tp->symbol = strdup(fmt1_str); if (tp->symbol == NULL) { -- cgit v0.10.2 From 6c6e024f0a62a6a08c06002fd3caa2307cc54fd0 Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Wed, 26 Aug 2015 10:57:44 +0000 Subject: perf probe: Fix error reported when offset without function This patch fixes a bug that, when offset is provided but function is lost, parse_perf_probe_point() will give a "" string as function name, so the checking code at the end of parse_perf_probe_point() become useless. For example: # perf probe +0x1234 Failed to find symbol in kernel Error: Failed to add events. After this patch: # perf probe +0x1234 Semantic error :Offset requires an entry function. Error: Command Parse Error. Signed-off-by: Wang Nan Acked-by: Masami Hiramatsu Cc: Namhyung Kim Cc: Steven Rostedt Cc: Zefan Li Cc: pi3orama@163.com Link: http://lkml.kernel.org/r/1440586666-235233-6-git-send-email-wangnan0@huawei.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c index 926bcec..eaacb58 100644 --- a/tools/perf/util/probe-event.c +++ b/tools/perf/util/probe-event.c @@ -1194,9 +1194,13 @@ static int parse_perf_probe_point(char *arg, struct perf_probe_event *pev) *ptr++ = '\0'; } - tmp = strdup(arg); - if (tmp == NULL) - return -ENOMEM; + if (arg[0] == '\0') + tmp = NULL; + else { + tmp = strdup(arg); + if (tmp == NULL) + return -ENOMEM; + } if (file_spec) pp->file = tmp; -- cgit v0.10.2 From da15bd9df4afd2f9f78cf29f85f013e3a38402b5 Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Wed, 26 Aug 2015 10:57:45 +0000 Subject: perf probe: Support probing at absolute address It should be useful to allow 'perf probe' probe at absolute offset of a target. For example, when (u)probing at a instruction of a shared object in a embedded system where debuginfo is not avaliable but we know the offset of that instruction by manually digging. This patch enables following perf probe command syntax: # perf probe 0xffffffff811e6615 And # perf probe /lib/x86_64-linux-gnu/libc-2.19.so 0xeb860 In the above example, we don't need a anchor symbol, so it is possible to compute absolute addresses using other methods and then use 'perf probe' to create the probing points. v1 -> v2: Drop the leading '+' in cmdline; Allow uprobing at offset 0x0; Improve 'perf probe -l' result when uprobe at area without debuginfo. v2 -> v3: Split bugfix to a separated patch. Test result: # perf probe 0xffffffff8119d175 %ax # perf probe sys_write %ax # perf probe /lib64/libc-2.18.so 0x0 %ax # perf probe /lib64/libc-2.18.so 0x5 %ax # perf probe /lib64/libc-2.18.so 0xd8e40 %ax # perf probe /lib64/libc-2.18.so __write %ax # perf probe /lib64/libc-2.18.so 0xd8e49 %ax # cat /sys/kernel/debug/tracing/uprobe_events p:probe_libc/abs_0 /lib64/libc-2.18.so:0x (null) arg1=%ax p:probe_libc/abs_5 /lib64/libc-2.18.so:0x0000000000000005 arg1=%ax p:probe_libc/abs_d8e40 /lib64/libc-2.18.so:0x00000000000d8e40 arg1=%ax p:probe_libc/__write /lib64/libc-2.18.so:0x00000000000d8e40 arg1=%ax p:probe_libc/abs_d8e49 /lib64/libc-2.18.so:0x00000000000d8e49 arg1=%ax # cat /sys/kernel/debug/tracing/kprobe_events p:probe/abs_ffffffff8119d175 0xffffffff8119d175 arg1=%ax p:probe/sys_write _text+1692016 arg1=%ax # perf probe -l Failed to find debug information for address 5 probe:abs_ffffffff8119d175 (on sys_write+5 with arg1) probe:sys_write (on sys_write with arg1) probe_libc:__write (on @unix/syscall-template.S:81 in /lib64/libc-2.18.so with arg1) probe_libc:abs_0 (on 0x0 in /lib64/libc-2.18.so with arg1) probe_libc:abs_5 (on 0x5 in /lib64/libc-2.18.so with arg1) probe_libc:abs_d8e40 (on @unix/syscall-template.S:81 in /lib64/libc-2.18.so with arg1) probe_libc:abs_d8e49 (on __GI___libc_write+9 in /lib64/libc-2.18.so with arg1) Signed-off-by: Wang Nan Acked-by: Masami Hiramatsu Cc: Namhyung Kim Cc: Steven Rostedt Cc: Zefan Li Cc: pi3orama@163.com Link: http://lkml.kernel.org/r/1440586666-235233-7-git-send-email-wangnan0@huawei.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c index eaacb58..eb5f18b 100644 --- a/tools/perf/util/probe-event.c +++ b/tools/perf/util/probe-event.c @@ -1204,9 +1204,27 @@ static int parse_perf_probe_point(char *arg, struct perf_probe_event *pev) if (file_spec) pp->file = tmp; - else + else { pp->function = tmp; + /* + * Keep pp->function even if this is absolute address, + * so it can mark whether abs_address is valid. + * Which make 'perf probe lib.bin 0x0' possible. + * + * Note that checking length of tmp is not needed + * because when we access tmp[1] we know tmp[0] is '0', + * so tmp[1] should always valid (but could be '\0'). + */ + if (tmp && !strncmp(tmp, "0x", 2)) { + pp->abs_address = strtoul(pp->function, &tmp, 0); + if (*tmp != '\0') { + semantic_error("Invalid absolute address.\n"); + return -EINVAL; + } + } + } + /* Parse other options */ while (ptr) { arg = ptr; @@ -1804,14 +1822,29 @@ char *synthesize_probe_trace_command(struct probe_trace_event *tev) if (len <= 0) goto error; - /* Uprobes must have tp->address and tp->module */ - if (tev->uprobes && (!tp->address || !tp->module)) + /* Uprobes must have tp->module */ + if (tev->uprobes && !tp->module) goto error; + /* + * If tp->address == 0, then this point must be a + * absolute address uprobe. + * try_to_find_absolute_address() should have made + * tp->symbol to "0x0". + */ + if (tev->uprobes && !tp->address) { + if (!tp->symbol || strcmp(tp->symbol, "0x0")) + goto error; + } /* Use the tp->address for uprobes */ if (tev->uprobes) ret = e_snprintf(buf + len, MAX_CMDLEN - len, "%s:0x%lx", tp->module, tp->address); + else if (!strncmp(tp->symbol, "0x", 2)) + /* Absolute address. See try_to_find_absolute_address() */ + ret = e_snprintf(buf + len, MAX_CMDLEN - len, "%s%s0x%lx", + tp->module ?: "", tp->module ? ":" : "", + tp->address); else ret = e_snprintf(buf + len, MAX_CMDLEN - len, "%s%s%s+%lu", tp->module ?: "", tp->module ? ":" : "", @@ -1874,8 +1907,8 @@ out: } static int convert_to_perf_probe_point(struct probe_trace_point *tp, - struct perf_probe_point *pp, - bool is_kprobe) + struct perf_probe_point *pp, + bool is_kprobe) { char buf[128]; int ret; @@ -2331,7 +2364,9 @@ static int probe_trace_event__set_name(struct probe_trace_event *tev, if (pev->event) event = pev->event; else - if (pev->point.function && !strisglob(pev->point.function)) + if (pev->point.function && + (strncmp(pev->point.function, "0x", 2) != 0) && + !strisglob(pev->point.function)) event = pev->point.function; else event = tev->point.realname; @@ -2598,6 +2633,98 @@ err_out: goto out; } +static int try_to_find_absolute_address(struct perf_probe_event *pev, + struct probe_trace_event **tevs) +{ + struct perf_probe_point *pp = &pev->point; + struct probe_trace_event *tev; + struct probe_trace_point *tp; + int i, err; + + if (!(pev->point.function && !strncmp(pev->point.function, "0x", 2))) + return -EINVAL; + if (perf_probe_event_need_dwarf(pev)) + return -EINVAL; + + /* + * This is 'perf probe /lib/libc.so 0xabcd'. Try to probe at + * absolute address. + * + * Only one tev can be generated by this. + */ + *tevs = zalloc(sizeof(*tev)); + if (!*tevs) + return -ENOMEM; + + tev = *tevs; + tp = &tev->point; + + /* + * Don't use tp->offset, use address directly, because + * in synthesize_probe_trace_command() address cannot be + * zero. + */ + tp->address = pev->point.abs_address; + tp->retprobe = pp->retprobe; + tev->uprobes = pev->uprobes; + + err = -ENOMEM; + /* + * Give it a '0x' leading symbol name. + * In __add_probe_trace_events, a NULL symbol is interpreted as + * invalud. + */ + if (asprintf(&tp->symbol, "0x%lx", tp->address) < 0) + goto errout; + + /* For kprobe, check range */ + if ((!tev->uprobes) && + (kprobe_warn_out_range(tev->point.symbol, + tev->point.address))) { + err = -EACCES; + goto errout; + } + + if (asprintf(&tp->realname, "abs_%lx", tp->address) < 0) + goto errout; + + if (pev->target) { + tp->module = strdup(pev->target); + if (!tp->module) + goto errout; + } + + if (tev->group) { + tev->group = strdup(pev->group); + if (!tev->group) + goto errout; + } + + if (pev->event) { + tev->event = strdup(pev->event); + if (!tev->event) + goto errout; + } + + tev->nargs = pev->nargs; + tev->args = zalloc(sizeof(struct probe_trace_arg) * tev->nargs); + if (!tev->args) { + err = -ENOMEM; + goto errout; + } + for (i = 0; i < tev->nargs; i++) + copy_to_probe_trace_arg(&tev->args[i], &pev->args[i]); + + return 1; + +errout: + if (*tevs) { + clear_probe_trace_events(*tevs, 1); + *tevs = NULL; + } + return err; +} + bool __weak arch__prefers_symtab(void) { return false; } static int convert_to_probe_trace_events(struct perf_probe_event *pev, @@ -2614,6 +2741,10 @@ static int convert_to_probe_trace_events(struct perf_probe_event *pev, } } + ret = try_to_find_absolute_address(pev, tevs); + if (ret > 0) + return ret; + if (arch__prefers_symtab() && !perf_probe_event_need_dwarf(pev)) { ret = find_probe_trace_events_from_map(pev, tevs); if (ret > 0) @@ -2784,3 +2915,22 @@ end: return ret; } +int copy_to_probe_trace_arg(struct probe_trace_arg *tvar, + struct perf_probe_arg *pvar) +{ + tvar->value = strdup(pvar->var); + if (tvar->value == NULL) + return -ENOMEM; + if (pvar->type) { + tvar->type = strdup(pvar->type); + if (tvar->type == NULL) + return -ENOMEM; + } + if (pvar->name) { + tvar->name = strdup(pvar->name); + if (tvar->name == NULL) + return -ENOMEM; + } else + tvar->name = NULL; + return 0; +} diff --git a/tools/perf/util/probe-event.h b/tools/perf/util/probe-event.h index 83ee95e..6e7ec68 100644 --- a/tools/perf/util/probe-event.h +++ b/tools/perf/util/probe-event.h @@ -59,6 +59,7 @@ struct perf_probe_point { bool retprobe; /* Return probe flag */ char *lazy_line; /* Lazy matching pattern */ unsigned long offset; /* Offset from function entry */ + unsigned long abs_address; /* Absolute address of the point */ }; /* Perf probe probing argument field chain */ @@ -156,4 +157,7 @@ int e_snprintf(char *str, size_t size, const char *format, ...) /* Maximum index number of event-name postfix */ #define MAX_EVENT_INDEX 1024 +int copy_to_probe_trace_arg(struct probe_trace_arg *tvar, + struct perf_probe_arg *pvar); + #endif /*_PROBE_EVENT_H */ diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c index 7b80f8c..29c43c068 100644 --- a/tools/perf/util/probe-finder.c +++ b/tools/perf/util/probe-finder.c @@ -553,24 +553,9 @@ static int find_variable(Dwarf_Die *sc_die, struct probe_finder *pf) char buf[32], *ptr; int ret = 0; - if (!is_c_varname(pf->pvar->var)) { - /* Copy raw parameters */ - pf->tvar->value = strdup(pf->pvar->var); - if (pf->tvar->value == NULL) - return -ENOMEM; - if (pf->pvar->type) { - pf->tvar->type = strdup(pf->pvar->type); - if (pf->tvar->type == NULL) - return -ENOMEM; - } - if (pf->pvar->name) { - pf->tvar->name = strdup(pf->pvar->name); - if (pf->tvar->name == NULL) - return -ENOMEM; - } else - pf->tvar->name = NULL; - return 0; - } + /* Copy raw parameters */ + if (!is_c_varname(pf->pvar->var)) + return copy_to_probe_trace_arg(pf->tvar, pf->pvar); if (pf->pvar->name) pf->tvar->name = strdup(pf->pvar->name); -- cgit v0.10.2 From a2fb3382edbea83c6f2bf6ac15e3673b2e254aad Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Wed, 26 Aug 2015 10:57:46 +0000 Subject: tracing/uprobes: Do not print '0x (null)' when offset is 0 When manually added uprobe point with zero address, 'uprobe_events' output '(null)' instead of 0x00000000: # echo p:probe_libc/abs_0 /path/to/lib.bin:0x0 arg1=%ax > \ /sys/kernel/debug/tracing/uprobe_events # cat /sys/kernel/debug/tracing/uprobe_events p:probe_libc/abs_0 /path/to/lib.bin:0x (null) arg1=%ax This patch fixes this behavior: # cat /sys/kernel/debug/tracing/uprobe_events p:probe_libc/abs_0 /path/to/lib.bin:0x0000000000000000 Signed-off-by: Wang Nan Acked-by: Masami Hiramatsu Cc: Namhyung Kim Cc: Steven Rostedt Cc: Zefan Li Cc: pi3orama@163.com Link: http://lkml.kernel.org/r/1440586666-235233-8-git-send-email-wangnan0@huawei.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index f97479f..d2f6d0b 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -601,7 +601,22 @@ static int probes_seq_show(struct seq_file *m, void *v) seq_printf(m, "%c:%s/%s", c, tu->tp.call.class->system, trace_event_name(&tu->tp.call)); - seq_printf(m, " %s:0x%p", tu->filename, (void *)tu->offset); + seq_printf(m, " %s:", tu->filename); + + /* Don't print "0x (null)" when offset is 0 */ + if (tu->offset) { + seq_printf(m, "0x%p", (void *)tu->offset); + } else { + switch (sizeof(void *)) { + case 4: + seq_printf(m, "0x00000000"); + break; + case 8: + default: + seq_printf(m, "0x0000000000000000"); + break; + } + } for (i = 0; i < tu->tp.nr_args; i++) seq_printf(m, " %s=%s", tu->tp.args[i].name, tu->tp.args[i].comm); -- cgit v0.10.2 From 77e0070da41f76e1ebd15291fb0254b0c174adfa Mon Sep 17 00:00:00 2001 From: Mark Drayton Date: Wed, 26 Aug 2015 12:18:15 -0700 Subject: perf script: Add --[no-]-demangle/--[no-]-demangle-kernel Sometimes when post-processing output from `perf script` one does not want to demangle C++ symbol names. Add an option to allow this. Also add --[no-]demangle-kernel to be consistent with top/report/probe. Signed-off-by: Mark Drayton Cc: Andi Kleen Cc: Jiri Olsa Link: http://lkml.kernel.org/r/1440616695-32340-1-git-send-email-scientist@fb.com Signed-off-by: Yannick Brosseau Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt index c0d2479..614b2c7 100644 --- a/tools/perf/Documentation/perf-script.txt +++ b/tools/perf/Documentation/perf-script.txt @@ -226,6 +226,13 @@ OPTIONS Display context switch events i.e. events of type PERF_RECORD_SWITCH or PERF_RECORD_SWITCH_CPU_WIDE. +--demangle:: + Demangle symbol names to human readable form. It's enabled by default, + disable with --no-demangle. + +--demangle-kernel:: + Demangle kernel symbol names to human readable form (for C++ kernels). + --header Show perf.data header. diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 105332e..4430340 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -1671,6 +1671,11 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused) itrace_parse_synth_opts), OPT_BOOLEAN(0, "full-source-path", &srcline_full_filename, "Show full source file name path for source lines"), + OPT_BOOLEAN(0, "demangle", &symbol_conf.demangle, + "Enable symbol demangling"), + OPT_BOOLEAN(0, "demangle-kernel", &symbol_conf.demangle_kernel, + "Enable kernel symbol demangling"), + OPT_END() }; const char * const script_subcommands[] = { "record", "report", NULL }; -- cgit v0.10.2 From 1d945012d1381f0232ea2d66e32b06182eedb476 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 27 Aug 2015 09:46:01 -0400 Subject: tools lib traceeveent: Allow for negative numbers in print format It was reported that "%-8s" does not parse well when used in the printk format. The '-' is what is throwing it off. Allow that to be included. Reporter note: Example before: transhuge-stres-10730 [004] 5897.713989: mm_compaction_finished: node=0 zone=>-<8s order=-2119871790 ret= Example after: transhuge-stres-4235 [000] 453.149280: mm_compaction_finished: node=0 zone=ffffffff81815d7a order=9 ret= (I will send patches to fix the string handling in the tracepoints so it's on par with in-kernel printing via trace_pipe:) transhuge-stres-10921 [007] ...1 6307.140205: mm_compaction_finished: node=0 zone=Normal order=9 ret=partial Reported-by: Vlastimil Babka Signed-off-by: Steven Rostedt Tested-by: Vlastimil Babka Cc: Namhyung Kim Link: http://lkml.kernel.org/r/20150827094601.46518bcc@gandalf.local.home Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/lib/traceevent/event-parse.c b/tools/lib/traceevent/event-parse.c index 5c1867a..4d88593 100644 --- a/tools/lib/traceevent/event-parse.c +++ b/tools/lib/traceevent/event-parse.c @@ -4828,6 +4828,7 @@ static void pretty_print(struct trace_seq *s, void *data, int size, struct event case 'z': case 'Z': case '0' ... '9': + case '-': goto cont_process; case 'p': if (pevent->long_size == 4) -- cgit v0.10.2 From 601083cffb7cabdcc55b8195d732f0f7028570fa Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 2 Jul 2015 03:08:43 -0400 Subject: perf stat: Get correct cpu id for print_aggr print_aggr() fails to print per-core/per-socket statistics after commit 582ec0829b3d ("perf stat: Fix per-socket output bug for uncore events") if events have differnt cpus. Because in print_aggr(), aggr_get_id needs index (not cpu id) to find core/pkg id. Also, evsel cpu maps should be used to get aggregated id. Here is an example: Counting events cycles,uncore_imc_0/cas_count_read/. (Uncore event has cpumask 0,18) $ perf stat -e cycles,uncore_imc_0/cas_count_read/ -C0,18 --per-core sleep 2 Without this patch, it failes to get CPU 18 result. Performance counter stats for 'CPU(s) 0,18': S0-C0 1 7526851 cycles S0-C0 1 1.05 MiB uncore_imc_0/cas_count_read/ S1-C0 0 cycles S1-C0 0 MiB uncore_imc_0/cas_count_read/ With this patch, it can get both CPU0 and CPU18 result. Performance counter stats for 'CPU(s) 0,18': S0-C0 1 6327768 cycles S0-C0 1 0.47 MiB uncore_imc_0/cas_count_read/ S1-C0 1 330228 cycles S1-C0 1 0.29 MiB uncore_imc_0/cas_count_read/ Signed-off-by: Kan Liang Acked-by: Jiri Olsa Acked-by: Stephane Eranian Cc: Adrian Hunter Cc: Andi Kleen Cc: David Ahern Cc: Namhyung Kim Cc: Peter Zijlstra Fixes: 582ec0829b3d ("perf stat: Fix per-socket output bug for uncore events") Link: http://lkml.kernel.org/r/1435820925-51091-1-git-send-email-kan.liang@intel.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index d99d850..ef355fc 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -694,7 +694,7 @@ static void abs_printout(int id, int nr, struct perf_evsel *evsel, double avg) static void print_aggr(char *prefix) { struct perf_evsel *counter; - int cpu, cpu2, s, s2, id, nr; + int cpu, s, s2, id, nr; double uval; u64 ena, run, val; @@ -707,8 +707,7 @@ static void print_aggr(char *prefix) val = ena = run = 0; nr = 0; for (cpu = 0; cpu < perf_evsel__nr_cpus(counter); cpu++) { - cpu2 = perf_evsel__cpus(counter)->map[cpu]; - s2 = aggr_get_id(evsel_list->cpus, cpu2); + s2 = aggr_get_id(perf_evsel__cpus(counter), cpu); if (s2 != id) continue; val += perf_counts(counter->counts, cpu, 0)->val; -- cgit v0.10.2 From 84e5d89a777cca26cc513264752b439874702dd6 Mon Sep 17 00:00:00 2001 From: Tony Jones Date: Mon, 17 Aug 2015 12:48:52 -0700 Subject: perf scripts python: Add new compaction-times script This patch creates a new script (compaction-times) to report time spent in mm compaction. It is possible to report times in nanoseconds (default) or microseconds (-u). The option -p will break down results by process id, -pv will further decompose by each compaction entry/exit. For each compaction entry/exit what is reported is controlled by the options: -t report only timing -m report migration stats -ms report migration scanner stats -fs report free scanner stats The default is to report all. Entries may be further filtered by pid, pid-range or comm (regex). The script is useful when analysing workloads that compact memory. The most common example will be THP allocations on systems with a lot of uptime that has fragmented memory. This is an example of using the script to analyse a thpscale from mmtests which deliberately fragments memory and allocates THP in 4 separate threads # Recording step, one of the following; $ perf record -e 'compaction:mm_compaction_*' ./workload # or: $ perf script record compaction-times # Reporting: basic total: 2444505743ns migration: moved=357738 failed=39275 free_scanner: scanned=2705578 isolated=387875 migration_scanner: scanned=414426 isolated=397013 # Reporting: Per task stall times $ perf script report compaction-times -- -t -p total: 2444505743ns 6384[thpscale]: 740800017ns 6385[thpscale]: 274119512ns 6386[thpscale]: 832961337ns 6383[thpscale]: 596624877ns # Reporting: Per-compaction attempts for task 6385 $ perf script report compaction-times -- -m -pv 6385 total: 274119512ns migration: moved=14893 failed=24285 6385[thpscale]: 274119512ns migration: moved=14893 failed=24285 6385[thpscale].1: 3033277ns migration: moved=511 failed=1 6385[thpscale].2: 9592094ns migration: moved=1524 failed=12 6385[thpscale].3: 2495587ns migration: moved=512 failed=0 6385[thpscale].4: 2561766ns migration: moved=512 failed=0 6385[thpscale].5: 2523521ns migration: moved=512 failed=0 ..... output continues ... Changes since v1: - report stats for isolate_migratepages and isolate_freepages (Vlastimil Babka) - refactor code to achieve above - add help text - output to stdout/stderr explicitly Signed-off-by: Tony Jones Cc: Mel Gorman Cc: Vlastimil Babka Link: http://lkml.kernel.org/r/1439840932-8933-1-git-send-email-tonyj@suse.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/scripts/python/bin/compaction-times-record b/tools/perf/scripts/python/bin/compaction-times-record new file mode 100644 index 0000000..6edcd40 --- /dev/null +++ b/tools/perf/scripts/python/bin/compaction-times-record @@ -0,0 +1,2 @@ +#!/bin/bash +perf record -e compaction:mm_compaction_begin -e compaction:mm_compaction_end -e compaction:mm_compaction_migratepages -e compaction:mm_compaction_isolate_migratepages -e compaction:mm_compaction_isolate_freepages $@ diff --git a/tools/perf/scripts/python/bin/compaction-times-report b/tools/perf/scripts/python/bin/compaction-times-report new file mode 100644 index 0000000..3dc1389 --- /dev/null +++ b/tools/perf/scripts/python/bin/compaction-times-report @@ -0,0 +1,4 @@ +#!/bin/bash +#description: display time taken by mm compaction +#args: [-h] [-u] [-p|-pv] [-t | [-m] [-fs] [-ms]] [pid|pid-range|comm-regex] +perf script -s "$PERF_EXEC_PATH"/scripts/python/compaction-times.py $@ diff --git a/tools/perf/scripts/python/compaction-times.py b/tools/perf/scripts/python/compaction-times.py new file mode 100644 index 0000000..239cb05 --- /dev/null +++ b/tools/perf/scripts/python/compaction-times.py @@ -0,0 +1,311 @@ +# report time spent in compaction +# Licensed under the terms of the GNU GPL License version 2 + +# testing: +# 'echo 1 > /proc/sys/vm/compact_memory' to force compaction of all zones + +import os +import sys +import re + +import signal +signal.signal(signal.SIGPIPE, signal.SIG_DFL) + +usage = "usage: perf script report compaction-times.py -- [-h] [-u] [-p|-pv] [-t | [-m] [-fs] [-ms]] [pid|pid-range|comm-regex]\n" + +class popt: + DISP_DFL = 0 + DISP_PROC = 1 + DISP_PROC_VERBOSE=2 + +class topt: + DISP_TIME = 0 + DISP_MIG = 1 + DISP_ISOLFREE = 2 + DISP_ISOLMIG = 4 + DISP_ALL = 7 + +class comm_filter: + def __init__(self, re): + self.re = re + + def filter(self, pid, comm): + m = self.re.search(comm) + return m == None or m.group() == "" + +class pid_filter: + def __init__(self, low, high): + self.low = (0 if low == "" else int(low)) + self.high = (0 if high == "" else int(high)) + + def filter(self, pid, comm): + return not (pid >= self.low and (self.high == 0 or pid <= self.high)) + +def set_type(t): + global opt_disp + opt_disp = (t if opt_disp == topt.DISP_ALL else opt_disp|t) + +def ns(sec, nsec): + return (sec * 1000000000) + nsec + +def time(ns): + return "%dns" % ns if opt_ns else "%dus" % (round(ns, -3) / 1000) + +class pair: + def __init__(self, aval, bval, alabel = None, blabel = None): + self.alabel = alabel + self.blabel = blabel + self.aval = aval + self.bval = bval + + def __add__(self, rhs): + self.aval += rhs.aval + self.bval += rhs.bval + return self + + def __str__(self): + return "%s=%d %s=%d" % (self.alabel, self.aval, self.blabel, self.bval) + +class cnode: + def __init__(self, ns): + self.ns = ns + self.migrated = pair(0, 0, "moved", "failed") + self.fscan = pair(0,0, "scanned", "isolated") + self.mscan = pair(0,0, "scanned", "isolated") + + def __add__(self, rhs): + self.ns += rhs.ns + self.migrated += rhs.migrated + self.fscan += rhs.fscan + self.mscan += rhs.mscan + return self + + def __str__(self): + prev = 0 + s = "%s " % time(self.ns) + if (opt_disp & topt.DISP_MIG): + s += "migration: %s" % self.migrated + prev = 1 + if (opt_disp & topt.DISP_ISOLFREE): + s += "%sfree_scanner: %s" % (" " if prev else "", self.fscan) + prev = 1 + if (opt_disp & topt.DISP_ISOLMIG): + s += "%smigration_scanner: %s" % (" " if prev else "", self.mscan) + return s + + def complete(self, secs, nsecs): + self.ns = ns(secs, nsecs) - self.ns + + def increment(self, migrated, fscan, mscan): + if (migrated != None): + self.migrated += migrated + if (fscan != None): + self.fscan += fscan + if (mscan != None): + self.mscan += mscan + + +class chead: + heads = {} + val = cnode(0); + fobj = None + + @classmethod + def add_filter(cls, filter): + cls.fobj = filter + + @classmethod + def create_pending(cls, pid, comm, start_secs, start_nsecs): + filtered = 0 + try: + head = cls.heads[pid] + filtered = head.is_filtered() + except KeyError: + if cls.fobj != None: + filtered = cls.fobj.filter(pid, comm) + head = cls.heads[pid] = chead(comm, pid, filtered) + + if not filtered: + head.mark_pending(start_secs, start_nsecs) + + @classmethod + def increment_pending(cls, pid, migrated, fscan, mscan): + head = cls.heads[pid] + if not head.is_filtered(): + if head.is_pending(): + head.do_increment(migrated, fscan, mscan) + else: + sys.stderr.write("missing start compaction event for pid %d\n" % pid) + + @classmethod + def complete_pending(cls, pid, secs, nsecs): + head = cls.heads[pid] + if not head.is_filtered(): + if head.is_pending(): + head.make_complete(secs, nsecs) + else: + sys.stderr.write("missing start compaction event for pid %d\n" % pid) + + @classmethod + def gen(cls): + if opt_proc != popt.DISP_DFL: + for i in cls.heads: + yield cls.heads[i] + + @classmethod + def str(cls): + return cls.val + + def __init__(self, comm, pid, filtered): + self.comm = comm + self.pid = pid + self.val = cnode(0) + self.pending = None + self.filtered = filtered + self.list = [] + + def __add__(self, rhs): + self.ns += rhs.ns + self.val += rhs.val + return self + + def mark_pending(self, secs, nsecs): + self.pending = cnode(ns(secs, nsecs)) + + def do_increment(self, migrated, fscan, mscan): + self.pending.increment(migrated, fscan, mscan) + + def make_complete(self, secs, nsecs): + self.pending.complete(secs, nsecs) + chead.val += self.pending + + if opt_proc != popt.DISP_DFL: + self.val += self.pending + + if opt_proc == popt.DISP_PROC_VERBOSE: + self.list.append(self.pending) + self.pending = None + + def enumerate(self): + if opt_proc == popt.DISP_PROC_VERBOSE and not self.is_filtered(): + for i, pelem in enumerate(self.list): + sys.stdout.write("%d[%s].%d: %s\n" % (self.pid, self.comm, i+1, pelem)) + + def is_pending(self): + return self.pending != None + + def is_filtered(self): + return self.filtered + + def display(self): + if not self.is_filtered(): + sys.stdout.write("%d[%s]: %s\n" % (self.pid, self.comm, self.val)) + + +def trace_end(): + sys.stdout.write("total: %s\n" % chead.str()) + for i in chead.gen(): + i.display(), + i.enumerate() + +def compaction__mm_compaction_migratepages(event_name, context, common_cpu, + common_secs, common_nsecs, common_pid, common_comm, + common_callchain, nr_migrated, nr_failed): + + chead.increment_pending(common_pid, + pair(nr_migrated, nr_failed), None, None) + +def compaction__mm_compaction_isolate_freepages(event_name, context, common_cpu, + common_secs, common_nsecs, common_pid, common_comm, + common_callchain, start_pfn, end_pfn, nr_scanned, nr_taken): + + chead.increment_pending(common_pid, + None, pair(nr_scanned, nr_taken), None) + +def compaction__mm_compaction_isolate_migratepages(event_name, context, common_cpu, + common_secs, common_nsecs, common_pid, common_comm, + common_callchain, start_pfn, end_pfn, nr_scanned, nr_taken): + + chead.increment_pending(common_pid, + None, None, pair(nr_scanned, nr_taken)) + +def compaction__mm_compaction_end(event_name, context, common_cpu, + common_secs, common_nsecs, common_pid, common_comm, + common_callchain, zone_start, migrate_start, free_start, zone_end, + sync, status): + + chead.complete_pending(common_pid, common_secs, common_nsecs) + +def compaction__mm_compaction_begin(event_name, context, common_cpu, + common_secs, common_nsecs, common_pid, common_comm, + common_callchain, zone_start, migrate_start, free_start, zone_end, + sync): + + chead.create_pending(common_pid, common_comm, common_secs, common_nsecs) + +def pr_help(): + global usage + + sys.stdout.write(usage) + sys.stdout.write("\n") + sys.stdout.write("-h display this help\n") + sys.stdout.write("-p display by process\n") + sys.stdout.write("-pv display by process (verbose)\n") + sys.stdout.write("-t display stall times only\n") + sys.stdout.write("-m display stats for migration\n") + sys.stdout.write("-fs display stats for free scanner\n") + sys.stdout.write("-ms display stats for migration scanner\n") + sys.stdout.write("-u display results in microseconds (default nanoseconds)\n") + + +comm_re = None +pid_re = None +pid_regex = "^(\d*)-(\d*)$|^(\d*)$" + +opt_proc = popt.DISP_DFL +opt_disp = topt.DISP_ALL + +opt_ns = True + +argc = len(sys.argv) - 1 +if argc >= 1: + pid_re = re.compile(pid_regex) + + for i, opt in enumerate(sys.argv[1:]): + if opt[0] == "-": + if opt == "-h": + pr_help() + exit(0); + elif opt == "-p": + opt_proc = popt.DISP_PROC + elif opt == "-pv": + opt_proc = popt.DISP_PROC_VERBOSE + elif opt == '-u': + opt_ns = False + elif opt == "-t": + set_type(topt.DISP_TIME) + elif opt == "-m": + set_type(topt.DISP_MIG) + elif opt == "-fs": + set_type(topt.DISP_ISOLFREE) + elif opt == "-ms": + set_type(topt.DISP_ISOLMIG) + else: + sys.exit(usage) + + elif i == argc - 1: + m = pid_re.search(opt) + if m != None and m.group() != "": + if m.group(3) != None: + f = pid_filter(m.group(3), m.group(3)) + else: + f = pid_filter(m.group(1), m.group(2)) + else: + try: + comm_re=re.compile(opt) + except: + sys.stderr.write("invalid regex '%s'" % opt) + sys.exit(usage) + f = comm_filter(comm_re) + + chead.add_filter(f) -- cgit v0.10.2 From a598bb5e3558509cbb8aa1d422071ef61fda9d77 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 28 Aug 2015 12:02:37 -0300 Subject: perf trace: Add header with copyright and background info Cc: Adrian Hunter Cc: Borislav Petkov Cc: David Ahern Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Namhyung Kim Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-5yqtfs728r1j1u8zmg8ufxwm@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 2f1162d..4e3abba 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -1,3 +1,21 @@ +/* + * builtin-trace.c + * + * Builtin 'trace' command: + * + * Display a continuously updated trace of any workload, CPU, specific PID, + * system wide, etc. Default format is loosely strace like, but any other + * event may be specified using --event. + * + * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo + * + * Initially based on the 'trace' prototype by Thomas Gleixner: + * + * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'") + * + * Released under the GPL v2. (and only v2, not any later version) + */ + #include #include "builtin.h" #include "util/color.h" @@ -27,6 +45,7 @@ #ifndef MADV_HWPOISON # define MADV_HWPOISON 100 + #endif #ifndef MADV_MERGEABLE -- cgit v0.10.2 From d49e4695077278ee3016cd242967de23072ec331 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 27 Aug 2015 08:07:40 -0400 Subject: perf evsel: Add a backpointer to the evlist a evsel is in So that functions that deal primarily with an evsel to access information that concerns the whole evlist it is in. Cc: Adrian Hunter Cc: Andi Kleen Cc: Andy Lutomirski Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/r/1440677263-21954-5-git-send-email-kan.liang@intel.com Signed-off-by: Kan Liang Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index e9a5d43..8d00039 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -98,6 +98,7 @@ static void perf_evlist__purge(struct perf_evlist *evlist) evlist__for_each_safe(evlist, n, pos) { list_del_init(&pos->node); + pos->evlist = NULL; perf_evsel__delete(pos); } @@ -125,6 +126,7 @@ void perf_evlist__delete(struct perf_evlist *evlist) void perf_evlist__add(struct perf_evlist *evlist, struct perf_evsel *entry) { + entry->evlist = evlist; list_add_tail(&entry->node, &evlist->entries); entry->idx = evlist->nr_entries; entry->tracking = !entry->idx; diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index b096ef7..bac25f4 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -206,6 +206,7 @@ void perf_evsel__init(struct perf_evsel *evsel, evsel->leader = evsel; evsel->unit = ""; evsel->scale = 1.0; + evsel->evlist = NULL; INIT_LIST_HEAD(&evsel->node); INIT_LIST_HEAD(&evsel->config_terms); perf_evsel__object.init(evsel); @@ -1026,6 +1027,7 @@ void perf_evsel__close_fd(struct perf_evsel *evsel, int ncpus, int nthreads) void perf_evsel__exit(struct perf_evsel *evsel) { assert(list_empty(&evsel->node)); + assert(evsel->evlist == NULL); perf_evsel__free_fd(evsel); perf_evsel__free_id(evsel); perf_evsel__free_config_terms(evsel); diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index 93ac6b1..298e6bb 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -60,6 +60,9 @@ struct perf_evsel_config_term { /** struct perf_evsel - event selector * + * @evlist - evlist this evsel is in, if it is in one. + * @node - To insert it into evlist->entries or in other list_heads, say in + * the event parsing routines. * @name - Can be set to retain the original event name passed by the user, * so that when showing results in tools such as 'perf stat', we * show the name used, not some alias. @@ -73,6 +76,7 @@ struct perf_evsel_config_term { */ struct perf_evsel { struct list_head node; + struct perf_evlist *evlist; struct perf_event_attr attr; char *filter; struct xyarray *fd; -- cgit v0.10.2 From 0b5a7935f3b5b7d40293b02c3e552f3d67af208b Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 15 Aug 2015 20:42:59 +0900 Subject: perf buildid: Introduce sysfs/filename__sprintf_build_id Introduce sysfs/filename__sprintf_build_id for consolidating similar code. Signed-off-by: Masami Hiramatsu Cc: Adrian Hunter Cc: Borislav Petkov Cc: Hemant Kumar Cc: Jiri Olsa Cc: Namhyung Kim Cc: Paul Mackerras Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20150815114259.13642.34685.stgit@localhost.localdomain Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/builtin-buildid-cache.c b/tools/perf/builtin-buildid-cache.c index 65b4835..7b8450c 100644 --- a/tools/perf/builtin-buildid-cache.c +++ b/tools/perf/builtin-buildid-cache.c @@ -25,8 +25,6 @@ static int build_id_cache__kcore_buildid(const char *proc_dir, char *sbuildid) { char root_dir[PATH_MAX]; - char notes[PATH_MAX]; - u8 build_id[BUILD_ID_SIZE]; char *p; strlcpy(root_dir, proc_dir, sizeof(root_dir)); @@ -35,15 +33,7 @@ static int build_id_cache__kcore_buildid(const char *proc_dir, char *sbuildid) if (!p) return -1; *p = '\0'; - - scnprintf(notes, sizeof(notes), "%s/sys/kernel/notes", root_dir); - - if (sysfs__read_build_id(notes, build_id, sizeof(build_id))) - return -1; - - build_id__sprintf(build_id, sizeof(build_id), sbuildid); - - return 0; + return sysfs__sprintf_build_id(root_dir, sbuildid); } static int build_id_cache__kcore_dir(char *dir, size_t sz) @@ -138,7 +128,7 @@ static int build_id_cache__add_kcore(const char *filename, bool force) return -1; *p = '\0'; - if (build_id_cache__kcore_buildid(from_dir, sbuildid)) + if (build_id_cache__kcore_buildid(from_dir, sbuildid) < 0) return -1; scnprintf(to_dir, sizeof(to_dir), "%s/[kernel.kcore]/%s", diff --git a/tools/perf/builtin-buildid-list.c b/tools/perf/builtin-buildid-list.c index b5ca988..918b4de 100644 --- a/tools/perf/builtin-buildid-list.c +++ b/tools/perf/builtin-buildid-list.c @@ -19,29 +19,25 @@ static int sysfs__fprintf_build_id(FILE *fp) { - u8 kallsyms_build_id[BUILD_ID_SIZE]; char sbuild_id[SBUILD_ID_SIZE]; + int ret; - if (sysfs__read_build_id("/sys/kernel/notes", kallsyms_build_id, - sizeof(kallsyms_build_id)) != 0) - return -1; + ret = sysfs__sprintf_build_id("/", sbuild_id); + if (ret != sizeof(sbuild_id)) + return ret < 0 ? ret : -EINVAL; - build_id__sprintf(kallsyms_build_id, sizeof(kallsyms_build_id), - sbuild_id); - fprintf(fp, "%s\n", sbuild_id); - return 0; + return fprintf(fp, "%s\n", sbuild_id); } static int filename__fprintf_build_id(const char *name, FILE *fp) { - u8 build_id[BUILD_ID_SIZE]; char sbuild_id[SBUILD_ID_SIZE]; + int ret; - if (filename__read_build_id(name, build_id, - sizeof(build_id)) != sizeof(build_id)) - return 0; + ret = filename__sprintf_build_id(name, sbuild_id); + if (ret != sizeof(sbuild_id)) + return ret < 0 ? ret : -EINVAL; - build_id__sprintf(build_id, sizeof(build_id), sbuild_id); return fprintf(fp, "%s\n", sbuild_id); } @@ -63,7 +59,7 @@ static int perf_session__list_build_ids(bool force, bool with_hits) /* * See if this is an ELF file first: */ - if (filename__fprintf_build_id(input_name, stdout)) + if (filename__fprintf_build_id(input_name, stdout) > 0) goto out; session = perf_session__new(&file, false, &build_id__mark_dso_hit_ops); diff --git a/tools/perf/util/build-id.c b/tools/perf/util/build-id.c index 4a2c2f0..d909459 100644 --- a/tools/perf/util/build-id.c +++ b/tools/perf/util/build-id.c @@ -93,6 +93,38 @@ int build_id__sprintf(const u8 *build_id, int len, char *bf) return raw - build_id; } +int sysfs__sprintf_build_id(const char *root_dir, char *sbuild_id) +{ + char notes[PATH_MAX]; + u8 build_id[BUILD_ID_SIZE]; + int ret; + + if (!root_dir) + root_dir = ""; + + scnprintf(notes, sizeof(notes), "%s/sys/kernel/notes", root_dir); + + ret = sysfs__read_build_id(notes, build_id, sizeof(build_id)); + if (ret < 0) + return ret; + + return build_id__sprintf(build_id, sizeof(build_id), sbuild_id); +} + +int filename__sprintf_build_id(const char *pathname, char *sbuild_id) +{ + u8 build_id[BUILD_ID_SIZE]; + int ret; + + ret = filename__read_build_id(pathname, build_id, sizeof(build_id)); + if (ret < 0) + return ret; + else if (ret != sizeof(build_id)) + return -EINVAL; + + return build_id__sprintf(build_id, sizeof(build_id), sbuild_id); +} + /* asnprintf consolidates asprintf and snprintf */ static int asnprintf(char **strp, size_t size, const char *fmt, ...) { diff --git a/tools/perf/util/build-id.h b/tools/perf/util/build-id.h index ce2f493..27a14a8 100644 --- a/tools/perf/util/build-id.h +++ b/tools/perf/util/build-id.h @@ -12,6 +12,9 @@ extern struct perf_tool build_id__mark_dso_hit_ops; struct dso; int build_id__sprintf(const u8 *build_id, int len, char *bf); +int sysfs__sprintf_build_id(const char *root_dir, char *sbuild_id); +int filename__sprintf_build_id(const char *pathname, char *sbuild_id); + char *dso__build_id_filename(const struct dso *dso, char *bf, size_t size); int build_id__mark_dso_hit(struct perf_tool *tool, union perf_event *event, -- cgit v0.10.2 From 9f44f0cc1c32f1542071447a9493652bbc03facb Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 26 Aug 2015 15:46:44 +0200 Subject: perf tools: Add tracing_path and remove unneeded functions There's no need for find_tracing_dir, because perf already searches for debugfs/tracefs mount on start and populate tracing_events_path. Adding tracing_path to carry tracing dir string to be used in get_tracing_file instead of calling find_tracing_dir. Signed-off-by: Jiri Olsa Cc: Raphael Beamonte Cc: David Ahern Cc: Matt Fleming Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1440596813-12844-3-git-send-email-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/util.c b/tools/perf/util/util.c index f7adf12..d33c341 100644 --- a/tools/perf/util/util.c +++ b/tools/perf/util/util.c @@ -34,6 +34,7 @@ bool test_attr__enabled; bool perf_host = true; bool perf_guest = false; +char tracing_path[PATH_MAX + 1] = "/sys/kernel/debug/tracing"; char tracing_events_path[PATH_MAX + 1] = "/sys/kernel/debug/tracing/events"; void event_attr_init(struct perf_event_attr *attr) @@ -391,6 +392,8 @@ void set_term_quiet_input(struct termios *old) static void set_tracing_events_path(const char *tracing, const char *mountpoint) { + snprintf(tracing_path, sizeof(tracing_path), "%s/%s", + mountpoint, tracing); snprintf(tracing_events_path, sizeof(tracing_events_path), "%s/%s%s", mountpoint, tracing, "events"); } @@ -440,62 +443,11 @@ void perf_debugfs_set_path(const char *mntpt) set_tracing_events_path("tracing/", mntpt); } -static const char *find_tracefs(void) -{ - const char *path = __perf_tracefs_mount(NULL); - - return path; -} - -static const char *find_debugfs(void) -{ - const char *path = __perf_debugfs_mount(NULL); - - if (!path) - fprintf(stderr, "Your kernel does not support the debugfs filesystem"); - - return path; -} - -/* - * Finds the path to the debugfs/tracing - * Allocates the string and stores it. - */ -const char *find_tracing_dir(void) -{ - const char *tracing_dir = ""; - static char *tracing; - static int tracing_found; - const char *debugfs; - - if (tracing_found) - return tracing; - - debugfs = find_tracefs(); - if (!debugfs) { - tracing_dir = "/tracing"; - debugfs = find_debugfs(); - if (!debugfs) - return NULL; - } - - if (asprintf(&tracing, "%s%s", debugfs, tracing_dir) < 0) - return NULL; - - tracing_found = 1; - return tracing; -} - char *get_tracing_file(const char *name) { - const char *tracing; char *file; - tracing = find_tracing_dir(); - if (!tracing) - return NULL; - - if (asprintf(&file, "%s/%s", tracing, name) < 0) + if (asprintf(&file, "%s/%s", tracing_path, name) < 0) return NULL; return file; diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h index 88a8915..291be1d 100644 --- a/tools/perf/util/util.h +++ b/tools/perf/util/util.h @@ -83,10 +83,10 @@ extern const char *graph_line; extern const char *graph_dotted_line; extern char buildid_dir[]; +extern char tracing_path[]; extern char tracing_events_path[]; extern void perf_debugfs_set_path(const char *mountpoint); const char *perf_debugfs_mount(const char *mountpoint); -const char *find_tracing_dir(void); char *get_tracing_file(const char *name); void put_tracing_file(char *file); -- cgit v0.10.2 From 9f30fffc78ca35c862f74f34cc597c7fdddc8793 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 26 Aug 2015 15:46:45 +0200 Subject: perf tools: Do not change lib/api/fs/debugfs directly The tracing_events_path is the variable we want to change via --debugfs-dir option, not the debugfs_mountpoint. Signed-off-by: Jiri Olsa Reviewed-by: Matt Fleming Cc: Raphael Beamonte Cc: David Ahern Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1440596813-12844-4-git-send-email-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/perf.c b/tools/perf/perf.c index b857fcb..07dbff5 100644 --- a/tools/perf/perf.c +++ b/tools/perf/perf.c @@ -231,7 +231,7 @@ static int handle_options(const char ***argv, int *argc, int *envchanged) (*argc)--; } else if (!prefixcmp(cmd, CMD_DEBUGFS_DIR)) { perf_debugfs_set_path(cmd + strlen(CMD_DEBUGFS_DIR)); - fprintf(stderr, "dir: %s\n", debugfs_mountpoint); + fprintf(stderr, "dir: %s\n", tracing_path); if (envchanged) *envchanged = 1; } else if (!strcmp(cmd, "--list-cmds")) { diff --git a/tools/perf/util/util.c b/tools/perf/util/util.c index d33c341..7acafb3 100644 --- a/tools/perf/util/util.c +++ b/tools/perf/util/util.c @@ -439,7 +439,6 @@ const char *perf_debugfs_mount(const char *mountpoint) void perf_debugfs_set_path(const char *mntpt) { - snprintf(debugfs_mountpoint, strlen(debugfs_mountpoint), "%s", mntpt); set_tracing_events_path("tracing/", mntpt); } -- cgit v0.10.2 From ce80d3bef9ff97638ca57a5659ef6ad356f35047 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 28 Aug 2015 05:48:04 -0400 Subject: perf tools: Rename perf_session_env to perf_env As it is not necessarily tied to a perf.data file and needs using in places where a perf_session is not required. Suggested-by: Arnaldo Carvalho de Melo Signed-off-by: Kan Liang Cc: Adrian Hunter Cc: Andi Kleen Cc: Andy Lutomirski Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/r/1440755289-30939-4-git-send-email-kan.liang@intel.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/arch/common.c b/tools/perf/arch/common.c index b7bb42c..b00dfd92 100644 --- a/tools/perf/arch/common.c +++ b/tools/perf/arch/common.c @@ -128,7 +128,7 @@ static const char *normalize_arch(char *arch) return arch; } -static int perf_session_env__lookup_binutils_path(struct perf_session_env *env, +static int perf_session_env__lookup_binutils_path(struct perf_env *env, const char *name, const char **path) { @@ -206,7 +206,7 @@ out_error: return -1; } -int perf_session_env__lookup_objdump(struct perf_session_env *env) +int perf_session_env__lookup_objdump(struct perf_env *env) { /* * For live mode, env->arch will be NULL and we can use diff --git a/tools/perf/arch/common.h b/tools/perf/arch/common.h index ede246e..20176df 100644 --- a/tools/perf/arch/common.h +++ b/tools/perf/arch/common.h @@ -5,6 +5,6 @@ extern const char *objdump_path; -int perf_session_env__lookup_objdump(struct perf_session_env *env); +int perf_session_env__lookup_objdump(struct perf_env *env); #endif /* ARCH_PERF_COMMON_H */ diff --git a/tools/perf/ui/browser.h b/tools/perf/ui/browser.h index 52be871..f3cef56 100644 --- a/tools/perf/ui/browser.h +++ b/tools/perf/ui/browser.h @@ -61,8 +61,8 @@ int ui_browser__help_window(struct ui_browser *browser, const char *text); bool ui_browser__dialog_yesno(struct ui_browser *browser, const char *text); int ui_browser__input_window(const char *title, const char *text, char *input, const char *exit_msg, int delay_sec); -struct perf_session_env; -int tui__header_window(struct perf_session_env *env); +struct perf_env; +int tui__header_window(struct perf_env *env); void ui_browser__argv_seek(struct ui_browser *browser, off_t offset, int whence); unsigned int ui_browser__argv_refresh(struct ui_browser *browser); diff --git a/tools/perf/ui/browsers/header.c b/tools/perf/ui/browsers/header.c index f106817..edbeaaf 100644 --- a/tools/perf/ui/browsers/header.c +++ b/tools/perf/ui/browsers/header.c @@ -91,7 +91,7 @@ static int ui__list_menu(int argc, char * const argv[]) return list_menu__run(&menu); } -int tui__header_window(struct perf_session_env *env) +int tui__header_window(struct perf_env *env) { int i, argc = 0; char **argv; diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c index 10c7ec0..cf86f2d 100644 --- a/tools/perf/ui/browsers/hists.c +++ b/tools/perf/ui/browsers/hists.c @@ -26,7 +26,7 @@ struct hist_browser { struct map_symbol *selection; struct hist_browser_timer *hbt; struct pstack *pstack; - struct perf_session_env *env; + struct perf_env *env; int print_seq; bool show_dso; bool show_headers; @@ -1214,7 +1214,7 @@ static int hist_browser__dump(struct hist_browser *browser) static struct hist_browser *hist_browser__new(struct hists *hists, struct hist_browser_timer *hbt, - struct perf_session_env *env) + struct perf_env *env) { struct hist_browser *browser = zalloc(sizeof(*browser)); @@ -1695,7 +1695,7 @@ static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events, bool left_exits, struct hist_browser_timer *hbt, float min_pcnt, - struct perf_session_env *env) + struct perf_env *env) { struct hists *hists = evsel__hists(evsel); struct hist_browser *browser = hist_browser__new(hists, hbt, env); @@ -2016,7 +2016,7 @@ struct perf_evsel_menu { struct perf_evsel *selection; bool lost_events, lost_events_warned; float min_pcnt; - struct perf_session_env *env; + struct perf_env *env; }; static void perf_evsel_menu__write(struct ui_browser *browser, @@ -2169,7 +2169,7 @@ static int __perf_evlist__tui_browse_hists(struct perf_evlist *evlist, int nr_entries, const char *help, struct hist_browser_timer *hbt, float min_pcnt, - struct perf_session_env *env) + struct perf_env *env) { struct perf_evsel *pos; struct perf_evsel_menu menu = { @@ -2202,7 +2202,7 @@ static int __perf_evlist__tui_browse_hists(struct perf_evlist *evlist, int perf_evlist__tui_browse_hists(struct perf_evlist *evlist, const char *help, struct hist_browser_timer *hbt, float min_pcnt, - struct perf_session_env *env) + struct perf_env *env) { int nr_entries = evlist->nr_entries; diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h index 9b53b65..396e496 100644 --- a/tools/perf/util/header.h +++ b/tools/perf/util/header.h @@ -66,7 +66,7 @@ struct perf_header; int perf_file_header__read(struct perf_file_header *header, struct perf_header *ph, int fd); -struct perf_session_env { +struct perf_env { char *hostname; char *os_release; char *version; @@ -98,7 +98,7 @@ struct perf_header { u64 data_size; u64 feat_offset; DECLARE_BITMAP(adds_features, HEADER_FEAT_BITS); - struct perf_session_env env; + struct perf_env env; }; struct perf_evlist; diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h index bc528d5..de6d58e 100644 --- a/tools/perf/util/hist.h +++ b/tools/perf/util/hist.h @@ -313,7 +313,7 @@ int hist_entry__tui_annotate(struct hist_entry *he, struct perf_evsel *evsel, int perf_evlist__tui_browse_hists(struct perf_evlist *evlist, const char *help, struct hist_browser_timer *hbt, float min_pcnt, - struct perf_session_env *env); + struct perf_env *env); int script_browse(const char *script_opt); #else static inline @@ -321,7 +321,7 @@ int perf_evlist__tui_browse_hists(struct perf_evlist *evlist __maybe_unused, const char *help __maybe_unused, struct hist_browser_timer *hbt __maybe_unused, float min_pcnt __maybe_unused, - struct perf_session_env *env __maybe_unused) + struct perf_env *env __maybe_unused) { return 0; } diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 18722e7..8a4537e 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -170,7 +170,7 @@ static void perf_session__delete_threads(struct perf_session *session) machine__delete_threads(&session->machines.host); } -static void perf_session_env__exit(struct perf_session_env *env) +static void perf_session_env__exit(struct perf_env *env) { zfree(&env->hostname); zfree(&env->os_release); diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c index 46ae053..1f97ffb 100644 --- a/tools/perf/util/symbol.c +++ b/tools/perf/util/symbol.c @@ -1853,7 +1853,7 @@ static void vmlinux_path__exit(void) zfree(&vmlinux_path); } -static int vmlinux_path__init(struct perf_session_env *env) +static int vmlinux_path__init(struct perf_env *env) { struct utsname uts; char bf[PATH_MAX]; @@ -1964,7 +1964,7 @@ static bool symbol__read_kptr_restrict(void) return value; } -int symbol__init(struct perf_session_env *env) +int symbol__init(struct perf_env *env) { const char *symfs; diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h index a4cde92..440ba8a 100644 --- a/tools/perf/util/symbol.h +++ b/tools/perf/util/symbol.h @@ -252,8 +252,8 @@ int modules__parse(const char *filename, void *arg, int filename__read_debuglink(const char *filename, char *debuglink, size_t size); -struct perf_session_env; -int symbol__init(struct perf_session_env *env); +struct perf_env; +int symbol__init(struct perf_env *env); void symbol__exit(void); void symbol__elf_init(void); struct symbol *symbol__new(u64 start, u64 len, u8 binding, const char *name); -- cgit v0.10.2 From 2c07144dfce366e21465cc7b0ada9f0b6dc7b7ed Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 28 Aug 2015 05:48:05 -0400 Subject: perf evlist: Add backpointer for perf_env to evlist Add backpointer to perf_env in evlist, so we can easily access env when processing something where we have a evsel or evlist. Suggested-by: Arnaldo Carvalho de Melo Signed-off-by: Kan Liang Cc: Adrian Hunter Cc: Andi Kleen Cc: Andy Lutomirski Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/r/1440755289-30939-5-git-send-email-kan.liang@intel.com Signed-off-by: Arnaldo Carvalho de Melo diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h index 436e358..b39a619 100644 --- a/tools/perf/util/evlist.h +++ b/tools/perf/util/evlist.h @@ -56,6 +56,7 @@ struct perf_evlist { struct cpu_map *cpus; struct perf_evsel *selected; struct events_stats stats; + struct perf_env *env; }; struct perf_evsel_str_handler { diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index 179b2bd..4181454 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -2514,6 +2514,7 @@ int perf_session__read_header(struct perf_session *session) if (session->evlist == NULL) return -ENOMEM; + session->evlist->env = &header->env; if (perf_data_file__is_pipe(file)) return perf_header__read_pipe(session); -- cgit v0.10.2