From f4a2a0d9a4226846693b5b4462d4350c1bfd58ea Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 27 Oct 2008 02:05:25 +0100 Subject: ftrace: add a script to produce a hierarchical view of a function trace This script parses a function trace and then produces a hierarchical view of the function call stack after processing it into a tree. Changes on V2 thanks to the trace sent by Steven: - Support both the files "trace" and "trace_pipe" (comments and space differences) - Correct the mini HOW-TO at the beginning. Signed-off-by: Frederic Weisbecker Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/scripts/tracing/draw_functrace.py b/scripts/tracing/draw_functrace.py new file mode 100644 index 0000000..902f9a9 --- /dev/null +++ b/scripts/tracing/draw_functrace.py @@ -0,0 +1,130 @@ +#!/usr/bin/python + +""" +Copyright 2008 (c) Frederic Weisbecker +Licensed under the terms of the GNU GPL License version 2 + +This script parses a trace provided by the function tracer in +kernel/trace/trace_functions.c +The resulted trace is processed into a tree to produce a more human +view of the call stack by drawing textual but hierarchical tree of +calls. Only the functions's names and the the call time are provided. + +Usage: + Be sure that you have CONFIG_FUNCTION_TRACER + # mkdir /debugfs + # mount -t debug debug /debug + # echo function > /debug/tracing/current_tracer + $ cat /debug/tracing/trace_pipe > ~/raw_trace_func + Wait some times but not too much, the script is a bit slow. + Break the pipe (Ctrl + Z) + $ scripts/draw_functrace.py < raw_trace_func > draw_functrace + Then you have your drawn trace in draw_functrace +""" + + +import sys, re + +class CallTree: + """ This class provides a tree representation of the functions + call stack. If a function has no parent in the kernel (interrupt, + syscall, kernel thread...) then it is attached to a virtual parent + called ROOT. + """ + ROOT = None + + def __init__(self, func, time = None, parent = None): + self._func = func + self._time = time + if parent is None: + self._parent = CallTree.ROOT + else: + self._parent = parent + self._children = [] + + def calls(self, func, calltime): + """ If a function calls another one, call this method to insert it + into the tree at the appropriate place. + @return: A reference to the newly created child node. + """ + child = CallTree(func, calltime, self) + self._children.append(child) + return child + + def getParent(self, func): + """ Retrieve the last parent of the current node that + has the name given by func. If this function is not + on a parent, then create it as new child of root + @return: A reference to the parent. + """ + tree = self + while tree != CallTree.ROOT and tree._func != func: + tree = tree._parent + if tree == CallTree.ROOT: + child = CallTree.ROOT.calls(func, None) + return child + return tree + + def __repr__(self): + return self.__toString("", True) + + def __toString(self, branch, lastChild): + if self._time is not None: + s = "%s----%s (%s)\n" % (branch, self._func, self._time) + else: + s = "%s----%s\n" % (branch, self._func) + + i = 0 + if lastChild: + branch = branch[:-1] + " " + while i < len(self._children): + if i != len(self._children) - 1: + s += "%s" % self._children[i].__toString(branch +\ + " |", False) + else: + s += "%s" % self._children[i].__toString(branch +\ + " |", True) + i += 1 + return s + +class BrokenLineException(Exception): + """If the last line is not complete because of the pipe breakage, + we want to stop the processing and ignore this line. + """ + pass + +class CommentLineException(Exception): + """ If the line is a comment (as in the beginning of the trace file), + just ignore it. + """ + pass + + +def parseLine(line): + line = line.strip() + if line.startswith("#"): + raise CommentLineException + m = re.match("[^]]+?\\] +([0-9.]+): (\\w+) <-(\\w+)", line) + if m is None: + raise BrokenLineException + return (m.group(1), m.group(2), m.group(3)) + + +def main(): + CallTree.ROOT = CallTree("Root (Nowhere)", None, None) + tree = CallTree.ROOT + + for line in sys.stdin: + try: + calltime, callee, caller = parseLine(line) + except BrokenLineException: + break + except CommentLineException: + continue + tree = tree.getParent(caller) + tree = tree.calls(callee, calltime) + + print CallTree.ROOT + +if __name__ == "__main__": + main() -- cgit v0.10.2 From 0eec481e8fb000a209fda9bf8f466aca87dc1150 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 15 Oct 2008 14:56:37 +0800 Subject: markers: simplify marker_set_format() current marker_set_format() is complex this patch simplify it, and decrease the overhead of marker_update_probes(). Signed-off-by: Lai Jiangshan Acked-by: Mathieu Desnoyers Signed-off-by: Ingo Molnar diff --git a/kernel/marker.c b/kernel/marker.c index e9c6b2b..75a1a17 100644 --- a/kernel/marker.c +++ b/kernel/marker.c @@ -64,6 +64,7 @@ struct marker_entry { void *oldptr; int rcu_pending; unsigned char ptype:1; + unsigned char format_allocated:1; char name[0]; /* Contains name'\0'format'\0' */ }; @@ -416,6 +417,7 @@ static struct marker_entry *add_marker(const char *name, const char *format) e->single.probe_private = NULL; e->multi = NULL; e->ptype = 0; + e->format_allocated = 0; e->refcount = 0; e->rcu_pending = 0; hlist_add_head(&e->hlist, head); @@ -447,6 +449,8 @@ static int remove_marker(const char *name) if (e->single.func != __mark_empty_function) return -EBUSY; hlist_del(&e->hlist); + if (e->format_allocated) + kfree(e->format); /* Make sure the call_rcu has been executed */ if (e->rcu_pending) rcu_barrier_sched(); @@ -457,57 +461,34 @@ static int remove_marker(const char *name) /* * Set the mark_entry format to the format found in the element. */ -static int marker_set_format(struct marker_entry **entry, const char *format) +static int marker_set_format(struct marker_entry *entry, const char *format) { - struct marker_entry *e; - size_t name_len = strlen((*entry)->name) + 1; - size_t format_len = strlen(format) + 1; - - - e = kmalloc(sizeof(struct marker_entry) + name_len + format_len, - GFP_KERNEL); - if (!e) + entry->format = kstrdup(format, GFP_KERNEL); + if (!entry->format) return -ENOMEM; - memcpy(&e->name[0], (*entry)->name, name_len); - e->format = &e->name[name_len]; - memcpy(e->format, format, format_len); - if (strcmp(e->format, MARK_NOARGS) == 0) - e->call = marker_probe_cb_noarg; - else - e->call = marker_probe_cb; - e->single = (*entry)->single; - e->multi = (*entry)->multi; - e->ptype = (*entry)->ptype; - e->refcount = (*entry)->refcount; - e->rcu_pending = 0; - hlist_add_before(&e->hlist, &(*entry)->hlist); - hlist_del(&(*entry)->hlist); - /* Make sure the call_rcu has been executed */ - if ((*entry)->rcu_pending) - rcu_barrier_sched(); - kfree(*entry); - *entry = e; + entry->format_allocated = 1; + trace_mark(core_marker_format, "name %s format %s", - e->name, e->format); + entry->name, entry->format); return 0; } /* * Sets the probe callback corresponding to one marker. */ -static int set_marker(struct marker_entry **entry, struct marker *elem, +static int set_marker(struct marker_entry *entry, struct marker *elem, int active) { int ret; - WARN_ON(strcmp((*entry)->name, elem->name) != 0); + WARN_ON(strcmp(entry->name, elem->name) != 0); - if ((*entry)->format) { - if (strcmp((*entry)->format, elem->format) != 0) { + if (entry->format) { + if (strcmp(entry->format, elem->format) != 0) { printk(KERN_NOTICE "Format mismatch for probe %s " "(%s), marker (%s)\n", - (*entry)->name, - (*entry)->format, + entry->name, + entry->format, elem->format); return -EPERM; } @@ -523,34 +504,33 @@ static int set_marker(struct marker_entry **entry, struct marker *elem, * pass from a "safe" callback (with argument) to an "unsafe" * callback (does not set arguments). */ - elem->call = (*entry)->call; + elem->call = entry->call; /* * Sanity check : * We only update the single probe private data when the ptr is * set to a _non_ single probe! (0 -> 1 and N -> 1, N != 1) */ WARN_ON(elem->single.func != __mark_empty_function - && elem->single.probe_private - != (*entry)->single.probe_private && - !elem->ptype); - elem->single.probe_private = (*entry)->single.probe_private; + && elem->single.probe_private != entry->single.probe_private + && !elem->ptype); + elem->single.probe_private = entry->single.probe_private; /* * Make sure the private data is valid when we update the * single probe ptr. */ smp_wmb(); - elem->single.func = (*entry)->single.func; + elem->single.func = entry->single.func; /* * We also make sure that the new probe callbacks array is consistent * before setting a pointer to it. */ - rcu_assign_pointer(elem->multi, (*entry)->multi); + rcu_assign_pointer(elem->multi, entry->multi); /* * Update the function or multi probe array pointer before setting the * ptype. */ smp_wmb(); - elem->ptype = (*entry)->ptype; + elem->ptype = entry->ptype; elem->state = active; return 0; @@ -594,8 +574,7 @@ void marker_update_probe_range(struct marker *begin, for (iter = begin; iter < end; iter++) { mark_entry = get_marker(iter->name); if (mark_entry) { - set_marker(&mark_entry, iter, - !!mark_entry->refcount); + set_marker(mark_entry, iter, !!mark_entry->refcount); /* * ignore error, continue */ @@ -657,7 +636,7 @@ int marker_probe_register(const char *name, const char *format, ret = PTR_ERR(entry); } else if (format) { if (!entry->format) - ret = marker_set_format(&entry, format); + ret = marker_set_format(entry, format); else if (strcmp(entry->format, format)) ret = -EPERM; } -- cgit v0.10.2 From 505e371da195fad20cb8aaf45407a2849774d6d0 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 15 Oct 2008 14:56:42 +0800 Subject: markers: remove exported symbol marker_probe_cb_noarg() marker_probe_cb_noarg() should not be seen by outer code. this patch remove it. Signed-off-by: Lai Jiangshan Acked-by: Mathieu Desnoyers Signed-off-by: Ingo Molnar diff --git a/include/linux/marker.h b/include/linux/marker.h index 889196c..4cf4547 100644 --- a/include/linux/marker.h +++ b/include/linux/marker.h @@ -136,8 +136,6 @@ extern marker_probe_func __mark_empty_function; extern void marker_probe_cb(const struct marker *mdata, void *call_private, ...); -extern void marker_probe_cb_noarg(const struct marker *mdata, - void *call_private, ...); /* * Connect a probe to a marker. diff --git a/kernel/marker.c b/kernel/marker.c index 75a1a17..2319264 100644 --- a/kernel/marker.c +++ b/kernel/marker.c @@ -158,7 +158,7 @@ EXPORT_SYMBOL_GPL(marker_probe_cb); * * Should be connected to markers "MARK_NOARGS". */ -void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...) +static void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...) { va_list args; /* not initialized */ char ptype; @@ -198,7 +198,6 @@ void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...) } rcu_read_unlock_sched(); } -EXPORT_SYMBOL_GPL(marker_probe_cb_noarg); static void free_old_closure(struct rcu_head *head) { -- cgit v0.10.2 From 4de62748e69c31fc4fd5bc43b73e9cf60a17ec53 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 15 Oct 2008 14:56:47 +0800 Subject: markers: let marker_table be close to its comments marker_table is defined far from its comments, this fix make cleanup for it. Signed-off-by: Lai Jiangshan Acked-by: Mathieu Desnoyers Signed-off-by: Ingo Molnar diff --git a/kernel/marker.c b/kernel/marker.c index 2319264..0f2a944 100644 --- a/kernel/marker.c +++ b/kernel/marker.c @@ -43,6 +43,7 @@ static DEFINE_MUTEX(markers_mutex); */ #define MARKER_HASH_BITS 6 #define MARKER_TABLE_SIZE (1 << MARKER_HASH_BITS) +static struct hlist_head marker_table[MARKER_TABLE_SIZE]; /* * Note about RCU : @@ -68,8 +69,6 @@ struct marker_entry { char name[0]; /* Contains name'\0'format'\0' */ }; -static struct hlist_head marker_table[MARKER_TABLE_SIZE]; - /** * __mark_empty_function - Empty probe callback * @probe_private: probe private data -- cgit v0.10.2 From 5d9881ea1440f046ee851bbaa2a2962543336a11 Mon Sep 17 00:00:00 2001 From: Zhaolei Date: Wed, 22 Oct 2008 11:38:01 +0800 Subject: markers: break the redundant loop in kernel/marker.c Impact: cleanup, no functionality changed Because e->name is unique in list, we don't need to continue the iteration after matched. Signed-off-by: Zhao Lei Acked-by: Mathieu Desnoyers Signed-off-by: Ingo Molnar diff --git a/kernel/marker.c b/kernel/marker.c index 0f2a944..2898b64 100644 --- a/kernel/marker.c +++ b/kernel/marker.c @@ -825,8 +825,6 @@ void *marker_get_private_data(const char *name, marker_probe_func *probe, if (!e->ptype) { if (num == 0 && e->single.func == probe) return e->single.probe_private; - else - break; } else { struct marker_probe_closure *closure; int match = 0; @@ -838,6 +836,7 @@ void *marker_get_private_data(const char *name, marker_probe_func *probe, return closure[i].probe_private; } } + break; } } return ERR_PTR(-ENOENT); -- cgit v0.10.2 From fd3fdf11d3c649769e02459c5f1b8081a15e9007 Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Fri, 24 Oct 2008 20:08:11 +0300 Subject: trace: add the MMIO-tracer to the tracer menu, cleanup Impact: cleanup We can remove MMIOTRACE_HOOKS and replace it with just MMIOTRACE. MMIOTRACE_HOOKS is a remnant from the time when I thought that something else could also use the kmmio facilities. Signed-off-by: Pekka Paalanen Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 2a3dfbd..fa013f5 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -186,14 +186,10 @@ config IOMMU_LEAK Add a simple leak tracer to the IOMMU code. This is useful when you are debugging a buggy device driver that leaks IOMMU mappings. -config MMIOTRACE_HOOKS - bool - config MMIOTRACE bool "Memory mapped IO tracing" depends on DEBUG_KERNEL && PCI select TRACING - select MMIOTRACE_HOOKS help Mmiotrace traces Memory Mapped I/O access and is meant for debugging and reverse engineering. It is called from the ioremap diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 59f89b4..0a21b7a 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -8,9 +8,8 @@ obj-$(CONFIG_X86_PTDUMP) += dump_pagetables.o obj-$(CONFIG_HIGHMEM) += highmem_32.o -obj-$(CONFIG_MMIOTRACE_HOOKS) += kmmio.o obj-$(CONFIG_MMIOTRACE) += mmiotrace.o -mmiotrace-y := pf_in.o mmio-mod.o +mmiotrace-y := kmmio.o pf_in.o mmio-mod.o obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o obj-$(CONFIG_NUMA) += numa_$(BITS).o diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 31e8730..4152d3c 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -53,7 +53,7 @@ static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr) { -#ifdef CONFIG_MMIOTRACE_HOOKS +#ifdef CONFIG_MMIOTRACE if (unlikely(is_kmmio_active())) if (kmmio_handler(regs, addr) == 1) return -1; -- cgit v0.10.2 From 944ac4259e39801c843a915c3da8194ac9af0440 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 23 Oct 2008 19:26:08 -0400 Subject: ftrace: ftrace dump on oops control Impact: add (default-off) dump-trace-on-oops flag Currently, ftrace is set up to dump its contents to the console if the kernel panics or oops. This can be annoying if you have trace data in the buffers and you experience an oops, but the trace data is old or static. Usually when you want ftrace to dump its contents is when you are debugging your system and you have set up ftrace to trace the events leading to an oops. This patch adds a control variable called "ftrace_dump_on_oops" that will enable the ftrace dump to console on oops. This variable is default off but a developer can enable it either through the kernel command line by adding "ftrace_dump_on_oops" or at run time by setting (or disabling) /proc/sys/kernel/ftrace_dump_on_oops. v2: Replaced /** with /* as Randy explained that kernel-doc does not yet handle variables. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index a3d4615..9623b7b 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -165,6 +165,8 @@ static inline void __ftrace_enabled_restore(int enabled) #endif #ifdef CONFIG_TRACING +extern int ftrace_dump_on_oops; + extern void ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index a13bd4d..84754f5 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -484,6 +484,16 @@ static struct ctl_table kern_table[] = { .proc_handler = &ftrace_enable_sysctl, }, #endif +#ifdef CONFIG_TRACING + { + .ctl_name = CTL_UNNUMBERED, + .procname = "ftrace_dump_on_opps", + .data = &ftrace_dump_on_oops, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif #ifdef CONFIG_MODULES { .ctl_name = KERN_MODPROBE, diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d345d64..47f46cb 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -63,6 +63,28 @@ static cpumask_t __read_mostly tracing_buffer_mask; static int tracing_disabled = 1; +/* + * ftrace_dump_on_oops - variable to dump ftrace buffer on oops + * + * If there is an oops (or kernel panic) and the ftrace_dump_on_oops + * is set, then ftrace_dump is called. This will output the contents + * of the ftrace buffers to the console. This is very useful for + * capturing traces that lead to crashes and outputing it to a + * serial console. + * + * It is default off, but you can enable it with either specifying + * "ftrace_dump_on_oops" in the kernel command line, or setting + * /proc/sys/kernel/ftrace_dump_on_oops to true. + */ +int ftrace_dump_on_oops; + +static int __init set_ftrace_dump_on_oops(char *str) +{ + ftrace_dump_on_oops = 1; + return 1; +} +__setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops); + long ns2usecs(cycle_t nsec) { @@ -3021,7 +3043,8 @@ EXPORT_SYMBOL_GPL(__ftrace_printk); static int trace_panic_handler(struct notifier_block *this, unsigned long event, void *unused) { - ftrace_dump(); + if (ftrace_dump_on_oops) + ftrace_dump(); return NOTIFY_OK; } @@ -3037,7 +3060,8 @@ static int trace_die_handler(struct notifier_block *self, { switch (val) { case DIE_OOPS: - ftrace_dump(); + if (ftrace_dump_on_oops) + ftrace_dump(); break; default: break; @@ -3078,7 +3102,6 @@ trace_printk_seq(struct trace_seq *s) trace_seq_reset(s); } - void ftrace_dump(void) { static DEFINE_SPINLOCK(ftrace_dump_lock); -- cgit v0.10.2 From 17666f02b118099028522dfc3df00a235700e216 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 30 Oct 2008 16:08:32 -0400 Subject: ftrace: nmi safe code modification Impact: fix crashes that can occur in NMI handlers, if their code is modified Modifying code is something that needs special care. On SMP boxes, if code that is being modified is also being executed on another CPU, that CPU will have undefined results. The dynamic ftrace uses kstop_machine to make the system act like a uniprocessor system. But this does not address NMIs, that can still run on other CPUs. One approach to handle this is to make all code that are used by NMIs not be traced. But NMIs can call notifiers that spread throughout the kernel and this will be very hard to maintain, and the chance of missing a function is very high. The approach that this patch takes is to have the NMIs modify the code if the modification is taking place. The way this works is that just writing to code executing on another CPU is not harmful if what is written is the same as what exists. Two buffers are used: an IP buffer and a "code" buffer. The steps that the patcher takes are: 1) Put in the instruction pointer into the IP buffer and the new code into the "code" buffer. 2) Set a flag that says we are modifying code 3) Wait for any running NMIs to finish. 4) Write the code 5) clear the flag. 6) Wait for any running NMIs to finish. If an NMI is executed, it will also write the pending code. Multiple writes are OK, because what is being written is the same. Then the patcher must wait for all running NMIs to finish before going to the next line that must be patched. This is basically the RCU approach to code modification. Thanks to Ingo Molnar for suggesting the idea, and to Arjan van de Ven for his guidence on what is safe and what is not. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/arch/arm/include/asm/ftrace.h b/arch/arm/include/asm/ftrace.h index 39c8bc1..d4c24a7 100644 --- a/arch/arm/include/asm/ftrace.h +++ b/arch/arm/include/asm/ftrace.h @@ -1,6 +1,11 @@ #ifndef _ASM_ARM_FTRACE #define _ASM_ARM_FTRACE +#ifndef __ASSEMBLY__ +#define ftrace_nmi_enter() do { } while (0) +#define ftrace_nmi_exit() do { } while (0) +#endif + #ifdef CONFIG_FUNCTION_TRACER #define MCOUNT_ADDR ((long)(mcount)) #define MCOUNT_INSN_SIZE 4 /* sizeof mcount call */ diff --git a/arch/powerpc/include/asm/ftrace.h b/arch/powerpc/include/asm/ftrace.h index b298f7a..7652755 100644 --- a/arch/powerpc/include/asm/ftrace.h +++ b/arch/powerpc/include/asm/ftrace.h @@ -1,6 +1,11 @@ #ifndef _ASM_POWERPC_FTRACE #define _ASM_POWERPC_FTRACE +#ifndef __ASSEMBLY__ +#define ftrace_nmi_enter() do { } while (0) +#define ftrace_nmi_exit() do { } while (0) +#endif + #ifdef CONFIG_FUNCTION_TRACER #define MCOUNT_ADDR ((long)(_mcount)) #define MCOUNT_INSN_SIZE 4 /* sizeof mcount call */ diff --git a/arch/sh/include/asm/ftrace.h b/arch/sh/include/asm/ftrace.h index 3aed362..cdf2cb0 100644 --- a/arch/sh/include/asm/ftrace.h +++ b/arch/sh/include/asm/ftrace.h @@ -2,6 +2,11 @@ #define __ASM_SH_FTRACE_H #ifndef __ASSEMBLY__ +#define ftrace_nmi_enter() do { } while (0) +#define ftrace_nmi_exit() do { } while (0) +#endif + +#ifndef __ASSEMBLY__ extern void mcount(void); #endif diff --git a/arch/sparc/include/asm/ftrace.h b/arch/sparc/include/asm/ftrace.h index d27716c..33a95fe 100644 --- a/arch/sparc/include/asm/ftrace.h +++ b/arch/sparc/include/asm/ftrace.h @@ -1,6 +1,11 @@ #ifndef _ASM_SPARC64_FTRACE #define _ASM_SPARC64_FTRACE +#ifndef __ASSEMBLY__ +#define ftrace_nmi_enter() do { } while (0) +#define ftrace_nmi_exit() do { } while (0) +#endif + #ifdef CONFIG_MCOUNT #define MCOUNT_ADDR ((long)(_mcount)) #define MCOUNT_INSN_SIZE 4 /* sizeof mcount call */ diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index 9e8bc29..f2ed6b7 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -17,6 +17,21 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr) */ return addr - 1; } + +#ifdef CONFIG_DYNAMIC_FTRACE +extern void ftrace_nmi_enter(void); +extern void ftrace_nmi_exit(void); +#else +#define ftrace_nmi_enter() do { } while (0) +#define ftrace_nmi_exit() do { } while (0) +#endif +#endif + +#else /* CONFIG_FUNCTION_TRACER */ + +#ifndef __ASSEMBLY__ +#define ftrace_nmi_enter() do { } while (0) +#define ftrace_nmi_exit() do { } while (0) #endif #endif /* CONFIG_FUNCTION_TRACER */ diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 50ea0ac..fe5f859 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -56,6 +56,111 @@ unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr) return calc.code; } +/* + * Modifying code must take extra care. On an SMP machine, if + * the code being modified is also being executed on another CPU + * that CPU will have undefined results and possibly take a GPF. + * We use kstop_machine to stop other CPUS from exectuing code. + * But this does not stop NMIs from happening. We still need + * to protect against that. We separate out the modification of + * the code to take care of this. + * + * Two buffers are added: An IP buffer and a "code" buffer. + * + * 1) Put in the instruction pointer into the IP buffer + * and the new code into the "code" buffer. + * 2) Set a flag that says we are modifying code + * 3) Wait for any running NMIs to finish. + * 4) Write the code + * 5) clear the flag. + * 6) Wait for any running NMIs to finish. + * + * If an NMI is executed, the first thing it does is to call + * "ftrace_nmi_enter". This will check if the flag is set to write + * and if it is, it will write what is in the IP and "code" buffers. + * + * The trick is, it does not matter if everyone is writing the same + * content to the code location. Also, if a CPU is executing code + * it is OK to write to that code location if the contents being written + * are the same as what exists. + */ + +static atomic_t in_nmi; +static int mod_code_status; +static int mod_code_write; +static void *mod_code_ip; +static void *mod_code_newcode; + +static void ftrace_mod_code(void) +{ + /* + * Yes, more than one CPU process can be writing to mod_code_status. + * (and the code itself) + * But if one were to fail, then they all should, and if one were + * to succeed, then they all should. + */ + mod_code_status = probe_kernel_write(mod_code_ip, mod_code_newcode, + MCOUNT_INSN_SIZE); + +} + +void ftrace_nmi_enter(void) +{ + atomic_inc(&in_nmi); + /* Must have in_nmi seen before reading write flag */ + smp_mb(); + if (mod_code_write) + ftrace_mod_code(); +} + +void ftrace_nmi_exit(void) +{ + /* Finish all executions before clearing in_nmi */ + smp_wmb(); + atomic_dec(&in_nmi); +} + +static void wait_for_nmi(void) +{ + while (atomic_read(&in_nmi)) + cpu_relax(); +} + +static int +do_ftrace_mod_code(unsigned long ip, void *new_code) +{ + mod_code_ip = (void *)ip; + mod_code_newcode = new_code; + + /* The buffers need to be visible before we let NMIs write them */ + smp_wmb(); + + mod_code_write = 1; + + /* Make sure write bit is visible before we wait on NMIs */ + smp_mb(); + + wait_for_nmi(); + + /* Make sure all running NMIs have finished before we write the code */ + smp_mb(); + + ftrace_mod_code(); + + /* Make sure the write happens before clearing the bit */ + smp_wmb(); + + mod_code_write = 0; + + /* make sure NMIs see the cleared bit */ + smp_mb(); + + wait_for_nmi(); + + return mod_code_status; +} + + int ftrace_modify_code(unsigned long ip, unsigned char *old_code, unsigned char *new_code) @@ -81,7 +186,7 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code, return -EINVAL; /* replace the text with the new text */ - if (probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE)) + if (do_ftrace_mod_code(ip, new_code)) return -EPERM; sync_core(); diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index 181006c..0087cb4 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -5,6 +5,7 @@ #include #include #include +#include #include /* @@ -161,7 +162,17 @@ extern void irq_enter(void); */ extern void irq_exit(void); -#define nmi_enter() do { lockdep_off(); __irq_enter(); } while (0) -#define nmi_exit() do { __irq_exit(); lockdep_on(); } while (0) +#define nmi_enter() \ + do { \ + ftrace_nmi_enter(); \ + lockdep_off(); \ + __irq_enter(); \ + } while (0) +#define nmi_exit() \ + do { \ + __irq_exit(); \ + lockdep_on(); \ + ftrace_nmi_exit(); \ + } while (0) #endif /* LINUX_HARDIRQ_H */ -- cgit v0.10.2 From b807c3d0f8e39ed7cbbbe6da162650e305e8de15 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 30 Oct 2008 16:08:33 -0400 Subject: ftrace: nmi update statistics Impact: add more debug info to /debugfs/tracing/dyn_ftrace_total_info This patch adds dynamic ftrace NMI update statistics to the /debugfs/tracing/dyn_ftrace_total_info stat file. Signed-off-by: Steven Rostedt Cc: Peter Zijlstra Cc: Andrew Morton Cc: Linus Torvalds Cc: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index fe5f859..6685b0f 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -91,6 +91,19 @@ static int mod_code_write; static void *mod_code_ip; static void *mod_code_newcode; +static int nmi_wait_count; +static atomic_t nmi_update_count; + +int ftrace_arch_read_dyn_info(char *buf, int size) +{ + int r; + + r = snprintf(buf, size, "%u %u", + nmi_wait_count, + atomic_read(&nmi_update_count)); + return r; +} + static void ftrace_mod_code(void) { /* @@ -109,8 +122,10 @@ void ftrace_nmi_enter(void) atomic_inc(&in_nmi); /* Must have in_nmi seen before reading write flag */ smp_mb(); - if (mod_code_write) + if (mod_code_write) { ftrace_mod_code(); + atomic_inc(&nmi_update_count); + } } void ftrace_nmi_exit(void) @@ -122,8 +137,15 @@ void ftrace_nmi_exit(void) static void wait_for_nmi(void) { - while (atomic_read(&in_nmi)) + int waited = 0; + + while (atomic_read(&in_nmi)) { + waited = 1; cpu_relax(); + } + + if (waited) + nmi_wait_count++; } static int diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a610ca7..bc36feb 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2815,22 +2815,39 @@ static struct file_operations tracing_mark_fops = { #ifdef CONFIG_DYNAMIC_FTRACE +#define DYN_INFO_BUF_SIZE 1023 +static char ftrace_dyn_info_buffer[DYN_INFO_BUF_SIZE+1]; +static DEFINE_MUTEX(dyn_info_mutex); + +int __weak ftrace_arch_read_dyn_info(char *buf, int size) +{ + return 0; +} + static ssize_t -tracing_read_long(struct file *filp, char __user *ubuf, +tracing_read_dyn_info(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { unsigned long *p = filp->private_data; - char buf[64]; + char *buf = ftrace_dyn_info_buffer; int r; - r = sprintf(buf, "%ld\n", *p); + mutex_lock(&dyn_info_mutex); + r = sprintf(buf, "%ld ", *p); - return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); + r += ftrace_arch_read_dyn_info(buf+r, DYN_INFO_BUF_SIZE-r); + buf[r++] = '\n'; + + r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); + + mutex_unlock(&dyn_info_mutex); + + return r; } -static struct file_operations tracing_read_long_fops = { +static struct file_operations tracing_dyn_info_fops = { .open = tracing_open_generic, - .read = tracing_read_long, + .read = tracing_read_dyn_info, }; #endif @@ -2939,7 +2956,7 @@ static __init int tracer_init_debugfs(void) #ifdef CONFIG_DYNAMIC_FTRACE entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer, &ftrace_update_tot_cnt, - &tracing_read_long_fops); + &tracing_dyn_info_fops); if (!entry) pr_warning("Could not create debugfs " "'dyn_ftrace_total_info' entry\n"); -- cgit v0.10.2 From b3acf29afda06c76774dc6df6246c37ae707836b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 29 Oct 2008 15:30:26 -0400 Subject: ftrace, kbuild: condense recordmcount.pl parameter code Impact: cleanup Sam Ravnborg pointed out that I could condense the code for the parameters of recordmcount.pl by using an $(if ...) condition. Signed-off-by: Steven Rostedt Acked-by: Sam Ravnborg Signed-off-by: Ingo Molnar diff --git a/scripts/Makefile.build b/scripts/Makefile.build index 468fbc9..7a17677 100644 --- a/scripts/Makefile.build +++ b/scripts/Makefile.build @@ -198,16 +198,10 @@ cmd_modversions = \ fi; endif -ifdef CONFIG_64BIT -arch_bits = 64 -else -arch_bits = 32 -endif - ifdef CONFIG_FTRACE_MCOUNT_RECORD -cmd_record_mcount = perl $(srctree)/scripts/recordmcount.pl \ - "$(ARCH)" "$(arch_bits)" "$(OBJDUMP)" "$(OBJCOPY)" "$(CC)" "$(LD)" \ - "$(NM)" "$(RM)" "$(MV)" "$(@)"; +cmd_record_mcount = perl $(srctree)/scripts/recordmcount.pl "$(ARCH)" \ + "$(if $(CONFIG_64BIT),64,32)" \ + "$(OBJDUMP)" "$(OBJCOPY)" "$(CC)" "$(LD)" "$(NM)" "$(RM)" "$(MV)" "$(@)"; endif define rule_cc_o_c -- cgit v0.10.2 From a26a2a27396c0a0877aa701f8f92d08ba550a6c9 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 31 Oct 2008 00:03:22 -0400 Subject: ftrace: nmi safe code clean ups Impact: cleanup This patch cleans up the NMI safe code for dynamic ftrace as suggested by Andrew Morton. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/arch/arm/include/asm/ftrace.h b/arch/arm/include/asm/ftrace.h index d4c24a7..3f3a1d1 100644 --- a/arch/arm/include/asm/ftrace.h +++ b/arch/arm/include/asm/ftrace.h @@ -2,8 +2,8 @@ #define _ASM_ARM_FTRACE #ifndef __ASSEMBLY__ -#define ftrace_nmi_enter() do { } while (0) -#define ftrace_nmi_exit() do { } while (0) +static inline void ftrace_nmi_enter(void) { } +static inline void ftrace_nmi_exit(void) { } #endif #ifdef CONFIG_FUNCTION_TRACER diff --git a/arch/powerpc/include/asm/ftrace.h b/arch/powerpc/include/asm/ftrace.h index 7652755..1cd7270 100644 --- a/arch/powerpc/include/asm/ftrace.h +++ b/arch/powerpc/include/asm/ftrace.h @@ -2,8 +2,8 @@ #define _ASM_POWERPC_FTRACE #ifndef __ASSEMBLY__ -#define ftrace_nmi_enter() do { } while (0) -#define ftrace_nmi_exit() do { } while (0) +static inline void ftrace_nmi_enter(void) { } +static inline void ftrace_nmi_exit(void) { } #endif #ifdef CONFIG_FUNCTION_TRACER diff --git a/arch/sh/include/asm/ftrace.h b/arch/sh/include/asm/ftrace.h index cdf2cb0..31ada03 100644 --- a/arch/sh/include/asm/ftrace.h +++ b/arch/sh/include/asm/ftrace.h @@ -2,8 +2,8 @@ #define __ASM_SH_FTRACE_H #ifndef __ASSEMBLY__ -#define ftrace_nmi_enter() do { } while (0) -#define ftrace_nmi_exit() do { } while (0) +static inline void ftrace_nmi_enter(void) { } +static inline void ftrace_nmi_exit(void) { } #endif #ifndef __ASSEMBLY__ diff --git a/arch/sparc/include/asm/ftrace.h b/arch/sparc/include/asm/ftrace.h index 33a95fe..62055ac 100644 --- a/arch/sparc/include/asm/ftrace.h +++ b/arch/sparc/include/asm/ftrace.h @@ -2,8 +2,8 @@ #define _ASM_SPARC64_FTRACE #ifndef __ASSEMBLY__ -#define ftrace_nmi_enter() do { } while (0) -#define ftrace_nmi_exit() do { } while (0) +static inline void ftrace_nmi_enter(void) { } +static inline void ftrace_nmi_exit(void) { } #endif #ifdef CONFIG_MCOUNT diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index f2ed6b7..a234681 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -22,16 +22,16 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr) extern void ftrace_nmi_enter(void); extern void ftrace_nmi_exit(void); #else -#define ftrace_nmi_enter() do { } while (0) -#define ftrace_nmi_exit() do { } while (0) -#endif +static inline void ftrace_nmi_enter(void) { } +static inline void ftrace_nmi_exit(void) { } #endif +#endif /* __ASSEMBLY__ */ #else /* CONFIG_FUNCTION_TRACER */ #ifndef __ASSEMBLY__ -#define ftrace_nmi_enter() do { } while (0) -#define ftrace_nmi_exit() do { } while (0) +static inline void ftrace_nmi_enter(void) { } +static inline void ftrace_nmi_exit(void) { } #endif #endif /* CONFIG_FUNCTION_TRACER */ diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 6685b0f..6914933 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -67,7 +67,7 @@ unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr) * * Two buffers are added: An IP buffer and a "code" buffer. * - * 1) Put in the instruction pointer into the IP buffer + * 1) Put the instruction pointer into the IP buffer * and the new code into the "code" buffer. * 2) Set a flag that says we are modifying code * 3) Wait for any running NMIs to finish. @@ -85,14 +85,14 @@ unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr) * are the same as what exists. */ -static atomic_t in_nmi; -static int mod_code_status; -static int mod_code_write; -static void *mod_code_ip; -static void *mod_code_newcode; +static atomic_t in_nmi = ATOMIC_INIT(0); +static int mod_code_status; /* holds return value of text write */ +static int mod_code_write; /* set when NMI should do the write */ +static void *mod_code_ip; /* holds the IP to write to */ +static void *mod_code_newcode; /* holds the text to write to the IP */ -static int nmi_wait_count; -static atomic_t nmi_update_count; +static unsigned nmi_wait_count; +static atomic_t nmi_update_count = ATOMIC_INIT(0); int ftrace_arch_read_dyn_info(char *buf, int size) { diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 703eb53..22240df 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -74,6 +74,9 @@ extern void ftrace_caller(void); extern void ftrace_call(void); extern void mcount_call(void); +/* May be defined in arch */ +extern int ftrace_arch_read_dyn_info(char *buf, int size); + /** * ftrace_modify_code - modify code segment * @ip: the address of the code segment diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index bc36feb..7f86067 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2815,10 +2815,6 @@ static struct file_operations tracing_mark_fops = { #ifdef CONFIG_DYNAMIC_FTRACE -#define DYN_INFO_BUF_SIZE 1023 -static char ftrace_dyn_info_buffer[DYN_INFO_BUF_SIZE+1]; -static DEFINE_MUTEX(dyn_info_mutex); - int __weak ftrace_arch_read_dyn_info(char *buf, int size) { return 0; @@ -2828,14 +2824,17 @@ static ssize_t tracing_read_dyn_info(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { + static char ftrace_dyn_info_buffer[1024]; + static DEFINE_MUTEX(dyn_info_mutex); unsigned long *p = filp->private_data; char *buf = ftrace_dyn_info_buffer; + int size = ARRAY_SIZE(ftrace_dyn_info_buffer); int r; mutex_lock(&dyn_info_mutex); r = sprintf(buf, "%ld ", *p); - r += ftrace_arch_read_dyn_info(buf+r, DYN_INFO_BUF_SIZE-r); + r += ftrace_arch_read_dyn_info(buf+r, (size-1)-r); buf[r++] = '\n'; r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); -- cgit v0.10.2 From d9e540762f5cdd89f24e518ad1fd31142d0b9726 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 1 Nov 2008 19:57:37 +0100 Subject: ftrace: ftrace_dump_on_oops=[tracer] Impact: add new (optional) debug boot option In order to facilitate early boot trouble, allow one to specify a tracer on the kernel boot line. Signed-off-by: Peter Zijlstra Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 1bbcaa8..4862284 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -765,6 +765,14 @@ and is between 256 and 4096 characters. It is defined in the file parameter will force ia64_sal_cache_flush to call ia64_pal_cache_flush instead of SAL_CACHE_FLUSH. + ftrace=[tracer] + [ftrace] will set and start the specified tracer + as early as possible in order to facilitate early + boot debugging. + + ftrace_dump_on_oops + [ftrace] will dump the trace buffers on oops. + gamecon.map[2|3]= [HW,JOY] Multisystem joystick and NES/SNES/PSX pad support via parallel port (up to 5 devices per port) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index bdb1df0..482583e 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -79,6 +79,15 @@ static int tracing_disabled = 1; */ int ftrace_dump_on_oops; +static int tracing_set_tracer(char *buf); + +static int __init set_ftrace(char *str) +{ + tracing_set_tracer(str); + return 1; +} +__setup("ftrace", set_ftrace); + static int __init set_ftrace_dump_on_oops(char *str) { ftrace_dump_on_oops = 1; @@ -2394,29 +2403,11 @@ tracing_set_trace_read(struct file *filp, char __user *ubuf, return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } -static ssize_t -tracing_set_trace_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) +static int tracing_set_tracer(char *buf) { struct trace_array *tr = &global_trace; struct tracer *t; - char buf[max_tracer_type_len+1]; - int i; - size_t ret; - - ret = cnt; - - if (cnt > max_tracer_type_len) - cnt = max_tracer_type_len; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - - /* strip ending whitespace. */ - for (i = cnt - 1; i > 0 && isspace(buf[i]); i--) - buf[i] = 0; + int ret = 0; mutex_lock(&trace_types_lock); for (t = trace_types; t; t = t->next) { @@ -2440,6 +2431,33 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf, out: mutex_unlock(&trace_types_lock); + return ret; +} + +static ssize_t +tracing_set_trace_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[max_tracer_type_len+1]; + int i; + size_t ret; + + if (cnt > max_tracer_type_len) + cnt = max_tracer_type_len; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + /* strip ending whitespace. */ + for (i = cnt - 1; i > 0 && isspace(buf[i]); i--) + buf[i] = 0; + + ret = tracing_set_tracer(buf); + if (!ret) + ret = cnt; + if (ret > 0) filp->f_pos += ret; -- cgit v0.10.2 From 19dba33c43a2f0f2aa727ae075ec3b11330775ef Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 28 Oct 2008 10:51:49 +0800 Subject: tracepoint: simplification for tracepoints using RCU Impact: simplify implementation Now, unused memory is handled by struct tp_probes. old code use these three field to handle unused memory. struct tracepoint_entry { ... struct rcu_head rcu; void *oldptr; unsigned char rcu_pending:1; ... }; in this way, unused memory is handled by struct tracepoint_entry. it bring reenter bug(it was fixed) and tracepoint.c is filled full of ".*rcu.*" code statements. this patch removes all these. and: rcu_barrier_sched() is removed. Do not need regain tracepoints_mutex after tracepoint_update_probes() several little cleanup. Signed-off-by: Lai Jiangshan Acked-by: Mathieu Desnoyers Signed-off-by: Ingo Molnar diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index af8c856..3e22867 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -43,6 +43,7 @@ static DEFINE_MUTEX(tracepoints_mutex); */ #define TRACEPOINT_HASH_BITS 6 #define TRACEPOINT_TABLE_SIZE (1 << TRACEPOINT_HASH_BITS) +static struct hlist_head tracepoint_table[TRACEPOINT_TABLE_SIZE]; /* * Note about RCU : @@ -54,40 +55,40 @@ struct tracepoint_entry { struct hlist_node hlist; void **funcs; int refcount; /* Number of times armed. 0 if disarmed. */ - struct rcu_head rcu; - void *oldptr; - unsigned char rcu_pending:1; char name[0]; }; -static struct hlist_head tracepoint_table[TRACEPOINT_TABLE_SIZE]; +struct tp_probes { + struct rcu_head rcu; + void *probes[0]; +}; -static void free_old_closure(struct rcu_head *head) +static inline void *allocate_probes(int count) { - struct tracepoint_entry *entry = container_of(head, - struct tracepoint_entry, rcu); - kfree(entry->oldptr); - /* Make sure we free the data before setting the pending flag to 0 */ - smp_wmb(); - entry->rcu_pending = 0; + struct tp_probes *p = kmalloc(count * sizeof(void *) + + sizeof(struct tp_probes), GFP_KERNEL); + return p == NULL ? NULL : p->probes; } -static void tracepoint_entry_free_old(struct tracepoint_entry *entry, void *old) +static void rcu_free_old_probes(struct rcu_head *head) { - if (!old) - return; - entry->oldptr = old; - entry->rcu_pending = 1; - /* write rcu_pending before calling the RCU callback */ - smp_wmb(); - call_rcu_sched(&entry->rcu, free_old_closure); + kfree(container_of(head, struct tp_probes, rcu)); +} + +static inline void release_probes(void *old) +{ + if (old) { + struct tp_probes *tp_probes = container_of(old, + struct tp_probes, probes[0]); + call_rcu(&tp_probes->rcu, rcu_free_old_probes); + } } static void debug_print_probes(struct tracepoint_entry *entry) { int i; - if (!tracepoint_debug) + if (!tracepoint_debug || !entry->funcs) return; for (i = 0; entry->funcs[i]; i++) @@ -111,12 +112,13 @@ tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe) return ERR_PTR(-EEXIST); } /* + 2 : one for new probe, one for NULL func */ - new = kzalloc((nr_probes + 2) * sizeof(void *), GFP_KERNEL); + new = allocate_probes(nr_probes + 2); if (new == NULL) return ERR_PTR(-ENOMEM); if (old) memcpy(new, old, nr_probes * sizeof(void *)); new[nr_probes] = probe; + new[nr_probes + 1] = NULL; entry->refcount = nr_probes + 1; entry->funcs = new; debug_print_probes(entry); @@ -132,7 +134,7 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe) old = entry->funcs; if (!old) - return NULL; + return ERR_PTR(-ENOENT); debug_print_probes(entry); /* (N -> M), (N > 1, M >= 0) probes */ @@ -151,13 +153,13 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe) int j = 0; /* N -> M, (N > 1, M > 0) */ /* + 1 for NULL */ - new = kzalloc((nr_probes - nr_del + 1) - * sizeof(void *), GFP_KERNEL); + new = allocate_probes(nr_probes - nr_del + 1); if (new == NULL) return ERR_PTR(-ENOMEM); for (i = 0; old[i]; i++) if ((probe && old[i] != probe)) new[j++] = old[i]; + new[nr_probes - nr_del] = NULL; entry->refcount = nr_probes - nr_del; entry->funcs = new; } @@ -215,7 +217,6 @@ static struct tracepoint_entry *add_tracepoint(const char *name) memcpy(&e->name[0], name, name_len); e->funcs = NULL; e->refcount = 0; - e->rcu_pending = 0; hlist_add_head(&e->hlist, head); return e; } @@ -224,32 +225,10 @@ static struct tracepoint_entry *add_tracepoint(const char *name) * Remove the tracepoint from the tracepoint hash table. Must be called with * mutex_lock held. */ -static int remove_tracepoint(const char *name) +static inline void remove_tracepoint(struct tracepoint_entry *e) { - struct hlist_head *head; - struct hlist_node *node; - struct tracepoint_entry *e; - int found = 0; - size_t len = strlen(name) + 1; - u32 hash = jhash(name, len-1, 0); - - head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)]; - hlist_for_each_entry(e, node, head, hlist) { - if (!strcmp(name, e->name)) { - found = 1; - break; - } - } - if (!found) - return -ENOENT; - if (e->refcount) - return -EBUSY; hlist_del(&e->hlist); - /* Make sure the call_rcu_sched has been executed */ - if (e->rcu_pending) - rcu_barrier_sched(); kfree(e); - return 0; } /* @@ -343,25 +322,17 @@ int tracepoint_probe_register(const char *name, void *probe) goto end; } } - /* - * If we detect that a call_rcu_sched is pending for this tracepoint, - * make sure it's executed now. - */ - if (entry->rcu_pending) - rcu_barrier_sched(); old = tracepoint_entry_add_probe(entry, probe); if (IS_ERR(old)) { + if (!entry->refcount) + remove_tracepoint(entry); ret = PTR_ERR(old); goto end; } mutex_unlock(&tracepoints_mutex); tracepoint_update_probes(); /* may update entry */ - mutex_lock(&tracepoints_mutex); - entry = get_tracepoint(name); - WARN_ON(!entry); - if (entry->rcu_pending) - rcu_barrier_sched(); - tracepoint_entry_free_old(entry, old); + release_probes(old); + return 0; end: mutex_unlock(&tracepoints_mutex); return ret; @@ -388,25 +359,17 @@ int tracepoint_probe_unregister(const char *name, void *probe) entry = get_tracepoint(name); if (!entry) goto end; - if (entry->rcu_pending) - rcu_barrier_sched(); old = tracepoint_entry_remove_probe(entry, probe); - if (!old) { - printk(KERN_WARNING "Warning: Trying to unregister a probe" - "that doesn't exist\n"); + if (IS_ERR(old)) { + ret = PTR_ERR(old); goto end; } + if (!entry->refcount) + remove_tracepoint(entry); mutex_unlock(&tracepoints_mutex); tracepoint_update_probes(); /* may update entry */ - mutex_lock(&tracepoints_mutex); - entry = get_tracepoint(name); - if (!entry) - goto end; - if (entry->rcu_pending) - rcu_barrier_sched(); - tracepoint_entry_free_old(entry, old); - remove_tracepoint(name); /* Ignore busy error message */ - ret = 0; + release_probes(old); + return 0; end: mutex_unlock(&tracepoints_mutex); return ret; -- cgit v0.10.2 From 127cafbb276266b1b8da967bfe25a062ab1d42ab Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 28 Oct 2008 10:51:53 +0800 Subject: tracepoint: introduce *_noupdate APIs. Impact: add new tracepoint APIs to allow the batched registration of probes new APIs separate tracepoint_probe_register(), tracepoint_probe_unregister() into 2 steps. The first step of them is just update tracepoint_entry, not connect or disconnect. this patch introduces tracepoint_probe_update_all() for update all. these APIs are very useful for registering lots of probes but just updating once. Another very important thing is that *_noupdate APIs do not require module_mutex. Signed-off-by: Lai Jiangshan Acked-by: Mathieu Desnoyers Signed-off-by: Ingo Molnar diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index c5bb39c..63064e9 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -112,6 +112,10 @@ extern int tracepoint_probe_register(const char *name, void *probe); */ extern int tracepoint_probe_unregister(const char *name, void *probe); +extern int tracepoint_probe_register_noupdate(const char *name, void *probe); +extern int tracepoint_probe_unregister_noupdate(const char *name, void *probe); +extern void tracepoint_probe_update_all(void); + struct tracepoint_iter { struct module *module; struct tracepoint *tracepoint; diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 3e22867..e96590f 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -59,7 +59,10 @@ struct tracepoint_entry { }; struct tp_probes { - struct rcu_head rcu; + union { + struct rcu_head rcu; + struct list_head list; + } u; void *probes[0]; }; @@ -72,7 +75,7 @@ static inline void *allocate_probes(int count) static void rcu_free_old_probes(struct rcu_head *head) { - kfree(container_of(head, struct tp_probes, rcu)); + kfree(container_of(head, struct tp_probes, u.rcu)); } static inline void release_probes(void *old) @@ -80,7 +83,7 @@ static inline void release_probes(void *old) if (old) { struct tp_probes *tp_probes = container_of(old, struct tp_probes, probes[0]); - call_rcu(&tp_probes->rcu, rcu_free_old_probes); + call_rcu_sched(&tp_probes->u.rcu, rcu_free_old_probes); } } @@ -299,6 +302,23 @@ static void tracepoint_update_probes(void) module_update_tracepoints(); } +static void *tracepoint_add_probe(const char *name, void *probe) +{ + struct tracepoint_entry *entry; + void *old; + + entry = get_tracepoint(name); + if (!entry) { + entry = add_tracepoint(name); + if (IS_ERR(entry)) + return entry; + } + old = tracepoint_entry_add_probe(entry, probe); + if (IS_ERR(old) && !entry->refcount) + remove_tracepoint(entry); + return old; +} + /** * tracepoint_probe_register - Connect a probe to a tracepoint * @name: tracepoint name @@ -309,36 +329,36 @@ static void tracepoint_update_probes(void) */ int tracepoint_probe_register(const char *name, void *probe) { - struct tracepoint_entry *entry; - int ret = 0; void *old; mutex_lock(&tracepoints_mutex); - entry = get_tracepoint(name); - if (!entry) { - entry = add_tracepoint(name); - if (IS_ERR(entry)) { - ret = PTR_ERR(entry); - goto end; - } - } - old = tracepoint_entry_add_probe(entry, probe); - if (IS_ERR(old)) { - if (!entry->refcount) - remove_tracepoint(entry); - ret = PTR_ERR(old); - goto end; - } + old = tracepoint_add_probe(name, probe); mutex_unlock(&tracepoints_mutex); + if (IS_ERR(old)) + return PTR_ERR(old); + tracepoint_update_probes(); /* may update entry */ release_probes(old); return 0; -end: - mutex_unlock(&tracepoints_mutex); - return ret; } EXPORT_SYMBOL_GPL(tracepoint_probe_register); +static void *tracepoint_remove_probe(const char *name, void *probe) +{ + struct tracepoint_entry *entry; + void *old; + + entry = get_tracepoint(name); + if (!entry) + return ERR_PTR(-ENOENT); + old = tracepoint_entry_remove_probe(entry, probe); + if (IS_ERR(old)) + return old; + if (!entry->refcount) + remove_tracepoint(entry); + return old; +} + /** * tracepoint_probe_unregister - Disconnect a probe from a tracepoint * @name: tracepoint name @@ -351,31 +371,105 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_register); */ int tracepoint_probe_unregister(const char *name, void *probe) { - struct tracepoint_entry *entry; void *old; - int ret = -ENOENT; mutex_lock(&tracepoints_mutex); - entry = get_tracepoint(name); - if (!entry) - goto end; - old = tracepoint_entry_remove_probe(entry, probe); - if (IS_ERR(old)) { - ret = PTR_ERR(old); - goto end; - } - if (!entry->refcount) - remove_tracepoint(entry); + old = tracepoint_remove_probe(name, probe); mutex_unlock(&tracepoints_mutex); + if (IS_ERR(old)) + return PTR_ERR(old); + tracepoint_update_probes(); /* may update entry */ release_probes(old); return 0; -end: - mutex_unlock(&tracepoints_mutex); - return ret; } EXPORT_SYMBOL_GPL(tracepoint_probe_unregister); +static LIST_HEAD(old_probes); +static int need_update; + +static void tracepoint_add_old_probes(void *old) +{ + need_update = 1; + if (old) { + struct tp_probes *tp_probes = container_of(old, + struct tp_probes, probes[0]); + list_add(&tp_probes->u.list, &old_probes); + } +} + +/** + * tracepoint_probe_register_noupdate - register a probe but not connect + * @name: tracepoint name + * @probe: probe handler + * + * caller must call tracepoint_probe_update_all() + */ +int tracepoint_probe_register_noupdate(const char *name, void *probe) +{ + void *old; + + mutex_lock(&tracepoints_mutex); + old = tracepoint_add_probe(name, probe); + if (IS_ERR(old)) { + mutex_unlock(&tracepoints_mutex); + return PTR_ERR(old); + } + tracepoint_add_old_probes(old); + mutex_unlock(&tracepoints_mutex); + return 0; +} +EXPORT_SYMBOL_GPL(tracepoint_probe_register_noupdate); + +/** + * tracepoint_probe_unregister_noupdate - remove a probe but not disconnect + * @name: tracepoint name + * @probe: probe function pointer + * + * caller must call tracepoint_probe_update_all() + */ +int tracepoint_probe_unregister_noupdate(const char *name, void *probe) +{ + void *old; + + mutex_lock(&tracepoints_mutex); + old = tracepoint_remove_probe(name, probe); + if (IS_ERR(old)) { + mutex_unlock(&tracepoints_mutex); + return PTR_ERR(old); + } + tracepoint_add_old_probes(old); + mutex_unlock(&tracepoints_mutex); + return 0; +} +EXPORT_SYMBOL_GPL(tracepoint_probe_unregister_noupdate); + +/** + * tracepoint_probe_update_all - update tracepoints + */ +void tracepoint_probe_update_all(void) +{ + LIST_HEAD(release_probes); + struct tp_probes *pos, *next; + + mutex_lock(&tracepoints_mutex); + if (!need_update) { + mutex_unlock(&tracepoints_mutex); + return; + } + if (!list_empty(&old_probes)) + list_replace_init(&old_probes, &release_probes); + need_update = 0; + mutex_unlock(&tracepoints_mutex); + + tracepoint_update_probes(); + list_for_each_entry_safe(pos, next, &release_probes, u.list) { + list_del(&pos->u.list); + call_rcu_sched(&pos->u.rcu, rcu_free_old_probes); + } +} +EXPORT_SYMBOL_GPL(tracepoint_probe_update_all); + /** * tracepoint_get_iter_range - Get a next tracepoint iterator given a range. * @tracepoint: current tracepoints (in), next tracepoint (out) -- cgit v0.10.2 From 7e5e26a3d8ac4bcadb380073dc9604c07a9a6198 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 31 Oct 2008 09:36:38 -0400 Subject: ftrace: fix hardirq header for non ftrace archs Impact: build fix for non-ftrace architectures Not all archs implement ftrace, and therefore do not have an asm/ftrace.h. This patch corrects the problem. The ftrace_nmi_enter/exit now must be defined for all archs that implement dynamic ftrace. Currently, only x86 does. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/arch/arm/include/asm/ftrace.h b/arch/arm/include/asm/ftrace.h index 3f3a1d1..39c8bc1 100644 --- a/arch/arm/include/asm/ftrace.h +++ b/arch/arm/include/asm/ftrace.h @@ -1,11 +1,6 @@ #ifndef _ASM_ARM_FTRACE #define _ASM_ARM_FTRACE -#ifndef __ASSEMBLY__ -static inline void ftrace_nmi_enter(void) { } -static inline void ftrace_nmi_exit(void) { } -#endif - #ifdef CONFIG_FUNCTION_TRACER #define MCOUNT_ADDR ((long)(mcount)) #define MCOUNT_INSN_SIZE 4 /* sizeof mcount call */ diff --git a/arch/powerpc/include/asm/ftrace.h b/arch/powerpc/include/asm/ftrace.h index 1cd7270..b298f7a 100644 --- a/arch/powerpc/include/asm/ftrace.h +++ b/arch/powerpc/include/asm/ftrace.h @@ -1,11 +1,6 @@ #ifndef _ASM_POWERPC_FTRACE #define _ASM_POWERPC_FTRACE -#ifndef __ASSEMBLY__ -static inline void ftrace_nmi_enter(void) { } -static inline void ftrace_nmi_exit(void) { } -#endif - #ifdef CONFIG_FUNCTION_TRACER #define MCOUNT_ADDR ((long)(_mcount)) #define MCOUNT_INSN_SIZE 4 /* sizeof mcount call */ diff --git a/arch/sh/include/asm/ftrace.h b/arch/sh/include/asm/ftrace.h index 31ada03..3aed362 100644 --- a/arch/sh/include/asm/ftrace.h +++ b/arch/sh/include/asm/ftrace.h @@ -2,11 +2,6 @@ #define __ASM_SH_FTRACE_H #ifndef __ASSEMBLY__ -static inline void ftrace_nmi_enter(void) { } -static inline void ftrace_nmi_exit(void) { } -#endif - -#ifndef __ASSEMBLY__ extern void mcount(void); #endif diff --git a/arch/sparc/include/asm/ftrace.h b/arch/sparc/include/asm/ftrace.h index 62055ac..d27716c 100644 --- a/arch/sparc/include/asm/ftrace.h +++ b/arch/sparc/include/asm/ftrace.h @@ -1,11 +1,6 @@ #ifndef _ASM_SPARC64_FTRACE #define _ASM_SPARC64_FTRACE -#ifndef __ASSEMBLY__ -static inline void ftrace_nmi_enter(void) { } -static inline void ftrace_nmi_exit(void) { } -#endif - #ifdef CONFIG_MCOUNT #define MCOUNT_ADDR ((long)(_mcount)) #define MCOUNT_INSN_SIZE 4 /* sizeof mcount call */ diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index a234681..f8173ed 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -17,23 +17,7 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr) */ return addr - 1; } - -#ifdef CONFIG_DYNAMIC_FTRACE -extern void ftrace_nmi_enter(void); -extern void ftrace_nmi_exit(void); -#else -static inline void ftrace_nmi_enter(void) { } -static inline void ftrace_nmi_exit(void) { } -#endif #endif /* __ASSEMBLY__ */ - -#else /* CONFIG_FUNCTION_TRACER */ - -#ifndef __ASSEMBLY__ -static inline void ftrace_nmi_enter(void) { } -static inline void ftrace_nmi_exit(void) { } -#endif - #endif /* CONFIG_FUNCTION_TRACER */ #endif /* _ASM_X86_FTRACE_H */ diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index e46a7b3..0ad1b48 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -44,7 +44,6 @@ static inline void ftrace_kill(void) { } #endif /* CONFIG_FUNCTION_TRACER */ #ifdef CONFIG_DYNAMIC_FTRACE - enum { FTRACE_FL_FREE = (1 << 0), FTRACE_FL_FAILED = (1 << 1), @@ -105,6 +104,8 @@ extern void ftrace_release(void *start, unsigned long size); extern void ftrace_disable_daemon(void); extern void ftrace_enable_daemon(void); +extern void ftrace_nmi_enter(void); +extern void ftrace_nmi_exit(void); #else # define skip_trace(ip) ({ 0; }) @@ -113,6 +114,8 @@ extern void ftrace_enable_daemon(void); # define ftrace_disable_daemon() do { } while (0) # define ftrace_enable_daemon() do { } while (0) static inline void ftrace_release(void *start, unsigned long size) { } +static inline void ftrace_nmi_enter(void) { } +static inline void ftrace_nmi_exit(void) { } #endif /* CONFIG_DYNAMIC_FTRACE */ /* totally disable ftrace - can not re-enable after this */ diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index 0087cb4..ffc16ab 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -4,8 +4,8 @@ #include #include #include +#include #include -#include #include /* -- cgit v0.10.2 From 8f0a056fcb2f83a069fb5d60c2383304b7456687 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 3 Nov 2008 23:15:55 -0500 Subject: ftrace: introduce ftrace_preempt_disable()/enable() Impact: add new ftrace-plugin internal APIs Parts of the tracer needs to be careful about schedule recursion. If the NEED_RESCHED flag is set, a preempt_enable will call schedule. Inside the schedule function, the NEED_RESCHED flag is cleared. The problem arises when a trace happens in the schedule function but before NEED_RESCHED is cleared. The race is as follows: schedule() >> tracer called trace_function() preempt_disable() [ record trace ] preempt_enable() <<- here's the issue. [check NEED_RESCHED] schedule() [ Repeat the above, over and over again ] The naive approach is simply to use preempt_enable_no_schedule instead. The problem with that approach is that, although we solve the schedule recursion issue, we now might lose a preemption check when not in the schedule function. trace_function() preempt_disable() [ record trace ] [Interrupt comes in and sets NEED_RESCHED] preempt_enable_no_resched() [continue without scheduling] The way ftrace handles this problem is with the following approach: int resched; resched = need_resched(); preempt_disable_notrace(); [record trace] if (resched) preempt_enable_no_sched_notrace(); else preempt_enable_notrace(); This may seem like the opposite of what we want. If resched is set then we call the "no_sched" version?? The reason we do this is because if NEED_RESCHED is set before we disable preemption, there's two reasons for that: 1) we are in an atomic code path 2) we are already on our way to the schedule function, and maybe even in the schedule function, but have yet to clear the flag. Both the above cases we do not want to schedule. This solution has already been implemented within the ftrace infrastructure. But the problem is that it has been implemented several times. This patch encapsulates this code to two nice functions. resched = ftrace_preempt_disable(); [ record trace] ftrace_preempt_enable(resched); This way the tracers do not need to worry about getting it right. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 8465ad0..10c6dae 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -419,4 +419,52 @@ enum trace_iterator_flags { extern struct tracer nop_trace; +/** + * ftrace_preempt_disable - disable preemption scheduler safe + * + * When tracing can happen inside the scheduler, there exists + * cases that the tracing might happen before the need_resched + * flag is checked. If this happens and the tracer calls + * preempt_enable (after a disable), a schedule might take place + * causing an infinite recursion. + * + * To prevent this, we read the need_recshed flag before + * disabling preemption. When we want to enable preemption we + * check the flag, if it is set, then we call preempt_enable_no_resched. + * Otherwise, we call preempt_enable. + * + * The rational for doing the above is that if need resched is set + * and we have yet to reschedule, we are either in an atomic location + * (where we do not need to check for scheduling) or we are inside + * the scheduler and do not want to resched. + */ +static inline int ftrace_preempt_disable(void) +{ + int resched; + + resched = need_resched(); + preempt_disable_notrace(); + + return resched; +} + +/** + * ftrace_preempt_enable - enable preemption scheduler safe + * @resched: the return value from ftrace_preempt_disable + * + * This is a scheduler safe way to enable preemption and not miss + * any preemption checks. The disabled saved the state of preemption. + * If resched is set, then we were either inside an atomic or + * are inside the scheduler (we would have already scheduled + * otherwise). In this case, we do not want to call normal + * preempt_enable, but preempt_enable_no_resched instead. + */ +static inline void ftrace_preempt_enable(int resched) +{ + if (resched) + preempt_enable_no_resched_notrace(); + else + preempt_enable_notrace(); +} + #endif /* _LINUX_KERNEL_TRACE_H */ -- cgit v0.10.2 From 182e9f5f704ed6b9175142fe8da33c9ce0c52b52 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 3 Nov 2008 23:15:56 -0500 Subject: ftrace: insert in the ftrace_preempt_disable()/enable() functions Impact: use new, consolidated APIs in ftrace plugins This patch replaces the schedule safe preempt disable code with the ftrace_preempt_disable() and ftrace_preempt_enable() safe functions. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index cedf4e2..151f6a7 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -16,6 +16,8 @@ #include #include +#include "trace.h" + /* Up this if you want to test the TIME_EXTENTS and normalization */ #define DEBUG_SHIFT 0 @@ -1122,8 +1124,7 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, return NULL; /* If we are tracing schedule, we don't want to recurse */ - resched = need_resched(); - preempt_disable_notrace(); + resched = ftrace_preempt_disable(); cpu = raw_smp_processor_id(); @@ -1154,10 +1155,7 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, return event; out: - if (resched) - preempt_enable_notrace(); - else - preempt_enable_notrace(); + ftrace_preempt_enable(resched); return NULL; } @@ -1199,12 +1197,9 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer, /* * Only the last preempt count needs to restore preemption. */ - if (preempt_count() == 1) { - if (per_cpu(rb_need_resched, cpu)) - preempt_enable_no_resched_notrace(); - else - preempt_enable_notrace(); - } else + if (preempt_count() == 1) + ftrace_preempt_enable(per_cpu(rb_need_resched, cpu)); + else preempt_enable_no_resched_notrace(); return 0; @@ -1237,8 +1232,7 @@ int ring_buffer_write(struct ring_buffer *buffer, if (atomic_read(&buffer->record_disabled)) return -EBUSY; - resched = need_resched(); - preempt_disable_notrace(); + resched = ftrace_preempt_disable(); cpu = raw_smp_processor_id(); @@ -1264,10 +1258,7 @@ int ring_buffer_write(struct ring_buffer *buffer, ret = 0; out: - if (resched) - preempt_enable_no_resched_notrace(); - else - preempt_enable_notrace(); + ftrace_preempt_enable(resched); return ret; } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index e4c40c8..3e7bf5e 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -904,8 +904,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip) return; pc = preempt_count(); - resched = need_resched(); - preempt_disable_notrace(); + resched = ftrace_preempt_disable(); local_save_flags(flags); cpu = raw_smp_processor_id(); data = tr->data[cpu]; @@ -915,10 +914,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip) trace_function(tr, data, ip, parent_ip, flags, pc); atomic_dec(&data->disabled); - if (resched) - preempt_enable_no_resched_notrace(); - else - preempt_enable_notrace(); + ftrace_preempt_enable(resched); } static struct ftrace_ops trace_ops __read_mostly = diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 3ae93f1..7bc4abf 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -50,8 +50,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) return; pc = preempt_count(); - resched = need_resched(); - preempt_disable_notrace(); + resched = ftrace_preempt_disable(); cpu = raw_smp_processor_id(); data = tr->data[cpu]; @@ -81,15 +80,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) out: atomic_dec(&data->disabled); - /* - * To prevent recursion from the scheduler, if the - * resched flag was set before we entered, then - * don't reschedule. - */ - if (resched) - preempt_enable_no_resched_notrace(); - else - preempt_enable_notrace(); + ftrace_preempt_enable(resched); } static struct ftrace_ops trace_ops __read_mostly = diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index be682b6..d39e8b7 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -107,8 +107,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip) if (unlikely(!ftrace_enabled || stack_trace_disabled)) return; - resched = need_resched(); - preempt_disable_notrace(); + resched = ftrace_preempt_disable(); cpu = raw_smp_processor_id(); /* no atomic needed, we only modify this variable by this cpu */ @@ -120,10 +119,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip) out: per_cpu(trace_active, cpu)--; /* prevent recursion in schedule */ - if (resched) - preempt_enable_no_resched_notrace(); - else - preempt_enable_notrace(); + ftrace_preempt_enable(resched); } static struct ftrace_ops trace_ops __read_mostly = -- cgit v0.10.2 From b2a866f9344cb30d7ddf5d67b5b8393daf8bef07 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 3 Nov 2008 23:15:57 -0500 Subject: ftrace: function tracer with irqs disabled Impact: disable interrupts during trace entry creation (as opposed to preempt) To help with performance, I set the ftracer to not disable interrupts, and only to disable preemption. If an interrupt occurred, it would not be traced, because the function tracer protects itself from recursion. This may be faster, but the trace output might miss some traces. This patch makes the fuction trace disable interrupts, but it also adds a runtime feature to disable preemption instead. It does this by having two different tracer functions. When the function tracer is enabled, it will check to see which version is requested (irqs disabled or preemption disabled). Then it will use the corresponding function as the tracer. Irq disabling is the default behaviour, but if the user wants better performance, with the chance of missing traces, then they can choose the preempt disabled version. Running hackbench 3 times with the irqs disabled and 3 times with the preempt disabled function tracer yielded: tracing type times entries recorded ------------ -------- ---------------- irq disabled 43.393 166433066 43.282 166172618 43.298 166256704 preempt disabled 38.969 159871710 38.943 159972935 39.325 161056510 Average: irqs disabled: 43.324 166287462 preempt disabled: 39.079 160300385 preempt is 10.8 percent faster than irqs disabled. I wrote a patch to count function trace recursion and reran hackbench. With irq disabled: 1,150 times the function tracer did not trace due to recursion. with preempt disabled: 5,117,718 times. The thousand times with irq disabled could be due to NMIs, or simply a case where it called a function that was not protected by notrace. But we also see that a large amount of the trace is lost with the preempt version. Signed-off-by: Steven Rostedt Cc: Peter Zijlstra Cc: Andrew Morton Cc: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 3e7bf5e..d576dbd 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -244,6 +244,7 @@ static const char *trace_options[] = { "stacktrace", "sched-tree", "ftrace_printk", + "ftrace_preempt", NULL }; @@ -891,7 +892,7 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) #ifdef CONFIG_FUNCTION_TRACER static void -function_trace_call(unsigned long ip, unsigned long parent_ip) +function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip) { struct trace_array *tr = &global_trace; struct trace_array_cpu *data; @@ -917,6 +918,37 @@ function_trace_call(unsigned long ip, unsigned long parent_ip) ftrace_preempt_enable(resched); } +static void +function_trace_call(unsigned long ip, unsigned long parent_ip) +{ + struct trace_array *tr = &global_trace; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int cpu; + int pc; + + if (unlikely(!ftrace_function_enabled)) + return; + + /* + * Need to use raw, since this must be called before the + * recursive protection is performed. + */ + raw_local_irq_save(flags); + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + disabled = atomic_inc_return(&data->disabled); + + if (likely(disabled == 1)) { + pc = preempt_count(); + trace_function(tr, data, ip, parent_ip, flags, pc); + } + + atomic_dec(&data->disabled); + raw_local_irq_restore(flags); +} + static struct ftrace_ops trace_ops __read_mostly = { .func = function_trace_call, @@ -925,6 +957,12 @@ static struct ftrace_ops trace_ops __read_mostly = void tracing_start_function_trace(void) { ftrace_function_enabled = 0; + + if (trace_flags & TRACE_ITER_PREEMPTONLY) + trace_ops.func = function_trace_call_preempt_only; + else + trace_ops.func = function_trace_call; + register_ftrace_function(&trace_ops); if (tracer_enabled) ftrace_function_enabled = 1; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 10c6dae..bb547e9 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -415,6 +415,7 @@ enum trace_iterator_flags { TRACE_ITER_STACKTRACE = 0x100, TRACE_ITER_SCHED_TREE = 0x200, TRACE_ITER_PRINTK = 0x400, + TRACE_ITER_PREEMPTONLY = 0x800, }; extern struct tracer nop_trace; -- cgit v0.10.2 From 69f698adcf43930a283f630395a1bb781962cfe6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 1 Nov 2008 19:53:34 +0100 Subject: ftrace: sysrq-z to dump the buffers Impact: add SysRq-z support to dump trace buffers Allows one to force an ftrace dump from sysrq Signed-off-by: Peter Zijlstra Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c index ce0d9da..94966ed 100644 --- a/drivers/char/sysrq.c +++ b/drivers/char/sysrq.c @@ -274,6 +274,22 @@ static struct sysrq_key_op sysrq_showstate_blocked_op = { .enable_mask = SYSRQ_ENABLE_DUMP, }; +#ifdef CONFIG_TRACING +#include + +static void sysrq_ftrace_dump(int key, struct tty_struct *tty) +{ + ftrace_dump(); +} +static struct sysrq_key_op sysrq_ftrace_dump_op = { + .handler = sysrq_ftrace_dump, + .help_msg = "dumpZ-ftrace-buffer", + .action_msg = "Dump ftrace buffer", + .enable_mask = SYSRQ_ENABLE_DUMP, +}; +#else +#define sysrq_ftrace_dump_op (*(struct sysrq_key_op *)0) +#endif static void sysrq_handle_showmem(int key, struct tty_struct *tty) { @@ -406,7 +422,7 @@ static struct sysrq_key_op *sysrq_key_table[36] = { NULL, /* x */ /* y: May be registered on sparc64 for global register dump */ NULL, /* y */ - NULL /* z */ + &sysrq_ftrace_dump_op, /* z */ }; /* key2index calculation, -1 on invalid index */ -- cgit v0.10.2 From 3299b4dd1180762da831be5eb6adc44553eaec26 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 4 Nov 2008 11:58:21 +0100 Subject: ftrace: sysctl typo Impact: fix sysctl name typo Steve must have needed more coffee ;-) Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 6b6b727..65d4a9b 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -487,7 +487,7 @@ static struct ctl_table kern_table[] = { #ifdef CONFIG_TRACING { .ctl_name = CTL_UNNUMBERED, - .procname = "ftrace_dump_on_opps", + .procname = "ftrace_dump_on_oops", .data = &ftrace_dump_on_oops, .maxlen = sizeof(int), .mode = 0644, -- cgit v0.10.2 From 71566a0d161edec70361b7f90f6e54af6a6d5d05 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 31 Oct 2008 12:57:20 +0100 Subject: tracing/fastboot: Enable boot tracing only during initcalls Impact: modify boot tracer We used to disable the initcall tracing at a specified time (IE: end of builtin initcalls). But we don't need it anymore. It will be stopped when initcalls are finished. However we want two things: _Start this tracing only after pre-smp initcalls are finished. _Since we are planning to trace sched_switches at the same time, we want to enable them only during the initcall execution. For this purpose, this patch introduce two functions to enable/disable the sched_switch tracing during boot. Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index e46a7b3..4642959 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -234,6 +234,11 @@ ftrace_init_module(unsigned long *start, unsigned long *end) { } #endif +/* + * Structure which defines the trace of an initcall. + * You don't have to fill the func field since it is + * only used internally by the tracer. + */ struct boot_trace { pid_t caller; char func[KSYM_NAME_LEN]; @@ -244,13 +249,28 @@ struct boot_trace { }; #ifdef CONFIG_BOOT_TRACER +/* Append the trace on the ring-buffer */ extern void trace_boot(struct boot_trace *it, initcall_t fn); + +/* Tells the tracer that smp_pre_initcall is finished. + * So we can start the tracing + */ extern void start_boot_trace(void); -extern void stop_boot_trace(void); + +/* Resume the tracing of other necessary events + * such as sched switches + */ +extern void enable_boot_trace(void); + +/* Suspend this tracing. Actually, only sched_switches tracing have + * to be suspended. Initcalls doesn't need it.) + */ +extern void disable_boot_trace(void); #else static inline void trace_boot(struct boot_trace *it, initcall_t fn) { } static inline void start_boot_trace(void) { } -static inline void stop_boot_trace(void) { } +static inline void enable_boot_trace(void) { } +static inline void disable_boot_trace(void) { } #endif diff --git a/init/main.c b/init/main.c index 7e117a2..4b03cd5 100644 --- a/init/main.c +++ b/init/main.c @@ -711,6 +711,7 @@ int do_one_initcall(initcall_t fn) it.caller = task_pid_nr(current); printk("calling %pF @ %i\n", fn, it.caller); it.calltime = ktime_get(); + enable_boot_trace(); } it.result = fn(); @@ -722,6 +723,7 @@ int do_one_initcall(initcall_t fn) printk("initcall %pF returned %d after %Ld usecs\n", fn, it.result, it.duration); trace_boot(&it, fn); + disable_boot_trace(); } msgbuf[0] = 0; @@ -882,7 +884,7 @@ static int __init kernel_init(void * unused) * we're essentially up and running. Get rid of the * initmem segments and start the user-mode stuff.. */ - stop_boot_trace(); + init_post(); return 0; } diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index d0a5e50..d104d5b 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -13,23 +13,29 @@ #include "trace.h" static struct trace_array *boot_trace; -static int trace_boot_enabled; +static bool pre_initcalls_finished; - -/* Should be started after do_pre_smp_initcalls() in init/main.c */ +/* Tells the boot tracer that the pre_smp_initcalls are finished. + * So we are ready . + * It doesn't enable sched events tracing however. + * You have to call enable_boot_trace to do so. + */ void start_boot_trace(void) { - trace_boot_enabled = 1; + pre_initcalls_finished = true; +} + +void enable_boot_trace(void) +{ } -void stop_boot_trace(void) +void disable_boot_trace(void) { - trace_boot_enabled = 0; } void reset_boot_trace(struct trace_array *tr) { - stop_boot_trace(); + disable_boot_trace(); } static void boot_trace_init(struct trace_array *tr) @@ -37,8 +43,6 @@ static void boot_trace_init(struct trace_array *tr) int cpu; boot_trace = tr; - trace_boot_enabled = 0; - for_each_cpu_mask(cpu, cpu_possible_map) tracing_reset(tr, cpu); } @@ -46,9 +50,9 @@ static void boot_trace_init(struct trace_array *tr) static void boot_trace_ctrl_update(struct trace_array *tr) { if (tr->ctrl) - start_boot_trace(); + enable_boot_trace(); else - stop_boot_trace(); + disable_boot_trace(); } static enum print_line_t initcall_print_line(struct trace_iterator *iter) @@ -99,7 +103,7 @@ void trace_boot(struct boot_trace *it, initcall_t fn) unsigned long irq_flags; struct trace_array *tr = boot_trace; - if (!trace_boot_enabled) + if (!pre_initcalls_finished) return; /* Get its name now since this function could -- cgit v0.10.2 From 07695fa04e8a3384b0c855398ce1f7885bd7dc3b Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 31 Oct 2008 13:08:28 +0100 Subject: tracing/ftrace: fix a race condition in sched_switch tracer Impact: fix race condition in sched_switch tracer This patch fixes a race condition in the sched_switch tracer. If several tasks (IE: concurrent initcalls) are playing with tracing_start_cmdline_record() and tracing_stop_cmdline_record(), the following situation could happen: _ Task A and B are using the same tracepoint probe. Task A holds it. Task B is sleeping and doesn't hold it. _ Task A frees the sched tracer, then sched_ref is decremented to 0. _ Task A is preempted and hadn't yet unregistered its tracepoint probe, then B runs. _ B increments sched_ref, sees it's 1 and then guess it has to register its probe. But it has not been freed by task A. _ A lot of bad things can happen after that... Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index b8f56be..59de514 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -17,6 +17,7 @@ static struct trace_array *ctx_trace; static int __read_mostly tracer_enabled; static atomic_t sched_ref; +static DEFINE_MUTEX(tracepoint_mutex); static void probe_sched_switch(struct rq *__rq, struct task_struct *prev, @@ -125,18 +126,22 @@ static void tracing_start_sched_switch(void) { long ref; + mutex_lock(&tracepoint_mutex); ref = atomic_inc_return(&sched_ref); if (ref == 1) tracing_sched_register(); + mutex_unlock(&tracepoint_mutex); } static void tracing_stop_sched_switch(void) { long ref; + mutex_lock(&tracepoint_mutex); ref = atomic_dec_and_test(&sched_ref); if (ref) tracing_sched_unregister(); + mutex_unlock(&tracepoint_mutex); } void tracing_start_cmdline_record(void) -- cgit v0.10.2 From e55f605c14679c30be41473e60b7ad26524cdc35 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 31 Oct 2008 13:14:28 +0100 Subject: tracing/ftrace: remove unused code in sched_switch tracer Impact: cleanup When init_sched_switch_trace() is called, it has no reason to start the sched tracer if the sched_ref is not zero. _ If this is non-zero, the tracer is already used, but we can register it to the tracing engine. There is already a security which avoid the tracer probes not to be resgistered twice. _ If this is zero, this block will not be used. Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 59de514..96620c7 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -203,14 +203,6 @@ static struct tracer sched_switch_trace __read_mostly = __init static int init_sched_switch_trace(void) { - int ret = 0; - - if (atomic_read(&sched_ref)) - ret = tracing_sched_register(); - if (ret) { - pr_info("error registering scheduler trace\n"); - return ret; - } return register_tracer(&sched_switch_trace); } device_initcall(init_sched_switch_trace); -- cgit v0.10.2 From d7ad44b697c9d13e445ddc7d16f736fbac333249 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 31 Oct 2008 13:20:08 +0100 Subject: tracing/fastboot: use sched switch tracer from boot tracer Impact: enhance boot trace output with scheduling events Use the sched_switch tracer from the boot tracer. We also can trace schedule events inside the initcalls. Sched tracing is disabled after the initcall has finished and then reenabled before the next one is started. Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index e4c40c8..50d7018 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3251,6 +3251,8 @@ __init static int tracer_alloc_buffers(void) register_tracer(&nop_trace); #ifdef CONFIG_BOOT_TRACER + /* We don't want to launch sched_switch tracer yet */ + global_trace.ctrl = 0; register_tracer(&boot_tracer); current_trace = &boot_tracer; current_trace->init(&global_trace); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 8465ad0..9911277 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -49,6 +49,7 @@ struct ftrace_entry { unsigned long parent_ip; }; extern struct tracer boot_tracer; +extern struct tracer sched_switch_trace; /* Used by the boot tracer */ /* * Context switch trace entry - which task (and prio) we switched from/to: diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index d104d5b..6bbc879 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -27,10 +27,14 @@ void start_boot_trace(void) void enable_boot_trace(void) { + if (pre_initcalls_finished) + tracing_start_cmdline_record(); } void disable_boot_trace(void) { + if (pre_initcalls_finished) + tracing_stop_cmdline_record(); } void reset_boot_trace(struct trace_array *tr) @@ -45,6 +49,8 @@ static void boot_trace_init(struct trace_array *tr) for_each_cpu_mask(cpu, cpu_possible_map) tracing_reset(tr, cpu); + + sched_switch_trace.init(tr); } static void boot_trace_ctrl_update(struct trace_array *tr) diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 96620c7..9d7bdac 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -127,6 +127,7 @@ static void tracing_start_sched_switch(void) long ref; mutex_lock(&tracepoint_mutex); + tracer_enabled = 1; ref = atomic_inc_return(&sched_ref); if (ref == 1) tracing_sched_register(); @@ -138,6 +139,7 @@ static void tracing_stop_sched_switch(void) long ref; mutex_lock(&tracepoint_mutex); + tracer_enabled = 0; ref = atomic_dec_and_test(&sched_ref); if (ref) tracing_sched_unregister(); @@ -158,12 +160,10 @@ static void start_sched_trace(struct trace_array *tr) { sched_switch_reset(tr); tracing_start_cmdline_record(); - tracer_enabled = 1; } static void stop_sched_trace(struct trace_array *tr) { - tracer_enabled = 0; tracing_stop_cmdline_record(); } @@ -190,7 +190,7 @@ static void sched_switch_trace_ctrl_update(struct trace_array *tr) stop_sched_trace(tr); } -static struct tracer sched_switch_trace __read_mostly = +struct tracer sched_switch_trace __read_mostly = { .name = "sched_switch", .init = sched_switch_trace_init, -- cgit v0.10.2 From efade6e7821c4219818e9da08f9315dfa617048b Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 31 Oct 2008 13:28:58 +0100 Subject: tracing/ftrace: types and naming corrections for sched tracer Impact: cleanup This patch applies some corrections suggested by Steven Rostedt. Change the type of shed_ref into int since it is used into a Mutex, we don't need it anymore as an atomic variable in the sched_switch tracer. Also change the name of the register mutex. Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 9d7bdac..969953b 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -16,8 +16,8 @@ static struct trace_array *ctx_trace; static int __read_mostly tracer_enabled; -static atomic_t sched_ref; -static DEFINE_MUTEX(tracepoint_mutex); +static int sched_ref; +static DEFINE_MUTEX(sched_register_mutex); static void probe_sched_switch(struct rq *__rq, struct task_struct *prev, @@ -28,7 +28,7 @@ probe_sched_switch(struct rq *__rq, struct task_struct *prev, int cpu; int pc; - if (!atomic_read(&sched_ref)) + if (!sched_ref) return; tracing_record_cmdline(prev); @@ -124,26 +124,22 @@ static void tracing_sched_unregister(void) static void tracing_start_sched_switch(void) { - long ref; - - mutex_lock(&tracepoint_mutex); - tracer_enabled = 1; - ref = atomic_inc_return(&sched_ref); - if (ref == 1) + mutex_lock(&sched_register_mutex); + if (!(sched_ref++)) { + tracer_enabled = 1; tracing_sched_register(); - mutex_unlock(&tracepoint_mutex); + } + mutex_unlock(&sched_register_mutex); } static void tracing_stop_sched_switch(void) { - long ref; - - mutex_lock(&tracepoint_mutex); - tracer_enabled = 0; - ref = atomic_dec_and_test(&sched_ref); - if (ref) + mutex_lock(&sched_register_mutex); + if (!(--sched_ref)) { tracing_sched_unregister(); - mutex_unlock(&tracepoint_mutex); + tracer_enabled = 0; + } + mutex_unlock(&sched_register_mutex); } void tracing_start_cmdline_record(void) -- cgit v0.10.2 From 79a9d461fd521f133f0e66485aa9ed09c21f5191 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 31 Oct 2008 13:34:45 +0100 Subject: tracing/ftrace: fix a bug when switch current tracer to sched tracer Impact: fix boot tracer + sched tracer coupling bug Fix a bug that made the sched_switch tracer unable to run if set as the current_tracer after the boot tracer. Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 6bbc879..bd5046c 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -37,9 +37,9 @@ void disable_boot_trace(void) tracing_stop_cmdline_record(); } -void reset_boot_trace(struct trace_array *tr) +static void reset_boot_trace(struct trace_array *tr) { - disable_boot_trace(); + sched_switch_trace.reset(tr); } static void boot_trace_init(struct trace_array *tr) diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 969953b..888944d 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -173,7 +173,7 @@ static void sched_switch_trace_init(struct trace_array *tr) static void sched_switch_trace_reset(struct trace_array *tr) { - if (tr->ctrl) + if (tr->ctrl && sched_ref) stop_sched_trace(tr); } -- cgit v0.10.2 From 60a7ecf42661f2b22168751298592da6ee210c9e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 5 Nov 2008 16:05:44 -0500 Subject: ftrace: add quick function trace stop Impact: quick start and stop of function tracer This patch adds a way to disable the function tracer quickly without the need to run kstop_machine. It adds a new variable called function_trace_stop which will stop the calls to functions from mcount when set. This is just an on/off switch and does not handle recursion like preempt_disable(). It's main purpose is to help other tracers/debuggers start and stop tracing fuctions without the need to call kstop_machine. The config option HAVE_FUNCTION_TRACE_MCOUNT_TEST is added for archs that implement the testing of the function_trace_stop in the mcount arch dependent code. Otherwise, the test is done in the C code. x86 is the only arch at the moment that supports this. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 6f20718..d09e812 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -29,6 +29,7 @@ config X86 select HAVE_FTRACE_MCOUNT_RECORD select HAVE_DYNAMIC_FTRACE select HAVE_FUNCTION_TRACER + select HAVE_FUNCTION_TRACE_MCOUNT_TEST select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) select HAVE_ARCH_KGDB if !X86_VOYAGER select HAVE_ARCH_TRACEHOOK diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 28b597e..9134de8 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -1157,6 +1157,9 @@ ENTRY(mcount) END(mcount) ENTRY(ftrace_caller) + cmpl $0, function_trace_stop + jne ftrace_stub + pushl %eax pushl %ecx pushl %edx @@ -1180,6 +1183,9 @@ END(ftrace_caller) #else /* ! CONFIG_DYNAMIC_FTRACE */ ENTRY(mcount) + cmpl $0, function_trace_stop + jne ftrace_stub + cmpl $ftrace_stub, ftrace_trace_function jnz trace .globl ftrace_stub diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index b86f332..08aa6b1 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -68,6 +68,8 @@ ENTRY(mcount) END(mcount) ENTRY(ftrace_caller) + cmpl $0, function_trace_stop + jne ftrace_stub /* taken from glibc */ subq $0x38, %rsp @@ -103,6 +105,9 @@ END(ftrace_caller) #else /* ! CONFIG_DYNAMIC_FTRACE */ ENTRY(mcount) + cmpl $0, function_trace_stop + jne ftrace_stub + cmpq $ftrace_stub, ftrace_trace_function jnz trace .globl ftrace_stub diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 4642959..794ab90 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -23,6 +23,34 @@ struct ftrace_ops { struct ftrace_ops *next; }; +extern int function_trace_stop; + +/** + * ftrace_stop - stop function tracer. + * + * A quick way to stop the function tracer. Note this an on off switch, + * it is not something that is recursive like preempt_disable. + * This does not disable the calling of mcount, it only stops the + * calling of functions from mcount. + */ +static inline void ftrace_stop(void) +{ + function_trace_stop = 1; +} + +/** + * ftrace_start - start the function tracer. + * + * This function is the inverse of ftrace_stop. This does not enable + * the function tracing if the function tracer is disabled. This only + * sets the function tracer flag to continue calling the functions + * from mcount. + */ +static inline void ftrace_start(void) +{ + function_trace_stop = 0; +} + /* * The ftrace_ops must be a static and should also * be read_mostly. These functions do modify read_mostly variables @@ -41,6 +69,8 @@ extern void ftrace_stub(unsigned long a0, unsigned long a1); # define unregister_ftrace_function(ops) do { } while (0) # define clear_ftrace_function(ops) do { } while (0) static inline void ftrace_kill(void) { } +static inline void ftrace_stop(void) { } +static inline void ftrace_start(void) { } #endif /* CONFIG_FUNCTION_TRACER */ #ifdef CONFIG_DYNAMIC_FTRACE diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 33dbefd..fc4febc 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -9,6 +9,13 @@ config NOP_TRACER config HAVE_FUNCTION_TRACER bool +config HAVE_FUNCTION_TRACE_MCOUNT_TEST + bool + help + This gets selected when the arch tests the function_trace_stop + variable at the mcount call site. Otherwise, this variable + is tested by the called function. + config HAVE_DYNAMIC_FTRACE bool diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 4a39d24..896c71f 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -47,6 +47,9 @@ int ftrace_enabled __read_mostly; static int last_ftrace_enabled; +/* Quick disabling of function tracer. */ +int function_trace_stop; + /* * ftrace_disabled is set when an anomaly is discovered. * ftrace_disabled is much stronger than ftrace_enabled. @@ -63,6 +66,7 @@ static struct ftrace_ops ftrace_list_end __read_mostly = static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end; ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; +ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub; static void ftrace_list_func(unsigned long ip, unsigned long parent_ip) { @@ -88,8 +92,23 @@ static void ftrace_list_func(unsigned long ip, unsigned long parent_ip) void clear_ftrace_function(void) { ftrace_trace_function = ftrace_stub; + __ftrace_trace_function = ftrace_stub; } +#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST +/* + * For those archs that do not test ftrace_trace_stop in their + * mcount call site, we need to do it from C. + */ +static void ftrace_test_stop_func(unsigned long ip, unsigned long parent_ip) +{ + if (function_trace_stop) + return; + + __ftrace_trace_function(ip, parent_ip); +} +#endif + static int __register_ftrace_function(struct ftrace_ops *ops) { /* should not be called from interrupt context */ @@ -110,10 +129,18 @@ static int __register_ftrace_function(struct ftrace_ops *ops) * For one func, simply call it directly. * For more than one func, call the chain. */ +#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST if (ops->next == &ftrace_list_end) ftrace_trace_function = ops->func; else ftrace_trace_function = ftrace_list_func; +#else + if (ops->next == &ftrace_list_end) + __ftrace_trace_function = ops->func; + else + __ftrace_trace_function = ftrace_list_func; + ftrace_trace_function = ftrace_test_stop_func; +#endif } spin_unlock(&ftrace_lock); @@ -526,7 +553,7 @@ static void ftrace_run_update_code(int command) } static ftrace_func_t saved_ftrace_func; -static int ftrace_start; +static int ftrace_start_up; static DEFINE_MUTEX(ftrace_start_lock); static void ftrace_startup(void) @@ -537,8 +564,8 @@ static void ftrace_startup(void) return; mutex_lock(&ftrace_start_lock); - ftrace_start++; - if (ftrace_start == 1) + ftrace_start_up++; + if (ftrace_start_up == 1) command |= FTRACE_ENABLE_CALLS; if (saved_ftrace_func != ftrace_trace_function) { @@ -562,8 +589,8 @@ static void ftrace_shutdown(void) return; mutex_lock(&ftrace_start_lock); - ftrace_start--; - if (!ftrace_start) + ftrace_start_up--; + if (!ftrace_start_up) command |= FTRACE_DISABLE_CALLS; if (saved_ftrace_func != ftrace_trace_function) { @@ -589,8 +616,8 @@ static void ftrace_startup_sysctl(void) mutex_lock(&ftrace_start_lock); /* Force update next time */ saved_ftrace_func = NULL; - /* ftrace_start is true if we want ftrace running */ - if (ftrace_start) + /* ftrace_start_up is true if we want ftrace running */ + if (ftrace_start_up) command |= FTRACE_ENABLE_CALLS; ftrace_run_update_code(command); @@ -605,8 +632,8 @@ static void ftrace_shutdown_sysctl(void) return; mutex_lock(&ftrace_start_lock); - /* ftrace_start is true if ftrace is running */ - if (ftrace_start) + /* ftrace_start_up is true if ftrace is running */ + if (ftrace_start_up) command |= FTRACE_DISABLE_CALLS; ftrace_run_update_code(command); @@ -1186,7 +1213,7 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable) mutex_lock(&ftrace_sysctl_lock); mutex_lock(&ftrace_start_lock); - if (iter->filtered && ftrace_start && ftrace_enabled) + if (iter->filtered && ftrace_start_up && ftrace_enabled) ftrace_run_update_code(FTRACE_ENABLE_CALLS); mutex_unlock(&ftrace_start_lock); mutex_unlock(&ftrace_sysctl_lock); -- cgit v0.10.2 From 0f04870148ecb825133bc2733f473b1c5773ac0b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 5 Nov 2008 16:05:44 -0500 Subject: ftrace: soft tracing stop and start Impact: add way to quickly start stop tracing from the kernel This patch adds a soft stop and start to the trace. This simply disables function tracing via the ftrace_disabled flag, and disables the trace buffers to prevent recording. The tracing code may still be executed, but the trace will not be recorded. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 794ab90..7a75fc6 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -216,6 +216,9 @@ static inline void __ftrace_enabled_restore(int enabled) #ifdef CONFIG_TRACING extern int ftrace_dump_on_oops; +extern void tracing_start(void); +extern void tracing_stop(void); + extern void ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3); @@ -246,6 +249,8 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) { } static inline int ftrace_printk(const char *fmt, ...) __attribute__ ((format (printf, 1, 0))); +static inline void tracing_start(void) { } +static inline void tracing_stop(void) { } static inline int ftrace_printk(const char *fmt, ...) { diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 29ab40a..113aea9 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -43,6 +43,15 @@ unsigned long __read_mostly tracing_max_latency = (cycle_t)ULONG_MAX; unsigned long __read_mostly tracing_thresh; + +/* + * Kill all tracing for good (never come back). + * It is initialized to 1 but will turn to zero if the initialization + * of the tracer is successful. But that is the only place that sets + * this back to zero. + */ +int tracing_disabled = 1; + static DEFINE_PER_CPU(local_t, ftrace_cpu_disabled); static inline void ftrace_disable_cpu(void) @@ -62,8 +71,6 @@ static cpumask_t __read_mostly tracing_buffer_mask; #define for_each_tracing_cpu(cpu) \ for_each_cpu_mask(cpu, tracing_buffer_mask) -static int tracing_disabled = 1; - /* * ftrace_dump_on_oops - variable to dump ftrace buffer on oops * @@ -613,6 +620,76 @@ static void trace_init_cmdlines(void) cmdline_idx = 0; } +static int trace_stop_count; +static DEFINE_SPINLOCK(tracing_start_lock); + +/** + * tracing_start - quick start of the tracer + * + * If tracing is enabled but was stopped by tracing_stop, + * this will start the tracer back up. + */ +void tracing_start(void) +{ + struct ring_buffer *buffer; + unsigned long flags; + + if (tracing_disabled) + return; + + spin_lock_irqsave(&tracing_start_lock, flags); + if (--trace_stop_count) + goto out; + + if (trace_stop_count < 0) { + /* Someone screwed up their debugging */ + WARN_ON_ONCE(1); + trace_stop_count = 0; + goto out; + } + + + buffer = global_trace.buffer; + if (buffer) + ring_buffer_record_enable(buffer); + + buffer = max_tr.buffer; + if (buffer) + ring_buffer_record_enable(buffer); + + ftrace_start(); + out: + spin_unlock_irqrestore(&tracing_start_lock, flags); +} + +/** + * tracing_stop - quick stop of the tracer + * + * Light weight way to stop tracing. Use in conjunction with + * tracing_start. + */ +void tracing_stop(void) +{ + struct ring_buffer *buffer; + unsigned long flags; + + ftrace_stop(); + spin_lock_irqsave(&tracing_start_lock, flags); + if (trace_stop_count++) + goto out; + + buffer = global_trace.buffer; + if (buffer) + ring_buffer_record_disable(buffer); + + buffer = max_tr.buffer; + if (buffer) + ring_buffer_record_disable(buffer); + + out: + spin_unlock_irqrestore(&tracing_start_lock, flags); +} + void trace_stop_cmdline_recording(void); static void trace_save_cmdline(struct task_struct *tsk) -- cgit v0.10.2 From 9036990d462e09366f7297a2d1da6582c3e6b1d3 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 5 Nov 2008 16:05:44 -0500 Subject: ftrace: restructure tracing start/stop infrastructure Impact: change where tracing is started up and stopped Currently, when a new tracer is selected via echo'ing a tracer name into the current_tracer file, the startup is only done if tracing_enabled is set to one. If tracing_enabled is changed to zero (by echo'ing 0 into the tracing_enabled file) a full shutdown is performed. The full startup and shutdown of a tracer can be expensive and the user can lose out traces when echo'ing in 0 to the tracing_enabled file, because the process takes too long. There can also be places that the user would like to start and stop the tracer several times and doing the full startup and shutdown of a tracer might be too expensive. This patch performs the full startup and shutdown when a tracer is selected. It also adds a way to do a quick start or stop of a tracer. The quick version is just a flag that prevents the tracing from taking place, but the overhead of the code is still there. For example, the startup of a tracer may enable tracepoints, or enable the function tracer. The stop and start will just set a flag to have the tracer ignore the calls when the tracepoint or function trace is called. The overhead of the tracer may still be present when the tracer is stopped, but no tracing will occur. Setting the tracer to the 'nop' tracer (or any other tracer) will perform the shutdown of the tracer which will disable the tracepoint or disable the function tracer. The tracing_enabled file will simply start or stop tracing. This change is all internal. The end result for the user should be the same as before. If tracing_enabled is not set, no trace will happen. If tracing_enabled is set, then the trace will happen. The tracing_enabled variable is static between tracers. Enabling tracing_enabled and going to another tracer will keep tracing_enabled enabled. Same is true with disabling tracing_enabled. This patch will now provide a fast start/stop method to the users for enabling or disabling tracing. Note: There were two methods to the struct tracer that were never used: The methods start and stop. These were to be used as a hook to the reading of the trace output, but ended up not being necessary. These two methods are now used to enable the start and stop of each tracer, in case the tracer needs to do more than just not write into the buffer. For example, the irqsoff tracer must stop recording max latencies when tracing is stopped. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 113aea9..ff1e9ed 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -150,6 +150,19 @@ static DEFINE_PER_CPU(struct trace_array_cpu, max_data); /* tracer_enabled is used to toggle activation of a tracer */ static int tracer_enabled = 1; +/** + * tracing_is_enabled - return tracer_enabled status + * + * This function is used by other tracers to know the status + * of the tracer_enabled flag. Tracers may use this function + * to know if it should enable their features when starting + * up. See irqsoff tracer for an example (start_irqsoff_tracer). + */ +int tracing_is_enabled(void) +{ + return tracer_enabled; +} + /* function tracing enabled */ int ftrace_function_enabled; @@ -1041,8 +1054,7 @@ void tracing_start_function_trace(void) trace_ops.func = function_trace_call; register_ftrace_function(&trace_ops); - if (tracer_enabled) - ftrace_function_enabled = 1; + ftrace_function_enabled = 1; } void tracing_stop_function_trace(void) @@ -1189,10 +1201,6 @@ static void *s_start(struct seq_file *m, loff_t *pos) atomic_inc(&trace_record_cmdline_disabled); - /* let the tracer grab locks here if needed */ - if (current_trace->start) - current_trace->start(iter); - if (*pos != iter->pos) { iter->ent = NULL; iter->cpu = 0; @@ -1219,14 +1227,7 @@ static void *s_start(struct seq_file *m, loff_t *pos) static void s_stop(struct seq_file *m, void *p) { - struct trace_iterator *iter = m->private; - atomic_dec(&trace_record_cmdline_disabled); - - /* let the tracer release locks here if needed */ - if (current_trace && current_trace == iter->trace && iter->trace->stop) - iter->trace->stop(iter); - mutex_unlock(&trace_types_lock); } @@ -2056,10 +2057,7 @@ __tracing_open(struct inode *inode, struct file *file, int *ret) m->private = iter; /* stop the trace while dumping */ - if (iter->tr->ctrl) { - tracer_enabled = 0; - ftrace_function_enabled = 0; - } + tracing_stop(); if (iter->trace && iter->trace->open) iter->trace->open(iter); @@ -2104,14 +2102,7 @@ int tracing_release(struct inode *inode, struct file *file) iter->trace->close(iter); /* reenable tracing if it was previously enabled */ - if (iter->tr->ctrl) { - tracer_enabled = 1; - /* - * It is safe to enable function tracing even if it - * isn't used - */ - ftrace_function_enabled = 1; - } + tracing_start(); mutex_unlock(&trace_types_lock); seq_release(inode, file); @@ -2449,11 +2440,10 @@ static ssize_t tracing_ctrl_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { - struct trace_array *tr = filp->private_data; char buf[64]; int r; - r = sprintf(buf, "%ld\n", tr->ctrl); + r = sprintf(buf, "%u\n", tracer_enabled); return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } @@ -2481,16 +2471,18 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf, val = !!val; mutex_lock(&trace_types_lock); - if (tr->ctrl ^ val) { - if (val) + if (tracer_enabled ^ val) { + if (val) { tracer_enabled = 1; - else + if (current_trace->start) + current_trace->start(tr); + tracing_start(); + } else { tracer_enabled = 0; - - tr->ctrl = val; - - if (current_trace && current_trace->ctrl_update) - current_trace->ctrl_update(tr); + tracing_stop(); + if (current_trace->stop) + current_trace->stop(tr); + } } mutex_unlock(&trace_types_lock); @@ -3372,7 +3364,7 @@ __init static int tracer_alloc_buffers(void) #endif /* All seems OK, enable tracing */ - global_trace.ctrl = tracer_enabled; + global_trace.ctrl = 1; tracing_disabled = 0; atomic_notifier_chain_register(&panic_notifier_list, diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index cc14a6b..3422489 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -237,11 +237,11 @@ struct tracer { const char *name; void (*init)(struct trace_array *tr); void (*reset)(struct trace_array *tr); + void (*start)(struct trace_array *tr); + void (*stop)(struct trace_array *tr); void (*open)(struct trace_iterator *iter); void (*pipe_open)(struct trace_iterator *iter); void (*close)(struct trace_iterator *iter); - void (*start)(struct trace_iterator *iter); - void (*stop)(struct trace_iterator *iter); ssize_t (*read)(struct trace_iterator *iter, struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos); @@ -282,6 +282,7 @@ struct trace_iterator { long idx; }; +int tracing_is_enabled(void); void trace_wake_up(void); void tracing_reset(struct trace_array *tr, int cpu); int tracing_open_generic(struct inode *inode, struct file *filp); diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 0f85a64..9f1b0de 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -62,11 +62,17 @@ static void function_trace_ctrl_update(struct trace_array *tr) stop_function_trace(tr); } +static void function_trace_start(struct trace_array *tr) +{ + function_reset(tr); +} + static struct tracer function_trace __read_mostly = { .name = "function", .init = function_trace_init, .reset = function_trace_reset, + .start = function_trace_start, .ctrl_update = function_trace_ctrl_update, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_function, diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 9c74071..a87a20f 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -353,15 +353,28 @@ void trace_preempt_off(unsigned long a0, unsigned long a1) } #endif /* CONFIG_PREEMPT_TRACER */ +/* + * save_tracer_enabled is used to save the state of the tracer_enabled + * variable when we disable it when we open a trace output file. + */ +static int save_tracer_enabled; + static void start_irqsoff_tracer(struct trace_array *tr) { register_ftrace_function(&trace_ops); - tracer_enabled = 1; + if (tracing_is_enabled()) { + tracer_enabled = 1; + save_tracer_enabled = 1; + } else { + tracer_enabled = 0; + save_tracer_enabled = 0; + } } static void stop_irqsoff_tracer(struct trace_array *tr) { tracer_enabled = 0; + save_tracer_enabled = 0; unregister_ftrace_function(&trace_ops); } @@ -389,17 +402,29 @@ static void irqsoff_tracer_ctrl_update(struct trace_array *tr) stop_irqsoff_tracer(tr); } +static void irqsoff_tracer_start(struct trace_array *tr) +{ + irqsoff_tracer_reset(tr); + tracer_enabled = 1; + save_tracer_enabled = 1; +} + +static void irqsoff_tracer_stop(struct trace_array *tr) +{ + tracer_enabled = 0; + save_tracer_enabled = 0; +} + static void irqsoff_tracer_open(struct trace_iterator *iter) { /* stop the trace while dumping */ - if (iter->tr->ctrl) - stop_irqsoff_tracer(iter->tr); + tracer_enabled = 0; } static void irqsoff_tracer_close(struct trace_iterator *iter) { - if (iter->tr->ctrl) - start_irqsoff_tracer(iter->tr); + /* restart tracing */ + tracer_enabled = save_tracer_enabled; } #ifdef CONFIG_IRQSOFF_TRACER @@ -414,6 +439,8 @@ static struct tracer irqsoff_tracer __read_mostly = .name = "irqsoff", .init = irqsoff_tracer_init, .reset = irqsoff_tracer_reset, + .start = irqsoff_tracer_start, + .stop = irqsoff_tracer_stop, .open = irqsoff_tracer_open, .close = irqsoff_tracer_close, .ctrl_update = irqsoff_tracer_ctrl_update, @@ -440,6 +467,8 @@ static struct tracer preemptoff_tracer __read_mostly = .name = "preemptoff", .init = preemptoff_tracer_init, .reset = irqsoff_tracer_reset, + .start = irqsoff_tracer_start, + .stop = irqsoff_tracer_stop, .open = irqsoff_tracer_open, .close = irqsoff_tracer_close, .ctrl_update = irqsoff_tracer_ctrl_update, @@ -468,6 +497,8 @@ static struct tracer preemptirqsoff_tracer __read_mostly = .name = "preemptirqsoff", .init = preemptirqsoff_tracer_init, .reset = irqsoff_tracer_reset, + .start = irqsoff_tracer_start, + .stop = irqsoff_tracer_stop, .open = irqsoff_tracer_open, .close = irqsoff_tracer_close, .ctrl_update = irqsoff_tracer_ctrl_update, diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 888944d..91c699b 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -186,11 +186,24 @@ static void sched_switch_trace_ctrl_update(struct trace_array *tr) stop_sched_trace(tr); } +static void sched_switch_trace_start(struct trace_array *tr) +{ + sched_switch_reset(tr); + tracing_start_sched_switch(); +} + +static void sched_switch_trace_stop(struct trace_array *tr) +{ + tracing_stop_sched_switch(); +} + struct tracer sched_switch_trace __read_mostly = { .name = "sched_switch", .init = sched_switch_trace_init, .reset = sched_switch_trace_reset, + .start = sched_switch_trace_start, + .stop = sched_switch_trace_stop, .ctrl_update = sched_switch_trace_ctrl_update, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_sched_switch, diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 7bc4abf..240577b 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -262,6 +262,12 @@ out: atomic_dec(&wakeup_trace->data[cpu]->disabled); } +/* + * save_tracer_enabled is used to save the state of the tracer_enabled + * variable when we disable it when we open a trace output file. + */ +static int save_tracer_enabled; + static void start_wakeup_tracer(struct trace_array *tr) { int ret; @@ -300,7 +306,13 @@ static void start_wakeup_tracer(struct trace_array *tr) register_ftrace_function(&trace_ops); - tracer_enabled = 1; + if (tracing_is_enabled()) { + tracer_enabled = 1; + save_tracer_enabled = 1; + } else { + tracer_enabled = 0; + save_tracer_enabled = 0; + } return; fail_deprobe_wake_new: @@ -312,6 +324,7 @@ fail_deprobe: static void stop_wakeup_tracer(struct trace_array *tr) { tracer_enabled = 0; + save_tracer_enabled = 0; unregister_ftrace_function(&trace_ops); unregister_trace_sched_switch(probe_wakeup_sched_switch); unregister_trace_sched_wakeup_new(probe_wakeup); @@ -343,18 +356,32 @@ static void wakeup_tracer_ctrl_update(struct trace_array *tr) stop_wakeup_tracer(tr); } +static void wakeup_tracer_start(struct trace_array *tr) +{ + wakeup_reset(tr); + tracer_enabled = 1; + save_tracer_enabled = 1; +} + +static void wakeup_tracer_stop(struct trace_array *tr) +{ + tracer_enabled = 0; + save_tracer_enabled = 0; +} + static void wakeup_tracer_open(struct trace_iterator *iter) { /* stop the trace while dumping */ - if (iter->tr->ctrl) - stop_wakeup_tracer(iter->tr); + tracer_enabled = 0; } static void wakeup_tracer_close(struct trace_iterator *iter) { /* forget about any processes we were recording */ - if (iter->tr->ctrl) - start_wakeup_tracer(iter->tr); + if (save_tracer_enabled) { + wakeup_reset(iter->tr); + tracer_enabled = 1; + } } static struct tracer wakeup_tracer __read_mostly = @@ -362,6 +389,8 @@ static struct tracer wakeup_tracer __read_mostly = .name = "wakeup", .init = wakeup_tracer_init, .reset = wakeup_tracer_reset, + .start = wakeup_tracer_start, + .stop = wakeup_tracer_stop, .open = wakeup_tracer_open, .close = wakeup_tracer_close, .ctrl_update = wakeup_tracer_ctrl_update, -- cgit v0.10.2 From 3e03fb7f1da2e691644526c0d6df42d778716349 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 6 Nov 2008 00:09:43 -0500 Subject: ring-buffer: convert to raw spinlocks Impact: no lockdep debugging of ring buffer The problem with running lockdep on the ring buffer is that the ring buffer is the core infrastructure of ftrace. What happens is that the tracer will start tracing the lockdep code while lockdep is testing the ring buffers locks. This can cause lockdep to fail due to testing cases that have not fully finished their locking transition. This patch converts the spin locks used by the ring buffer back into raw spin locks which lockdep does not check. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 151f6a7..a2dea50 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -154,7 +154,7 @@ static inline int test_time_stamp(u64 delta) struct ring_buffer_per_cpu { int cpu; struct ring_buffer *buffer; - spinlock_t lock; + raw_spinlock_t lock; struct lock_class_key lock_key; struct list_head pages; struct buffer_page *head_page; /* read from head */ @@ -291,7 +291,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) cpu_buffer->cpu = cpu; cpu_buffer->buffer = buffer; - spin_lock_init(&cpu_buffer->lock); + cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; INIT_LIST_HEAD(&cpu_buffer->pages); page = kzalloc_node(ALIGN(sizeof(*page), cache_line_size()), @@ -854,7 +854,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, if (write > BUF_PAGE_SIZE) { struct buffer_page *next_page = tail_page; - spin_lock_irqsave(&cpu_buffer->lock, flags); + local_irq_save(flags); + __raw_spin_lock(&cpu_buffer->lock); rb_inc_page(cpu_buffer, &next_page); @@ -930,7 +931,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, rb_set_commit_to_write(cpu_buffer); } - spin_unlock_irqrestore(&cpu_buffer->lock, flags); + __raw_spin_unlock(&cpu_buffer->lock); + local_irq_restore(flags); /* fail and let the caller try again */ return ERR_PTR(-EAGAIN); @@ -953,7 +955,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, return event; out_unlock: - spin_unlock_irqrestore(&cpu_buffer->lock, flags); + __raw_spin_unlock(&cpu_buffer->lock); + local_irq_restore(flags); return NULL; } @@ -1524,7 +1527,8 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) struct buffer_page *reader = NULL; unsigned long flags; - spin_lock_irqsave(&cpu_buffer->lock, flags); + local_irq_save(flags); + __raw_spin_lock(&cpu_buffer->lock); again: reader = cpu_buffer->reader_page; @@ -1574,7 +1578,8 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) goto again; out: - spin_unlock_irqrestore(&cpu_buffer->lock, flags); + __raw_spin_unlock(&cpu_buffer->lock); + local_irq_restore(flags); return reader; } @@ -1815,9 +1820,11 @@ ring_buffer_read_start(struct ring_buffer *buffer, int cpu) atomic_inc(&cpu_buffer->record_disabled); synchronize_sched(); - spin_lock_irqsave(&cpu_buffer->lock, flags); + local_irq_save(flags); + __raw_spin_lock(&cpu_buffer->lock); ring_buffer_iter_reset(iter); - spin_unlock_irqrestore(&cpu_buffer->lock, flags); + __raw_spin_unlock(&cpu_buffer->lock); + local_irq_restore(flags); return iter; } @@ -1903,11 +1910,13 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) if (!cpu_isset(cpu, buffer->cpumask)) return; - spin_lock_irqsave(&cpu_buffer->lock, flags); + local_irq_save(flags); + __raw_spin_lock(&cpu_buffer->lock); rb_reset_cpu(cpu_buffer); - spin_unlock_irqrestore(&cpu_buffer->lock, flags); + __raw_spin_unlock(&cpu_buffer->lock); + local_irq_restore(flags); } /** -- cgit v0.10.2 From 6a60dd121c5b6c2d827e99b38c1326f2600c3891 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 6 Nov 2008 15:55:21 -0500 Subject: ftrace: split out hardirq ftrace code into own header Impact: moving of function prototypes into own header file ftrace.h is too big of a file for hardirq.h, and some archs will fail to build because of the include dependencies not being met. This patch pulls out the required prototypes for hardirq.h into a smaller and safer ftrace_irq.h file. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 0ad1b48..1b340e3 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -104,9 +104,6 @@ extern void ftrace_release(void *start, unsigned long size); extern void ftrace_disable_daemon(void); extern void ftrace_enable_daemon(void); -extern void ftrace_nmi_enter(void); -extern void ftrace_nmi_exit(void); - #else # define skip_trace(ip) ({ 0; }) # define ftrace_force_update() ({ 0; }) @@ -114,8 +111,6 @@ extern void ftrace_nmi_exit(void); # define ftrace_disable_daemon() do { } while (0) # define ftrace_enable_daemon() do { } while (0) static inline void ftrace_release(void *start, unsigned long size) { } -static inline void ftrace_nmi_enter(void) { } -static inline void ftrace_nmi_exit(void) { } #endif /* CONFIG_DYNAMIC_FTRACE */ /* totally disable ftrace - can not re-enable after this */ diff --git a/include/linux/ftrace_irq.h b/include/linux/ftrace_irq.h new file mode 100644 index 0000000..b1299d6 --- /dev/null +++ b/include/linux/ftrace_irq.h @@ -0,0 +1,13 @@ +#ifndef _LINUX_FTRACE_IRQ_H +#define _LINUX_FTRACE_IRQ_H + + +#ifdef CONFIG_DYNAMIC_FTRACE +extern void ftrace_nmi_enter(void); +extern void ftrace_nmi_exit(void); +#else +static inline void ftrace_nmi_enter(void) { } +static inline void ftrace_nmi_exit(void) { } +#endif + +#endif /* _LINUX_FTRACE_IRQ_H */ diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index ffc16ab..89a56d7 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -4,7 +4,7 @@ #include #include #include -#include +#include #include #include -- cgit v0.10.2 From 0183fb1c94b74862b073590fc52c56b7364b7bad Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 7 Nov 2008 22:36:02 -0500 Subject: ftrace: fix set_ftrace_filter Impact: fix of output of set_ftrace_filter Commit ftrace: do not show freed records in available_filter_functions Removed a bit too much from the set_ftrace_filter code, where we now see all functions in the set_ftrace_filter file even when we set a filter. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 896c71f..4d2e751 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -765,6 +765,9 @@ t_next(struct seq_file *m, void *v, loff_t *pos) ((iter->flags & FTRACE_ITER_FAILURES) && !(rec->flags & FTRACE_FL_FAILED)) || + ((iter->flags & FTRACE_ITER_FILTER) && + !(rec->flags & FTRACE_FL_FILTER)) || + ((iter->flags & FTRACE_ITER_NOTRACE) && !(rec->flags & FTRACE_FL_NOTRACE))) { rec = NULL; -- cgit v0.10.2 From 75f5c47da386445ba0c5a8b7e3ca0c906e763369 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 7 Nov 2008 22:36:02 -0500 Subject: ftrace: fix boot trace sched startup Impact: boot tracer startup modified The boot tracer calls into some of the schedule tracing private functions that should not be exported. This patch cleans it up, and makes way for further changes in the ftrace infrastructure. This patch adds a api to assign a tracer array to the schedule context switch tracer. Signed-off-by: Steven Rostedt Acked-by: Frederic Weisbecker Signed-off-by: Ingo Molnar diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 3422489..db12e16 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -49,7 +49,6 @@ struct ftrace_entry { unsigned long parent_ip; }; extern struct tracer boot_tracer; -extern struct tracer sched_switch_trace; /* Used by the boot tracer */ /* * Context switch trace entry - which task (and prio) we switched from/to: @@ -325,6 +324,7 @@ void trace_function(struct trace_array *tr, void tracing_start_cmdline_record(void); void tracing_stop_cmdline_record(void); +void tracing_cmdline_assign_trace(struct trace_array *tr); int register_tracer(struct tracer *type); void unregister_tracer(struct tracer *type); diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index bd5046c..662cb91 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -39,7 +39,12 @@ void disable_boot_trace(void) static void reset_boot_trace(struct trace_array *tr) { - sched_switch_trace.reset(tr); + int cpu; + + tr->time_start = ftrace_now(tr->cpu); + + for_each_online_cpu(cpu) + tracing_reset(tr, cpu); } static void boot_trace_init(struct trace_array *tr) @@ -50,7 +55,7 @@ static void boot_trace_init(struct trace_array *tr) for_each_cpu_mask(cpu, cpu_possible_map) tracing_reset(tr, cpu); - sched_switch_trace.init(tr); + tracing_cmdline_assign_trace(tr); } static void boot_trace_ctrl_update(struct trace_array *tr) diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 91c699b..fbf05df 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -152,6 +152,19 @@ void tracing_stop_cmdline_record(void) tracing_stop_sched_switch(); } +/** + * tracing_cmdline_assign_trace - assign a trace array for ctx switch + * @tr: trace array pointer to assign + * + * Some tracers might want to record the context switches in their + * trace. This function lets those tracers assign the trace array + * to use. + */ +void tracing_cmdline_assign_trace(struct trace_array *tr) +{ + ctx_trace = tr; +} + static void start_sched_trace(struct trace_array *tr) { sched_switch_reset(tr); @@ -197,7 +210,7 @@ static void sched_switch_trace_stop(struct trace_array *tr) tracing_stop_sched_switch(); } -struct tracer sched_switch_trace __read_mostly = +static struct tracer sched_switch_trace __read_mostly = { .name = "sched_switch", .init = sched_switch_trace_init, -- cgit v0.10.2 From e168e0516e476070faa9e8e7b23dfcba79b76d82 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 7 Nov 2008 22:36:02 -0500 Subject: ftrace: fix sched_switch API Impact: fix for sched_switch that broke dynamic ftrace startup The commit: tracing/fastboot: use sched switch tracer from boot tracer broke the API of the sched_switch trace. The use of the tracing_start/stop_cmdline record is for only recording the cmdline, NOT recording the schedule switches themselves. Seeing that the boot tracer broke the API to do something that it wanted, this patch adds a new interface for the API while puting back the original interface of the old API. Signed-off-by: Steven Rostedt Acked-by: Frederic Weisbecker Signed-off-by: Ingo Molnar diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index db12e16..25abfc4 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -324,7 +324,9 @@ void trace_function(struct trace_array *tr, void tracing_start_cmdline_record(void); void tracing_stop_cmdline_record(void); -void tracing_cmdline_assign_trace(struct trace_array *tr); +void tracing_sched_switch_assign_trace(struct trace_array *tr); +void tracing_stop_sched_switch_record(void); +void tracing_start_sched_switch_record(void); int register_tracer(struct tracer *type); void unregister_tracer(struct tracer *type); diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 662cb91..0203c10 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -28,13 +28,13 @@ void start_boot_trace(void) void enable_boot_trace(void) { if (pre_initcalls_finished) - tracing_start_cmdline_record(); + tracing_start_sched_switch_record(); } void disable_boot_trace(void) { if (pre_initcalls_finished) - tracing_stop_cmdline_record(); + tracing_stop_sched_switch_record(); } static void reset_boot_trace(struct trace_array *tr) @@ -55,7 +55,7 @@ static void boot_trace_init(struct trace_array *tr) for_each_cpu_mask(cpu, cpu_possible_map) tracing_reset(tr, cpu); - tracing_cmdline_assign_trace(tr); + tracing_sched_switch_assign_trace(tr); } static void boot_trace_ctrl_update(struct trace_array *tr) diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index fbf05df..79410db 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -125,20 +125,16 @@ static void tracing_sched_unregister(void) static void tracing_start_sched_switch(void) { mutex_lock(&sched_register_mutex); - if (!(sched_ref++)) { - tracer_enabled = 1; + if (!(sched_ref++)) tracing_sched_register(); - } mutex_unlock(&sched_register_mutex); } static void tracing_stop_sched_switch(void) { mutex_lock(&sched_register_mutex); - if (!(--sched_ref)) { + if (!(--sched_ref)) tracing_sched_unregister(); - tracer_enabled = 0; - } mutex_unlock(&sched_register_mutex); } @@ -153,14 +149,48 @@ void tracing_stop_cmdline_record(void) } /** - * tracing_cmdline_assign_trace - assign a trace array for ctx switch + * tracing_start_sched_switch_record - start tracing context switches + * + * Turns on context switch tracing for a tracer. + */ +void tracing_start_sched_switch_record(void) +{ + if (unlikely(!ctx_trace)) { + WARN_ON(1); + return; + } + + tracing_start_sched_switch(); + + mutex_lock(&sched_register_mutex); + tracer_enabled++; + mutex_unlock(&sched_register_mutex); +} + +/** + * tracing_stop_sched_switch_record - start tracing context switches + * + * Turns off context switch tracing for a tracer. + */ +void tracing_stop_sched_switch_record(void) +{ + mutex_lock(&sched_register_mutex); + tracer_enabled--; + WARN_ON(tracer_enabled < 0); + mutex_unlock(&sched_register_mutex); + + tracing_stop_sched_switch(); +} + +/** + * tracing_sched_switch_assign_trace - assign a trace array for ctx switch * @tr: trace array pointer to assign * * Some tracers might want to record the context switches in their * trace. This function lets those tracers assign the trace array * to use. */ -void tracing_cmdline_assign_trace(struct trace_array *tr) +void tracing_sched_switch_assign_trace(struct trace_array *tr) { ctx_trace = tr; } @@ -168,12 +198,12 @@ void tracing_cmdline_assign_trace(struct trace_array *tr) static void start_sched_trace(struct trace_array *tr) { sched_switch_reset(tr); - tracing_start_cmdline_record(); + tracing_start_sched_switch_record(); } static void stop_sched_trace(struct trace_array *tr) { - tracing_stop_cmdline_record(); + tracing_stop_sched_switch_record(); } static void sched_switch_trace_init(struct trace_array *tr) -- cgit v0.10.2 From 451931702017951f74624ddc4f7f02e4641b0e20 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 7 Nov 2008 22:36:02 -0500 Subject: ftrace: irqsoff tracer incorrect reset Impact: fix to irqsoff tracer output In converting to the new start / stop ftrace handling, the irqsoff tracer start called the irqsoff reset function. irqsoff tracer is not the same as the other traces, and it resets the buffers while searching for the longest latency. The reset that the irqsoff stop method calls disables the function tracing. That means that, by starting the tracer, the function tracer is disabled incorrectly. This patch simply removes the call to reset which keeps the function tracing enabled. Reset is not needed for the irqsoff stop method. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index a87a20f..3509086 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -404,7 +404,6 @@ static void irqsoff_tracer_ctrl_update(struct trace_array *tr) static void irqsoff_tracer_start(struct trace_array *tr) { - irqsoff_tracer_reset(tr); tracer_enabled = 1; save_tracer_enabled = 1; } -- cgit v0.10.2 From 49833fc232bd6a5076496994d855f601354501d7 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 7 Nov 2008 22:36:02 -0500 Subject: ftrace: enable trace_printk by default Impact: have the ftrace_printk enabled on startup It is confusing to have to "echo trace_printk > /debug/tracing/iter_ctrl" after adding ftrace_printk in the kernel. Currently the trace_printk is set to off by default. ftrace_printk should only be in open kernel code when used for debugging, and thus it should be enabled by default. It may also be used to record data within a tracer, but those ftrace_printks should be within wrappers that are either enabled by trace_points or have a variable protecting the code path from being entered when the tracer is disabled. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d55ccfc..dbbdacf 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -205,7 +205,7 @@ static DEFINE_MUTEX(trace_types_lock); static DECLARE_WAIT_QUEUE_HEAD(trace_wait); /* trace_flags holds iter_ctrl options */ -unsigned long trace_flags = TRACE_ITER_PRINT_PARENT; +unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK; /** * trace_wake_up - wake up tasks waiting for trace input -- cgit v0.10.2 From bbf5b1a0cecb56de6236db8b01c5bfb7ab8ba8b2 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 7 Nov 2008 22:36:02 -0500 Subject: ftrace: remove ctrl_update method Impact: Remove the ctrl_update tracer method With the new quick start/stop method of tracing, the ctrl_update method is out of date. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index dbbdacf..9e83188 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3357,8 +3357,6 @@ __init static int tracer_alloc_buffers(void) register_tracer(&nop_trace); #ifdef CONFIG_BOOT_TRACER - /* We don't want to launch sched_switch tracer yet */ - global_trace.ctrl = 0; register_tracer(&boot_tracer); current_trace = &boot_tracer; current_trace->init(&global_trace); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 25abfc4..e481eda 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -244,7 +244,6 @@ struct tracer { ssize_t (*read)(struct trace_iterator *iter, struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos); - void (*ctrl_update)(struct trace_array *tr); #ifdef CONFIG_FTRACE_STARTUP_TEST int (*selftest)(struct tracer *trace, struct trace_array *tr); diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 0203c10..8f71915 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -58,14 +58,6 @@ static void boot_trace_init(struct trace_array *tr) tracing_sched_switch_assign_trace(tr); } -static void boot_trace_ctrl_update(struct trace_array *tr) -{ - if (tr->ctrl) - enable_boot_trace(); - else - disable_boot_trace(); -} - static enum print_line_t initcall_print_line(struct trace_iterator *iter) { int ret; @@ -102,7 +94,6 @@ struct tracer boot_tracer __read_mostly = .name = "initcall", .init = boot_trace_init, .reset = reset_boot_trace, - .ctrl_update = boot_trace_ctrl_update, .print_line = initcall_print_line, }; diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 9f1b0de..e980b87 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -54,14 +54,6 @@ static void function_trace_reset(struct trace_array *tr) stop_function_trace(tr); } -static void function_trace_ctrl_update(struct trace_array *tr) -{ - if (tr->ctrl) - start_function_trace(tr); - else - stop_function_trace(tr); -} - static void function_trace_start(struct trace_array *tr) { function_reset(tr); @@ -73,7 +65,6 @@ static struct tracer function_trace __read_mostly = .init = function_trace_init, .reset = function_trace_reset, .start = function_trace_start, - .ctrl_update = function_trace_ctrl_update, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_function, #endif diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 3509086..ffdf592 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -394,14 +394,6 @@ static void irqsoff_tracer_reset(struct trace_array *tr) stop_irqsoff_tracer(tr); } -static void irqsoff_tracer_ctrl_update(struct trace_array *tr) -{ - if (tr->ctrl) - start_irqsoff_tracer(tr); - else - stop_irqsoff_tracer(tr); -} - static void irqsoff_tracer_start(struct trace_array *tr) { tracer_enabled = 1; @@ -442,7 +434,6 @@ static struct tracer irqsoff_tracer __read_mostly = .stop = irqsoff_tracer_stop, .open = irqsoff_tracer_open, .close = irqsoff_tracer_close, - .ctrl_update = irqsoff_tracer_ctrl_update, .print_max = 1, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_irqsoff, @@ -470,7 +461,6 @@ static struct tracer preemptoff_tracer __read_mostly = .stop = irqsoff_tracer_stop, .open = irqsoff_tracer_open, .close = irqsoff_tracer_close, - .ctrl_update = irqsoff_tracer_ctrl_update, .print_max = 1, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_preemptoff, @@ -500,7 +490,6 @@ static struct tracer preemptirqsoff_tracer __read_mostly = .stop = irqsoff_tracer_stop, .open = irqsoff_tracer_open, .close = irqsoff_tracer_close, - .ctrl_update = irqsoff_tracer_ctrl_update, .print_max = 1, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_preemptirqsoff, diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index f284846..fa9354e 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -49,15 +49,10 @@ static void mmio_trace_reset(struct trace_array *tr) mmio_trace_array = NULL; } -static void mmio_trace_ctrl_update(struct trace_array *tr) +static void mmio_trace_start(struct trace_array *tr) { pr_debug("in %s\n", __func__); - if (tr->ctrl) { - mmio_reset_data(tr); - enable_mmiotrace(); - } else { - disable_mmiotrace(); - } + mmio_reset_data(tr); } static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev) @@ -298,10 +293,10 @@ static struct tracer mmio_tracer __read_mostly = .name = "mmiotrace", .init = mmio_trace_init, .reset = mmio_trace_reset, + .start = mmio_trace_start, .pipe_open = mmio_pipe_open, .close = mmio_close, .read = mmio_read, - .ctrl_update = mmio_trace_ctrl_update, .print_line = mmio_print_line, }; diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c index 4592b48..e3c5fbf 100644 --- a/kernel/trace/trace_nop.c +++ b/kernel/trace/trace_nop.c @@ -42,21 +42,11 @@ static void nop_trace_reset(struct trace_array *tr) stop_nop_trace(tr); } -static void nop_trace_ctrl_update(struct trace_array *tr) -{ - /* When starting a new trace, reset the buffers */ - if (tr->ctrl) - start_nop_trace(tr); - else - stop_nop_trace(tr); -} - struct tracer nop_trace __read_mostly = { .name = "nop", .init = nop_trace_init, .reset = nop_trace_reset, - .ctrl_update = nop_trace_ctrl_update, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_nop, #endif diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 79410db..7b73fd1 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -220,15 +220,6 @@ static void sched_switch_trace_reset(struct trace_array *tr) stop_sched_trace(tr); } -static void sched_switch_trace_ctrl_update(struct trace_array *tr) -{ - /* When starting a new trace, reset the buffers */ - if (tr->ctrl) - start_sched_trace(tr); - else - stop_sched_trace(tr); -} - static void sched_switch_trace_start(struct trace_array *tr) { sched_switch_reset(tr); @@ -247,7 +238,6 @@ static struct tracer sched_switch_trace __read_mostly = .reset = sched_switch_trace_reset, .start = sched_switch_trace_start, .stop = sched_switch_trace_stop, - .ctrl_update = sched_switch_trace_ctrl_update, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_sched_switch, #endif diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 240577b..23e54d4 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -348,14 +348,6 @@ static void wakeup_tracer_reset(struct trace_array *tr) } } -static void wakeup_tracer_ctrl_update(struct trace_array *tr) -{ - if (tr->ctrl) - start_wakeup_tracer(tr); - else - stop_wakeup_tracer(tr); -} - static void wakeup_tracer_start(struct trace_array *tr) { wakeup_reset(tr); @@ -393,7 +385,6 @@ static struct tracer wakeup_tracer __read_mostly = .stop = wakeup_tracer_stop, .open = wakeup_tracer_open, .close = wakeup_tracer_close, - .ctrl_update = wakeup_tracer_ctrl_update, .print_max = 1, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_wakeup, diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 90bc752..7469343 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -134,13 +134,13 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, msleep(100); /* stop the tracing. */ - tr->ctrl = 0; - trace->ctrl_update(tr); + tracing_stop(); ftrace_enabled = 0; /* check the trace buffer */ ret = trace_test_buffer(tr, &count); trace->reset(tr); + tracing_start(); /* we should only have one item */ if (!ret && count != 1) { @@ -148,6 +148,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, ret = -1; goto out; } + out: ftrace_enabled = save_ftrace_enabled; tracer_enabled = save_tracer_enabled; @@ -185,13 +186,13 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) /* Sleep for a 1/10 of a second */ msleep(100); /* stop the tracing. */ - tr->ctrl = 0; - trace->ctrl_update(tr); + tracing_stop(); ftrace_enabled = 0; /* check the trace buffer */ ret = trace_test_buffer(tr, &count); trace->reset(tr); + tracing_start(); if (!ret && !count) { printk(KERN_CONT ".. no entries found .."); @@ -232,13 +233,13 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr) udelay(100); local_irq_enable(); /* stop the tracing. */ - tr->ctrl = 0; - trace->ctrl_update(tr); + tracing_stop(); /* check both trace buffers */ ret = trace_test_buffer(tr, NULL); if (!ret) ret = trace_test_buffer(&max_tr, &count); trace->reset(tr); + tracing_start(); if (!ret && !count) { printk(KERN_CONT ".. no entries found .."); @@ -269,13 +270,13 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr) udelay(100); preempt_enable(); /* stop the tracing. */ - tr->ctrl = 0; - trace->ctrl_update(tr); + tracing_stop(); /* check both trace buffers */ ret = trace_test_buffer(tr, NULL); if (!ret) ret = trace_test_buffer(&max_tr, &count); trace->reset(tr); + tracing_start(); if (!ret && !count) { printk(KERN_CONT ".. no entries found .."); @@ -312,27 +313,30 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array * local_irq_enable(); /* stop the tracing. */ - tr->ctrl = 0; - trace->ctrl_update(tr); + tracing_stop(); /* check both trace buffers */ ret = trace_test_buffer(tr, NULL); - if (ret) + if (ret) { + tracing_start(); goto out; + } ret = trace_test_buffer(&max_tr, &count); - if (ret) + if (ret) { + tracing_start(); goto out; + } if (!ret && !count) { printk(KERN_CONT ".. no entries found .."); ret = -1; + tracing_start(); goto out; } /* do the test by disabling interrupts first this time */ tracing_max_latency = 0; - tr->ctrl = 1; - trace->ctrl_update(tr); + tracing_start(); preempt_disable(); local_irq_disable(); udelay(100); @@ -341,8 +345,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array * local_irq_enable(); /* stop the tracing. */ - tr->ctrl = 0; - trace->ctrl_update(tr); + tracing_stop(); /* check both trace buffers */ ret = trace_test_buffer(tr, NULL); if (ret) @@ -358,6 +361,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array * out: trace->reset(tr); + tracing_start(); tracing_max_latency = save_max; return ret; @@ -448,8 +452,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) msleep(100); /* stop the tracing. */ - tr->ctrl = 0; - trace->ctrl_update(tr); + tracing_stop(); /* check both trace buffers */ ret = trace_test_buffer(tr, NULL); if (!ret) @@ -457,6 +460,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) trace->reset(tr); + tracing_start(); tracing_max_latency = save_max; @@ -485,11 +489,11 @@ trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr /* Sleep for a 1/10 of a second */ msleep(100); /* stop the tracing. */ - tr->ctrl = 0; - trace->ctrl_update(tr); + tracing_stop(); /* check the trace buffer */ ret = trace_test_buffer(tr, &count); trace->reset(tr); + tracing_start(); if (!ret && !count) { printk(KERN_CONT ".. no entries found .."); @@ -513,11 +517,11 @@ trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr) /* Sleep for a 1/10 of a second */ msleep(100); /* stop the tracing. */ - tr->ctrl = 0; - trace->ctrl_update(tr); + tracing_stop(); /* check the trace buffer */ ret = trace_test_buffer(tr, &count); trace->reset(tr); + tracing_start(); return ret; } diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c index 9587d3b..8097430 100644 --- a/kernel/trace/trace_sysprof.c +++ b/kernel/trace/trace_sysprof.c @@ -275,21 +275,11 @@ static void stack_trace_reset(struct trace_array *tr) stop_stack_trace(tr); } -static void stack_trace_ctrl_update(struct trace_array *tr) -{ - /* When starting a new trace, reset the buffers */ - if (tr->ctrl) - start_stack_trace(tr); - else - stop_stack_trace(tr); -} - static struct tracer stack_trace __read_mostly = { .name = "sysprof", .init = stack_trace_init, .reset = stack_trace_reset, - .ctrl_update = stack_trace_ctrl_update, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_sysprof, #endif -- cgit v0.10.2 From c76f06945be50564f925799ddfb6235ee4c26aa0 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 7 Nov 2008 22:36:02 -0500 Subject: ftrace: remove trace array ctrl Impact: remove obsolete variable in trace_array structure With the new start / stop method of ftrace, the ctrl variable in the trace_array structure is now obsolete. Remove it. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9e83188..5843541 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -537,7 +537,6 @@ int register_tracer(struct tracer *type) if (type->selftest) { struct tracer *saved_tracer = current_trace; struct trace_array *tr = &global_trace; - int saved_ctrl = tr->ctrl; int i; /* * Run a selftest on this tracer. @@ -550,13 +549,11 @@ int register_tracer(struct tracer *type) tracing_reset(tr, i); } current_trace = type; - tr->ctrl = 0; /* the test is responsible for initializing and enabling */ pr_info("Testing tracer %s: ", type->name); ret = type->selftest(type, tr); /* the test is responsible for resetting too */ current_trace = saved_tracer; - tr->ctrl = saved_ctrl; if (ret) { printk(KERN_CONT "FAILED!\n"); goto out; @@ -966,7 +963,7 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) int cpu; int pc; - if (tracing_disabled || !tr->ctrl) + if (tracing_disabled) return; pc = preempt_count(); @@ -2820,7 +2817,6 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, unsigned long val; char buf[64]; int ret; - struct trace_array *tr = filp->private_data; if (cnt >= sizeof(buf)) return -EINVAL; @@ -2840,12 +2836,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, mutex_lock(&trace_types_lock); - if (tr->ctrl) { - cnt = -EBUSY; - pr_info("ftrace: please disable tracing" - " before modifying buffer size\n"); - goto out; - } + tracing_stop(); if (val != global_trace.entries) { ret = ring_buffer_resize(global_trace.buffer, val); @@ -2878,6 +2869,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, if (tracing_disabled) cnt = -ENOMEM; out: + tracing_start(); max_tr.entries = global_trace.entries; mutex_unlock(&trace_types_lock); @@ -2900,9 +2892,8 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, { char *buf; char *end; - struct trace_array *tr = &global_trace; - if (!tr->ctrl || tracing_disabled) + if (tracing_disabled) return -EINVAL; if (cnt > TRACE_BUF_SIZE) @@ -3131,7 +3122,7 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args) unsigned long flags, irq_flags; int cpu, len = 0, size, pc; - if (!tr->ctrl || tracing_disabled) + if (tracing_disabled) return 0; pc = preempt_count(); @@ -3365,7 +3356,6 @@ __init static int tracer_alloc_buffers(void) #endif /* All seems OK, enable tracing */ - global_trace.ctrl = 1; tracing_disabled = 0; atomic_notifier_chain_register(&panic_notifier_list, diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index e481eda..cfda9d2 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -172,7 +172,6 @@ struct trace_iterator; struct trace_array { struct ring_buffer *buffer; unsigned long entries; - long ctrl; int cpu; cycle_t time_start; struct task_struct *waiter; diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index e980b87..8693b7a 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -44,14 +44,12 @@ static void stop_function_trace(struct trace_array *tr) static void function_trace_init(struct trace_array *tr) { - if (tr->ctrl) - start_function_trace(tr); + start_function_trace(tr); } static void function_trace_reset(struct trace_array *tr) { - if (tr->ctrl) - stop_function_trace(tr); + stop_function_trace(tr); } static void function_trace_start(struct trace_array *tr) diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index ffdf592..d919d4e 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -383,15 +383,12 @@ static void __irqsoff_tracer_init(struct trace_array *tr) irqsoff_trace = tr; /* make sure that the tracer is visible */ smp_wmb(); - - if (tr->ctrl) - start_irqsoff_tracer(tr); + start_irqsoff_tracer(tr); } static void irqsoff_tracer_reset(struct trace_array *tr) { - if (tr->ctrl) - stop_irqsoff_tracer(tr); + stop_irqsoff_tracer(tr); } static void irqsoff_tracer_start(struct trace_array *tr) diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index fa9354e..51bcf37 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -34,17 +34,16 @@ static void mmio_trace_init(struct trace_array *tr) { pr_debug("in %s\n", __func__); mmio_trace_array = tr; - if (tr->ctrl) { - mmio_reset_data(tr); - enable_mmiotrace(); - } + + mmio_reset_data(tr); + enable_mmiotrace(); } static void mmio_trace_reset(struct trace_array *tr) { pr_debug("in %s\n", __func__); - if (tr->ctrl) - disable_mmiotrace(); + + disable_mmiotrace(); mmio_reset_data(tr); mmio_trace_array = NULL; } diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c index e3c5fbf..2ef1d22 100644 --- a/kernel/trace/trace_nop.c +++ b/kernel/trace/trace_nop.c @@ -32,14 +32,12 @@ static void nop_trace_init(struct trace_array *tr) for_each_online_cpu(cpu) tracing_reset(tr, cpu); - if (tr->ctrl) - start_nop_trace(tr); + start_nop_trace(tr); } static void nop_trace_reset(struct trace_array *tr) { - if (tr->ctrl) - stop_nop_trace(tr); + stop_nop_trace(tr); } struct tracer nop_trace __read_mostly = diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 7b73fd1..be35bdf 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -209,14 +209,12 @@ static void stop_sched_trace(struct trace_array *tr) static void sched_switch_trace_init(struct trace_array *tr) { ctx_trace = tr; - - if (tr->ctrl) - start_sched_trace(tr); + start_sched_trace(tr); } static void sched_switch_trace_reset(struct trace_array *tr) { - if (tr->ctrl && sched_ref) + if (sched_ref) stop_sched_trace(tr); } diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 23e54d4..983f2b1 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -334,18 +334,14 @@ static void stop_wakeup_tracer(struct trace_array *tr) static void wakeup_tracer_init(struct trace_array *tr) { wakeup_trace = tr; - - if (tr->ctrl) - start_wakeup_tracer(tr); + start_wakeup_tracer(tr); } static void wakeup_tracer_reset(struct trace_array *tr) { - if (tr->ctrl) { - stop_wakeup_tracer(tr); - /* make sure we put back any tasks we are tracing */ - wakeup_reset(tr); - } + stop_wakeup_tracer(tr); + /* make sure we put back any tasks we are tracing */ + wakeup_reset(tr); } static void wakeup_tracer_start(struct trace_array *tr) diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 7469343..ea4e5d3 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -110,7 +110,6 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, ftrace_set_filter(func_name, strlen(func_name), 1); /* enable tracing */ - tr->ctrl = 1; trace->init(tr); /* Sleep for a 1/10 of a second */ @@ -181,7 +180,6 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) ftrace_enabled = 1; tracer_enabled = 1; - tr->ctrl = 1; trace->init(tr); /* Sleep for a 1/10 of a second */ msleep(100); @@ -224,7 +222,6 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr) int ret; /* start the tracing */ - tr->ctrl = 1; trace->init(tr); /* reset the max latency */ tracing_max_latency = 0; @@ -261,7 +258,6 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr) int ret; /* start the tracing */ - tr->ctrl = 1; trace->init(tr); /* reset the max latency */ tracing_max_latency = 0; @@ -298,7 +294,6 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array * int ret; /* start the tracing */ - tr->ctrl = 1; trace->init(tr); /* reset the max latency */ @@ -427,7 +422,6 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) wait_for_completion(&isrt); /* start the tracing */ - tr->ctrl = 1; trace->init(tr); /* reset the max latency */ tracing_max_latency = 0; @@ -484,7 +478,6 @@ trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr int ret; /* start the tracing */ - tr->ctrl = 1; trace->init(tr); /* Sleep for a 1/10 of a second */ msleep(100); @@ -512,7 +505,6 @@ trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr) int ret; /* start the tracing */ - tr->ctrl = 1; trace->init(tr); /* Sleep for a 1/10 of a second */ msleep(100); diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c index 8097430..05f7534 100644 --- a/kernel/trace/trace_sysprof.c +++ b/kernel/trace/trace_sysprof.c @@ -265,14 +265,12 @@ static void stack_trace_init(struct trace_array *tr) { sysprof_trace = tr; - if (tr->ctrl) - start_stack_trace(tr); + start_stack_trace(tr); } static void stack_trace_reset(struct trace_array *tr) { - if (tr->ctrl) - stop_stack_trace(tr); + stop_stack_trace(tr); } static struct tracer stack_trace __read_mostly = -- cgit v0.10.2 From 7d5222a6afa4e429f55df8c086adb747837cbdf5 Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Fri, 7 Nov 2008 13:26:25 +0000 Subject: ftrace: align __mcount_loc sections Impact: add alignment option for recordmcount.pl script Align the __mcount_loc sections so that architectures with strict alignment requirements need not worry about performing unaligned accesses. This fixes an issue where I was seeing unaligned accesses, which are not supported on our architecture (the results of an unaligned access are undefined). Signed-off-by: Matt Fleming Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl index 6b9fe3e..eeac71c 100755 --- a/scripts/recordmcount.pl +++ b/scripts/recordmcount.pl @@ -134,6 +134,7 @@ my $section_regex; # Find the start of a section my $function_regex; # Find the name of a function # (return offset and func name) my $mcount_regex; # Find the call site to mcount (return offset) +my $alignment; # The .align value to use for $mcount_section if ($arch eq "x86") { if ($bits == 64) { @@ -148,6 +149,7 @@ if ($arch eq "x86_64") { $function_regex = "^([0-9a-fA-F]+)\\s+<(.*?)>:"; $mcount_regex = "^\\s*([0-9a-fA-F]+):.*\\smcount([+-]0x[0-9a-zA-Z]+)?\$"; $type = ".quad"; + $alignment = 8; # force flags for this arch $ld .= " -m elf_x86_64"; @@ -160,6 +162,7 @@ if ($arch eq "x86_64") { $function_regex = "^([0-9a-fA-F]+)\\s+<(.*?)>:"; $mcount_regex = "^\\s*([0-9a-fA-F]+):.*\\smcount\$"; $type = ".long"; + $alignment = 4; # force flags for this arch $ld .= " -m elf_i386"; @@ -288,6 +291,7 @@ sub update_funcs open(FILE, ">$mcount_s") || die "can't create $mcount_s\n"; $opened = 1; print FILE "\t.section $mcount_section,\"a\",\@progbits\n"; + print FILE "\t.align $alignment\n"; } printf FILE "\t%s %s + %d\n", $type, $ref_func, $offsets[$i] - $offset; } -- cgit v0.10.2 From 769c48eb2530c5c1a393e2c82063f4f050571d24 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 7 Nov 2008 22:36:02 -0500 Subject: ftrace: force pass of preemptoff selftest Impact: preemptoff not tested in selftest Due to the BKL not being preemptable anymore, the selftest of the preemptoff code can not be tested. It requires that it is called with preemption enabled, but since the BKL is held, that is no longer the case. This patch simply skips those tests if it detects that the context is not preemptable. The following will now show up in the tests: Testing tracer preemptoff: can not test ... force PASSED Testing tracer preemptirqsoff: can not test ... force PASSED When the BKL is removed, or it becomes preemptable once again, then the tests will be performed. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index ea4e5d3..0728a10 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -257,6 +257,19 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr) unsigned long count; int ret; + /* + * Now that the big kernel lock is no longer preemptable, + * and this is called with the BKL held, it will always + * fail. If preemption is already disabled, simply + * pass the test. When the BKL is removed, or becomes + * preemptible again, we will once again test this, + * so keep it in. + */ + if (preempt_count()) { + printk(KERN_CONT "can not test ... force "); + return 0; + } + /* start the tracing */ trace->init(tr); /* reset the max latency */ @@ -293,6 +306,19 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array * unsigned long count; int ret; + /* + * Now that the big kernel lock is no longer preemptable, + * and this is called with the BKL held, it will always + * fail. If preemption is already disabled, simply + * pass the test. When the BKL is removed, or becomes + * preemptible again, we will once again test this, + * so keep it in. + */ + if (preempt_count()) { + printk(KERN_CONT "can not test ... force "); + return 0; + } + /* start the tracing */ trace->init(tr); -- cgit v0.10.2 From a309720c876d7ad2e224bfd1982c92ae4364c82e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 7 Nov 2008 22:36:02 -0500 Subject: ftrace: display start of CPU buffer in trace output Impact: change in trace output Because the trace buffers are per cpu ring buffers, the start of the trace can be confusing. If one CPU is very active at the end of the trace, its history will not go as far back as the other CPU traces. This means that output for a particular CPU may not appear for the first part of a trace. To help annotate what is happening, and to prevent any more confusion, this patch adds a line that annotates the start of a CPU buffer output. For example: automount-3495 [001] 184.596443: dnotify_parent <-vfs_write [...] automount-3495 [001] 184.596449: dput <-path_put automount-3496 [002] 184.596450: down_read_trylock <-do_page_fault [...] sshd-3497 [001] 184.597069: up_read <-do_page_fault -0 [000] 184.597074: __exit_idle <-exit_idle [...] automount-3496 [002] 184.597257: filemap_fault <-__do_fault -0 [003] 184.597261: exit_idle <-smp_apic_timer_interrupt Note, parsers of a trace output should always ignore any lines that start with a '#'. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 5843541..f147f19 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1478,6 +1478,17 @@ void trace_seq_print_cont(struct trace_seq *s, struct trace_iterator *iter) trace_seq_putc(s, '\n'); } +static void test_cpu_buff_start(struct trace_iterator *iter) +{ + struct trace_seq *s = &iter->seq; + + if (cpu_isset(iter->cpu, iter->started)) + return; + + cpu_set(iter->cpu, iter->started); + trace_seq_printf(s, "##### CPU %u buffer started ####\n", iter->cpu); +} + static enum print_line_t print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) { @@ -1497,6 +1508,8 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) if (entry->type == TRACE_CONT) return TRACE_TYPE_HANDLED; + test_cpu_buff_start(iter); + next_entry = find_next_entry(iter, NULL, &next_ts); if (!next_entry) next_ts = iter->ts; @@ -1612,6 +1625,8 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter) if (entry->type == TRACE_CONT) return TRACE_TYPE_HANDLED; + test_cpu_buff_start(iter); + comm = trace_find_cmdline(iter->ent->pid); t = ns2usecs(iter->ts); @@ -2631,6 +2646,10 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) return -ENOMEM; mutex_lock(&trace_types_lock); + + /* trace pipe does not show start of buffer */ + cpus_setall(iter->started); + iter->tr = &global_trace; iter->trace = current_trace; filp->private_data = iter; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index cfda9d2..9781450 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -277,6 +277,8 @@ struct trace_iterator { unsigned long iter_flags; loff_t pos; long idx; + + cpumask_t started; }; int tracing_is_enabled(void); -- cgit v0.10.2 From 5aa1ba6a6c710e747838a22d798ac97a8b248745 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 10 Nov 2008 23:07:30 -0500 Subject: ftrace: prevent ftrace_special from recursion Impact: stop ftrace_special from recursion The ftrace_special is used to help debug areas of the kernel. Because of this, if it is put in certain locations, the fact that it allows recursion can become a problem if the kernel developer using does not realize that. This patch changes ftrace_special to not allow recursion into itself to make it more robust. It also changes from preempt disable interrupts disable to prevent any loss of trace entries. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 0c22fe2..216bbe7 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -960,6 +960,7 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) { struct trace_array *tr = &global_trace; struct trace_array_cpu *data; + unsigned long flags; int cpu; int pc; @@ -967,14 +968,15 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) return; pc = preempt_count(); - preempt_disable_notrace(); + local_irq_save(flags); cpu = raw_smp_processor_id(); data = tr->data[cpu]; - if (likely(!atomic_read(&data->disabled))) + if (likely(atomic_inc_return(&data->disabled) == 1)) ftrace_trace_special(tr, data, arg1, arg2, arg3, pc); - preempt_enable_notrace(); + atomic_dec(&data->disabled); + local_irq_restore(flags); } #ifdef CONFIG_FUNCTION_TRACER -- cgit v0.10.2 From f536aafc5a2e6f0c8f1577a155e6f93db5e469f0 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 10 Nov 2008 23:07:30 -0500 Subject: ring-buffer: replace most bug ons with warn on and disable buffer This patch replaces most of the BUG_ONs in the ring_buffer code with RB_WARN_ON variants. It adds some more variants as needed for the replacement. This lets the buffer die nicely and still warn the user. One BUG_ON remains in the code, and that is because it detects a bad pointer passed in by the calling function, and not a bug by the ring buffer code itself. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index ee9b93d..a6b8f9d 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -188,6 +188,7 @@ struct ring_buffer_iter { u64 read_stamp; }; +/* buffer may be either ring_buffer or ring_buffer_per_cpu */ #define RB_WARN_ON(buffer, cond) \ do { \ if (unlikely(cond)) { \ @@ -201,10 +202,28 @@ struct ring_buffer_iter { if (unlikely(cond)) { \ atomic_inc(&buffer->record_disabled); \ WARN_ON(1); \ + return; \ + } \ + } while (0) + +#define RB_WARN_ON_RET_INT(buffer, cond) \ + do { \ + if (unlikely(cond)) { \ + atomic_inc(&buffer->record_disabled); \ + WARN_ON(1); \ return -1; \ } \ } while (0) +#define RB_WARN_ON_RET_NULL(buffer, cond) \ + do { \ + if (unlikely(cond)) { \ + atomic_inc(&buffer->record_disabled); \ + WARN_ON(1); \ + return NULL; \ + } \ + } while (0) + #define RB_WARN_ON_ONCE(buffer, cond) \ do { \ static int once; \ @@ -215,6 +234,17 @@ struct ring_buffer_iter { } \ } while (0) +/* buffer must be ring_buffer not per_cpu */ +#define RB_WARN_ON_UNLOCK(buffer, cond) \ + do { \ + if (unlikely(cond)) { \ + mutex_unlock(&buffer->mutex); \ + atomic_inc(&buffer->record_disabled); \ + WARN_ON(1); \ + return -1; \ + } \ + } while (0) + /** * check_pages - integrity check of buffer pages * @cpu_buffer: CPU buffer with pages to test @@ -227,13 +257,13 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) struct list_head *head = &cpu_buffer->pages; struct buffer_page *page, *tmp; - RB_WARN_ON_RET(cpu_buffer, head->next->prev != head); - RB_WARN_ON_RET(cpu_buffer, head->prev->next != head); + RB_WARN_ON_RET_INT(cpu_buffer, head->next->prev != head); + RB_WARN_ON_RET_INT(cpu_buffer, head->prev->next != head); list_for_each_entry_safe(page, tmp, head, list) { - RB_WARN_ON_RET(cpu_buffer, + RB_WARN_ON_RET_INT(cpu_buffer, page->list.next->prev != &page->list); - RB_WARN_ON_RET(cpu_buffer, + RB_WARN_ON_RET_INT(cpu_buffer, page->list.prev->next != &page->list); } @@ -440,13 +470,13 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) synchronize_sched(); for (i = 0; i < nr_pages; i++) { - BUG_ON(list_empty(&cpu_buffer->pages)); + RB_WARN_ON_RET(cpu_buffer, list_empty(&cpu_buffer->pages)); p = cpu_buffer->pages.next; page = list_entry(p, struct buffer_page, list); list_del_init(&page->list); free_buffer_page(page); } - BUG_ON(list_empty(&cpu_buffer->pages)); + RB_WARN_ON_RET(cpu_buffer, list_empty(&cpu_buffer->pages)); rb_reset_cpu(cpu_buffer); @@ -468,7 +498,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, synchronize_sched(); for (i = 0; i < nr_pages; i++) { - BUG_ON(list_empty(pages)); + RB_WARN_ON_RET(cpu_buffer, list_empty(pages)); p = pages->next; page = list_entry(p, struct buffer_page, list); list_del_init(&page->list); @@ -523,7 +553,7 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) if (size < buffer_size) { /* easy case, just free pages */ - BUG_ON(nr_pages >= buffer->pages); + RB_WARN_ON_UNLOCK(buffer, nr_pages >= buffer->pages); rm_pages = buffer->pages - nr_pages; @@ -542,7 +572,8 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) * add these pages to the cpu_buffers. Otherwise we just free * them all and return -ENOMEM; */ - BUG_ON(nr_pages <= buffer->pages); + RB_WARN_ON_UNLOCK(buffer, nr_pages <= buffer->pages); + new_pages = nr_pages - buffer->pages; for_each_buffer_cpu(buffer, cpu) { @@ -565,7 +596,7 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) rb_insert_pages(cpu_buffer, &pages, new_pages); } - BUG_ON(!list_empty(&pages)); + RB_WARN_ON_UNLOCK(buffer, !list_empty(&pages)); out: buffer->pages = nr_pages; @@ -653,7 +684,7 @@ static void rb_update_overflow(struct ring_buffer_per_cpu *cpu_buffer) head += rb_event_length(event)) { event = __rb_page_index(cpu_buffer->head_page, head); - BUG_ON(rb_null_event(event)); + RB_WARN_ON_RET(cpu_buffer, rb_null_event(event)); /* Only count data entries */ if (event->type != RINGBUF_TYPE_DATA) continue; @@ -940,7 +971,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, /* We reserved something on the buffer */ - BUG_ON(write > BUF_PAGE_SIZE); + RB_WARN_ON_RET_NULL(cpu_buffer, write > BUF_PAGE_SIZE); event = __rb_page_index(tail_page, tail); rb_update_event(event, type, length); @@ -1621,7 +1652,7 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer) reader = rb_get_reader_page(cpu_buffer); /* This function should not be called when buffer is empty */ - BUG_ON(!reader); + RB_WARN_ON_RET(cpu_buffer, !reader); event = rb_reader_event(cpu_buffer); @@ -1648,7 +1679,8 @@ static void rb_advance_iter(struct ring_buffer_iter *iter) * Check if we are at the end of the buffer. */ if (iter->head >= rb_page_size(iter->head_page)) { - BUG_ON(iter->head_page == cpu_buffer->commit_page); + RB_WARN_ON_RET(buffer, + iter->head_page == cpu_buffer->commit_page); rb_inc_iter(iter); return; } @@ -1661,8 +1693,9 @@ static void rb_advance_iter(struct ring_buffer_iter *iter) * This should not be called to advance the header if we are * at the tail of the buffer. */ - BUG_ON((iter->head_page == cpu_buffer->commit_page) && - (iter->head + length > rb_commit_index(cpu_buffer))); + RB_WARN_ON_RET(cpu_buffer, + (iter->head_page == cpu_buffer->commit_page) && + (iter->head + length > rb_commit_index(cpu_buffer))); rb_update_iter_read_stamp(iter, event); -- cgit v0.10.2 -- cgit v0.10.2 From caf4b323b02a16c92fba449952ac6515ddc76d7a Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 11 Nov 2008 07:03:45 +0100 Subject: tracing, x86: add low level support for ftrace return tracing Impact: add infrastructure for function-return tracing Add low level support for ftrace return tracing. This plug-in stores return addresses on the thread_info structure of the current task. The index of the current return address is initialized when the task is the first one (init) and when a process forks (the child). It is not needed when a task does a sys_execve because after this syscall, it still needs to return on the kernel functions it called. Note that the code of return_to_handler has been suggested by Steven Rostedt as almost all of the ideas of improvements in this V3. For purpose of security, arch/x86/kernel/process_32.c is not traced because __switch_to() changes the current task during its execution. That could cause inconsistency in the stored return address of this function even if I didn't have any crash after testing with tracing on this function enabled. Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 27b8a3a..ca91e50 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -11,6 +11,7 @@ config 64BIT config X86_32 def_bool !64BIT + select HAVE_FUNCTION_RET_TRACER config X86_64 def_bool 64BIT diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index f8173ed..9b6a1fa 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -20,4 +20,30 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr) #endif /* __ASSEMBLY__ */ #endif /* CONFIG_FUNCTION_TRACER */ +#ifdef CONFIG_FUNCTION_RET_TRACER +#define FTRACE_RET_STACK_SIZE 20 + +#ifndef __ASSEMBLY__ + +/* + * Stack of return addresses for functions + * of a thread. + * Used in struct thread_info + */ +struct ftrace_ret_stack { + unsigned long ret; + unsigned long func; + unsigned long long calltime; +}; + +/* + * Primary handler of a function return. + * It relays on ftrace_return_to_handler. + * Defined in entry32.S + */ +extern void return_to_handler(void); + +#endif /* __ASSEMBLY__ */ +#endif /* CONFIG_FUNCTION_RET_TRACER */ + #endif /* _ASM_X86_FTRACE_H */ diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index e44d379..a711583 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -20,6 +20,7 @@ struct task_struct; struct exec_domain; #include +#include struct thread_info { struct task_struct *task; /* main task structure */ @@ -38,8 +39,30 @@ struct thread_info { */ __u8 supervisor_stack[0]; #endif + +#ifdef CONFIG_FUNCTION_RET_TRACER + /* Index of current stored adress in ret_stack */ + int curr_ret_stack; + /* Stack of return addresses for return function tracing */ + struct ftrace_ret_stack ret_stack[FTRACE_RET_STACK_SIZE]; +#endif }; +#ifdef CONFIG_FUNCTION_RET_TRACER +#define INIT_THREAD_INFO(tsk) \ +{ \ + .task = &tsk, \ + .exec_domain = &default_exec_domain, \ + .flags = 0, \ + .cpu = 0, \ + .preempt_count = 1, \ + .addr_limit = KERNEL_DS, \ + .restart_block = { \ + .fn = do_no_restart_syscall, \ + }, \ + .curr_ret_stack = -1,\ +} +#else #define INIT_THREAD_INFO(tsk) \ { \ .task = &tsk, \ @@ -52,6 +75,7 @@ struct thread_info { .fn = do_no_restart_syscall, \ }, \ } +#endif #define init_thread_info (init_thread_union.thread_info) #define init_stack (init_thread_union.stack) diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index e489ff9..1d8ed95 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -14,6 +14,11 @@ CFLAGS_REMOVE_paravirt-spinlocks.o = -pg CFLAGS_REMOVE_ftrace.o = -pg endif +ifdef CONFIG_FUNCTION_RET_TRACER +# Don't trace __switch_to() but let it for function tracer +CFLAGS_REMOVE_process_32.o = -pg +endif + # # vsyscalls (which work on the user stack) should have # no stack-protector checks: @@ -65,6 +70,7 @@ obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o obj-$(CONFIG_X86_IO_APIC) += io_apic.o obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o +obj-$(CONFIG_FUNCTION_RET_TRACER) += ftrace.o obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 9134de8..9a0ac85 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -1188,6 +1188,10 @@ ENTRY(mcount) cmpl $ftrace_stub, ftrace_trace_function jnz trace +#ifdef CONFIG_FUNCTION_RET_TRACER + cmpl $ftrace_stub, ftrace_function_return + jnz trace_return +#endif .globl ftrace_stub ftrace_stub: ret @@ -1206,8 +1210,37 @@ trace: popl %edx popl %ecx popl %eax + jmp ftrace_stub +#ifdef CONFIG_FUNCTION_RET_TRACER +trace_return: + pushl %eax + pushl %ecx + pushl %edx + movl 0xc(%esp), %eax + pushl %eax + lea 0x4(%ebp), %eax + pushl %eax + call prepare_ftrace_return + addl $8, %esp + popl %edx + popl %ecx + popl %eax jmp ftrace_stub + +.globl return_to_handler +return_to_handler: + pushl $0 + pushl %eax + pushl %ecx + pushl %edx + call ftrace_return_to_handler + movl %eax, 0xc(%esp) + popl %edx + popl %ecx + popl %eax + ret +#endif /* CONFIG_FUNCTION_RET_TRACER */ END(mcount) #endif /* CONFIG_DYNAMIC_FTRACE */ #endif /* CONFIG_FUNCTION_TRACER */ diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 6914933..d68033b 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -18,10 +18,173 @@ #include #include +#include #include +#include -static unsigned char ftrace_nop[MCOUNT_INSN_SIZE]; + +#ifdef CONFIG_FUNCTION_RET_TRACER + +/* + * These functions are picked from those used on + * this page for dynamic ftrace. They have been + * simplified to ignore all traces in NMI context. + */ +static atomic_t in_nmi; + +void ftrace_nmi_enter(void) +{ + atomic_inc(&in_nmi); +} + +void ftrace_nmi_exit(void) +{ + atomic_dec(&in_nmi); +} + +/* + * Synchronize accesses to return adresses stack with + * interrupts. + */ +static raw_spinlock_t ret_stack_lock; + +/* Add a function return address to the trace stack on thread info.*/ +static int push_return_trace(unsigned long ret, unsigned long long time, + unsigned long func) +{ + int index; + struct thread_info *ti; + unsigned long flags; + int err = 0; + + raw_local_irq_save(flags); + __raw_spin_lock(&ret_stack_lock); + + ti = current_thread_info(); + /* The return trace stack is full */ + if (ti->curr_ret_stack == FTRACE_RET_STACK_SIZE - 1) { + err = -EBUSY; + goto out; + } + + index = ++ti->curr_ret_stack; + ti->ret_stack[index].ret = ret; + ti->ret_stack[index].func = func; + ti->ret_stack[index].calltime = time; + +out: + __raw_spin_unlock(&ret_stack_lock); + raw_local_irq_restore(flags); + return err; +} + +/* Retrieve a function return address to the trace stack on thread info.*/ +static void pop_return_trace(unsigned long *ret, unsigned long long *time, + unsigned long *func) +{ + struct thread_info *ti; + int index; + unsigned long flags; + + raw_local_irq_save(flags); + __raw_spin_lock(&ret_stack_lock); + + ti = current_thread_info(); + index = ti->curr_ret_stack; + *ret = ti->ret_stack[index].ret; + *func = ti->ret_stack[index].func; + *time = ti->ret_stack[index].calltime; + ti->curr_ret_stack--; + + __raw_spin_unlock(&ret_stack_lock); + raw_local_irq_restore(flags); +} + +/* + * Send the trace to the ring-buffer. + * @return the original return address. + */ +unsigned long ftrace_return_to_handler(void) +{ + struct ftrace_retfunc trace; + pop_return_trace(&trace.ret, &trace.calltime, &trace.func); + trace.rettime = cpu_clock(raw_smp_processor_id()); + ftrace_function_return(&trace); + + return trace.ret; +} + +/* + * Hook the return address and push it in the stack of return addrs + * in current thread info. + */ +asmlinkage +void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) +{ + unsigned long old; + unsigned long long calltime; + int faulted; + unsigned long return_hooker = (unsigned long) + &return_to_handler; + + /* Nmi's are currently unsupported */ + if (atomic_read(&in_nmi)) + return; + + /* + * Protect against fault, even if it shouldn't + * happen. This tool is too much intrusive to + * ignore such a protection. + */ + asm volatile( + "1: movl (%[parent_old]), %[old]\n" + "2: movl %[return_hooker], (%[parent_replaced])\n" + " movl $0, %[faulted]\n" + + ".section .fixup, \"ax\"\n" + "3: movl $1, %[faulted]\n" + ".previous\n" + + ".section __ex_table, \"a\"\n" + " .long 1b, 3b\n" + " .long 2b, 3b\n" + ".previous\n" + + : [parent_replaced] "=rm" (parent), [old] "=r" (old), + [faulted] "=r" (faulted) + : [parent_old] "0" (parent), [return_hooker] "r" (return_hooker) + : "memory" + ); + + if (WARN_ON(faulted)) { + unregister_ftrace_return(); + return; + } + + if (WARN_ON(!__kernel_text_address(old))) { + unregister_ftrace_return(); + *parent = old; + return; + } + + calltime = cpu_clock(raw_smp_processor_id()); + + if (push_return_trace(old, calltime, self_addr) == -EBUSY) + *parent = old; +} + +static int __init init_ftrace_function_return(void) +{ + ret_stack_lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; + return 0; +} +device_initcall(init_ftrace_function_return); + + +#endif + +#ifdef CONFIG_DYNAMIC_FTRACE union ftrace_code_union { char code[MCOUNT_INSN_SIZE]; @@ -31,17 +194,11 @@ union ftrace_code_union { } __attribute__((packed)); }; - static int ftrace_calc_offset(long ip, long addr) { return (int)(addr - ip); } -unsigned char *ftrace_nop_replace(void) -{ - return ftrace_nop; -} - unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr) { static union ftrace_code_union calc; @@ -183,6 +340,15 @@ do_ftrace_mod_code(unsigned long ip, void *new_code) } + + +static unsigned char ftrace_nop[MCOUNT_INSN_SIZE]; + +unsigned char *ftrace_nop_replace(void) +{ + return ftrace_nop; +} + int ftrace_modify_code(unsigned long ip, unsigned char *old_code, unsigned char *new_code) @@ -292,3 +458,4 @@ int __init ftrace_dyn_arch_init(void *data) return 0; } +#endif diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 1f5608c..dcbbf72 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -268,6 +268,26 @@ ftrace_init_module(unsigned long *start, unsigned long *end) { } /* + * Structure that defines a return function trace. + */ +struct ftrace_retfunc { + unsigned long ret; /* Return address */ + unsigned long func; /* Current function */ + unsigned long long calltime; + unsigned long long rettime; +}; + +#ifdef CONFIG_FUNCTION_RET_TRACER +/* Type of a callback handler of tracing return function */ +typedef void (*trace_function_return_t)(struct ftrace_retfunc *); + +extern void register_ftrace_return(trace_function_return_t func); +/* The current handler in use */ +extern trace_function_return_t ftrace_function_return; +extern void unregister_ftrace_return(void); +#endif + +/* * Structure which defines the trace of an initcall. * You don't have to fill the func field since it is * only used internally by the tracer. diff --git a/include/linux/ftrace_irq.h b/include/linux/ftrace_irq.h index b1299d6..0b4df55 100644 --- a/include/linux/ftrace_irq.h +++ b/include/linux/ftrace_irq.h @@ -2,7 +2,7 @@ #define _LINUX_FTRACE_IRQ_H -#ifdef CONFIG_DYNAMIC_FTRACE +#if defined(CONFIG_DYNAMIC_FTRACE) || defined(CONFIG_FUNCTION_RET_TRACER) extern void ftrace_nmi_enter(void); extern void ftrace_nmi_exit(void); #else diff --git a/include/linux/sched.h b/include/linux/sched.h index 295b7c7..df77abe 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2005,6 +2005,17 @@ static inline void setup_thread_stack(struct task_struct *p, struct task_struct { *task_thread_info(p) = *task_thread_info(org); task_thread_info(p)->task = p; + +#ifdef CONFIG_FUNCTION_RET_TRACER + /* + * When fork() creates a child process, this function is called. + * But the child task may not inherit the return adresses traced + * by the return function tracer because it will directly execute + * in userspace and will not return to kernel functions its parent + * used. + */ + task_thread_info(p)->curr_ret_stack = -1; +#endif } static inline unsigned long *end_of_stack(struct task_struct *p) diff --git a/kernel/Makefile b/kernel/Makefile index 9a3ec66..af3be57 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -23,6 +23,10 @@ CFLAGS_REMOVE_cgroup-debug.o = -pg CFLAGS_REMOVE_sched_clock.o = -pg CFLAGS_REMOVE_sched.o = -mno-spe -pg endif +ifdef CONFIG_FUNCTION_RET_TRACER +CFLAGS_REMOVE_extable.o = -pg # For __kernel_text_address() +CFLAGS_REMOVE_module.o = -pg # For __module_text_address() +endif obj-$(CONFIG_FREEZER) += freezer.o obj-$(CONFIG_PROFILING) += profile.o -- cgit v0.10.2 From 15e6cb3673ea6277999642802406a764b49391b0 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 11 Nov 2008 07:14:25 +0100 Subject: tracing: add a tracer to catch execution time of kernel functions Impact: add new tracing plugin which can trace full (entry+exit) function calls This tracer uses the low level function return ftrace plugin to measure the execution time of the kernel functions. The first field is the caller of the function, the second is the measured function, and the last one is the execution time in nanoseconds. - v3: - HAVE_FUNCTION_RET_TRACER have been added. Each arch that support ftrace return should enable it. - ftrace_return_stub becomes ftrace_stub. - CONFIG_FUNCTION_RET_TRACER depends now on CONFIG_FUNCTION_TRACER - Return traces printing can be used for other tracers on trace.c - Adapt to the new tracing API (no more ctrl_update callback) - Correct the check of "disabled" during insertion. - Minor changes... Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index fc4febc..d986216 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -9,6 +9,9 @@ config NOP_TRACER config HAVE_FUNCTION_TRACER bool +config HAVE_FUNCTION_RET_TRACER + bool + config HAVE_FUNCTION_TRACE_MCOUNT_TEST bool help @@ -54,6 +57,17 @@ config FUNCTION_TRACER (the bootup default), then the overhead of the instructions is very small and not measurable even in micro-benchmarks. +config FUNCTION_RET_TRACER + bool "Kernel Function return Tracer" + depends on !DYNAMIC_FTRACE + depends on HAVE_FUNCTION_RET_TRACER + depends on FUNCTION_TRACER + help + Enable the kernel to trace a function at its return. + It's first purpose is to trace the duration of functions. + This is done by setting the current return address on the thread + info structure of the current task. + config IRQSOFF_TRACER bool "Interrupts-off Latency Tracer" default n diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index c8228b1..3e1f361 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -24,5 +24,6 @@ obj-$(CONFIG_NOP_TRACER) += trace_nop.o obj-$(CONFIG_STACK_TRACER) += trace_stack.o obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o obj-$(CONFIG_BOOT_TRACER) += trace_boot.o +obj-$(CONFIG_FUNCTION_RET_TRACER) += trace_functions_return.o libftrace-y := ftrace.o diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 4d2e751..f03fe74 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1484,3 +1484,19 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, return ret; } +#ifdef CONFIG_FUNCTION_RET_TRACER +trace_function_return_t ftrace_function_return = + (trace_function_return_t)ftrace_stub; +void register_ftrace_return(trace_function_return_t func) +{ + ftrace_function_return = func; +} + +void unregister_ftrace_return(void) +{ + ftrace_function_return = (trace_function_return_t)ftrace_stub; +} +#endif + + + diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 216bbe7..a3f7ae9 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -244,13 +244,6 @@ unsigned long nsecs_to_usecs(unsigned long nsecs) return nsecs / 1000; } -/* - * TRACE_ITER_SYM_MASK masks the options in trace_flags that - * control the output of kernel symbols. - */ -#define TRACE_ITER_SYM_MASK \ - (TRACE_ITER_PRINT_PARENT|TRACE_ITER_SYM_OFFSET|TRACE_ITER_SYM_ADDR) - /* These must match the bit postions in trace_iterator_flags */ static const char *trace_options[] = { "print-parent", @@ -810,6 +803,35 @@ trace_function(struct trace_array *tr, struct trace_array_cpu *data, ring_buffer_unlock_commit(tr->buffer, event, irq_flags); } +#ifdef CONFIG_FUNCTION_RET_TRACER +static void __trace_function_return(struct trace_array *tr, + struct trace_array_cpu *data, + struct ftrace_retfunc *trace, + unsigned long flags, + int pc) +{ + struct ring_buffer_event *event; + struct ftrace_ret_entry *entry; + unsigned long irq_flags; + + if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) + return; + + event = ring_buffer_lock_reserve(global_trace.buffer, sizeof(*entry), + &irq_flags); + if (!event) + return; + entry = ring_buffer_event_data(event); + tracing_generic_entry_update(&entry->ent, flags, pc); + entry->ent.type = TRACE_FN_RET; + entry->ip = trace->func; + entry->parent_ip = trace->ret; + entry->rettime = trace->rettime; + entry->calltime = trace->calltime; + ring_buffer_unlock_commit(global_trace.buffer, event, irq_flags); +} +#endif + void ftrace(struct trace_array *tr, struct trace_array_cpu *data, unsigned long ip, unsigned long parent_ip, unsigned long flags, @@ -1038,6 +1060,29 @@ function_trace_call(unsigned long ip, unsigned long parent_ip) raw_local_irq_restore(flags); } +#ifdef CONFIG_FUNCTION_RET_TRACER +void trace_function_return(struct ftrace_retfunc *trace) +{ + struct trace_array *tr = &global_trace; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int cpu; + int pc; + + raw_local_irq_save(flags); + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + disabled = atomic_inc_return(&data->disabled); + if (likely(disabled == 1)) { + pc = preempt_count(); + __trace_function_return(tr, data, trace, flags, pc); + } + atomic_dec(&data->disabled); + raw_local_irq_restore(flags); +} +#endif /* CONFIG_FUNCTION_RET_TRACER */ + static struct ftrace_ops trace_ops __read_mostly = { .func = function_trace_call, @@ -1285,7 +1330,7 @@ seq_print_sym_offset(struct trace_seq *s, const char *fmt, # define IP_FMT "%016lx" #endif -static int +int seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) { int ret; @@ -1738,6 +1783,10 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter) trace_seq_print_cont(s, iter); break; } + case TRACE_FN_RET: { + return print_return_function(iter); + break; + } } return TRACE_TYPE_HANDLED; } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 9781450..e40ce0c 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -22,6 +22,7 @@ enum trace_type { TRACE_MMIO_RW, TRACE_MMIO_MAP, TRACE_BOOT, + TRACE_FN_RET, __TRACE_LAST_TYPE }; @@ -48,6 +49,15 @@ struct ftrace_entry { unsigned long ip; unsigned long parent_ip; }; + +/* Function return entry */ +struct ftrace_ret_entry { + struct trace_entry ent; + unsigned long ip; + unsigned long parent_ip; + unsigned long long calltime; + unsigned long long rettime; +}; extern struct tracer boot_tracer; /* @@ -218,6 +228,7 @@ extern void __ftrace_bad_type(void); IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \ TRACE_MMIO_MAP); \ IF_ASSIGN(var, ent, struct trace_boot, TRACE_BOOT); \ + IF_ASSIGN(var, ent, struct ftrace_ret_entry, TRACE_FN_RET); \ __ftrace_bad_type(); \ } while (0) @@ -321,6 +332,8 @@ void trace_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, unsigned long flags, int pc); +void +trace_function_return(struct ftrace_retfunc *trace); void tracing_start_cmdline_record(void); void tracing_stop_cmdline_record(void); @@ -393,6 +406,10 @@ extern void *head_page(struct trace_array_cpu *data); extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...); extern void trace_seq_print_cont(struct trace_seq *s, struct trace_iterator *iter); + +extern int +seq_print_ip_sym(struct trace_seq *s, unsigned long ip, + unsigned long sym_flags); extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt); extern long ns2usecs(cycle_t nsec); @@ -400,6 +417,17 @@ extern int trace_vprintk(unsigned long ip, const char *fmt, va_list args); extern unsigned long trace_flags; +/* Standard output formatting function used for function return traces */ +#ifdef CONFIG_FUNCTION_RET_TRACER +extern enum print_line_t print_return_function(struct trace_iterator *iter); +#else +static inline enum print_line_t +print_return_function(struct trace_iterator *iter) +{ + return TRACE_TYPE_UNHANDLED; +} +#endif + /* * trace_iterator_flags is an enumeration that defines bit * positions into trace_flags that controls the output. @@ -422,6 +450,13 @@ enum trace_iterator_flags { TRACE_ITER_PREEMPTONLY = 0x800, }; +/* + * TRACE_ITER_SYM_MASK masks the options in trace_flags that + * control the output of kernel symbols. + */ +#define TRACE_ITER_SYM_MASK \ + (TRACE_ITER_PRINT_PARENT|TRACE_ITER_SYM_OFFSET|TRACE_ITER_SYM_ADDR) + extern struct tracer nop_trace; /** diff --git a/kernel/trace/trace_functions_return.c b/kernel/trace/trace_functions_return.c new file mode 100644 index 0000000..7680b21 --- /dev/null +++ b/kernel/trace/trace_functions_return.c @@ -0,0 +1,82 @@ +/* + * + * Function return tracer. + * Copyright (c) 2008 Frederic Weisbecker + * Mostly borrowed from function tracer which + * is Copyright (c) Steven Rostedt + * + */ +#include +#include +#include +#include + +#include "trace.h" + + +static void start_return_trace(struct trace_array *tr) +{ + register_ftrace_return(&trace_function_return); +} + +static void stop_return_trace(struct trace_array *tr) +{ + unregister_ftrace_return(); +} + +static void return_trace_init(struct trace_array *tr) +{ + int cpu; + for_each_online_cpu(cpu) + tracing_reset(tr, cpu); + + start_return_trace(tr); +} + +static void return_trace_reset(struct trace_array *tr) +{ + stop_return_trace(tr); +} + + +enum print_line_t +print_return_function(struct trace_iterator *iter) +{ + struct trace_seq *s = &iter->seq; + struct trace_entry *entry = iter->ent; + struct ftrace_ret_entry *field; + int ret; + + if (entry->type == TRACE_FN_RET) { + trace_assign_type(field, entry); + ret = trace_seq_printf(s, "%pF -> ", (void *)field->parent_ip); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + ret = seq_print_ip_sym(s, field->ip, + trace_flags & TRACE_ITER_SYM_MASK); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + ret = trace_seq_printf(s, " (%llu ns)\n", + field->rettime - field->calltime); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + else + return TRACE_TYPE_HANDLED; + } + return TRACE_TYPE_UNHANDLED; +} + +static struct tracer return_trace __read_mostly = +{ + .name = "return", + .init = return_trace_init, + .reset = return_trace_reset, + .print_line = print_return_function +}; + +static __init int init_return_trace(void) +{ + return register_tracer(&return_trace); +} + +device_initcall(init_return_trace); -- cgit v0.10.2 From f1c4be5edad3756212cbbbeab39428fe90c27109 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 11 Nov 2008 10:22:36 +0100 Subject: tracing, x86: clean up FUNCTION_RET_TRACER Kconfig Impact: cleanup move FUNCTION_RET_TRACER to the X86 select section, where we have all the other options. Signed-off-by: Ingo Molnar diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ca91e50..0de793c 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -11,7 +11,6 @@ config 64BIT config X86_32 def_bool !64BIT - select HAVE_FUNCTION_RET_TRACER config X86_64 def_bool 64BIT @@ -30,6 +29,7 @@ config X86 select HAVE_FTRACE_MCOUNT_RECORD select HAVE_DYNAMIC_FTRACE select HAVE_FUNCTION_TRACER + select HAVE_FUNCTION_RET_TRACER if X86_32 select HAVE_FUNCTION_TRACE_MCOUNT_TEST select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) select HAVE_ARCH_KGDB if !X86_VOYAGER -- cgit v0.10.2 From 867f7fb3ebb831970847b179e7df5a9ab10da16d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 11 Nov 2008 11:18:14 +0100 Subject: tracing, x86: function return tracer, fix assembly constraints fix: arch/x86/kernel/ftrace.c: Assembler messages: arch/x86/kernel/ftrace.c:140: Error: missing ')' arch/x86/kernel/ftrace.c:140: Error: junk `(%ebp))' after expression arch/x86/kernel/ftrace.c:141: Error: missing ')' arch/x86/kernel/ftrace.c:141: Error: junk `(%ebp))' after expression the [parent_replaced] is used in an =rm fashion, so that constraint is correct in isolation - but [parent_old] aliases register %0 and uses it in an addressing mode that is only valid with registers - so change the constraint from =rm to =r. This fixes the build failure. Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index d68033b..9b2325a 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -151,7 +151,7 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) " .long 2b, 3b\n" ".previous\n" - : [parent_replaced] "=rm" (parent), [old] "=r" (old), + : [parent_replaced] "=r" (parent), [old] "=r" (old), [faulted] "=r" (faulted) : [parent_old] "0" (parent), [return_hooker] "r" (return_hooker) : "memory" -- cgit v0.10.2 From 19b3e9671c5a219b8c34da2cc66e0ce7c3a501ae Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 11 Nov 2008 11:57:02 +0100 Subject: tracing: function return tracer, build fix fix: arch/x86/kernel/ftrace.c: In function 'ftrace_return_to_handler': arch/x86/kernel/ftrace.c:112: error: implicit declaration of function 'cpu_clock' cpu_clock() is implicitly included via a number of ways, but its real location is sched.h. (Build failure is triggerable if enough other kernel components are turned off.) Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 9b2325a..16a571d 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include -- cgit v0.10.2 From f83c9d0fe42a7544b4d4ffcebb2e6716fcfd95c0 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 11 Nov 2008 18:47:44 +0100 Subject: ring-buffer: add reader lock Impact: serialize reader accesses to individual CPU ring buffers The code in the ring buffer expects only one reader at a time, but currently it puts that requirement on the caller. This is not strong enough, and this patch adds a "reader_lock" that serializes the access to the reader API of the ring buffer. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index a6b8f9d..17c2cce 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -154,6 +154,7 @@ static inline int test_time_stamp(u64 delta) struct ring_buffer_per_cpu { int cpu; struct ring_buffer *buffer; + spinlock_t reader_lock; /* serialize readers */ raw_spinlock_t lock; struct lock_class_key lock_key; struct list_head pages; @@ -321,6 +322,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) cpu_buffer->cpu = cpu; cpu_buffer->buffer = buffer; + spin_lock_init(&cpu_buffer->reader_lock); cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; INIT_LIST_HEAD(&cpu_buffer->pages); @@ -1476,6 +1478,9 @@ unsigned long ring_buffer_overruns(struct ring_buffer *buffer) void ring_buffer_iter_reset(struct ring_buffer_iter *iter) { struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; + unsigned long flags; + + spin_lock_irqsave(&cpu_buffer->reader_lock, flags); /* Iterator usage is expected to have record disabled */ if (list_empty(&cpu_buffer->reader_page->list)) { @@ -1489,6 +1494,8 @@ void ring_buffer_iter_reset(struct ring_buffer_iter *iter) iter->read_stamp = cpu_buffer->read_stamp; else iter->read_stamp = iter->head_page->time_stamp; + + spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); } /** @@ -1707,17 +1714,8 @@ static void rb_advance_iter(struct ring_buffer_iter *iter) rb_advance_iter(iter); } -/** - * ring_buffer_peek - peek at the next event to be read - * @buffer: The ring buffer to read - * @cpu: The cpu to peak at - * @ts: The timestamp counter of this event. - * - * This will return the event that will be read next, but does - * not consume the data. - */ -struct ring_buffer_event * -ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) +static struct ring_buffer_event * +rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) { struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_event *event; @@ -1779,16 +1777,8 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) return NULL; } -/** - * ring_buffer_iter_peek - peek at the next event to be read - * @iter: The ring buffer iterator - * @ts: The timestamp counter of this event. - * - * This will return the event that will be read next, but does - * not increment the iterator. - */ -struct ring_buffer_event * -ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts) +static struct ring_buffer_event * +rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) { struct ring_buffer *buffer; struct ring_buffer_per_cpu *cpu_buffer; @@ -1850,6 +1840,51 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts) } /** + * ring_buffer_peek - peek at the next event to be read + * @buffer: The ring buffer to read + * @cpu: The cpu to peak at + * @ts: The timestamp counter of this event. + * + * This will return the event that will be read next, but does + * not consume the data. + */ +struct ring_buffer_event * +ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) +{ + struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; + struct ring_buffer_event *event; + unsigned long flags; + + spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + event = rb_buffer_peek(buffer, cpu, ts); + spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + + return event; +} + +/** + * ring_buffer_iter_peek - peek at the next event to be read + * @iter: The ring buffer iterator + * @ts: The timestamp counter of this event. + * + * This will return the event that will be read next, but does + * not increment the iterator. + */ +struct ring_buffer_event * +ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts) +{ + struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; + struct ring_buffer_event *event; + unsigned long flags; + + spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + event = rb_iter_peek(iter, ts); + spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + + return event; +} + +/** * ring_buffer_consume - return an event and consume it * @buffer: The ring buffer to get the next event from * @@ -1860,19 +1895,24 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts) struct ring_buffer_event * ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts) { - struct ring_buffer_per_cpu *cpu_buffer; + struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; struct ring_buffer_event *event; + unsigned long flags; if (!cpu_isset(cpu, buffer->cpumask)) return NULL; - event = ring_buffer_peek(buffer, cpu, ts); + spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + + event = rb_buffer_peek(buffer, cpu, ts); if (!event) - return NULL; + goto out; - cpu_buffer = buffer->buffers[cpu]; rb_advance_reader(cpu_buffer); + out: + spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + return event; } @@ -1909,11 +1949,11 @@ ring_buffer_read_start(struct ring_buffer *buffer, int cpu) atomic_inc(&cpu_buffer->record_disabled); synchronize_sched(); - local_irq_save(flags); + spin_lock_irqsave(&cpu_buffer->reader_lock, flags); __raw_spin_lock(&cpu_buffer->lock); ring_buffer_iter_reset(iter); __raw_spin_unlock(&cpu_buffer->lock); - local_irq_restore(flags); + spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); return iter; } @@ -1945,12 +1985,17 @@ struct ring_buffer_event * ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts) { struct ring_buffer_event *event; + struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; + unsigned long flags; - event = ring_buffer_iter_peek(iter, ts); + spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + event = rb_iter_peek(iter, ts); if (!event) - return NULL; + goto out; rb_advance_iter(iter); + out: + spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); return event; } @@ -1999,13 +2044,15 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) if (!cpu_isset(cpu, buffer->cpumask)) return; - local_irq_save(flags); + spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + __raw_spin_lock(&cpu_buffer->lock); rb_reset_cpu(cpu_buffer); __raw_spin_unlock(&cpu_buffer->lock); - local_irq_restore(flags); + + spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); } /** -- cgit v0.10.2 From 3e89c7bb92fc92bb964734341487798c8d497bae Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 11 Nov 2008 15:28:41 -0500 Subject: ring-buffer: clean up warn ons Impact: Restructure WARN_ONs in ring_buffer.c The current WARN_ON macros in ring_buffer.c are quite ugly. This patch cleans them up and uses a single RB_WARN_ON that returns the value of the condition. This allows the caller to abort the function if the condition is true. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 17c2cce..8c5cacb 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -191,60 +191,14 @@ struct ring_buffer_iter { /* buffer may be either ring_buffer or ring_buffer_per_cpu */ #define RB_WARN_ON(buffer, cond) \ - do { \ - if (unlikely(cond)) { \ + ({ \ + int _____ret = unlikely(cond); \ + if (_____ret) { \ atomic_inc(&buffer->record_disabled); \ WARN_ON(1); \ } \ - } while (0) - -#define RB_WARN_ON_RET(buffer, cond) \ - do { \ - if (unlikely(cond)) { \ - atomic_inc(&buffer->record_disabled); \ - WARN_ON(1); \ - return; \ - } \ - } while (0) - -#define RB_WARN_ON_RET_INT(buffer, cond) \ - do { \ - if (unlikely(cond)) { \ - atomic_inc(&buffer->record_disabled); \ - WARN_ON(1); \ - return -1; \ - } \ - } while (0) - -#define RB_WARN_ON_RET_NULL(buffer, cond) \ - do { \ - if (unlikely(cond)) { \ - atomic_inc(&buffer->record_disabled); \ - WARN_ON(1); \ - return NULL; \ - } \ - } while (0) - -#define RB_WARN_ON_ONCE(buffer, cond) \ - do { \ - static int once; \ - if (unlikely(cond) && !once) { \ - once++; \ - atomic_inc(&buffer->record_disabled); \ - WARN_ON(1); \ - } \ - } while (0) - -/* buffer must be ring_buffer not per_cpu */ -#define RB_WARN_ON_UNLOCK(buffer, cond) \ - do { \ - if (unlikely(cond)) { \ - mutex_unlock(&buffer->mutex); \ - atomic_inc(&buffer->record_disabled); \ - WARN_ON(1); \ - return -1; \ - } \ - } while (0) + _____ret; \ + }) /** * check_pages - integrity check of buffer pages @@ -258,14 +212,18 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) struct list_head *head = &cpu_buffer->pages; struct buffer_page *page, *tmp; - RB_WARN_ON_RET_INT(cpu_buffer, head->next->prev != head); - RB_WARN_ON_RET_INT(cpu_buffer, head->prev->next != head); + if (RB_WARN_ON(cpu_buffer, head->next->prev != head)) + return -1; + if (RB_WARN_ON(cpu_buffer, head->prev->next != head)) + return -1; list_for_each_entry_safe(page, tmp, head, list) { - RB_WARN_ON_RET_INT(cpu_buffer, - page->list.next->prev != &page->list); - RB_WARN_ON_RET_INT(cpu_buffer, - page->list.prev->next != &page->list); + if (RB_WARN_ON(cpu_buffer, + page->list.next->prev != &page->list)) + return -1; + if (RB_WARN_ON(cpu_buffer, + page->list.prev->next != &page->list)) + return -1; } return 0; @@ -472,13 +430,15 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) synchronize_sched(); for (i = 0; i < nr_pages; i++) { - RB_WARN_ON_RET(cpu_buffer, list_empty(&cpu_buffer->pages)); + if (RB_WARN_ON(cpu_buffer, list_empty(&cpu_buffer->pages))) + return; p = cpu_buffer->pages.next; page = list_entry(p, struct buffer_page, list); list_del_init(&page->list); free_buffer_page(page); } - RB_WARN_ON_RET(cpu_buffer, list_empty(&cpu_buffer->pages)); + if (RB_WARN_ON(cpu_buffer, list_empty(&cpu_buffer->pages))) + return; rb_reset_cpu(cpu_buffer); @@ -500,7 +460,8 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, synchronize_sched(); for (i = 0; i < nr_pages; i++) { - RB_WARN_ON_RET(cpu_buffer, list_empty(pages)); + if (RB_WARN_ON(cpu_buffer, list_empty(pages))) + return; p = pages->next; page = list_entry(p, struct buffer_page, list); list_del_init(&page->list); @@ -555,7 +516,10 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) if (size < buffer_size) { /* easy case, just free pages */ - RB_WARN_ON_UNLOCK(buffer, nr_pages >= buffer->pages); + if (RB_WARN_ON(buffer, nr_pages >= buffer->pages)) { + mutex_unlock(&buffer->mutex); + return -1; + } rm_pages = buffer->pages - nr_pages; @@ -574,7 +538,10 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) * add these pages to the cpu_buffers. Otherwise we just free * them all and return -ENOMEM; */ - RB_WARN_ON_UNLOCK(buffer, nr_pages <= buffer->pages); + if (RB_WARN_ON(buffer, nr_pages <= buffer->pages)) { + mutex_unlock(&buffer->mutex); + return -1; + } new_pages = nr_pages - buffer->pages; @@ -598,7 +565,10 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) rb_insert_pages(cpu_buffer, &pages, new_pages); } - RB_WARN_ON_UNLOCK(buffer, !list_empty(&pages)); + if (RB_WARN_ON(buffer, !list_empty(&pages))) { + mutex_unlock(&buffer->mutex); + return -1; + } out: buffer->pages = nr_pages; @@ -686,7 +656,8 @@ static void rb_update_overflow(struct ring_buffer_per_cpu *cpu_buffer) head += rb_event_length(event)) { event = __rb_page_index(cpu_buffer->head_page, head); - RB_WARN_ON_RET(cpu_buffer, rb_null_event(event)); + if (RB_WARN_ON(cpu_buffer, rb_null_event(event))) + return; /* Only count data entries */ if (event->type != RINGBUF_TYPE_DATA) continue; @@ -739,8 +710,9 @@ rb_set_commit_event(struct ring_buffer_per_cpu *cpu_buffer, addr &= PAGE_MASK; while (cpu_buffer->commit_page->page != (void *)addr) { - RB_WARN_ON(cpu_buffer, - cpu_buffer->commit_page == cpu_buffer->tail_page); + if (RB_WARN_ON(cpu_buffer, + cpu_buffer->commit_page == cpu_buffer->tail_page)) + return; cpu_buffer->commit_page->commit = cpu_buffer->commit_page->write; rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); @@ -896,7 +868,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, reader_page = cpu_buffer->reader_page; /* we grabbed the lock before incrementing */ - RB_WARN_ON(cpu_buffer, next_page == reader_page); + if (RB_WARN_ON(cpu_buffer, next_page == reader_page)) + goto out_unlock; /* * If for some reason, we had an interrupt storm that made @@ -973,7 +946,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, /* We reserved something on the buffer */ - RB_WARN_ON_RET_NULL(cpu_buffer, write > BUF_PAGE_SIZE); + if (RB_WARN_ON(cpu_buffer, write > BUF_PAGE_SIZE)) + return NULL; event = __rb_page_index(tail_page, tail); rb_update_event(event, type, length); @@ -1072,10 +1046,8 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, * storm or we have something buggy. * Bail! */ - if (unlikely(++nr_loops > 1000)) { - RB_WARN_ON(cpu_buffer, 1); + if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000)) return NULL; - } ts = ring_buffer_time_stamp(cpu_buffer->cpu); @@ -1591,8 +1563,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) * a case where we will loop three times. There should be no * reason to loop four times (that I know of). */ - if (unlikely(++nr_loops > 3)) { - RB_WARN_ON(cpu_buffer, 1); + if (RB_WARN_ON(cpu_buffer, ++nr_loops > 3)) { reader = NULL; goto out; } @@ -1604,8 +1575,9 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) goto out; /* Never should we have an index greater than the size */ - RB_WARN_ON(cpu_buffer, - cpu_buffer->reader_page->read > rb_page_size(reader)); + if (RB_WARN_ON(cpu_buffer, + cpu_buffer->reader_page->read > rb_page_size(reader))) + goto out; /* check if we caught up to the tail */ reader = NULL; @@ -1659,7 +1631,8 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer) reader = rb_get_reader_page(cpu_buffer); /* This function should not be called when buffer is empty */ - RB_WARN_ON_RET(cpu_buffer, !reader); + if (RB_WARN_ON(cpu_buffer, !reader)) + return; event = rb_reader_event(cpu_buffer); @@ -1686,8 +1659,9 @@ static void rb_advance_iter(struct ring_buffer_iter *iter) * Check if we are at the end of the buffer. */ if (iter->head >= rb_page_size(iter->head_page)) { - RB_WARN_ON_RET(buffer, - iter->head_page == cpu_buffer->commit_page); + if (RB_WARN_ON(buffer, + iter->head_page == cpu_buffer->commit_page)) + return; rb_inc_iter(iter); return; } @@ -1700,9 +1674,10 @@ static void rb_advance_iter(struct ring_buffer_iter *iter) * This should not be called to advance the header if we are * at the tail of the buffer. */ - RB_WARN_ON_RET(cpu_buffer, + if (RB_WARN_ON(cpu_buffer, (iter->head_page == cpu_buffer->commit_page) && - (iter->head + length > rb_commit_index(cpu_buffer))); + (iter->head + length > rb_commit_index(cpu_buffer)))) + return; rb_update_iter_read_stamp(iter, event); @@ -1736,10 +1711,8 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) * can have. Nesting 10 deep of interrupts is clearly * an anomaly. */ - if (unlikely(++nr_loops > 10)) { - RB_WARN_ON(cpu_buffer, 1); + if (RB_WARN_ON(cpu_buffer, ++nr_loops > 10)) return NULL; - } reader = rb_get_reader_page(cpu_buffer); if (!reader) @@ -1800,10 +1773,8 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) * can have. Nesting 10 deep of interrupts is clearly * an anomaly. */ - if (unlikely(++nr_loops > 10)) { - RB_WARN_ON(cpu_buffer, 1); + if (RB_WARN_ON(cpu_buffer, ++nr_loops > 10)) return NULL; - } if (rb_per_cpu_empty(cpu_buffer)) return NULL; -- cgit v0.10.2 From 3f5ec13696fd4a33bde42f385406cbb1d3cc96fd Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 11 Nov 2008 23:21:31 +0100 Subject: tracing/fastboot: move boot tracer structs and funcs into their own header. Impact: Cleanups on the boot tracer and ftrace This patch bring some cleanups about the boot tracer headers. The functions and structures of this tracer have nothing related to ftrace and should have so their own header file. Signed-off-by: Frederic Weisbecker Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index dcbbf72..4fbc4a8 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -287,45 +287,4 @@ extern trace_function_return_t ftrace_function_return; extern void unregister_ftrace_return(void); #endif -/* - * Structure which defines the trace of an initcall. - * You don't have to fill the func field since it is - * only used internally by the tracer. - */ -struct boot_trace { - pid_t caller; - char func[KSYM_NAME_LEN]; - int result; - unsigned long long duration; /* usecs */ - ktime_t calltime; - ktime_t rettime; -}; - -#ifdef CONFIG_BOOT_TRACER -/* Append the trace on the ring-buffer */ -extern void trace_boot(struct boot_trace *it, initcall_t fn); - -/* Tells the tracer that smp_pre_initcall is finished. - * So we can start the tracing - */ -extern void start_boot_trace(void); - -/* Resume the tracing of other necessary events - * such as sched switches - */ -extern void enable_boot_trace(void); - -/* Suspend this tracing. Actually, only sched_switches tracing have - * to be suspended. Initcalls doesn't need it.) - */ -extern void disable_boot_trace(void); -#else -static inline void trace_boot(struct boot_trace *it, initcall_t fn) { } -static inline void start_boot_trace(void) { } -static inline void enable_boot_trace(void) { } -static inline void disable_boot_trace(void) { } -#endif - - - #endif /* _LINUX_FTRACE_H */ diff --git a/include/trace/boot.h b/include/trace/boot.h new file mode 100644 index 0000000..4cbe64e --- /dev/null +++ b/include/trace/boot.h @@ -0,0 +1,43 @@ +#ifndef _LINUX_TRACE_BOOT_H +#define _LINUX_TRACE_BOOT_H + +/* + * Structure which defines the trace of an initcall. + * You don't have to fill the func field since it is + * only used internally by the tracer. + */ +struct boot_trace { + pid_t caller; + char func[KSYM_NAME_LEN]; + int result; + unsigned long long duration; /* usecs */ + ktime_t calltime; + ktime_t rettime; +}; + +#ifdef CONFIG_BOOT_TRACER +/* Append the trace on the ring-buffer */ +extern void trace_boot(struct boot_trace *it, initcall_t fn); + +/* Tells the tracer that smp_pre_initcall is finished. + * So we can start the tracing + */ +extern void start_boot_trace(void); + +/* Resume the tracing of other necessary events + * such as sched switches + */ +extern void enable_boot_trace(void); + +/* Suspend this tracing. Actually, only sched_switches tracing have + * to be suspended. Initcalls doesn't need it.) + */ +extern void disable_boot_trace(void); +#else +static inline void trace_boot(struct boot_trace *it, initcall_t fn) { } +static inline void start_boot_trace(void) { } +static inline void enable_boot_trace(void) { } +static inline void disable_boot_trace(void) { } +#endif /* CONFIG_BOOT_TRACER */ + +#endif /* __LINUX_TRACE_BOOT_H */ diff --git a/init/main.c b/init/main.c index 4b03cd5..16ca1ee 100644 --- a/init/main.c +++ b/init/main.c @@ -63,6 +63,7 @@ #include #include #include +#include #include #include diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index e40ce0c..f69a519 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -8,6 +8,7 @@ #include #include #include +#include enum trace_type { __TRACE_FIRST_TYPE = 0, -- cgit v0.10.2 From 74239072830ef3f1398edeb1bc1076fc330fd4a2 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 11 Nov 2008 23:24:42 +0100 Subject: tracing/fastboot: Use the ring-buffer timestamp for initcall entries Impact: Split the boot tracer entries in two parts: call and return Now that we are using the sched tracer from the boot tracer, we want to use the same timestamp than the ring-buffer to have consistent time captures between sched events and initcall events. So we get rid of the old time capture by the boot tracer and split the initcall events in two parts: call and return. This way we have the ring buffer timestamp of both. An example trace: [ 27.904149584] calling net_ns_init+0x0/0x1c0 @ 1 [ 27.904429624] initcall net_ns_init+0x0/0x1c0 returned 0 after 0 msecs [ 27.904575926] calling reboot_init+0x0/0x20 @ 1 [ 27.904655399] initcall reboot_init+0x0/0x20 returned 0 after 0 msecs [ 27.904800228] calling sysctl_init+0x0/0x30 @ 1 [ 27.905142914] initcall sysctl_init+0x0/0x30 returned 0 after 0 msecs [ 27.905287211] calling ksysfs_init+0x0/0xb0 @ 1 ##### CPU 0 buffer started #### init-1 [000] 27.905395: 1:120:R + [001] 11:115:S ##### CPU 1 buffer started #### -0 [001] 27.905425: 0:140:R ==> [001] 11:115:R init-1 [000] 27.905426: 1:120:D ==> [000] 0:140:R -0 [000] 27.905431: 0:140:R + [000] 4:115:S -0 [000] 27.905451: 0:140:R ==> [000] 4:115:R ksoftirqd/0-4 [000] 27.905456: 4:115:S ==> [000] 0:140:R udevd-11 [001] 27.905458: 11:115:R + [001] 14:115:R -0 [000] 27.905459: 0:140:R + [000] 4:115:S -0 [000] 27.905462: 0:140:R ==> [000] 4:115:R udevd-11 [001] 27.905462: 11:115:R ==> [001] 14:115:R ksoftirqd/0-4 [000] 27.905467: 4:115:S ==> [000] 0:140:R -0 [000] 27.905470: 0:140:R + [000] 4:115:S -0 [000] 27.905473: 0:140:R ==> [000] 4:115:R ksoftirqd/0-4 [000] 27.905476: 4:115:S ==> [000] 0:140:R -0 [000] 27.905479: 0:140:R + [000] 4:115:S -0 [000] 27.905482: 0:140:R ==> [000] 4:115:R ksoftirqd/0-4 [000] 27.905486: 4:115:S ==> [000] 0:140:R udevd-14 [001] 27.905499: 14:120:X ==> [001] 11:115:R udevd-11 [001] 27.905506: 11:115:R + [000] 1:120:D -0 [000] 27.905515: 0:140:R ==> [000] 1:120:R udevd-11 [001] 27.905517: 11:115:S ==> [001] 0:140:R [ 27.905557107] initcall ksysfs_init+0x0/0xb0 returned 0 after 3906 msecs [ 27.905705736] calling init_jiffies_clocksource+0x0/0x10 @ 1 [ 27.905779239] initcall init_jiffies_clocksource+0x0/0x10 returned 0 after 0 msecs [ 27.906769814] calling pm_init+0x0/0x30 @ 1 [ 27.906853627] initcall pm_init+0x0/0x30 returned 0 after 0 msecs [ 27.906997803] calling pm_disk_init+0x0/0x20 @ 1 [ 27.907076946] initcall pm_disk_init+0x0/0x20 returned 0 after 0 msecs [ 27.907222556] calling swsusp_header_init+0x0/0x30 @ 1 [ 27.907294325] initcall swsusp_header_init+0x0/0x30 returned 0 after 0 msecs [ 27.907439620] calling stop_machine_init+0x0/0x50 @ 1 init-1 [000] 27.907485: 1:120:R + [000] 2:115:S init-1 [000] 27.907490: 1:120:D ==> [000] 2:115:R kthreadd-2 [000] 27.907507: 2:115:R + [001] 15:115:R -0 [001] 27.907517: 0:140:R ==> [001] 15:115:R kthreadd-2 [000] 27.907517: 2:115:D ==> [000] 0:140:R -0 [000] 27.907521: 0:140:R + [000] 4:115:S -0 [000] 27.907524: 0:140:R ==> [000] 4:115:R udevd-15 [001] 27.907527: 15:115:D + [000] 2:115:D ksoftirqd/0-4 [000] 27.907537: 4:115:S ==> [000] 2:115:R udevd-15 [001] 27.907537: 15:115:D ==> [001] 0:140:R kthreadd-2 [000] 27.907546: 2:115:R + [000] 1:120:D kthreadd-2 [000] 27.907550: 2:115:S ==> [000] 1:120:R init-1 [000] 27.907584: 1:120:R + [000] 15: 0:D init-1 [000] 27.907589: 1:120:R + [000] 2:115:S init-1 [000] 27.907593: 1:120:D ==> [000] 15: 0:R udevd-15 [000] 27.907601: 15: 0:S ==> [000] 2:115:R ##### CPU 0 buffer started #### kthreadd-2 [000] 27.907616: 2:115:R + [001] 16:115:R ##### CPU 1 buffer started #### -0 [001] 27.907620: 0:140:R ==> [001] 16:115:R kthreadd-2 [000] 27.907621: 2:115:D ==> [000] 0:140:R udevd-16 [001] 27.907625: 16:115:D + [000] 2:115:D -0 [000] 27.907628: 0:140:R + [000] 4:115:S udevd-16 [001] 27.907629: 16:115:D ==> [001] 0:140:R -0 [000] 27.907631: 0:140:R ==> [000] 4:115:R ksoftirqd/0-4 [000] 27.907636: 4:115:S ==> [000] 2:115:R kthreadd-2 [000] 27.907644: 2:115:R + [000] 1:120:D kthreadd-2 [000] 27.907647: 2:115:S ==> [000] 1:120:R init-1 [000] 27.907657: 1:120:R + [001] 16: 0:D -0 [001] 27.907666: 0:140:R ==> [001] 16: 0:R [ 27.907703862] initcall stop_machine_init+0x0/0x50 returned 0 after 0 msecs [ 27.907850704] calling filelock_init+0x0/0x30 @ 1 [ 27.907926573] initcall filelock_init+0x0/0x30 returned 0 after 0 msecs [ 27.908071327] calling init_script_binfmt+0x0/0x10 @ 1 [ 27.908165195] initcall init_script_binfmt+0x0/0x10 returned 0 after 0 msecs [ 27.908309461] calling init_elf_binfmt+0x0/0x10 @ 1 Signed-off-by: Frederic Weisbecker Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/include/trace/boot.h b/include/trace/boot.h index 4cbe64e..6b54537 100644 --- a/include/trace/boot.h +++ b/include/trace/boot.h @@ -2,22 +2,30 @@ #define _LINUX_TRACE_BOOT_H /* - * Structure which defines the trace of an initcall. + * Structure which defines the trace of an initcall + * while it is called. * You don't have to fill the func field since it is * only used internally by the tracer. */ -struct boot_trace { +struct boot_trace_call { pid_t caller; char func[KSYM_NAME_LEN]; - int result; - unsigned long long duration; /* usecs */ - ktime_t calltime; - ktime_t rettime; +}; + +/* + * Structure which defines the trace of an initcall + * while it returns. + */ +struct boot_trace_ret { + char func[KSYM_NAME_LEN]; + int result; + unsigned long long duration; /* nsecs */ }; #ifdef CONFIG_BOOT_TRACER -/* Append the trace on the ring-buffer */ -extern void trace_boot(struct boot_trace *it, initcall_t fn); +/* Append the traces on the ring-buffer */ +extern void trace_boot_call(struct boot_trace_call *bt, initcall_t fn); +extern void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn); /* Tells the tracer that smp_pre_initcall is finished. * So we can start the tracing @@ -34,7 +42,12 @@ extern void enable_boot_trace(void); */ extern void disable_boot_trace(void); #else -static inline void trace_boot(struct boot_trace *it, initcall_t fn) { } +static inline +void trace_boot_call(struct boot_trace_call *bt, initcall_t fn) { } + +static inline +void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn) { } + static inline void start_boot_trace(void) { } static inline void enable_boot_trace(void) { } static inline void disable_boot_trace(void) { } diff --git a/init/main.c b/init/main.c index 16ca1ee..e810196 100644 --- a/init/main.c +++ b/init/main.c @@ -704,33 +704,35 @@ core_param(initcall_debug, initcall_debug, bool, 0644); int do_one_initcall(initcall_t fn) { int count = preempt_count(); - ktime_t delta; + ktime_t calltime, delta, rettime; char msgbuf[64]; - struct boot_trace it; + struct boot_trace_call call; + struct boot_trace_ret ret; if (initcall_debug) { - it.caller = task_pid_nr(current); - printk("calling %pF @ %i\n", fn, it.caller); - it.calltime = ktime_get(); + call.caller = task_pid_nr(current); + printk("calling %pF @ %i\n", fn, call.caller); + calltime = ktime_get(); + trace_boot_call(&call, fn); enable_boot_trace(); } - it.result = fn(); + ret.result = fn(); if (initcall_debug) { - it.rettime = ktime_get(); - delta = ktime_sub(it.rettime, it.calltime); - it.duration = (unsigned long long) delta.tv64 >> 10; - printk("initcall %pF returned %d after %Ld usecs\n", fn, - it.result, it.duration); - trace_boot(&it, fn); disable_boot_trace(); + rettime = ktime_get(); + delta = ktime_sub(rettime, calltime); + ret.duration = (unsigned long long) delta.tv64 >> 10; + trace_boot_ret(&ret, fn); + printk("initcall %pF returned %d after %Ld usecs\n", fn, + ret.result, ret.duration); } msgbuf[0] = 0; - if (it.result && it.result != -ENODEV && initcall_debug) - sprintf(msgbuf, "error code %d ", it.result); + if (ret.result && ret.result != -ENODEV && initcall_debug) + sprintf(msgbuf, "error code %d ", ret.result); if (preempt_count() != count) { strlcat(msgbuf, "preemption imbalance ", sizeof(msgbuf)); @@ -744,7 +746,7 @@ int do_one_initcall(initcall_t fn) printk("initcall %pF returned with %s\n", fn, msgbuf); } - return it.result; + return ret.result; } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index f69a519..b5f91f1 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -22,7 +22,8 @@ enum trace_type { TRACE_SPECIAL, TRACE_MMIO_RW, TRACE_MMIO_MAP, - TRACE_BOOT, + TRACE_BOOT_CALL, + TRACE_BOOT_RET, TRACE_FN_RET, __TRACE_LAST_TYPE @@ -123,9 +124,14 @@ struct trace_mmiotrace_map { struct mmiotrace_map map; }; -struct trace_boot { +struct trace_boot_call { struct trace_entry ent; - struct boot_trace initcall; + struct boot_trace_call boot_call; +}; + +struct trace_boot_ret { + struct trace_entry ent; + struct boot_trace_ret boot_ret; }; /* @@ -228,8 +234,9 @@ extern void __ftrace_bad_type(void); TRACE_MMIO_RW); \ IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \ TRACE_MMIO_MAP); \ - IF_ASSIGN(var, ent, struct trace_boot, TRACE_BOOT); \ - IF_ASSIGN(var, ent, struct ftrace_ret_entry, TRACE_FN_RET); \ + IF_ASSIGN(var, ent, struct trace_boot_call, TRACE_BOOT_CALL);\ + IF_ASSIGN(var, ent, struct trace_boot_ret, TRACE_BOOT_RET);\ + IF_ASSIGN(var, ent, struct ftrace_ret_entry, TRACE_FN_RET);\ __ftrace_bad_type(); \ } while (0) diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 8f71915..cb333b7 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -58,35 +58,71 @@ static void boot_trace_init(struct trace_array *tr) tracing_sched_switch_assign_trace(tr); } -static enum print_line_t initcall_print_line(struct trace_iterator *iter) +static enum print_line_t +initcall_call_print_line(struct trace_iterator *iter) { + struct trace_entry *entry = iter->ent; + struct trace_seq *s = &iter->seq; + struct trace_boot_call *field; + struct boot_trace_call *call; + u64 ts; + unsigned long nsec_rem; int ret; + + trace_assign_type(field, entry); + call = &field->boot_call; + ts = iter->ts; + nsec_rem = do_div(ts, 1000000000); + + ret = trace_seq_printf(s, "[%5ld.%09ld] calling %s @ %i\n", + (unsigned long)ts, nsec_rem, call->func, call->caller); + + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + else + return TRACE_TYPE_HANDLED; +} + +static enum print_line_t +initcall_ret_print_line(struct trace_iterator *iter) +{ struct trace_entry *entry = iter->ent; - struct trace_boot *field = (struct trace_boot *)entry; - struct boot_trace *it = &field->initcall; struct trace_seq *s = &iter->seq; - struct timespec calltime = ktime_to_timespec(it->calltime); - struct timespec rettime = ktime_to_timespec(it->rettime); - - if (entry->type == TRACE_BOOT) { - ret = trace_seq_printf(s, "[%5ld.%09ld] calling %s @ %i\n", - calltime.tv_sec, - calltime.tv_nsec, - it->func, it->caller); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - ret = trace_seq_printf(s, "[%5ld.%09ld] initcall %s " - "returned %d after %lld msecs\n", - rettime.tv_sec, - rettime.tv_nsec, - it->func, it->result, it->duration); - - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + struct trace_boot_ret *field; + struct boot_trace_ret *init_ret; + u64 ts; + unsigned long nsec_rem; + int ret; + + trace_assign_type(field, entry); + init_ret = &field->boot_ret; + ts = iter->ts; + nsec_rem = do_div(ts, 1000000000); + + ret = trace_seq_printf(s, "[%5ld.%09ld] initcall %s " + "returned %d after %llu msecs\n", + (unsigned long) ts, + nsec_rem, + init_ret->func, init_ret->result, init_ret->duration); + + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + else return TRACE_TYPE_HANDLED; +} + +static enum print_line_t initcall_print_line(struct trace_iterator *iter) +{ + struct trace_entry *entry = iter->ent; + + switch (entry->type) { + case TRACE_BOOT_CALL: + return initcall_call_print_line(iter); + case TRACE_BOOT_RET: + return initcall_ret_print_line(iter); + default: + return TRACE_TYPE_UNHANDLED; } - return TRACE_TYPE_UNHANDLED; } struct tracer boot_tracer __read_mostly = @@ -97,11 +133,10 @@ struct tracer boot_tracer __read_mostly = .print_line = initcall_print_line, }; -void trace_boot(struct boot_trace *it, initcall_t fn) +void trace_boot_call(struct boot_trace_call *bt, initcall_t fn) { struct ring_buffer_event *event; - struct trace_boot *entry; - struct trace_array_cpu *data; + struct trace_boot_call *entry; unsigned long irq_flags; struct trace_array *tr = boot_trace; @@ -111,9 +146,37 @@ void trace_boot(struct boot_trace *it, initcall_t fn) /* Get its name now since this function could * disappear because it is in the .init section. */ - sprint_symbol(it->func, (unsigned long)fn); + sprint_symbol(bt->func, (unsigned long)fn); + preempt_disable(); + + event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), + &irq_flags); + if (!event) + goto out; + entry = ring_buffer_event_data(event); + tracing_generic_entry_update(&entry->ent, 0, 0); + entry->ent.type = TRACE_BOOT_CALL; + entry->boot_call = *bt; + ring_buffer_unlock_commit(tr->buffer, event, irq_flags); + + trace_wake_up(); + + out: + preempt_enable(); +} + +void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn) +{ + struct ring_buffer_event *event; + struct trace_boot_ret *entry; + unsigned long irq_flags; + struct trace_array *tr = boot_trace; + + if (!pre_initcalls_finished) + return; + + sprint_symbol(bt->func, (unsigned long)fn); preempt_disable(); - data = tr->data[smp_processor_id()]; event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), &irq_flags); @@ -121,8 +184,8 @@ void trace_boot(struct boot_trace *it, initcall_t fn) goto out; entry = ring_buffer_event_data(event); tracing_generic_entry_update(&entry->ent, 0, 0); - entry->ent.type = TRACE_BOOT; - entry->initcall = *it; + entry->ent.type = TRACE_BOOT_RET; + entry->boot_ret = *bt; ring_buffer_unlock_commit(tr->buffer, event, irq_flags); trace_wake_up(); -- cgit v0.10.2 From 642edba5f5c545772b89907cf96134c73d6073c7 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 12 Nov 2008 00:01:26 -0500 Subject: ring-buffer: fix deadlock from reader_lock in read_start Impact: deadlock fix in ring_buffer_read_start The ring_buffer_iter_reset was called from ring_buffer_read_start where both grabbed the reader_lock. This patch separates out the internals of ring_buffer_iter_reset to its own function so that both APIs may grab the reader_lock. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index c04c433..86dc353 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1475,19 +1475,9 @@ unsigned long ring_buffer_overruns(struct ring_buffer *buffer) return overruns; } -/** - * ring_buffer_iter_reset - reset an iterator - * @iter: The iterator to reset - * - * Resets the iterator, so that it will start from the beginning - * again. - */ -void ring_buffer_iter_reset(struct ring_buffer_iter *iter) +static void rb_iter_reset(struct ring_buffer_iter *iter) { struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; - unsigned long flags; - - spin_lock_irqsave(&cpu_buffer->reader_lock, flags); /* Iterator usage is expected to have record disabled */ if (list_empty(&cpu_buffer->reader_page->list)) { @@ -1501,7 +1491,22 @@ void ring_buffer_iter_reset(struct ring_buffer_iter *iter) iter->read_stamp = cpu_buffer->read_stamp; else iter->read_stamp = iter->head_page->time_stamp; +} +/** + * ring_buffer_iter_reset - reset an iterator + * @iter: The iterator to reset + * + * Resets the iterator, so that it will start from the beginning + * again. + */ +void ring_buffer_iter_reset(struct ring_buffer_iter *iter) +{ + struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; + unsigned long flags; + + spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + rb_iter_reset(iter); spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); } @@ -1957,7 +1962,7 @@ ring_buffer_read_start(struct ring_buffer *buffer, int cpu) spin_lock_irqsave(&cpu_buffer->reader_lock, flags); __raw_spin_lock(&cpu_buffer->lock); - ring_buffer_iter_reset(iter); + rb_iter_reset(iter); __raw_spin_unlock(&cpu_buffer->lock); spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); -- cgit v0.10.2 From 1f0d69a9fc815db82f15722bf05227190b1d714d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 12 Nov 2008 00:14:39 -0500 Subject: tracing: profile likely and unlikely annotations Impact: new unlikely/likely profiler Andrew Morton recently suggested having an in-kernel way to profile likely and unlikely macros. This patch achieves that goal. When configured, every(*) likely and unlikely macro gets a counter attached to it. When the condition is hit, the hit and misses of that condition are recorded. These numbers can later be retrieved by: /debugfs/tracing/profile_likely - All likely markers /debugfs/tracing/profile_unlikely - All unlikely markers. # cat /debug/tracing/profile_unlikely | head correct incorrect % Function File Line ------- --------- - -------- ---- ---- 2167 0 0 do_arch_prctl process_64.c 832 0 0 0 do_arch_prctl process_64.c 804 2670 0 0 IS_ERR err.h 34 71230 5693 7 __switch_to process_64.c 673 76919 0 0 __switch_to process_64.c 639 43184 33743 43 __switch_to process_64.c 624 12740 64181 83 __switch_to process_64.c 594 12740 64174 83 __switch_to process_64.c 590 # cat /debug/tracing/profile_unlikely | \ awk '{ if ($3 > 25) print $0; }' |head -20 44963 35259 43 __switch_to process_64.c 624 12762 67454 84 __switch_to process_64.c 594 12762 67447 84 __switch_to process_64.c 590 1478 595 28 syscall_get_error syscall.h 51 0 2821 100 syscall_trace_leave ptrace.c 1567 0 1 100 native_smp_prepare_cpus smpboot.c 1237 86338 265881 75 calc_delta_fair sched_fair.c 408 210410 108540 34 calc_delta_mine sched.c 1267 0 54550 100 sched_info_queued sched_stats.h 222 51899 66435 56 pick_next_task_fair sched_fair.c 1422 6 10 62 yield_task_fair sched_fair.c 982 7325 2692 26 rt_policy sched.c 144 0 1270 100 pre_schedule_rt sched_rt.c 1261 1268 48073 97 pick_next_task_rt sched_rt.c 884 0 45181 100 sched_info_dequeued sched_stats.h 177 0 15 100 sched_move_task sched.c 8700 0 15 100 sched_move_task sched.c 8690 53167 33217 38 schedule sched.c 4457 0 80208 100 sched_info_switch sched_stats.h 270 30585 49631 61 context_switch sched.c 2619 # cat /debug/tracing/profile_likely | awk '{ if ($3 > 25) print $0; }' 39900 36577 47 pick_next_task sched.c 4397 20824 15233 42 switch_mm mmu_context_64.h 18 0 7 100 __cancel_work_timer workqueue.c 560 617 66484 99 clocksource_adjust timekeeping.c 456 0 346340 100 audit_syscall_exit auditsc.c 1570 38 347350 99 audit_get_context auditsc.c 732 0 345244 100 audit_syscall_entry auditsc.c 1541 38 1017 96 audit_free auditsc.c 1446 0 1090 100 audit_alloc auditsc.c 862 2618 1090 29 audit_alloc auditsc.c 858 0 6 100 move_masked_irq migration.c 9 1 198 99 probe_sched_wakeup trace_sched_switch.c 58 2 2 50 probe_wakeup trace_sched_wakeup.c 227 0 2 100 probe_wakeup_sched_switch trace_sched_wakeup.c 144 4514 2090 31 __grab_cache_page filemap.c 2149 12882 228786 94 mapping_unevictable pagemap.h 50 4 11 73 __flush_cpu_slab slub.c 1466 627757 330451 34 slab_free slub.c 1731 2959 61245 95 dentry_lru_del_init dcache.c 153 946 1217 56 load_elf_binary binfmt_elf.c 904 102 82 44 disk_put_part genhd.h 206 1 1 50 dst_gc_task dst.c 82 0 19 100 tcp_mss_split_point tcp_output.c 1126 As you can see by the above, there's a bit of work to do in rethinking the use of some unlikelys and likelys. Note: the unlikely case had 71 hits that were more than 25%. Note: After submitting my first version of this patch, Andrew Morton showed me a version written by Daniel Walker, where I picked up the following ideas from: 1) Using __builtin_constant_p to avoid profiling fixed values. 2) Using __FILE__ instead of instruction pointers. 3) Using the preprocessor to stop all profiling of likely annotations from vsyscall_64.c. Thanks to Andrew Morton, Arjan van de Ven, Theodore Tso and Ingo Molnar for their feed back on this patch. (*) Not ever unlikely is recorded, those that are used by vsyscalls (a few of them) had to have profiling disabled. Signed-off-by: Steven Rostedt Cc: Andrew Morton Cc: Frederic Weisbecker Cc: Theodore Tso Cc: Arjan van de Ven Cc: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 0b8b6690..2f90202 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -17,6 +17,14 @@ * want per guest time just set the kernel.vsyscall64 sysctl to 0. */ +/* Protect userspace from profiling */ +#ifdef CONFIG_TRACE_UNLIKELY_PROFILE +# undef likely +# undef unlikely +# define likely(x) likely_notrace(x) +# define unlikely(x) unlikely_notrace(x) +#endif + #include #include #include diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 80744606..e10beb5 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -45,6 +45,17 @@ #define MCOUNT_REC() #endif +#ifdef CONFIG_TRACE_UNLIKELY_PROFILE +#define LIKELY_PROFILE() VMLINUX_SYMBOL(__start_likely_profile) = .; \ + *(_ftrace_likely) \ + VMLINUX_SYMBOL(__stop_likely_profile) = .; \ + VMLINUX_SYMBOL(__start_unlikely_profile) = .; \ + *(_ftrace_unlikely) \ + VMLINUX_SYMBOL(__stop_unlikely_profile) = .; +#else +#define LIKELY_PROFILE() +#endif + /* .data section */ #define DATA_DATA \ *(.data) \ @@ -62,7 +73,8 @@ VMLINUX_SYMBOL(__stop___markers) = .; \ VMLINUX_SYMBOL(__start___tracepoints) = .; \ *(__tracepoints) \ - VMLINUX_SYMBOL(__stop___tracepoints) = .; + VMLINUX_SYMBOL(__stop___tracepoints) = .; \ + LIKELY_PROFILE() #define RO_DATA(align) \ . = ALIGN((align)); \ diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 98115d9..935e30c 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -59,8 +59,65 @@ extern void __chk_io_ptr(const volatile void __iomem *); * specific implementations come from the above header files */ -#define likely(x) __builtin_expect(!!(x), 1) -#define unlikely(x) __builtin_expect(!!(x), 0) +#ifdef CONFIG_TRACE_UNLIKELY_PROFILE +struct ftrace_likely_data { + const char *func; + const char *file; + unsigned line; + unsigned long correct; + unsigned long incorrect; +}; +void ftrace_likely_update(struct ftrace_likely_data *f, int val, int expect); + +#define likely_notrace(x) __builtin_expect(!!(x), 1) +#define unlikely_notrace(x) __builtin_expect(!!(x), 0) + +#define likely_check(x) ({ \ + int ______r; \ + static struct ftrace_likely_data \ + __attribute__((__aligned__(4))) \ + __attribute__((section("_ftrace_likely"))) \ + ______f = { \ + .func = __func__, \ + .file = __FILE__, \ + .line = __LINE__, \ + }; \ + ______f.line = __LINE__; \ + ______r = likely_notrace(x); \ + ftrace_likely_update(&______f, ______r, 1); \ + ______r; \ + }) +#define unlikely_check(x) ({ \ + int ______r; \ + static struct ftrace_likely_data \ + __attribute__((__aligned__(4))) \ + __attribute__((section("_ftrace_unlikely"))) \ + ______f = { \ + .func = __func__, \ + .file = __FILE__, \ + .line = __LINE__, \ + }; \ + ______f.line = __LINE__; \ + ______r = unlikely_notrace(x); \ + ftrace_likely_update(&______f, ______r, 0); \ + ______r; \ + }) + +/* + * Using __builtin_constant_p(x) to ignore cases where the return + * value is always the same. This idea is taken from a similar patch + * written by Daniel Walker. + */ +# ifndef likely +# define likely(x) (__builtin_constant_p(x) ? !!(x) : likely_check(x)) +# endif +# ifndef unlikely +# define unlikely(x) (__builtin_constant_p(x) ? !!(x) : unlikely_check(x)) +# endif +#else +# define likely(x) __builtin_expect(!!(x), 1) +# define unlikely(x) __builtin_expect(!!(x), 0) +#endif /* Optimization barrier */ #ifndef barrier diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index d986216..a604f24 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -159,6 +159,22 @@ config BOOT_TRACER selected, because the self-tests are an initcall as well and that would invalidate the boot trace. ) +config TRACE_UNLIKELY_PROFILE + bool "Trace likely/unlikely profiler" + depends on DEBUG_KERNEL + select TRACING + help + This tracer profiles all the the likely and unlikely macros + in the kernel. It will display the results in: + + /debugfs/tracing/profile_likely + /debugfs/tracing/profile_unlikely + + Note: this will add a significant overhead, only turn this + on if you need to profile the system's use of these macros. + + Say N if unsure. + config STACK_TRACER bool "Trace max stack" depends on HAVE_FUNCTION_TRACER diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 3e1f361..98e70ee 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -25,5 +25,6 @@ obj-$(CONFIG_STACK_TRACER) += trace_stack.o obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o obj-$(CONFIG_BOOT_TRACER) += trace_boot.o obj-$(CONFIG_FUNCTION_RET_TRACER) += trace_functions_return.o +obj-$(CONFIG_TRACE_UNLIKELY_PROFILE) += trace_unlikely.o libftrace-y := ftrace.o diff --git a/kernel/trace/trace_unlikely.c b/kernel/trace/trace_unlikely.c new file mode 100644 index 0000000..9493269 --- /dev/null +++ b/kernel/trace/trace_unlikely.c @@ -0,0 +1,164 @@ +/* + * unlikely profiler + * + * Copyright (C) 2008 Steven Rostedt + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "trace.h" + +void ftrace_likely_update(struct ftrace_likely_data *f, int val, int expect) +{ + /* FIXME: Make this atomic! */ + if (val == expect) + f->correct++; + else + f->incorrect++; +} +EXPORT_SYMBOL(ftrace_likely_update); + +struct ftrace_pointer { + void *start; + void *stop; +}; + +static void * +t_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct ftrace_pointer *f = m->private; + struct ftrace_likely_data *p = v; + + (*pos)++; + + if (v == (void *)1) + return f->start; + + ++p; + + if ((void *)p >= (void *)f->stop) + return NULL; + + return p; +} + +static void *t_start(struct seq_file *m, loff_t *pos) +{ + void *t = (void *)1; + loff_t l = 0; + + for (; t && l < *pos; t = t_next(m, t, &l)) + ; + + return t; +} + +static void t_stop(struct seq_file *m, void *p) +{ +} + +static int t_show(struct seq_file *m, void *v) +{ + struct ftrace_likely_data *p = v; + const char *f; + unsigned long percent; + + if (v == (void *)1) { + seq_printf(m, " correct incorrect %% " + " Function " + " File Line\n" + " ------- --------- - " + " -------- " + " ---- ----\n"); + return 0; + } + + /* Only print the file, not the path */ + f = p->file + strlen(p->file); + while (f >= p->file && *f != '/') + f--; + f++; + + if (p->correct) { + percent = p->incorrect * 100; + percent /= p->correct + p->incorrect; + } else + percent = p->incorrect ? 100 : 0; + + seq_printf(m, "%8lu %8lu %3lu ", p->correct, p->incorrect, percent); + seq_printf(m, "%-30.30s %-20.20s %d\n", p->func, f, p->line); + return 0; +} + +static struct seq_operations tracing_likely_seq_ops = { + .start = t_start, + .next = t_next, + .stop = t_stop, + .show = t_show, +}; + +static int tracing_likely_open(struct inode *inode, struct file *file) +{ + int ret; + + ret = seq_open(file, &tracing_likely_seq_ops); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = (void *)inode->i_private; + } + + return ret; +} + +static struct file_operations tracing_likely_fops = { + .open = tracing_likely_open, + .read = seq_read, + .llseek = seq_lseek, +}; + +extern unsigned long __start_likely_profile[]; +extern unsigned long __stop_likely_profile[]; +extern unsigned long __start_unlikely_profile[]; +extern unsigned long __stop_unlikely_profile[]; + +static struct ftrace_pointer ftrace_likely_pos = { + .start = __start_likely_profile, + .stop = __stop_likely_profile, +}; + +static struct ftrace_pointer ftrace_unlikely_pos = { + .start = __start_unlikely_profile, + .stop = __stop_unlikely_profile, +}; + +static __init int ftrace_unlikely_init(void) +{ + struct dentry *d_tracer; + struct dentry *entry; + + d_tracer = tracing_init_dentry(); + + entry = debugfs_create_file("profile_likely", 0444, d_tracer, + &ftrace_likely_pos, + &tracing_likely_fops); + if (!entry) + pr_warning("Could not create debugfs 'profile_likely' entry\n"); + + entry = debugfs_create_file("profile_unlikely", 0444, d_tracer, + &ftrace_unlikely_pos, + &tracing_likely_fops); + if (!entry) + pr_warning("Could not create debugfs" + " 'profile_unlikely' entry\n"); + + return 0; +} + +device_initcall(ftrace_unlikely_init); -- cgit v0.10.2 From 52f232cb720a7babb752849cbc2cab2d24021209 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 12 Nov 2008 00:14:40 -0500 Subject: tracing: likely/unlikely branch annotation tracer Impact: new likely/unlikely branch tracer This patch adds a way to record the instances of the likely() and unlikely() branch condition annotations. When "unlikely" is set in /debugfs/tracing/iter_ctrl the unlikely conditions will be added to any of the ftrace tracers. The change takes effect when a new tracer is passed into the current_tracer file. For example: bash-3471 [003] 357.014755: [INCORRECT] sched_info_dequeued:sched_stats.h:177 bash-3471 [003] 357.014756: [correct] update_curr:sched_fair.c:489 bash-3471 [003] 357.014758: [correct] calc_delta_fair:sched_fair.c:411 bash-3471 [003] 357.014759: [correct] account_group_exec_runtime:sched_stats.h:356 bash-3471 [003] 357.014761: [correct] update_curr:sched_fair.c:489 bash-3471 [003] 357.014763: [INCORRECT] calc_delta_fair:sched_fair.c:411 bash-3471 [003] 357.014765: [correct] calc_delta_mine:sched.c:1279 Which shows the normal tracer heading, as well as whether the condition was correct "[correct]" or was mistaken "[INCORRECT]", followed by the function, file name and line number. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index a604f24..8abcaf8 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -175,6 +175,28 @@ config TRACE_UNLIKELY_PROFILE Say N if unsure. +config TRACING_UNLIKELY + bool + help + Selected by tracers that will trace the likely and unlikely + conditions. This prevents the tracers themselves from being + profiled. Profiling the tracing infrastructure can only happen + when the likelys and unlikelys are not being traced. + +config UNLIKELY_TRACER + bool "Trace likely/unlikely instances" + depends on TRACE_UNLIKELY_PROFILE + select TRACING_UNLIKELY + help + This traces the events of likely and unlikely condition + calls in the kernel. The difference between this and the + "Trace likely/unlikely profiler" is that this is not a + histogram of the callers, but actually places the calling + events into a running trace buffer to see when and where the + events happened, as well as their results. + + Say N if unsure. + config STACK_TRACER bool "Trace max stack" depends on HAVE_FUNCTION_TRACER diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 98e70ee..c938d03 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -10,6 +10,12 @@ CFLAGS_trace_selftest_dynamic.o = -pg obj-y += trace_selftest_dynamic.o endif +# If unlikely tracing is enabled, do not trace these files +ifdef CONFIG_TRACING_UNLIKELY +KBUILD_CFLAGS += '-Dlikely(x)=likely_notrace(x)' +KBUILD_CFLAGS += '-Dunlikely(x)=unlikely_notrace(x)' +endif + obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o obj-$(CONFIG_RING_BUFFER) += ring_buffer.o diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a3f7ae9..83d3863 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -258,6 +258,9 @@ static const char *trace_options[] = { "sched-tree", "ftrace_printk", "ftrace_preempt", +#ifdef CONFIG_UNLIKELY_TRACER + "unlikely", +#endif NULL }; @@ -1648,6 +1651,18 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) trace_seq_print_cont(s, iter); break; } + case TRACE_UNLIKELY: { + struct trace_unlikely *field; + + trace_assign_type(field, entry); + + trace_seq_printf(s, "[%s] %s:%s:%d\n", + field->correct ? "correct" : "INCORRECT", + field->func, + field->file, + field->line); + break; + } default: trace_seq_printf(s, "Unknown type %d\n", entry->type); } @@ -1787,6 +1802,18 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter) return print_return_function(iter); break; } + case TRACE_UNLIKELY: { + struct trace_unlikely *field; + + trace_assign_type(field, entry); + + trace_seq_printf(s, "[%s] %s:%s:%d\n", + field->correct ? "correct" : "INCORRECT", + field->func, + field->file, + field->line); + break; + } } return TRACE_TYPE_HANDLED; } @@ -2592,6 +2619,7 @@ static int tracing_set_tracer(char *buf) if (t == current_trace) goto out; + trace_unlikely_disable(); if (current_trace && current_trace->reset) current_trace->reset(tr); @@ -2599,6 +2627,7 @@ static int tracing_set_tracer(char *buf) if (t->init) t->init(tr); + trace_unlikely_enable(tr); out: mutex_unlock(&trace_types_lock); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index b5f91f1..9635aa2 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -22,6 +22,7 @@ enum trace_type { TRACE_SPECIAL, TRACE_MMIO_RW, TRACE_MMIO_MAP, + TRACE_UNLIKELY, TRACE_BOOT_CALL, TRACE_BOOT_RET, TRACE_FN_RET, @@ -134,6 +135,16 @@ struct trace_boot_ret { struct boot_trace_ret boot_ret; }; +#define TRACE_FUNC_SIZE 30 +#define TRACE_FILE_SIZE 20 +struct trace_unlikely { + struct trace_entry ent; + unsigned line; + char func[TRACE_FUNC_SIZE+1]; + char file[TRACE_FILE_SIZE+1]; + char correct; +}; + /* * trace_flag_type is an enumeration that holds different * states when a trace occurs. These are: @@ -236,6 +247,7 @@ extern void __ftrace_bad_type(void); TRACE_MMIO_MAP); \ IF_ASSIGN(var, ent, struct trace_boot_call, TRACE_BOOT_CALL);\ IF_ASSIGN(var, ent, struct trace_boot_ret, TRACE_BOOT_RET);\ + IF_ASSIGN(var, ent, struct trace_unlikely, TRACE_UNLIKELY); \ IF_ASSIGN(var, ent, struct ftrace_ret_entry, TRACE_FN_RET);\ __ftrace_bad_type(); \ } while (0) @@ -456,6 +468,9 @@ enum trace_iterator_flags { TRACE_ITER_SCHED_TREE = 0x200, TRACE_ITER_PRINTK = 0x400, TRACE_ITER_PREEMPTONLY = 0x800, +#ifdef CONFIG_UNLIKELY_TRACER + TRACE_ITER_UNLIKELY = 0x1000, +#endif }; /* @@ -515,4 +530,28 @@ static inline void ftrace_preempt_enable(int resched) preempt_enable_notrace(); } +#ifdef CONFIG_UNLIKELY_TRACER +extern int enable_unlikely_tracing(struct trace_array *tr); +extern void disable_unlikely_tracing(void); +static inline int trace_unlikely_enable(struct trace_array *tr) +{ + if (trace_flags & TRACE_ITER_UNLIKELY) + return enable_unlikely_tracing(tr); + return 0; +} +static inline void trace_unlikely_disable(void) +{ + /* due to races, always disable */ + disable_unlikely_tracing(); +} +#else +static inline int trace_unlikely_enable(struct trace_array *tr) +{ + return 0; +} +static inline void trace_unlikely_disable(void) +{ +} +#endif /* CONFIG_UNLIKELY_TRACER */ + #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_unlikely.c b/kernel/trace/trace_unlikely.c index 9493269..7290e0e 100644 --- a/kernel/trace/trace_unlikely.c +++ b/kernel/trace/trace_unlikely.c @@ -15,8 +15,122 @@ #include #include "trace.h" +#ifdef CONFIG_UNLIKELY_TRACER + +static int unlikely_tracing_enabled __read_mostly; +static DEFINE_MUTEX(unlikely_tracing_mutex); +static struct trace_array *unlikely_tracer; + +static void +probe_likely_condition(struct ftrace_likely_data *f, int val, int expect) +{ + struct trace_array *tr = unlikely_tracer; + struct ring_buffer_event *event; + struct trace_unlikely *entry; + unsigned long flags, irq_flags; + int cpu, pc; + const char *p; + + /* + * I would love to save just the ftrace_likely_data pointer, but + * this code can also be used by modules. Ugly things can happen + * if the module is unloaded, and then we go and read the + * pointer. This is slower, but much safer. + */ + + if (unlikely(!tr)) + return; + + local_irq_save(flags); + cpu = raw_smp_processor_id(); + if (atomic_inc_return(&tr->data[cpu]->disabled) != 1) + goto out; + + event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), + &irq_flags); + if (!event) + goto out; + + pc = preempt_count(); + entry = ring_buffer_event_data(event); + tracing_generic_entry_update(&entry->ent, flags, pc); + entry->ent.type = TRACE_UNLIKELY; + + /* Strip off the path, only save the file */ + p = f->file + strlen(f->file); + while (p >= f->file && *p != '/') + p--; + p++; + + strncpy(entry->func, f->func, TRACE_FUNC_SIZE); + strncpy(entry->file, p, TRACE_FILE_SIZE); + entry->func[TRACE_FUNC_SIZE] = 0; + entry->file[TRACE_FILE_SIZE] = 0; + entry->line = f->line; + entry->correct = val == expect; + + ring_buffer_unlock_commit(tr->buffer, event, irq_flags); + + out: + atomic_dec(&tr->data[cpu]->disabled); + local_irq_restore(flags); +} + +static inline +void trace_likely_condition(struct ftrace_likely_data *f, int val, int expect) +{ + if (!unlikely_tracing_enabled) + return; + + probe_likely_condition(f, val, expect); +} + +int enable_unlikely_tracing(struct trace_array *tr) +{ + int ret = 0; + + mutex_lock(&unlikely_tracing_mutex); + unlikely_tracer = tr; + /* + * Must be seen before enabling. The reader is a condition + * where we do not need a matching rmb() + */ + smp_wmb(); + unlikely_tracing_enabled++; + mutex_unlock(&unlikely_tracing_mutex); + + return ret; +} + +void disable_unlikely_tracing(void) +{ + mutex_lock(&unlikely_tracing_mutex); + + if (!unlikely_tracing_enabled) + goto out_unlock; + + unlikely_tracing_enabled--; + + out_unlock: + mutex_unlock(&unlikely_tracing_mutex); +} +#else +static inline +void trace_likely_condition(struct ftrace_likely_data *f, int val, int expect) +{ +} +#endif /* CONFIG_UNLIKELY_TRACER */ + void ftrace_likely_update(struct ftrace_likely_data *f, int val, int expect) { + /* + * I would love to have a trace point here instead, but the + * trace point code is so inundated with unlikely and likely + * conditions that the recursive nightmare that exists is too + * much to try to get working. At least for now. + */ + trace_likely_condition(f, val, expect); + /* FIXME: Make this atomic! */ if (val == expect) f->correct++; -- cgit v0.10.2 From f88c4ae9f8c3939bee4337c75c7a673b5de7a8a7 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 12 Nov 2008 11:55:41 +0100 Subject: tracing: branch tracer, tweak output Impact: modify the tracer output, to make it a bit easier to read Change the output from: > bash-3471 [003] 357.014755: [INCORRECT] sched_info_dequeued:sched_stats.h:177 > bash-3471 [003] 357.014756: [correct] update_curr:sched_fair.c:489 > bash-3471 [003] 357.014758: [correct] calc_delta_fair:sched_fair.c:411 to: > bash-3471 [003] 357.014755: [ MISS ] sched_info_dequeued:sched_stats.h:177 > bash-3471 [003] 357.014756: [ .... ] update_curr:sched_fair.c:489 > bash-3471 [003] 357.014758: [ .... ] calc_delta_fair:sched_fair.c:411 it's good to have fields aligned vertically, and the only important information is a prediction miss, so display only that information. Signed-off-by: Ingo Molnar diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 83d3863..728a46e 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1657,7 +1657,7 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) trace_assign_type(field, entry); trace_seq_printf(s, "[%s] %s:%s:%d\n", - field->correct ? "correct" : "INCORRECT", + field->correct ? " .... " : " MISS ", field->func, field->file, field->line); @@ -1808,7 +1808,7 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter) trace_assign_type(field, entry); trace_seq_printf(s, "[%s] %s:%s:%d\n", - field->correct ? "correct" : "INCORRECT", + field->correct ? " .... " : " MISS ", field->func, field->file, field->line); -- cgit v0.10.2 From 2b7d0390a6d6d595f43ea3806639664afe5b9ebe Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 12 Nov 2008 13:17:38 +0100 Subject: tracing: branch tracer, fix vdso crash Impact: fix bootup crash the branch tracer missed arch/x86/vdso/vclock_gettime.c from disabling tracing, which caused such bootup crashes: [ 201.840097] init[1]: segfault at 7fffed3fe7c0 ip 00007fffed3fea2e sp 000077 also clean up the ugly ifdefs in arch/x86/kernel/vsyscall_64.c by creating DISABLE_UNLIKELY_PROFILE facility for code to turn off instrumentation on a per file basis. Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 2f90202..ece0293 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -17,13 +17,8 @@ * want per guest time just set the kernel.vsyscall64 sysctl to 0. */ -/* Protect userspace from profiling */ -#ifdef CONFIG_TRACE_UNLIKELY_PROFILE -# undef likely -# undef unlikely -# define likely(x) likely_notrace(x) -# define unlikely(x) unlikely_notrace(x) -#endif +/* Disable profiling for userspace code: */ +#define DISABLE_UNLIKELY_PROFILE #include #include diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c index 1ef0f90..6e66763 100644 --- a/arch/x86/vdso/vclock_gettime.c +++ b/arch/x86/vdso/vclock_gettime.c @@ -9,6 +9,9 @@ * Also alternative() doesn't work. */ +/* Disable profiling for userspace code: */ +#define DISABLE_UNLIKELY_PROFILE + #include #include #include diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 935e30c..63b7d90 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -59,7 +59,11 @@ extern void __chk_io_ptr(const volatile void __iomem *); * specific implementations come from the above header files */ -#ifdef CONFIG_TRACE_UNLIKELY_PROFILE +/* + * Note: DISABLE_UNLIKELY_PROFILE can be used by special lowlevel code + * to disable branch tracing on a per file basis. + */ +#if defined(CONFIG_TRACE_UNLIKELY_PROFILE) && !defined(DISABLE_UNLIKELY_PROFILE) struct ftrace_likely_data { const char *func; const char *file; -- cgit v0.10.2 From 68d119f0a66f7e3663304343b072e56a2693446b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 12 Nov 2008 14:09:30 +0100 Subject: tracing: finetune branch-tracer output Steve suggested the to change the output from this: > bash-3471 [003] 357.014755: [ MISS ] sched_info_dequeued:sched_stats.h:177 > bash-3471 [003] 357.014756: [ .... ] update_curr:sched_fair.c:489 > bash-3471 [003] 357.014758: [ .... ] calc_delta_fair:sched_fair.c:411 to this: > bash-3471 [003] 357.014755: [ MISS ] sched_info_dequeued:sched_stats.h:177 > bash-3471 [003] 357.014756: [ ok ] update_curr:sched_fair.c:489 > bash-3471 [003] 357.014758: [ ok ] calc_delta_fair:sched_fair.c:411 as it makes it clearer to the user what it means exactly. Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 728a46e..d842db1 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1657,7 +1657,7 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) trace_assign_type(field, entry); trace_seq_printf(s, "[%s] %s:%s:%d\n", - field->correct ? " .... " : " MISS ", + field->correct ? " ok " : " MISS ", field->func, field->file, field->line); @@ -1808,7 +1808,7 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter) trace_assign_type(field, entry); trace_seq_printf(s, "[%s] %s:%s:%d\n", - field->correct ? " .... " : " MISS ", + field->correct ? " ok " : " MISS ", field->func, field->file, field->line); -- cgit v0.10.2 From 68f96c0c889b55bf62eee98e859cb686f8850188 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Wed, 12 Nov 2008 10:21:01 -0800 Subject: tracing/fastboot: fix perlcritic warning Impact: cleanup Fix the following warning from the perl syntax checking tool perlcritic. This tool is a lint like tool that checks for perl best practices. Loop iterator is not lexical at line 113, column 1. See page 108 of PBP. (Severity: 5) Signed-off-by: Stephen Hemminger Signed-off-by: Ingo Molnar diff --git a/scripts/bootgraph.pl b/scripts/bootgraph.pl index d2c61ef..a8635a1 100644 --- a/scripts/bootgraph.pl +++ b/scripts/bootgraph.pl @@ -109,8 +109,8 @@ my $stylecounter = 0; my %rows; my $rowscount = 1; my @initcalls = sort { $start{$a} <=> $start{$b} } keys(%start); -my $key; -foreach $key (@initcalls) { + +foreach my $key (@initcalls) { my $duration = $end{$key} - $start{$key}; if ($duration >= $threshold) { -- cgit v0.10.2 From 2ed84eeb8808cf3c9f039213ca137ffd7d753f0e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 12 Nov 2008 15:24:24 -0500 Subject: trace: rename unlikely profiler to branch profiler Impact: name change of unlikely tracer and profiler Ingo Molnar suggested changing the config from UNLIKELY_PROFILE to BRANCH_PROFILING. I never did like the "unlikely" name so I went one step farther, and renamed all the unlikely configurations to a "BRANCH" variant. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index ece0293..6f3d3d4 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -18,7 +18,7 @@ */ /* Disable profiling for userspace code: */ -#define DISABLE_UNLIKELY_PROFILE +#define DISABLE_BRANCH_PROFILING #include #include diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c index 6e66763..d9d3582 100644 --- a/arch/x86/vdso/vclock_gettime.c +++ b/arch/x86/vdso/vclock_gettime.c @@ -10,7 +10,7 @@ */ /* Disable profiling for userspace code: */ -#define DISABLE_UNLIKELY_PROFILE +#define DISABLE_BRANCH_PROFILING #include #include diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index e10beb5..a5e4ed9 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -45,7 +45,7 @@ #define MCOUNT_REC() #endif -#ifdef CONFIG_TRACE_UNLIKELY_PROFILE +#ifdef CONFIG_TRACE_BRANCH_PROFILING #define LIKELY_PROFILE() VMLINUX_SYMBOL(__start_likely_profile) = .; \ *(_ftrace_likely) \ VMLINUX_SYMBOL(__stop_likely_profile) = .; \ diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 63b7d90..c7d804a 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -59,26 +59,27 @@ extern void __chk_io_ptr(const volatile void __iomem *); * specific implementations come from the above header files */ -/* - * Note: DISABLE_UNLIKELY_PROFILE can be used by special lowlevel code - * to disable branch tracing on a per file basis. - */ -#if defined(CONFIG_TRACE_UNLIKELY_PROFILE) && !defined(DISABLE_UNLIKELY_PROFILE) -struct ftrace_likely_data { +struct ftrace_branch_data { const char *func; const char *file; unsigned line; unsigned long correct; unsigned long incorrect; }; -void ftrace_likely_update(struct ftrace_likely_data *f, int val, int expect); + +/* + * Note: DISABLE_BRANCH_PROFILING can be used by special lowlevel code + * to disable branch tracing on a per file basis. + */ +#if defined(CONFIG_TRACE_BRANCH_PROFILING) && !defined(DISABLE_BRANCH_PROFILING) +void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect); #define likely_notrace(x) __builtin_expect(!!(x), 1) #define unlikely_notrace(x) __builtin_expect(!!(x), 0) #define likely_check(x) ({ \ int ______r; \ - static struct ftrace_likely_data \ + static struct ftrace_branch_data \ __attribute__((__aligned__(4))) \ __attribute__((section("_ftrace_likely"))) \ ______f = { \ @@ -93,7 +94,7 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, int expect); }) #define unlikely_check(x) ({ \ int ______r; \ - static struct ftrace_likely_data \ + static struct ftrace_branch_data \ __attribute__((__aligned__(4))) \ __attribute__((section("_ftrace_unlikely"))) \ ______f = { \ diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 8abcaf8..9c89526 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -159,7 +159,7 @@ config BOOT_TRACER selected, because the self-tests are an initcall as well and that would invalidate the boot trace. ) -config TRACE_UNLIKELY_PROFILE +config TRACE_BRANCH_PROFILING bool "Trace likely/unlikely profiler" depends on DEBUG_KERNEL select TRACING @@ -175,7 +175,7 @@ config TRACE_UNLIKELY_PROFILE Say N if unsure. -config TRACING_UNLIKELY +config TRACING_BRANCHES bool help Selected by tracers that will trace the likely and unlikely @@ -183,10 +183,10 @@ config TRACING_UNLIKELY profiled. Profiling the tracing infrastructure can only happen when the likelys and unlikelys are not being traced. -config UNLIKELY_TRACER +config BRANCH_TRACER bool "Trace likely/unlikely instances" - depends on TRACE_UNLIKELY_PROFILE - select TRACING_UNLIKELY + depends on TRACE_BRANCH_PROFILING + select TRACING_BRANCHES help This traces the events of likely and unlikely condition calls in the kernel. The difference between this and the diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index c938d03..0087df7 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -11,9 +11,8 @@ obj-y += trace_selftest_dynamic.o endif # If unlikely tracing is enabled, do not trace these files -ifdef CONFIG_TRACING_UNLIKELY -KBUILD_CFLAGS += '-Dlikely(x)=likely_notrace(x)' -KBUILD_CFLAGS += '-Dunlikely(x)=unlikely_notrace(x)' +ifdef CONFIG_TRACING_BRANCHES +KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING endif obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o @@ -31,6 +30,6 @@ obj-$(CONFIG_STACK_TRACER) += trace_stack.o obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o obj-$(CONFIG_BOOT_TRACER) += trace_boot.o obj-$(CONFIG_FUNCTION_RET_TRACER) += trace_functions_return.o -obj-$(CONFIG_TRACE_UNLIKELY_PROFILE) += trace_unlikely.o +obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_unlikely.o libftrace-y := ftrace.o diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d842db1..bad59d3 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -258,7 +258,7 @@ static const char *trace_options[] = { "sched-tree", "ftrace_printk", "ftrace_preempt", -#ifdef CONFIG_UNLIKELY_TRACER +#ifdef CONFIG_BRANCH_TRACER "unlikely", #endif NULL diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 9635aa2..dccae63 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -468,7 +468,7 @@ enum trace_iterator_flags { TRACE_ITER_SCHED_TREE = 0x200, TRACE_ITER_PRINTK = 0x400, TRACE_ITER_PREEMPTONLY = 0x800, -#ifdef CONFIG_UNLIKELY_TRACER +#ifdef CONFIG_BRANCH_TRACER TRACE_ITER_UNLIKELY = 0x1000, #endif }; @@ -530,7 +530,7 @@ static inline void ftrace_preempt_enable(int resched) preempt_enable_notrace(); } -#ifdef CONFIG_UNLIKELY_TRACER +#ifdef CONFIG_BRANCH_TRACER extern int enable_unlikely_tracing(struct trace_array *tr); extern void disable_unlikely_tracing(void); static inline int trace_unlikely_enable(struct trace_array *tr) @@ -552,6 +552,6 @@ static inline int trace_unlikely_enable(struct trace_array *tr) static inline void trace_unlikely_disable(void) { } -#endif /* CONFIG_UNLIKELY_TRACER */ +#endif /* CONFIG_BRANCH_TRACER */ #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_unlikely.c b/kernel/trace/trace_unlikely.c index 7290e0e..856eb3b 100644 --- a/kernel/trace/trace_unlikely.c +++ b/kernel/trace/trace_unlikely.c @@ -15,7 +15,7 @@ #include #include "trace.h" -#ifdef CONFIG_UNLIKELY_TRACER +#ifdef CONFIG_BRANCH_TRACER static int unlikely_tracing_enabled __read_mostly; static DEFINE_MUTEX(unlikely_tracing_mutex); @@ -119,7 +119,7 @@ static inline void trace_likely_condition(struct ftrace_likely_data *f, int val, int expect) { } -#endif /* CONFIG_UNLIKELY_TRACER */ +#endif /* CONFIG_BRANCH_TRACER */ void ftrace_likely_update(struct ftrace_likely_data *f, int val, int expect) { -- cgit v0.10.2 From 9f029e83e968e5661d7be045bbcb620dbb909938 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 12 Nov 2008 15:24:24 -0500 Subject: ftrace: rename unlikely iter_ctrl to branch Impact: rename of iter_ctrl unlikely to branch The unlikely name is ugly. This patch converts the iter_ctrl command "unlikely" and "nounlikely" to "branch" and "nobranch" respectively. It also renames a lot of internal functions to use "branch" instead of "unlikely". Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index bad59d3..4bf070b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -259,7 +259,7 @@ static const char *trace_options[] = { "ftrace_printk", "ftrace_preempt", #ifdef CONFIG_BRANCH_TRACER - "unlikely", + "branch", #endif NULL }; @@ -1651,8 +1651,8 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) trace_seq_print_cont(s, iter); break; } - case TRACE_UNLIKELY: { - struct trace_unlikely *field; + case TRACE_BRANCH: { + struct trace_branch *field; trace_assign_type(field, entry); @@ -1802,8 +1802,8 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter) return print_return_function(iter); break; } - case TRACE_UNLIKELY: { - struct trace_unlikely *field; + case TRACE_BRANCH: { + struct trace_branch *field; trace_assign_type(field, entry); @@ -2619,7 +2619,7 @@ static int tracing_set_tracer(char *buf) if (t == current_trace) goto out; - trace_unlikely_disable(); + trace_branch_disable(); if (current_trace && current_trace->reset) current_trace->reset(tr); @@ -2627,7 +2627,7 @@ static int tracing_set_tracer(char *buf) if (t->init) t->init(tr); - trace_unlikely_enable(tr); + trace_branch_enable(tr); out: mutex_unlock(&trace_types_lock); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index dccae63..7fbf37b 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -22,7 +22,7 @@ enum trace_type { TRACE_SPECIAL, TRACE_MMIO_RW, TRACE_MMIO_MAP, - TRACE_UNLIKELY, + TRACE_BRANCH, TRACE_BOOT_CALL, TRACE_BOOT_RET, TRACE_FN_RET, @@ -137,7 +137,7 @@ struct trace_boot_ret { #define TRACE_FUNC_SIZE 30 #define TRACE_FILE_SIZE 20 -struct trace_unlikely { +struct trace_branch { struct trace_entry ent; unsigned line; char func[TRACE_FUNC_SIZE+1]; @@ -247,7 +247,7 @@ extern void __ftrace_bad_type(void); TRACE_MMIO_MAP); \ IF_ASSIGN(var, ent, struct trace_boot_call, TRACE_BOOT_CALL);\ IF_ASSIGN(var, ent, struct trace_boot_ret, TRACE_BOOT_RET);\ - IF_ASSIGN(var, ent, struct trace_unlikely, TRACE_UNLIKELY); \ + IF_ASSIGN(var, ent, struct trace_branch, TRACE_BRANCH); \ IF_ASSIGN(var, ent, struct ftrace_ret_entry, TRACE_FN_RET);\ __ftrace_bad_type(); \ } while (0) @@ -469,7 +469,7 @@ enum trace_iterator_flags { TRACE_ITER_PRINTK = 0x400, TRACE_ITER_PREEMPTONLY = 0x800, #ifdef CONFIG_BRANCH_TRACER - TRACE_ITER_UNLIKELY = 0x1000, + TRACE_ITER_BRANCH = 0x1000, #endif }; @@ -531,25 +531,25 @@ static inline void ftrace_preempt_enable(int resched) } #ifdef CONFIG_BRANCH_TRACER -extern int enable_unlikely_tracing(struct trace_array *tr); -extern void disable_unlikely_tracing(void); -static inline int trace_unlikely_enable(struct trace_array *tr) +extern int enable_branch_tracing(struct trace_array *tr); +extern void disable_branch_tracing(void); +static inline int trace_branch_enable(struct trace_array *tr) { - if (trace_flags & TRACE_ITER_UNLIKELY) - return enable_unlikely_tracing(tr); + if (trace_flags & TRACE_ITER_BRANCH) + return enable_branch_tracing(tr); return 0; } -static inline void trace_unlikely_disable(void) +static inline void trace_branch_disable(void) { /* due to races, always disable */ - disable_unlikely_tracing(); + disable_branch_tracing(); } #else -static inline int trace_unlikely_enable(struct trace_array *tr) +static inline int trace_branch_enable(struct trace_array *tr) { return 0; } -static inline void trace_unlikely_disable(void) +static inline void trace_branch_disable(void) { } #endif /* CONFIG_BRANCH_TRACER */ diff --git a/kernel/trace/trace_unlikely.c b/kernel/trace/trace_unlikely.c index 856eb3b..e5d5969 100644 --- a/kernel/trace/trace_unlikely.c +++ b/kernel/trace/trace_unlikely.c @@ -17,16 +17,16 @@ #ifdef CONFIG_BRANCH_TRACER -static int unlikely_tracing_enabled __read_mostly; -static DEFINE_MUTEX(unlikely_tracing_mutex); -static struct trace_array *unlikely_tracer; +static int branch_tracing_enabled __read_mostly; +static DEFINE_MUTEX(branch_tracing_mutex); +static struct trace_array *branch_tracer; static void -probe_likely_condition(struct ftrace_likely_data *f, int val, int expect) +probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) { - struct trace_array *tr = unlikely_tracer; + struct trace_array *tr = branch_tracer; struct ring_buffer_event *event; - struct trace_unlikely *entry; + struct trace_branch *entry; unsigned long flags, irq_flags; int cpu, pc; const char *p; @@ -54,7 +54,7 @@ probe_likely_condition(struct ftrace_likely_data *f, int val, int expect) pc = preempt_count(); entry = ring_buffer_event_data(event); tracing_generic_entry_update(&entry->ent, flags, pc); - entry->ent.type = TRACE_UNLIKELY; + entry->ent.type = TRACE_BRANCH; /* Strip off the path, only save the file */ p = f->file + strlen(f->file); @@ -77,51 +77,51 @@ probe_likely_condition(struct ftrace_likely_data *f, int val, int expect) } static inline -void trace_likely_condition(struct ftrace_likely_data *f, int val, int expect) +void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect) { - if (!unlikely_tracing_enabled) + if (!branch_tracing_enabled) return; probe_likely_condition(f, val, expect); } -int enable_unlikely_tracing(struct trace_array *tr) +int enable_branch_tracing(struct trace_array *tr) { int ret = 0; - mutex_lock(&unlikely_tracing_mutex); - unlikely_tracer = tr; + mutex_lock(&branch_tracing_mutex); + branch_tracer = tr; /* * Must be seen before enabling. The reader is a condition * where we do not need a matching rmb() */ smp_wmb(); - unlikely_tracing_enabled++; - mutex_unlock(&unlikely_tracing_mutex); + branch_tracing_enabled++; + mutex_unlock(&branch_tracing_mutex); return ret; } -void disable_unlikely_tracing(void) +void disable_branch_tracing(void) { - mutex_lock(&unlikely_tracing_mutex); + mutex_lock(&branch_tracing_mutex); - if (!unlikely_tracing_enabled) + if (!branch_tracing_enabled) goto out_unlock; - unlikely_tracing_enabled--; + branch_tracing_enabled--; out_unlock: - mutex_unlock(&unlikely_tracing_mutex); + mutex_unlock(&branch_tracing_mutex); } #else static inline -void trace_likely_condition(struct ftrace_likely_data *f, int val, int expect) +void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect) { } #endif /* CONFIG_BRANCH_TRACER */ -void ftrace_likely_update(struct ftrace_likely_data *f, int val, int expect) +void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect) { /* * I would love to have a trace point here instead, but the @@ -148,7 +148,7 @@ static void * t_next(struct seq_file *m, void *v, loff_t *pos) { struct ftrace_pointer *f = m->private; - struct ftrace_likely_data *p = v; + struct ftrace_branch_data *p = v; (*pos)++; @@ -180,7 +180,7 @@ static void t_stop(struct seq_file *m, void *p) static int t_show(struct seq_file *m, void *v) { - struct ftrace_likely_data *p = v; + struct ftrace_branch_data *p = v; const char *f; unsigned long percent; @@ -252,7 +252,7 @@ static struct ftrace_pointer ftrace_unlikely_pos = { .stop = __stop_unlikely_profile, }; -static __init int ftrace_unlikely_init(void) +static __init int ftrace_branch_init(void) { struct dentry *d_tracer; struct dentry *entry; @@ -275,4 +275,4 @@ static __init int ftrace_unlikely_init(void) return 0; } -device_initcall(ftrace_unlikely_init); +device_initcall(ftrace_branch_init); -- cgit v0.10.2 From 80e5ea4506791af206266c5921c97f11d3b17866 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 12 Nov 2008 15:24:24 -0500 Subject: ftrace: add tracer called branch Impact: added new branch tracer Currently the tracing of branch profiling (unlikelys and likelys hit) is only activated by the iter_ctrl. This patch adds a tracer called "branch" that will just trace the branch profiling. The advantage of adding this tracer is that it can be added to the ftrace selftests on startup. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 7fbf37b..9e015f5 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -420,6 +420,8 @@ extern int trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr); extern int trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr); +extern int trace_selftest_startup_branch(struct tracer *trace, + struct trace_array *tr); #endif /* CONFIG_FTRACE_STARTUP_TEST */ extern void *head_page(struct trace_array_cpu *data); diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 0728a10..24e6e07 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -13,6 +13,7 @@ static inline int trace_valid_entry(struct trace_entry *entry) case TRACE_STACK: case TRACE_PRINT: case TRACE_SPECIAL: + case TRACE_BRANCH: return 1; } return 0; @@ -544,3 +545,25 @@ trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr) return ret; } #endif /* CONFIG_SYSPROF_TRACER */ + +#ifdef CONFIG_BRANCH_TRACER +int +trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr) +{ + unsigned long count; + int ret; + + /* start the tracing */ + trace->init(tr); + /* Sleep for a 1/10 of a second */ + msleep(100); + /* stop the tracing. */ + tracing_stop(); + /* check the trace buffer */ + ret = trace_test_buffer(tr, &count); + trace->reset(tr); + tracing_start(); + + return ret; +} +#endif /* CONFIG_BRANCH_TRACER */ diff --git a/kernel/trace/trace_unlikely.c b/kernel/trace/trace_unlikely.c index e5d5969..8526555 100644 --- a/kernel/trace/trace_unlikely.c +++ b/kernel/trace/trace_unlikely.c @@ -114,6 +114,48 @@ void disable_branch_tracing(void) out_unlock: mutex_unlock(&branch_tracing_mutex); } + +static void start_branch_trace(struct trace_array *tr) +{ + enable_branch_tracing(tr); +} + +static void stop_branch_trace(struct trace_array *tr) +{ + disable_branch_tracing(); +} + +static void branch_trace_init(struct trace_array *tr) +{ + int cpu; + + for_each_online_cpu(cpu) + tracing_reset(tr, cpu); + + start_branch_trace(tr); +} + +static void branch_trace_reset(struct trace_array *tr) +{ + stop_branch_trace(tr); +} + +struct tracer branch_trace __read_mostly = +{ + .name = "branch", + .init = branch_trace_init, + .reset = branch_trace_reset, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_branch, +#endif +}; + +__init static int init_branch_trace(void) +{ + return register_tracer(&branch_trace); +} + +device_initcall(init_branch_trace); #else static inline void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect) -- cgit v0.10.2 From 94b80ffd650b22e1fd493ccf6bad7efda4b8ea85 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 12 Nov 2008 16:18:45 -0500 Subject: ftrace: rename trace_unlikely.c file Impact: File name change of trace_unlikely.c The "unlikely" name for the tracer is quite ugly. We renamed all the parts of it to "branch" and now it is time to rename the file too. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 0087df7..1a8c925 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -30,6 +30,6 @@ obj-$(CONFIG_STACK_TRACER) += trace_stack.o obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o obj-$(CONFIG_BOOT_TRACER) += trace_boot.o obj-$(CONFIG_FUNCTION_RET_TRACER) += trace_functions_return.o -obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_unlikely.o +obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o libftrace-y := ftrace.o diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c new file mode 100644 index 0000000..8526555 --- /dev/null +++ b/kernel/trace/trace_branch.c @@ -0,0 +1,320 @@ +/* + * unlikely profiler + * + * Copyright (C) 2008 Steven Rostedt + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "trace.h" + +#ifdef CONFIG_BRANCH_TRACER + +static int branch_tracing_enabled __read_mostly; +static DEFINE_MUTEX(branch_tracing_mutex); +static struct trace_array *branch_tracer; + +static void +probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) +{ + struct trace_array *tr = branch_tracer; + struct ring_buffer_event *event; + struct trace_branch *entry; + unsigned long flags, irq_flags; + int cpu, pc; + const char *p; + + /* + * I would love to save just the ftrace_likely_data pointer, but + * this code can also be used by modules. Ugly things can happen + * if the module is unloaded, and then we go and read the + * pointer. This is slower, but much safer. + */ + + if (unlikely(!tr)) + return; + + local_irq_save(flags); + cpu = raw_smp_processor_id(); + if (atomic_inc_return(&tr->data[cpu]->disabled) != 1) + goto out; + + event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), + &irq_flags); + if (!event) + goto out; + + pc = preempt_count(); + entry = ring_buffer_event_data(event); + tracing_generic_entry_update(&entry->ent, flags, pc); + entry->ent.type = TRACE_BRANCH; + + /* Strip off the path, only save the file */ + p = f->file + strlen(f->file); + while (p >= f->file && *p != '/') + p--; + p++; + + strncpy(entry->func, f->func, TRACE_FUNC_SIZE); + strncpy(entry->file, p, TRACE_FILE_SIZE); + entry->func[TRACE_FUNC_SIZE] = 0; + entry->file[TRACE_FILE_SIZE] = 0; + entry->line = f->line; + entry->correct = val == expect; + + ring_buffer_unlock_commit(tr->buffer, event, irq_flags); + + out: + atomic_dec(&tr->data[cpu]->disabled); + local_irq_restore(flags); +} + +static inline +void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect) +{ + if (!branch_tracing_enabled) + return; + + probe_likely_condition(f, val, expect); +} + +int enable_branch_tracing(struct trace_array *tr) +{ + int ret = 0; + + mutex_lock(&branch_tracing_mutex); + branch_tracer = tr; + /* + * Must be seen before enabling. The reader is a condition + * where we do not need a matching rmb() + */ + smp_wmb(); + branch_tracing_enabled++; + mutex_unlock(&branch_tracing_mutex); + + return ret; +} + +void disable_branch_tracing(void) +{ + mutex_lock(&branch_tracing_mutex); + + if (!branch_tracing_enabled) + goto out_unlock; + + branch_tracing_enabled--; + + out_unlock: + mutex_unlock(&branch_tracing_mutex); +} + +static void start_branch_trace(struct trace_array *tr) +{ + enable_branch_tracing(tr); +} + +static void stop_branch_trace(struct trace_array *tr) +{ + disable_branch_tracing(); +} + +static void branch_trace_init(struct trace_array *tr) +{ + int cpu; + + for_each_online_cpu(cpu) + tracing_reset(tr, cpu); + + start_branch_trace(tr); +} + +static void branch_trace_reset(struct trace_array *tr) +{ + stop_branch_trace(tr); +} + +struct tracer branch_trace __read_mostly = +{ + .name = "branch", + .init = branch_trace_init, + .reset = branch_trace_reset, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_branch, +#endif +}; + +__init static int init_branch_trace(void) +{ + return register_tracer(&branch_trace); +} + +device_initcall(init_branch_trace); +#else +static inline +void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect) +{ +} +#endif /* CONFIG_BRANCH_TRACER */ + +void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect) +{ + /* + * I would love to have a trace point here instead, but the + * trace point code is so inundated with unlikely and likely + * conditions that the recursive nightmare that exists is too + * much to try to get working. At least for now. + */ + trace_likely_condition(f, val, expect); + + /* FIXME: Make this atomic! */ + if (val == expect) + f->correct++; + else + f->incorrect++; +} +EXPORT_SYMBOL(ftrace_likely_update); + +struct ftrace_pointer { + void *start; + void *stop; +}; + +static void * +t_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct ftrace_pointer *f = m->private; + struct ftrace_branch_data *p = v; + + (*pos)++; + + if (v == (void *)1) + return f->start; + + ++p; + + if ((void *)p >= (void *)f->stop) + return NULL; + + return p; +} + +static void *t_start(struct seq_file *m, loff_t *pos) +{ + void *t = (void *)1; + loff_t l = 0; + + for (; t && l < *pos; t = t_next(m, t, &l)) + ; + + return t; +} + +static void t_stop(struct seq_file *m, void *p) +{ +} + +static int t_show(struct seq_file *m, void *v) +{ + struct ftrace_branch_data *p = v; + const char *f; + unsigned long percent; + + if (v == (void *)1) { + seq_printf(m, " correct incorrect %% " + " Function " + " File Line\n" + " ------- --------- - " + " -------- " + " ---- ----\n"); + return 0; + } + + /* Only print the file, not the path */ + f = p->file + strlen(p->file); + while (f >= p->file && *f != '/') + f--; + f++; + + if (p->correct) { + percent = p->incorrect * 100; + percent /= p->correct + p->incorrect; + } else + percent = p->incorrect ? 100 : 0; + + seq_printf(m, "%8lu %8lu %3lu ", p->correct, p->incorrect, percent); + seq_printf(m, "%-30.30s %-20.20s %d\n", p->func, f, p->line); + return 0; +} + +static struct seq_operations tracing_likely_seq_ops = { + .start = t_start, + .next = t_next, + .stop = t_stop, + .show = t_show, +}; + +static int tracing_likely_open(struct inode *inode, struct file *file) +{ + int ret; + + ret = seq_open(file, &tracing_likely_seq_ops); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = (void *)inode->i_private; + } + + return ret; +} + +static struct file_operations tracing_likely_fops = { + .open = tracing_likely_open, + .read = seq_read, + .llseek = seq_lseek, +}; + +extern unsigned long __start_likely_profile[]; +extern unsigned long __stop_likely_profile[]; +extern unsigned long __start_unlikely_profile[]; +extern unsigned long __stop_unlikely_profile[]; + +static struct ftrace_pointer ftrace_likely_pos = { + .start = __start_likely_profile, + .stop = __stop_likely_profile, +}; + +static struct ftrace_pointer ftrace_unlikely_pos = { + .start = __start_unlikely_profile, + .stop = __stop_unlikely_profile, +}; + +static __init int ftrace_branch_init(void) +{ + struct dentry *d_tracer; + struct dentry *entry; + + d_tracer = tracing_init_dentry(); + + entry = debugfs_create_file("profile_likely", 0444, d_tracer, + &ftrace_likely_pos, + &tracing_likely_fops); + if (!entry) + pr_warning("Could not create debugfs 'profile_likely' entry\n"); + + entry = debugfs_create_file("profile_unlikely", 0444, d_tracer, + &ftrace_unlikely_pos, + &tracing_likely_fops); + if (!entry) + pr_warning("Could not create debugfs" + " 'profile_unlikely' entry\n"); + + return 0; +} + +device_initcall(ftrace_branch_init); diff --git a/kernel/trace/trace_unlikely.c b/kernel/trace/trace_unlikely.c deleted file mode 100644 index 8526555..0000000 --- a/kernel/trace/trace_unlikely.c +++ /dev/null @@ -1,320 +0,0 @@ -/* - * unlikely profiler - * - * Copyright (C) 2008 Steven Rostedt - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "trace.h" - -#ifdef CONFIG_BRANCH_TRACER - -static int branch_tracing_enabled __read_mostly; -static DEFINE_MUTEX(branch_tracing_mutex); -static struct trace_array *branch_tracer; - -static void -probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) -{ - struct trace_array *tr = branch_tracer; - struct ring_buffer_event *event; - struct trace_branch *entry; - unsigned long flags, irq_flags; - int cpu, pc; - const char *p; - - /* - * I would love to save just the ftrace_likely_data pointer, but - * this code can also be used by modules. Ugly things can happen - * if the module is unloaded, and then we go and read the - * pointer. This is slower, but much safer. - */ - - if (unlikely(!tr)) - return; - - local_irq_save(flags); - cpu = raw_smp_processor_id(); - if (atomic_inc_return(&tr->data[cpu]->disabled) != 1) - goto out; - - event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), - &irq_flags); - if (!event) - goto out; - - pc = preempt_count(); - entry = ring_buffer_event_data(event); - tracing_generic_entry_update(&entry->ent, flags, pc); - entry->ent.type = TRACE_BRANCH; - - /* Strip off the path, only save the file */ - p = f->file + strlen(f->file); - while (p >= f->file && *p != '/') - p--; - p++; - - strncpy(entry->func, f->func, TRACE_FUNC_SIZE); - strncpy(entry->file, p, TRACE_FILE_SIZE); - entry->func[TRACE_FUNC_SIZE] = 0; - entry->file[TRACE_FILE_SIZE] = 0; - entry->line = f->line; - entry->correct = val == expect; - - ring_buffer_unlock_commit(tr->buffer, event, irq_flags); - - out: - atomic_dec(&tr->data[cpu]->disabled); - local_irq_restore(flags); -} - -static inline -void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect) -{ - if (!branch_tracing_enabled) - return; - - probe_likely_condition(f, val, expect); -} - -int enable_branch_tracing(struct trace_array *tr) -{ - int ret = 0; - - mutex_lock(&branch_tracing_mutex); - branch_tracer = tr; - /* - * Must be seen before enabling. The reader is a condition - * where we do not need a matching rmb() - */ - smp_wmb(); - branch_tracing_enabled++; - mutex_unlock(&branch_tracing_mutex); - - return ret; -} - -void disable_branch_tracing(void) -{ - mutex_lock(&branch_tracing_mutex); - - if (!branch_tracing_enabled) - goto out_unlock; - - branch_tracing_enabled--; - - out_unlock: - mutex_unlock(&branch_tracing_mutex); -} - -static void start_branch_trace(struct trace_array *tr) -{ - enable_branch_tracing(tr); -} - -static void stop_branch_trace(struct trace_array *tr) -{ - disable_branch_tracing(); -} - -static void branch_trace_init(struct trace_array *tr) -{ - int cpu; - - for_each_online_cpu(cpu) - tracing_reset(tr, cpu); - - start_branch_trace(tr); -} - -static void branch_trace_reset(struct trace_array *tr) -{ - stop_branch_trace(tr); -} - -struct tracer branch_trace __read_mostly = -{ - .name = "branch", - .init = branch_trace_init, - .reset = branch_trace_reset, -#ifdef CONFIG_FTRACE_SELFTEST - .selftest = trace_selftest_startup_branch, -#endif -}; - -__init static int init_branch_trace(void) -{ - return register_tracer(&branch_trace); -} - -device_initcall(init_branch_trace); -#else -static inline -void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect) -{ -} -#endif /* CONFIG_BRANCH_TRACER */ - -void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect) -{ - /* - * I would love to have a trace point here instead, but the - * trace point code is so inundated with unlikely and likely - * conditions that the recursive nightmare that exists is too - * much to try to get working. At least for now. - */ - trace_likely_condition(f, val, expect); - - /* FIXME: Make this atomic! */ - if (val == expect) - f->correct++; - else - f->incorrect++; -} -EXPORT_SYMBOL(ftrace_likely_update); - -struct ftrace_pointer { - void *start; - void *stop; -}; - -static void * -t_next(struct seq_file *m, void *v, loff_t *pos) -{ - struct ftrace_pointer *f = m->private; - struct ftrace_branch_data *p = v; - - (*pos)++; - - if (v == (void *)1) - return f->start; - - ++p; - - if ((void *)p >= (void *)f->stop) - return NULL; - - return p; -} - -static void *t_start(struct seq_file *m, loff_t *pos) -{ - void *t = (void *)1; - loff_t l = 0; - - for (; t && l < *pos; t = t_next(m, t, &l)) - ; - - return t; -} - -static void t_stop(struct seq_file *m, void *p) -{ -} - -static int t_show(struct seq_file *m, void *v) -{ - struct ftrace_branch_data *p = v; - const char *f; - unsigned long percent; - - if (v == (void *)1) { - seq_printf(m, " correct incorrect %% " - " Function " - " File Line\n" - " ------- --------- - " - " -------- " - " ---- ----\n"); - return 0; - } - - /* Only print the file, not the path */ - f = p->file + strlen(p->file); - while (f >= p->file && *f != '/') - f--; - f++; - - if (p->correct) { - percent = p->incorrect * 100; - percent /= p->correct + p->incorrect; - } else - percent = p->incorrect ? 100 : 0; - - seq_printf(m, "%8lu %8lu %3lu ", p->correct, p->incorrect, percent); - seq_printf(m, "%-30.30s %-20.20s %d\n", p->func, f, p->line); - return 0; -} - -static struct seq_operations tracing_likely_seq_ops = { - .start = t_start, - .next = t_next, - .stop = t_stop, - .show = t_show, -}; - -static int tracing_likely_open(struct inode *inode, struct file *file) -{ - int ret; - - ret = seq_open(file, &tracing_likely_seq_ops); - if (!ret) { - struct seq_file *m = file->private_data; - m->private = (void *)inode->i_private; - } - - return ret; -} - -static struct file_operations tracing_likely_fops = { - .open = tracing_likely_open, - .read = seq_read, - .llseek = seq_lseek, -}; - -extern unsigned long __start_likely_profile[]; -extern unsigned long __stop_likely_profile[]; -extern unsigned long __start_unlikely_profile[]; -extern unsigned long __stop_unlikely_profile[]; - -static struct ftrace_pointer ftrace_likely_pos = { - .start = __start_likely_profile, - .stop = __stop_likely_profile, -}; - -static struct ftrace_pointer ftrace_unlikely_pos = { - .start = __start_unlikely_profile, - .stop = __stop_unlikely_profile, -}; - -static __init int ftrace_branch_init(void) -{ - struct dentry *d_tracer; - struct dentry *entry; - - d_tracer = tracing_init_dentry(); - - entry = debugfs_create_file("profile_likely", 0444, d_tracer, - &ftrace_likely_pos, - &tracing_likely_fops); - if (!entry) - pr_warning("Could not create debugfs 'profile_likely' entry\n"); - - entry = debugfs_create_file("profile_unlikely", 0444, d_tracer, - &ftrace_unlikely_pos, - &tracing_likely_fops); - if (!entry) - pr_warning("Could not create debugfs" - " 'profile_unlikely' entry\n"); - - return 0; -} - -device_initcall(ftrace_branch_init); -- cgit v0.10.2 From 62d59d17a5f98edb48b171742dfa531488802f07 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 12 Nov 2008 22:47:54 +0100 Subject: tracing/function-return-tracer: make the function return tracer lockless Impact: remove spinlocks and irq disabling in function return tracer. I've tried to figure out all of the race condition that could happen when the tracer pushes or pops a return address trace to/from the current thread_info. Theory: _ One thread can only execute on one cpu at a time. So this code doesn't need to be SMP-safe. Just drop the spinlock. _ The only race could happen between the current thread and an interrupt. If an interrupt is raised, it will increase the index of the return stack storage and then execute until the end of the tracing to finally free the index it used. We don't need to disable irqs. This is theorical. In practice, I've tested it with a two-core SMP and had no problem at all. Perhaps -tip testing could confirm it. Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 16a571d..1db0e12 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -44,62 +44,37 @@ void ftrace_nmi_exit(void) atomic_dec(&in_nmi); } -/* - * Synchronize accesses to return adresses stack with - * interrupts. - */ -static raw_spinlock_t ret_stack_lock; - /* Add a function return address to the trace stack on thread info.*/ static int push_return_trace(unsigned long ret, unsigned long long time, unsigned long func) { int index; - struct thread_info *ti; - unsigned long flags; - int err = 0; - - raw_local_irq_save(flags); - __raw_spin_lock(&ret_stack_lock); + struct thread_info *ti = current_thread_info(); - ti = current_thread_info(); /* The return trace stack is full */ - if (ti->curr_ret_stack == FTRACE_RET_STACK_SIZE - 1) { - err = -EBUSY; - goto out; - } + if (ti->curr_ret_stack == FTRACE_RET_STACK_SIZE - 1) + return -EBUSY; index = ++ti->curr_ret_stack; ti->ret_stack[index].ret = ret; ti->ret_stack[index].func = func; ti->ret_stack[index].calltime = time; -out: - __raw_spin_unlock(&ret_stack_lock); - raw_local_irq_restore(flags); - return err; + return 0; } /* Retrieve a function return address to the trace stack on thread info.*/ static void pop_return_trace(unsigned long *ret, unsigned long long *time, unsigned long *func) { - struct thread_info *ti; int index; - unsigned long flags; - - raw_local_irq_save(flags); - __raw_spin_lock(&ret_stack_lock); - ti = current_thread_info(); + struct thread_info *ti = current_thread_info(); index = ti->curr_ret_stack; *ret = ti->ret_stack[index].ret; *func = ti->ret_stack[index].func; *time = ti->ret_stack[index].calltime; ti->curr_ret_stack--; - - __raw_spin_unlock(&ret_stack_lock); - raw_local_irq_restore(flags); } /* @@ -175,14 +150,6 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) *parent = old; } -static int __init init_ftrace_function_return(void) -{ - ret_stack_lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; - return 0; -} -device_initcall(init_ftrace_function_return); - - #endif #ifdef CONFIG_DYNAMIC_FTRACE -- cgit v0.10.2 From 1dc1c6adf38bc5799d1594681645ced40ced4b6b Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 12 Nov 2008 22:49:23 +0100 Subject: tracing/function-return-tracer: call prepare_ftrace_return by registers Impact: Optimize a bit the function return tracer This patch changes the calling convention of prepare_ftrace_return to pass its arguments by register. This will optimize it a bit and prepare it to support dynamic tracing. Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 9a0ac85..f976211 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -1217,12 +1217,9 @@ trace_return: pushl %eax pushl %ecx pushl %edx - movl 0xc(%esp), %eax - pushl %eax + movl 0xc(%esp), %edx lea 0x4(%ebp), %eax - pushl %eax call prepare_ftrace_return - addl $8, %esp popl %edx popl %ecx popl %eax diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 1db0e12..fe83273 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -95,7 +95,6 @@ unsigned long ftrace_return_to_handler(void) * Hook the return address and push it in the stack of return addrs * in current thread info. */ -asmlinkage void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) { unsigned long old; -- cgit v0.10.2 From a94c80e78bc9f4493ffc25a02d5d7bcd93c399d0 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 12 Nov 2008 17:52:36 -0500 Subject: ftrace: rename trace_entries to buffer_size_kb Impact: rename of debugfs file trace_entries to buffer_size_kb The original ftrace had fixed size entries, and the number of entries was shown and modified via the file called trace_entries. By converting to the unified trace buffer, we now allow for variable size entries which makes the meaning of trace_entries pointless. Since trace_size might be confused to the size of the trace, this patch names it "buffer_size_kb" (thanks to Arjan van de Ven for this idea). [ mingo@elte.hu: changed from buffer_size to buffer_size_kb ] ( Note, the units are still bytes - the next patch changes that, to keep the wide rename patch separate from the unit-change patch. ) Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/Documentation/ftrace.txt b/Documentation/ftrace.txt index 9cc4d68..a1b5877 100644 --- a/Documentation/ftrace.txt +++ b/Documentation/ftrace.txt @@ -94,7 +94,7 @@ of ftrace. Here is a list of some of the key files: only be recorded if the latency is greater than the value in this file. (in microseconds) - trace_entries: This sets or displays the number of bytes each CPU + buffer_size_kb: This sets or displays the number of bytes each CPU buffer can hold. The tracer buffers are the same size for each CPU. The displayed number is the size of the CPU buffer and not total size of all buffers. The @@ -1299,13 +1299,13 @@ trace entries ------------- Having too much or not enough data can be troublesome in diagnosing -an issue in the kernel. The file trace_entries is used to modify +an issue in the kernel. The file buffer_size_kb is used to modify the size of the internal trace buffers. The number listed is the number of entries that can be recorded per CPU. To know the full size, multiply the number of possible CPUS with the number of entries. - # cat /debug/tracing/trace_entries + # cat /debug/tracing/buffer_size_kb 65620 Note, to modify this, you must have tracing completely disabled. To do that, @@ -1313,8 +1313,8 @@ echo "nop" into the current_tracer. If the current_tracer is not set to "nop", an EINVAL error will be returned. # echo nop > /debug/tracing/current_tracer - # echo 100000 > /debug/tracing/trace_entries - # cat /debug/tracing/trace_entries + # echo 100000 > /debug/tracing/buffer_size_kb + # cat /debug/tracing/buffer_size_kb 100045 @@ -1323,8 +1323,8 @@ are held in individual pages. It allocates the number of pages it takes to fulfill the request. If more entries may fit on the last page then they will be added. - # echo 1 > /debug/tracing/trace_entries - # cat /debug/tracing/trace_entries + # echo 1 > /debug/tracing/buffer_size_kb + # cat /debug/tracing/buffer_size_kb 85 This shows us that 85 entries can fit in a single page. @@ -1332,8 +1332,8 @@ This shows us that 85 entries can fit in a single page. The number of pages which will be allocated is limited to a percentage of available memory. Allocating too much will produce an error. - # echo 1000000000000 > /debug/tracing/trace_entries + # echo 1000000000000 > /debug/tracing/buffer_size_kb -bash: echo: write error: Cannot allocate memory - # cat /debug/tracing/trace_entries + # cat /debug/tracing/buffer_size_kb 85 diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4bf070b..b42d420 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3198,11 +3198,11 @@ static __init int tracer_init_debugfs(void) pr_warning("Could not create debugfs " "'trace_pipe' entry\n"); - entry = debugfs_create_file("trace_entries", 0644, d_tracer, + entry = debugfs_create_file("buffer_size_kb", 0644, d_tracer, &global_trace, &tracing_entries_fops); if (!entry) pr_warning("Could not create debugfs " - "'trace_entries' entry\n"); + "'buffer_size_kb' entry\n"); entry = debugfs_create_file("trace_marker", 0220, d_tracer, NULL, &tracing_mark_fops); -- cgit v0.10.2 From 1696b2b0f44a8d42f3e6b1ea90c21790871c04d9 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 13 Nov 2008 00:09:35 -0500 Subject: ftrace: show buffer size in kilobytes Impact: change the units of buffer_size_kb to kilobytes This patch changes the units of the buffer_size_kb file to kilobytes. Reading and writing to the file uses kilobytes as units. To help users to know what units are used, the output of the file now looks like: # cat /debug/tracing/buffer_size_kb 1408 Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/Documentation/ftrace.txt b/Documentation/ftrace.txt index a1b5877..6d3fe4c 100644 --- a/Documentation/ftrace.txt +++ b/Documentation/ftrace.txt @@ -94,10 +94,10 @@ of ftrace. Here is a list of some of the key files: only be recorded if the latency is greater than the value in this file. (in microseconds) - buffer_size_kb: This sets or displays the number of bytes each CPU + buffer_size_kb: This sets or displays the number of kilobytes each CPU buffer can hold. The tracer buffers are the same size for each CPU. The displayed number is the size of the - CPU buffer and not total size of all buffers. The + CPU buffer and not total size of all buffers. The trace buffers are allocated in pages (blocks of memory that the kernel uses for allocation, usually 4 KB in size). If the last page allocated has room for more bytes @@ -1306,28 +1306,16 @@ the full size, multiply the number of possible CPUS with the number of entries. # cat /debug/tracing/buffer_size_kb -65620 +1408 (units kilobytes) Note, to modify this, you must have tracing completely disabled. To do that, echo "nop" into the current_tracer. If the current_tracer is not set to "nop", an EINVAL error will be returned. # echo nop > /debug/tracing/current_tracer - # echo 100000 > /debug/tracing/buffer_size_kb + # echo 10000 > /debug/tracing/buffer_size_kb # cat /debug/tracing/buffer_size_kb -100045 - - -Notice that we echoed in 100,000 but the size is 100,045. The entries -are held in individual pages. It allocates the number of pages it takes -to fulfill the request. If more entries may fit on the last page -then they will be added. - - # echo 1 > /debug/tracing/buffer_size_kb - # cat /debug/tracing/buffer_size_kb -85 - -This shows us that 85 entries can fit in a single page. +10000 (units kilobytes) The number of pages which will be allocated is limited to a percentage of available memory. Allocating too much will produce an error. diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index b42d420..d664aae 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2905,7 +2905,7 @@ tracing_entries_read(struct file *filp, char __user *ubuf, char buf[64]; int r; - r = sprintf(buf, "%lu\n", tr->entries); + r = sprintf(buf, "%lu\n", tr->entries >> 10); return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } @@ -2945,6 +2945,9 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, atomic_inc(&max_tr.data[cpu]->disabled); } + /* value is in KB */ + val <<= 10; + if (val != global_trace.entries) { ret = ring_buffer_resize(global_trace.buffer, val); if (ret < 0) { -- cgit v0.10.2 From ee6bce52276c0717ed3e63296e5d9465d339e923 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 12 Nov 2008 17:52:37 -0500 Subject: ftrace: rename iter_ctrl to trace_options Impact: rename file /debug/tracing/iter_ctrl to /debug/tracing/trace_options The original ftrace had a file called "iter_ctrl" that would control the way the output was iterated. But this file grew into a catch all for different trace options. This patch renames the file from iter_ctrl to trace_options to reflect this change. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/Documentation/ftrace.txt b/Documentation/ftrace.txt index 6d3fe4c..753f4de 100644 --- a/Documentation/ftrace.txt +++ b/Documentation/ftrace.txt @@ -82,7 +82,7 @@ of ftrace. Here is a list of some of the key files: tracer is not adding more data, they will display the same information every time they are read. - iter_ctrl: This file lets the user control the amount of data + trace_options: This file lets the user control the amount of data that is displayed in one of the above output files. @@ -316,23 +316,23 @@ The above is mostly meaningful for kernel developers. The rest is the same as the 'trace' file. -iter_ctrl ---------- +trace_options +------------- -The iter_ctrl file is used to control what gets printed in the trace +The trace_options file is used to control what gets printed in the trace output. To see what is available, simply cat the file: - cat /debug/tracing/iter_ctrl + cat /debug/tracing/trace_options print-parent nosym-offset nosym-addr noverbose noraw nohex nobin \ noblock nostacktrace nosched-tree To disable one of the options, echo in the option prepended with "no". - echo noprint-parent > /debug/tracing/iter_ctrl + echo noprint-parent > /debug/tracing/trace_options To enable an option, leave off the "no". - echo sym-offset > /debug/tracing/iter_ctrl + echo sym-offset > /debug/tracing/trace_options Here are the available options: diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d664aae..240423a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -204,7 +204,7 @@ static DEFINE_MUTEX(trace_types_lock); /* trace_wait is a waitqueue for tasks blocked on trace_poll */ static DECLARE_WAIT_QUEUE_HEAD(trace_wait); -/* trace_flags holds iter_ctrl options */ +/* trace_flags holds trace_options default values */ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK; /** @@ -2411,7 +2411,7 @@ static struct file_operations tracing_cpumask_fops = { }; static ssize_t -tracing_iter_ctrl_read(struct file *filp, char __user *ubuf, +tracing_trace_options_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { char *buf; @@ -2448,7 +2448,7 @@ tracing_iter_ctrl_read(struct file *filp, char __user *ubuf, } static ssize_t -tracing_iter_ctrl_write(struct file *filp, const char __user *ubuf, +tracing_trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { char buf[64]; @@ -2493,8 +2493,8 @@ tracing_iter_ctrl_write(struct file *filp, const char __user *ubuf, static struct file_operations tracing_iter_fops = { .open = tracing_open_generic, - .read = tracing_iter_ctrl_read, - .write = tracing_iter_ctrl_write, + .read = tracing_trace_options_read, + .write = tracing_trace_options_write, }; static const char readme_msg[] = @@ -2508,9 +2508,9 @@ static const char readme_msg[] = "# echo sched_switch > /debug/tracing/current_tracer\n" "# cat /debug/tracing/current_tracer\n" "sched_switch\n" - "# cat /debug/tracing/iter_ctrl\n" + "# cat /debug/tracing/trace_options\n" "noprint-parent nosym-offset nosym-addr noverbose\n" - "# echo print-parent > /debug/tracing/iter_ctrl\n" + "# echo print-parent > /debug/tracing/trace_options\n" "# echo 1 > /debug/tracing/tracing_enabled\n" "# cat /debug/tracing/trace > /tmp/trace.txt\n" "echo 0 > /debug/tracing/tracing_enabled\n" @@ -3148,10 +3148,10 @@ static __init int tracer_init_debugfs(void) if (!entry) pr_warning("Could not create debugfs 'tracing_enabled' entry\n"); - entry = debugfs_create_file("iter_ctrl", 0644, d_tracer, + entry = debugfs_create_file("trace_options", 0644, d_tracer, NULL, &tracing_iter_fops); if (!entry) - pr_warning("Could not create debugfs 'iter_ctrl' entry\n"); + pr_warning("Could not create debugfs 'trace_options' entry\n"); entry = debugfs_create_file("tracing_cpumask", 0644, d_tracer, NULL, &tracing_cpumask_fops); -- cgit v0.10.2 From 12ef7d448613ead2babd41c3ebfa1fe03c20edef Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 12 Nov 2008 17:52:38 -0500 Subject: ftrace: CPU buffer start annotation clean ups Impact: better handling of CPU buffer start annotation Because of the confusion with the per CPU buffers wrapping where one CPU might be more active at the end of the trace than the other CPUs causing that one CPU to have a shorter history. Kernel developers were confused by the "missing" data of that one CPU at the beginning of the trace output. An annotation was added to the trace output to show that the buffer had started: # tracer: function # # TASK-PID CPU# TIMESTAMP FUNCTION # | | | | | ##### CPU 3 buffer started #### -0 [003] 158.192959: smp_apic_timer_interrupt [...] -0 [003] 161.556520: default_idle ##### CPU 1 buffer started #### -0 [001] 161.592494: hrtimer_force_reprogram [etc] But this annotation gets a bit messy when tracers do not fill the buffers. This patch does a couple of things: One) it adds a flag to trace_options to disable these annotations Two) it does not annotate if the tracer did not overflow its buffer. This makes the output much cleaner. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 240423a..4a90462 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -205,7 +205,8 @@ static DEFINE_MUTEX(trace_types_lock); static DECLARE_WAIT_QUEUE_HEAD(trace_wait); /* trace_flags holds trace_options default values */ -unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK; +unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | + TRACE_ITER_ANNOTATE; /** * trace_wake_up - wake up tasks waiting for trace input @@ -261,6 +262,7 @@ static const char *trace_options[] = { #ifdef CONFIG_BRANCH_TRACER "branch", #endif + "annotate", NULL }; @@ -1113,6 +1115,7 @@ void tracing_stop_function_trace(void) enum trace_file_type { TRACE_FILE_LAT_FMT = 1, + TRACE_FILE_ANNOTATE = 2, }; static void trace_iterator_increment(struct trace_iterator *iter, int cpu) @@ -1532,6 +1535,12 @@ static void test_cpu_buff_start(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; + if (!(trace_flags & TRACE_ITER_ANNOTATE)) + return; + + if (!(iter->iter_flags & TRACE_FILE_ANNOTATE)) + return; + if (cpu_isset(iter->cpu, iter->started)) return; @@ -2132,6 +2141,11 @@ __tracing_open(struct inode *inode, struct file *file, int *ret) iter->trace = current_trace; iter->pos = -1; + /* Annotate start of buffers if we had overruns */ + if (ring_buffer_overruns(iter->tr->buffer)) + iter->iter_flags |= TRACE_FILE_ANNOTATE; + + for_each_tracing_cpu(cpu) { iter->buffer_iter[cpu] = diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 9e015f5..790ea8c 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -473,6 +473,7 @@ enum trace_iterator_flags { #ifdef CONFIG_BRANCH_TRACER TRACE_ITER_BRANCH = 0x1000, #endif + TRACE_ITER_ANNOTATE = 0x2000, }; /* -- cgit v0.10.2 From b3535c6390f27d04273e4eee0bc687f171fbf5f4 Mon Sep 17 00:00:00 2001 From: walimis Date: Fri, 14 Nov 2008 00:21:02 +0800 Subject: ftrace: remove unnecessary if condition of __unregister_ftrace_function Because it has goto out before ftrace_list == &ftrace_list_end, that's to say, we never meet this condition. Signed-off-by: walimis Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index beb21a5..54cb9a7 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -179,8 +179,7 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) if (ftrace_enabled) { /* If we only have one func left, then call that directly */ - if (ftrace_list == &ftrace_list_end || - ftrace_list->next == &ftrace_list_end) + if (ftrace_list->next == &ftrace_list_end) ftrace_trace_function = ftrace_list->func; } -- cgit v0.10.2 From d1aaf8cf8afe70a8c2235a565885291fe290c57c Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 13 Nov 2008 08:33:00 -0800 Subject: tracing/fastboot: put error message on stderr Since this scripts output is usually redirected, put error messages on standard error and exit with error code if no data is found. Signed-off-by: Stephen Hemminger Acked-by: Arjan van de Ven Signed-off-by: Ingo Molnar diff --git a/scripts/bootgraph.pl b/scripts/bootgraph.pl index a8635a1..f0af9aa 100644 --- a/scripts/bootgraph.pl +++ b/scripts/bootgraph.pl @@ -78,11 +78,13 @@ while (<>) { } if ($count == 0) { - print "No data found in the dmesg. Make sure that 'printk.time=1' and\n"; - print "'initcall_debug' are passed on the kernel command line.\n\n"; - print "Usage: \n"; - print " dmesg | perl scripts/bootgraph.pl > output.svg\n\n"; - exit; + print STDERR < output.svg +END + exit 1; } print " \n"; -- cgit v0.10.2 From f3c7ac40a99f4044b843e6e2c4f46ab2d354c563 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 14 Nov 2008 16:21:19 -0800 Subject: ftrace: remove condition from ftrace_record_ip Impact: let module functions be recorded when dyn ftrace not enabled When dynamic ftrace had a daemon and a hash to record the locations of mcount callers at run time, the recording needed to stop when ftrace was disabled. But now that the recording is done at compile time and the ftrace_record_ip is only called at boot up and when a module is loaded, we no longer need to check if ftrace_enabled is set. In fact, this breaks module load if it is not set because we skip over module functions. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 54cb9a7..3160254 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -334,7 +334,7 @@ ftrace_record_ip(unsigned long ip) { struct dyn_ftrace *rec; - if (!ftrace_enabled || ftrace_disabled) + if (ftrace_disabled) return NULL; rec = ftrace_alloc_dyn_node(ip); -- cgit v0.10.2 From b17e8a37a13d0e87165054714434534bb7e69f2d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 14 Nov 2008 16:21:19 -0800 Subject: ftrace: disable ftrace on anomalies in trace start and stop Impact: robust feature to disable ftrace on start or stop tracing on error Currently only the initial conversion to nops will disable ftrace on an anomaly. But if an anomaly happens on start or stopping of the tracer, it will silently fail. This patch adds a check there too, to disable ftrace and warn if the conversion fails. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 3160254..d5bd21f 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -348,6 +348,47 @@ ftrace_record_ip(unsigned long ip) return rec; } +static void print_ip_ins(const char *fmt, unsigned char *p) +{ + int i; + + printk(KERN_CONT "%s", fmt); + + for (i = 0; i < MCOUNT_INSN_SIZE; i++) + printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]); +} + +static void ftrace_bug(int failed, unsigned long ip, + unsigned char *expected, + unsigned char *replace) +{ + switch (failed) { + case -EFAULT: + FTRACE_WARN_ON_ONCE(1); + pr_info("ftrace faulted on modifying "); + print_ip_sym(ip); + break; + case -EINVAL: + FTRACE_WARN_ON_ONCE(1); + pr_info("ftrace failed to modify "); + print_ip_sym(ip); + print_ip_ins(" expected: ", expected); + print_ip_ins(" actual: ", (unsigned char *)ip); + print_ip_ins(" replace: ", replace); + printk(KERN_CONT "\n"); + break; + case -EPERM: + FTRACE_WARN_ON_ONCE(1); + pr_info("ftrace faulted on writing "); + print_ip_sym(ip); + break; + default: + FTRACE_WARN_ON_ONCE(1); + pr_info("ftrace faulted on unknown error "); + print_ip_sym(ip); + } +} + #define FTRACE_ADDR ((long)(ftrace_caller)) static int @@ -465,22 +506,13 @@ static void ftrace_replace_code(int enable) if ((system_state == SYSTEM_BOOTING) || !core_kernel_text(rec->ip)) { ftrace_free_rec(rec); - } + } else + ftrace_bug(failed, rec->ip, old, new); } } } } -static void print_ip_ins(const char *fmt, unsigned char *p) -{ - int i; - - printk(KERN_CONT "%s", fmt); - - for (i = 0; i < MCOUNT_INSN_SIZE; i++) - printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]); -} - static int ftrace_code_disable(struct dyn_ftrace *rec) { @@ -495,32 +527,7 @@ ftrace_code_disable(struct dyn_ftrace *rec) ret = ftrace_modify_code(ip, call, nop); if (ret) { - switch (ret) { - case -EFAULT: - FTRACE_WARN_ON_ONCE(1); - pr_info("ftrace faulted on modifying "); - print_ip_sym(ip); - break; - case -EINVAL: - FTRACE_WARN_ON_ONCE(1); - pr_info("ftrace failed to modify "); - print_ip_sym(ip); - print_ip_ins(" expected: ", call); - print_ip_ins(" actual: ", (unsigned char *)ip); - print_ip_ins(" replace: ", nop); - printk(KERN_CONT "\n"); - break; - case -EPERM: - FTRACE_WARN_ON_ONCE(1); - pr_info("ftrace faulted on writing "); - print_ip_sym(ip); - break; - default: - FTRACE_WARN_ON_ONCE(1); - pr_info("ftrace faulted on unknown error "); - print_ip_sym(ip); - } - + ftrace_bug(ret, ip, call, nop); rec->flags |= FTRACE_FL_FAILED; return 0; } -- cgit v0.10.2 From 918c115410c6cc57033835b6a401e57697f9ea4f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 14 Nov 2008 16:21:19 -0800 Subject: ftrace: do not process freed records Impact: keep from converting freed records When the tracer is started or stopped, it converts all code pointed to by the saved records into callers to ftrace or nops. When modules are unloaded, their records are freed, but they still exist within the record pages. This patch changes the code to skip over freed records. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index d5bd21f..3940c71 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -488,8 +488,12 @@ static void ftrace_replace_code(int enable) for (i = 0; i < pg->index; i++) { rec = &pg->records[i]; - /* don't modify code that has already faulted */ - if (rec->flags & FTRACE_FL_FAILED) + /* + * Skip over free records and records that have + * failed. + */ + if (rec->flags & FTRACE_FL_FREE || + rec->flags & FTRACE_FL_FAILED) continue; /* ignore updates to this record's mcount site */ -- cgit v0.10.2 From d51ad7ac48f991c4a8834485727efa99a691cb87 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Sat, 15 Nov 2008 15:48:29 -0500 Subject: ftrace: replace raw_local_irq_save with local_irq_save Impact: fix lockdep disabling itself when function tracing is enabled The raw_local_irq_saves used in ftrace is causing problems with lockdep. (it thinks the irq flags are out of sync and disables itself with a warning) The raw ops here are not needed, and the normal local_irq_save is fine. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4a90462..dff4bee 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1051,7 +1051,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip) * Need to use raw, since this must be called before the * recursive protection is performed. */ - raw_local_irq_save(flags); + local_irq_save(flags); cpu = raw_smp_processor_id(); data = tr->data[cpu]; disabled = atomic_inc_return(&data->disabled); @@ -1062,7 +1062,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip) } atomic_dec(&data->disabled); - raw_local_irq_restore(flags); + local_irq_restore(flags); } #ifdef CONFIG_FUNCTION_RET_TRACER diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 24e6e07..5cb64ea 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -52,7 +52,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count) int cpu, ret = 0; /* Don't allow flipping of max traces now */ - raw_local_irq_save(flags); + local_irq_save(flags); __raw_spin_lock(&ftrace_max_lock); cnt = ring_buffer_entries(tr->buffer); @@ -63,7 +63,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count) break; } __raw_spin_unlock(&ftrace_max_lock); - raw_local_irq_restore(flags); + local_irq_restore(flags); if (count) *count = cnt; -- cgit v0.10.2 From 31e889098a80ceb3e9e3c555d522b2686a6663c6 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 14 Nov 2008 16:21:19 -0800 Subject: ftrace: pass module struct to arch dynamic ftrace functions Impact: allow archs more flexibility on dynamic ftrace implementations Dynamic ftrace has largly been developed on x86. Since x86 does not have the same limitations as other architectures, the ftrace interaction between the generic code and the architecture specific code was not flexible enough to handle some of the issues that other architectures have. Most notably, module trampolines. Due to the limited branch distance that archs make in calling kernel core code from modules, the module load code must create a trampoline to jump to what will make the larger jump into core kernel code. The problem arises when this happens to a call to mcount. Ftrace checks all code before modifying it and makes sure the current code is what it expects. Right now, there is not enough information to handle modifying module trampolines. This patch changes the API between generic dynamic ftrace code and the arch dependent code. There is now two functions for modifying code: ftrace_make_nop(mod, rec, addr) - convert the code at rec->ip into a nop, where the original text is calling addr. (mod is the module struct if called by module init) ftrace_make_caller(rec, addr) - convert the code rec->ip that should be a nop into a caller to addr. The record "rec" now has a new field called "arch" where the architecture can add any special attributes to each call site record. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index 9b6a1fa..2bb43b4 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -17,6 +17,14 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr) */ return addr - 1; } + +#ifdef CONFIG_DYNAMIC_FTRACE + +struct dyn_arch_ftrace { + /* No extra data needed for x86 */ +}; + +#endif /* CONFIG_DYNAMIC_FTRACE */ #endif /* __ASSEMBLY__ */ #endif /* CONFIG_FUNCTION_TRACER */ diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index fe83273..762222a 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -166,7 +166,7 @@ static int ftrace_calc_offset(long ip, long addr) return (int)(addr - ip); } -unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr) +static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr) { static union ftrace_code_union calc; @@ -311,12 +311,12 @@ do_ftrace_mod_code(unsigned long ip, void *new_code) static unsigned char ftrace_nop[MCOUNT_INSN_SIZE]; -unsigned char *ftrace_nop_replace(void) +static unsigned char *ftrace_nop_replace(void) { return ftrace_nop; } -int +static int ftrace_modify_code(unsigned long ip, unsigned char *old_code, unsigned char *new_code) { @@ -349,6 +349,29 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code, return 0; } +int ftrace_make_nop(struct module *mod, + struct dyn_ftrace *rec, unsigned long addr) +{ + unsigned char *new, *old; + unsigned long ip = rec->ip; + + old = ftrace_call_replace(ip, addr); + new = ftrace_nop_replace(); + + return ftrace_modify_code(rec->ip, old, new); +} + +int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) +{ + unsigned char *new, *old; + unsigned long ip = rec->ip; + + old = ftrace_nop_replace(); + new = ftrace_call_replace(ip, addr); + + return ftrace_modify_code(rec->ip, old, new); +} + int ftrace_update_ftrace_func(ftrace_func_t func) { unsigned long ip = (unsigned long)(&ftrace_call); diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 4fbc4a8..166a207 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -74,6 +74,9 @@ static inline void ftrace_start(void) { } #endif /* CONFIG_FUNCTION_TRACER */ #ifdef CONFIG_DYNAMIC_FTRACE +/* asm/ftrace.h must be defined for archs supporting dynamic ftrace */ +#include + enum { FTRACE_FL_FREE = (1 << 0), FTRACE_FL_FAILED = (1 << 1), @@ -88,6 +91,7 @@ struct dyn_ftrace { struct list_head list; unsigned long ip; /* address of mcount call-site */ unsigned long flags; + struct dyn_arch_ftrace arch; }; int ftrace_force_update(void); @@ -95,22 +99,40 @@ void ftrace_set_filter(unsigned char *buf, int len, int reset); /* defined in arch */ extern int ftrace_ip_converted(unsigned long ip); -extern unsigned char *ftrace_nop_replace(void); -extern unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr); extern int ftrace_dyn_arch_init(void *data); extern int ftrace_update_ftrace_func(ftrace_func_t func); extern void ftrace_caller(void); extern void ftrace_call(void); extern void mcount_call(void); -/* May be defined in arch */ -extern int ftrace_arch_read_dyn_info(char *buf, int size); +/** + * ftrace_make_nop - convert code into top + * @mod: module structure if called by module load initialization + * @rec: the mcount call site record + * @addr: the address that the call site should be calling + * + * This is a very sensitive operation and great care needs + * to be taken by the arch. The operation should carefully + * read the location, check to see if what is read is indeed + * what we expect it to be, and then on success of the compare, + * it should write to the location. + * + * The code segment at @rec->ip should be a caller to @addr + * + * Return must be: + * 0 on success + * -EFAULT on error reading the location + * -EINVAL on a failed compare of the contents + * -EPERM on error writing to the location + * Any other value will be considered a failure. + */ +extern int ftrace_make_nop(struct module *mod, + struct dyn_ftrace *rec, unsigned long addr); /** - * ftrace_modify_code - modify code segment - * @ip: the address of the code segment - * @old_code: the contents of what is expected to be there - * @new_code: the code to patch in + * ftrace_make_call - convert a nop call site into a call to addr + * @rec: the mcount call site record + * @addr: the address that the call site should call * * This is a very sensitive operation and great care needs * to be taken by the arch. The operation should carefully @@ -118,6 +140,8 @@ extern int ftrace_arch_read_dyn_info(char *buf, int size); * what we expect it to be, and then on success of the compare, * it should write to the location. * + * The code segment at @rec->ip should be a nop + * * Return must be: * 0 on success * -EFAULT on error reading the location @@ -125,8 +149,11 @@ extern int ftrace_arch_read_dyn_info(char *buf, int size); * -EPERM on error writing to the location * Any other value will be considered a failure. */ -extern int ftrace_modify_code(unsigned long ip, unsigned char *old_code, - unsigned char *new_code); +extern int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr); + + +/* May be defined in arch */ +extern int ftrace_arch_read_dyn_info(char *buf, int size); extern int skip_trace(unsigned long ip); @@ -259,11 +286,13 @@ static inline void ftrace_dump(void) { } #ifdef CONFIG_FTRACE_MCOUNT_RECORD extern void ftrace_init(void); -extern void ftrace_init_module(unsigned long *start, unsigned long *end); +extern void ftrace_init_module(struct module *mod, + unsigned long *start, unsigned long *end); #else static inline void ftrace_init(void) { } static inline void -ftrace_init_module(unsigned long *start, unsigned long *end) { } +ftrace_init_module(struct module *mod, + unsigned long *start, unsigned long *end) { } #endif diff --git a/kernel/module.c b/kernel/module.c index 1f4cc00..6979127 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2201,7 +2201,7 @@ static noinline struct module *load_module(void __user *umod, /* sechdrs[0].sh_size is always zero */ mseg = section_objs(hdr, sechdrs, secstrings, "__mcount_loc", sizeof(*mseg), &num_mcount); - ftrace_init_module(mseg, mseg + num_mcount); + ftrace_init_module(mod, mseg, mseg + num_mcount); err = module_finalize(hdr, sechdrs, mod); if (err < 0) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 3940c71..e9a5fbf 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -358,9 +358,7 @@ static void print_ip_ins(const char *fmt, unsigned char *p) printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]); } -static void ftrace_bug(int failed, unsigned long ip, - unsigned char *expected, - unsigned char *replace) +static void ftrace_bug(int failed, unsigned long ip) { switch (failed) { case -EFAULT: @@ -372,9 +370,7 @@ static void ftrace_bug(int failed, unsigned long ip, FTRACE_WARN_ON_ONCE(1); pr_info("ftrace failed to modify "); print_ip_sym(ip); - print_ip_ins(" expected: ", expected); print_ip_ins(" actual: ", (unsigned char *)ip); - print_ip_ins(" replace: ", replace); printk(KERN_CONT "\n"); break; case -EPERM: @@ -392,8 +388,7 @@ static void ftrace_bug(int failed, unsigned long ip, #define FTRACE_ADDR ((long)(ftrace_caller)) static int -__ftrace_replace_code(struct dyn_ftrace *rec, - unsigned char *old, unsigned char *new, int enable) +__ftrace_replace_code(struct dyn_ftrace *rec, int enable) { unsigned long ip, fl; @@ -435,12 +430,10 @@ __ftrace_replace_code(struct dyn_ftrace *rec, * otherwise enable it! */ if (fl & FTRACE_FL_ENABLED) { - /* swap new and old */ - new = old; - old = ftrace_call_replace(ip, FTRACE_ADDR); + enable = 0; rec->flags &= ~FTRACE_FL_ENABLED; } else { - new = ftrace_call_replace(ip, FTRACE_ADDR); + enable = 1; rec->flags |= FTRACE_FL_ENABLED; } } else { @@ -453,10 +446,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, fl = rec->flags & (FTRACE_FL_NOTRACE | FTRACE_FL_ENABLED); if (fl == FTRACE_FL_NOTRACE) return 0; - - new = ftrace_call_replace(ip, FTRACE_ADDR); - } else - old = ftrace_call_replace(ip, FTRACE_ADDR); + } if (enable) { if (rec->flags & FTRACE_FL_ENABLED) @@ -469,21 +459,18 @@ __ftrace_replace_code(struct dyn_ftrace *rec, } } - return ftrace_modify_code(ip, old, new); + if (enable) + return ftrace_make_call(rec, FTRACE_ADDR); + else + return ftrace_make_nop(NULL, rec, FTRACE_ADDR); } static void ftrace_replace_code(int enable) { int i, failed; - unsigned char *new = NULL, *old = NULL; struct dyn_ftrace *rec; struct ftrace_page *pg; - if (enable) - old = ftrace_nop_replace(); - else - new = ftrace_nop_replace(); - for (pg = ftrace_pages_start; pg; pg = pg->next) { for (i = 0; i < pg->index; i++) { rec = &pg->records[i]; @@ -504,34 +491,30 @@ static void ftrace_replace_code(int enable) unfreeze_record(rec); } - failed = __ftrace_replace_code(rec, old, new, enable); + failed = __ftrace_replace_code(rec, enable); if (failed && (rec->flags & FTRACE_FL_CONVERTED)) { rec->flags |= FTRACE_FL_FAILED; if ((system_state == SYSTEM_BOOTING) || !core_kernel_text(rec->ip)) { ftrace_free_rec(rec); } else - ftrace_bug(failed, rec->ip, old, new); + ftrace_bug(failed, rec->ip); } } } } static int -ftrace_code_disable(struct dyn_ftrace *rec) +ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec) { unsigned long ip; - unsigned char *nop, *call; int ret; ip = rec->ip; - nop = ftrace_nop_replace(); - call = ftrace_call_replace(ip, mcount_addr); - - ret = ftrace_modify_code(ip, call, nop); + ret = ftrace_make_nop(mod, rec, mcount_addr); if (ret) { - ftrace_bug(ret, ip, call, nop); + ftrace_bug(ret, ip); rec->flags |= FTRACE_FL_FAILED; return 0; } @@ -650,7 +633,7 @@ static cycle_t ftrace_update_time; static unsigned long ftrace_update_cnt; unsigned long ftrace_update_tot_cnt; -static int ftrace_update_code(void) +static int ftrace_update_code(struct module *mod) { struct dyn_ftrace *p, *t; cycle_t start, stop; @@ -667,7 +650,7 @@ static int ftrace_update_code(void) list_del_init(&p->list); /* convert record (i.e, patch mcount-call with NOP) */ - if (ftrace_code_disable(p)) { + if (ftrace_code_disable(mod, p)) { p->flags |= FTRACE_FL_CONVERTED; ftrace_update_cnt++; } else @@ -1309,7 +1292,8 @@ static __init int ftrace_init_debugfs(void) fs_initcall(ftrace_init_debugfs); -static int ftrace_convert_nops(unsigned long *start, +static int ftrace_convert_nops(struct module *mod, + unsigned long *start, unsigned long *end) { unsigned long *p; @@ -1325,18 +1309,19 @@ static int ftrace_convert_nops(unsigned long *start, /* disable interrupts to prevent kstop machine */ local_irq_save(flags); - ftrace_update_code(); + ftrace_update_code(mod); local_irq_restore(flags); mutex_unlock(&ftrace_start_lock); return 0; } -void ftrace_init_module(unsigned long *start, unsigned long *end) +void ftrace_init_module(struct module *mod, + unsigned long *start, unsigned long *end) { if (ftrace_disabled || start == end) return; - ftrace_convert_nops(start, end); + ftrace_convert_nops(mod, start, end); } extern unsigned long __start_mcount_loc[]; @@ -1366,7 +1351,8 @@ void __init ftrace_init(void) last_ftrace_enabled = ftrace_enabled = 1; - ret = ftrace_convert_nops(__start_mcount_loc, + ret = ftrace_convert_nops(NULL, + __start_mcount_loc, __stop_mcount_loc); return; -- cgit v0.10.2 From 20e5227e9f55ae1969934821ccbf581563785bbe Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 14 Nov 2008 16:21:19 -0800 Subject: ftrace: allow NULL pointers in mcount_loc Impact: make ftrace_convert_nops() more permissive Due to the way different architecture linkers combine the data sections of the mcount_loc (the section that lists all the locations that call mcount), there may be zeros added in that section. This is usually due to strange alignments that the linker performs, that pads in zeros. This patch makes the conversion code to nops skip any pointer in the mcount_loc section that is NULL. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index e9a5fbf..cc42191 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1304,6 +1304,14 @@ static int ftrace_convert_nops(struct module *mod, p = start; while (p < end) { addr = ftrace_call_adjust(*p++); + /* + * Some architecture linkers will pad between + * the different mcount_loc sections of different + * object files to satisfy alignments. + * Skip any NULL pointers. + */ + if (!addr) + continue; ftrace_record_ip(addr); } -- cgit v0.10.2 From 982c350b9ec4b3564d67f3627a274ae61bbc7e95 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Sat, 15 Nov 2008 16:31:41 -0500 Subject: ftrace: fix dyn ftrace filter Impact: correct implementation of dyn ftrace filter The old decisions made by the filter algorithm was complex and incorrect. This lead to inconsistent enabling or disabling of functions when the filter was used. This patch simplifies that code and in doing so, corrects the usage of the filters. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index cc42191..b9f2e22 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -394,72 +394,62 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) ip = rec->ip; - if (ftrace_filtered && enable) { + /* + * If this record is not to be traced and + * it is not enabled then do nothing. + * + * If this record is not to be traced and + * it is enabled then disabled it. + * + */ + if (rec->flags & FTRACE_FL_NOTRACE) { + if (rec->flags & FTRACE_FL_ENABLED) + rec->flags &= ~FTRACE_FL_ENABLED; + else + return 0; + + } else if (ftrace_filtered && enable) { /* - * If filtering is on: - * - * If this record is set to be filtered and - * is enabled then do nothing. - * - * If this record is set to be filtered and - * it is not enabled, enable it. - * - * If this record is not set to be filtered - * and it is not enabled do nothing. - * - * If this record is set not to trace then - * do nothing. - * - * If this record is set not to trace and - * it is enabled then disable it. - * - * If this record is not set to be filtered and - * it is enabled, disable it. + * Filtering is on: */ - fl = rec->flags & (FTRACE_FL_FILTER | FTRACE_FL_NOTRACE | - FTRACE_FL_ENABLED); + fl = rec->flags & (FTRACE_FL_FILTER | FTRACE_FL_ENABLED); - if ((fl == (FTRACE_FL_FILTER | FTRACE_FL_ENABLED)) || - (fl == (FTRACE_FL_FILTER | FTRACE_FL_NOTRACE)) || - !fl || (fl == FTRACE_FL_NOTRACE)) + /* Record is filtered and enabled, do nothing */ + if (fl == (FTRACE_FL_FILTER | FTRACE_FL_ENABLED)) return 0; - /* - * If it is enabled disable it, - * otherwise enable it! - */ - if (fl & FTRACE_FL_ENABLED) { - enable = 0; + /* Record is not filtered and is not enabled do nothing */ + if (!fl) + return 0; + + /* Record is not filtered but enabled, disable it */ + if (fl == FTRACE_FL_ENABLED) rec->flags &= ~FTRACE_FL_ENABLED; - } else { - enable = 1; + else + /* Otherwise record is filtered but not enabled, enable it */ rec->flags |= FTRACE_FL_ENABLED; - } } else { + /* Disable or not filtered */ if (enable) { - /* - * If this record is set not to trace and is - * not enabled, do nothing. - */ - fl = rec->flags & (FTRACE_FL_NOTRACE | FTRACE_FL_ENABLED); - if (fl == FTRACE_FL_NOTRACE) - return 0; - } - - if (enable) { + /* if record is enabled, do nothing */ if (rec->flags & FTRACE_FL_ENABLED) return 0; + rec->flags |= FTRACE_FL_ENABLED; + } else { + + /* if record is not enabled do nothing */ if (!(rec->flags & FTRACE_FL_ENABLED)) return 0; + rec->flags &= ~FTRACE_FL_ENABLED; } } - if (enable) + if (rec->flags & FTRACE_FL_ENABLED) return ftrace_make_call(rec, FTRACE_ADDR); else return ftrace_make_nop(NULL, rec, FTRACE_ADDR); @@ -554,8 +544,7 @@ static void ftrace_startup(void) mutex_lock(&ftrace_start_lock); ftrace_start_up++; - if (ftrace_start_up == 1) - command |= FTRACE_ENABLE_CALLS; + command |= FTRACE_ENABLE_CALLS; if (saved_ftrace_func != ftrace_trace_function) { saved_ftrace_func = ftrace_trace_function; -- cgit v0.10.2 From ee02a2e5c88ca2e4d6921f08d037b46d5bf82641 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Sat, 15 Nov 2008 16:31:41 -0500 Subject: ftrace: make filtered functions effective on setting Impact: set filtered functions at time the filter is set It can be confusing when the set_filter_functions is set (or cleared) and the functions being recorded by the dynamic tracer does not match. This patch causes the code to be updated if the function tracer is enabled and the filter is changed. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index b9f2e22..b42ec1d 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1194,7 +1194,7 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable) mutex_lock(&ftrace_sysctl_lock); mutex_lock(&ftrace_start_lock); - if (iter->filtered && ftrace_start_up && ftrace_enabled) + if (ftrace_start_up && ftrace_enabled) ftrace_run_update_code(FTRACE_ENABLE_CALLS); mutex_unlock(&ftrace_start_lock); mutex_unlock(&ftrace_sysctl_lock); -- cgit v0.10.2 From e6e7a65aabdb696cf05a56cfd495c49a11fd4cde Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 16 Nov 2008 05:53:19 +0100 Subject: tracing/ftrace: fix unexpected -EINVAL when longest tracer name is set Impact: fix confusing write() -EINVAL when changing the tracer The following commit d9e540762f5cdd89f24e518ad1fd31142d0b9726 remade alive the bug which made the set of a new tracer returning -EINVAL if this is the longest name of tracer. This patch corrects it. Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index dff4bee..80898f4 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2655,6 +2655,9 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf, char buf[max_tracer_type_len+1]; int i; size_t ret; + int err; + + ret = cnt; if (cnt > max_tracer_type_len) cnt = max_tracer_type_len; @@ -2668,12 +2671,11 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf, for (i = cnt - 1; i > 0 && isspace(buf[i]); i--) buf[i] = 0; - ret = tracing_set_tracer(buf); - if (!ret) - ret = cnt; + err = tracing_set_tracer(buf); + if (err) + return err; - if (ret > 0) - filp->f_pos += ret; + filp->f_pos += ret; return ret; } -- cgit v0.10.2 From 1c80025a49855b12fa09bb6db71820e3367b1369 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 16 Nov 2008 05:57:26 +0100 Subject: tracing/ftrace: change the type of the init() callback Impact: extend the ->init() method with the ability to fail This bring a way to know if the initialization of a tracer successed. A tracer must return 0 on success and a traditional error (ie: -ENOMEM) if it fails. If a tracer fails to init, it is free to print a detailed warn. The tracing api will not and switch to a new tracer will just return the error from the init callback. Note: this will be used for the return tracer. Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 80898f4..396fda0 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2638,8 +2638,11 @@ static int tracing_set_tracer(char *buf) current_trace->reset(tr); current_trace = t; - if (t->init) - t->init(tr); + if (t->init) { + ret = t->init(tr); + if (ret) + goto out; + } trace_branch_enable(tr); out: diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 790ea8c..cdbd5cc 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -264,7 +264,8 @@ enum print_line_t { */ struct tracer { const char *name; - void (*init)(struct trace_array *tr); + /* Your tracer should raise a warning if init fails */ + int (*init)(struct trace_array *tr); void (*reset)(struct trace_array *tr); void (*start)(struct trace_array *tr); void (*stop)(struct trace_array *tr); diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index cb333b7..a4fa2c5 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -47,7 +47,7 @@ static void reset_boot_trace(struct trace_array *tr) tracing_reset(tr, cpu); } -static void boot_trace_init(struct trace_array *tr) +static int boot_trace_init(struct trace_array *tr) { int cpu; boot_trace = tr; @@ -56,6 +56,7 @@ static void boot_trace_init(struct trace_array *tr) tracing_reset(tr, cpu); tracing_sched_switch_assign_trace(tr); + return 0; } static enum print_line_t diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 8526555..44bd395 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -125,7 +125,7 @@ static void stop_branch_trace(struct trace_array *tr) disable_branch_tracing(); } -static void branch_trace_init(struct trace_array *tr) +static int branch_trace_init(struct trace_array *tr) { int cpu; @@ -133,6 +133,7 @@ static void branch_trace_init(struct trace_array *tr) tracing_reset(tr, cpu); start_branch_trace(tr); + return 0; } static void branch_trace_reset(struct trace_array *tr) diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 8693b7a..e74f6d0 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -42,9 +42,10 @@ static void stop_function_trace(struct trace_array *tr) tracing_stop_cmdline_record(); } -static void function_trace_init(struct trace_array *tr) +static int function_trace_init(struct trace_array *tr) { start_function_trace(tr); + return 0; } static void function_trace_reset(struct trace_array *tr) diff --git a/kernel/trace/trace_functions_return.c b/kernel/trace/trace_functions_return.c index 7680b21..61185f7 100644 --- a/kernel/trace/trace_functions_return.c +++ b/kernel/trace/trace_functions_return.c @@ -24,13 +24,14 @@ static void stop_return_trace(struct trace_array *tr) unregister_ftrace_return(); } -static void return_trace_init(struct trace_array *tr) +static int return_trace_init(struct trace_array *tr) { int cpu; for_each_online_cpu(cpu) tracing_reset(tr, cpu); start_return_trace(tr); + return 0; } static void return_trace_reset(struct trace_array *tr) diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index d919d4e..7c2e326 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -416,11 +416,12 @@ static void irqsoff_tracer_close(struct trace_iterator *iter) } #ifdef CONFIG_IRQSOFF_TRACER -static void irqsoff_tracer_init(struct trace_array *tr) +static int irqsoff_tracer_init(struct trace_array *tr) { trace_type = TRACER_IRQS_OFF; __irqsoff_tracer_init(tr); + return 0; } static struct tracer irqsoff_tracer __read_mostly = { @@ -442,11 +443,12 @@ static struct tracer irqsoff_tracer __read_mostly = #endif #ifdef CONFIG_PREEMPT_TRACER -static void preemptoff_tracer_init(struct trace_array *tr) +static int preemptoff_tracer_init(struct trace_array *tr) { trace_type = TRACER_PREEMPT_OFF; __irqsoff_tracer_init(tr); + return 0; } static struct tracer preemptoff_tracer __read_mostly = @@ -471,11 +473,12 @@ static struct tracer preemptoff_tracer __read_mostly = #if defined(CONFIG_IRQSOFF_TRACER) && \ defined(CONFIG_PREEMPT_TRACER) -static void preemptirqsoff_tracer_init(struct trace_array *tr) +static int preemptirqsoff_tracer_init(struct trace_array *tr) { trace_type = TRACER_IRQS_OFF | TRACER_PREEMPT_OFF; __irqsoff_tracer_init(tr); + return 0; } static struct tracer preemptirqsoff_tracer __read_mostly = diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index 51bcf37..433d650 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -30,13 +30,14 @@ static void mmio_reset_data(struct trace_array *tr) tracing_reset(tr, cpu); } -static void mmio_trace_init(struct trace_array *tr) +static int mmio_trace_init(struct trace_array *tr) { pr_debug("in %s\n", __func__); mmio_trace_array = tr; mmio_reset_data(tr); enable_mmiotrace(); + return 0; } static void mmio_trace_reset(struct trace_array *tr) diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c index 2ef1d22..0e77415 100644 --- a/kernel/trace/trace_nop.c +++ b/kernel/trace/trace_nop.c @@ -24,7 +24,7 @@ static void stop_nop_trace(struct trace_array *tr) /* Nothing to do! */ } -static void nop_trace_init(struct trace_array *tr) +static int nop_trace_init(struct trace_array *tr) { int cpu; ctx_trace = tr; @@ -33,6 +33,7 @@ static void nop_trace_init(struct trace_array *tr) tracing_reset(tr, cpu); start_nop_trace(tr); + return 0; } static void nop_trace_reset(struct trace_array *tr) diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index be35bdf..8633905 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -206,10 +206,11 @@ static void stop_sched_trace(struct trace_array *tr) tracing_stop_sched_switch_record(); } -static void sched_switch_trace_init(struct trace_array *tr) +static int sched_switch_trace_init(struct trace_array *tr) { ctx_trace = tr; start_sched_trace(tr); + return 0; } static void sched_switch_trace_reset(struct trace_array *tr) diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 983f2b1..0067b49 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -331,10 +331,11 @@ static void stop_wakeup_tracer(struct trace_array *tr) unregister_trace_sched_wakeup(probe_wakeup); } -static void wakeup_tracer_init(struct trace_array *tr) +static int wakeup_tracer_init(struct trace_array *tr) { wakeup_trace = tr; start_wakeup_tracer(tr); + return 0; } static void wakeup_tracer_reset(struct trace_array *tr) diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 5cb64ea..88c8eb7 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -71,6 +71,11 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count) return ret; } +static inline void warn_failed_init_tracer(struct tracer *trace, int init_ret) +{ + printk(KERN_WARNING "Failed to init %s tracer, init returned %d\n", + trace->name, init_ret); +} #ifdef CONFIG_FUNCTION_TRACER #ifdef CONFIG_DYNAMIC_FTRACE @@ -111,7 +116,11 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, ftrace_set_filter(func_name, strlen(func_name), 1); /* enable tracing */ - trace->init(tr); + ret = trace->init(tr); + if (ret) { + warn_failed_init_tracer(trace, ret); + goto out; + } /* Sleep for a 1/10 of a second */ msleep(100); @@ -181,7 +190,12 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) ftrace_enabled = 1; tracer_enabled = 1; - trace->init(tr); + ret = trace->init(tr); + if (ret) { + warn_failed_init_tracer(trace, ret); + goto out; + } + /* Sleep for a 1/10 of a second */ msleep(100); /* stop the tracing. */ @@ -223,7 +237,12 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr) int ret; /* start the tracing */ - trace->init(tr); + ret = trace->init(tr); + if (ret) { + warn_failed_init_tracer(trace, ret); + return ret; + } + /* reset the max latency */ tracing_max_latency = 0; /* disable interrupts for a bit */ @@ -272,7 +291,12 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr) } /* start the tracing */ - trace->init(tr); + ret = trace->init(tr); + if (ret) { + warn_failed_init_tracer(trace, ret); + return ret; + } + /* reset the max latency */ tracing_max_latency = 0; /* disable preemption for a bit */ @@ -321,7 +345,11 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array * } /* start the tracing */ - trace->init(tr); + ret = trace->init(tr); + if (ret) { + warn_failed_init_tracer(trace, ret); + goto out; + } /* reset the max latency */ tracing_max_latency = 0; @@ -449,7 +477,12 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) wait_for_completion(&isrt); /* start the tracing */ - trace->init(tr); + ret = trace->init(tr); + if (ret) { + warn_failed_init_tracer(trace, ret); + return ret; + } + /* reset the max latency */ tracing_max_latency = 0; @@ -505,7 +538,12 @@ trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr int ret; /* start the tracing */ - trace->init(tr); + ret = trace->init(tr); + if (ret) { + warn_failed_init_tracer(trace, ret); + return ret; + } + /* Sleep for a 1/10 of a second */ msleep(100); /* stop the tracing. */ @@ -532,7 +570,12 @@ trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr) int ret; /* start the tracing */ - trace->init(tr); + ret = trace->init(tr); + if (ret) { + warn_failed_init_tracer(trace, ret); + return 0; + } + /* Sleep for a 1/10 of a second */ msleep(100); /* stop the tracing. */ @@ -554,7 +597,12 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr) int ret; /* start the tracing */ - trace->init(tr); + ret = trace->init(tr); + if (ret) { + warn_failed_init_tracer(trace, ret); + return ret; + } + /* Sleep for a 1/10 of a second */ msleep(100); /* stop the tracing. */ diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c index 05f7534..54960ed 100644 --- a/kernel/trace/trace_sysprof.c +++ b/kernel/trace/trace_sysprof.c @@ -261,11 +261,12 @@ static void stop_stack_trace(struct trace_array *tr) mutex_unlock(&sample_timer_lock); } -static void stack_trace_init(struct trace_array *tr) +static int stack_trace_init(struct trace_array *tr) { sysprof_trace = tr; start_stack_trace(tr); + return 0; } static void stack_trace_reset(struct trace_array *tr) -- cgit v0.10.2 From 072b40a15616fe6bea68466e6bffcfcbf5c8f26f Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 16 Nov 2008 05:59:52 +0100 Subject: tracing/branch-tracer: fix a trace recursion on branch tracer Impact: fix crash when enabling the branch-tracer When the branch tracer inserts an event through probe_likely_condition(), it calls local_irq_save() and then results in a trace recursion. local_irq_save() -> trace_hardirqs_off() -> trace_hardirqs_off_caller() -> unlikely() The trace_branch.c file is protected by DISABLE_BRANCH_PROFILING but that doesn't prevent from external call to functions that use unlikely(). My box crashed each time I tried to set this tracer (sudden and hard reboot). Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 8526555..2511e32 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -41,7 +41,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) if (unlikely(!tr)) return; - local_irq_save(flags); + raw_local_irq_save(flags); cpu = raw_smp_processor_id(); if (atomic_inc_return(&tr->data[cpu]->disabled) != 1) goto out; @@ -73,7 +73,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) out: atomic_dec(&tr->data[cpu]->disabled); - local_irq_restore(flags); + raw_local_irq_restore(flags); } static inline -- cgit v0.10.2 From b01c746617da5e260803eb10ed64ca043e9a1241 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 15 Nov 2008 02:37:44 +0100 Subject: tracing/function-return-tracer: add a barrier to ensure return stack index is incremented in memory Impact: fix possible race condition in ftrace function return tracer This fixes a possible race condition if index incrementation is not immediately flushed in memory. Thanks for Andi Kleen and Steven Rostedt for pointing out this issue and give me this solution. Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 762222a..d98b5a8 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -56,6 +56,7 @@ static int push_return_trace(unsigned long ret, unsigned long long time, return -EBUSY; index = ++ti->curr_ret_stack; + barrier(); ti->ret_stack[index].ret = ret; ti->ret_stack[index].func = func; ti->ret_stack[index].calltime = time; -- cgit v0.10.2 From e7d3737ea1b102030f44e96c97754101e41515f0 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 16 Nov 2008 06:02:06 +0100 Subject: tracing/function-return-tracer: support for dynamic ftrace on function return tracer This patch adds the support for dynamic tracing on the function return tracer. The whole difference with normal dynamic function tracing is that we don't need to hook on a particular callback. The only pro that we want is to nop or set dynamically the calls to ftrace_caller (which is ftrace_return_caller here). Some security checks ensure that we are not trying to launch dynamic tracing for return tracing while normal function tracing is already running. An example of trace with getnstimeofday set as a filter: ktime_get_ts+0x22/0x50 -> getnstimeofday (2283 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1396 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1382 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1825 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1426 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1464 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1524 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1382 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1382 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1434 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1464 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1502 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1404 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1397 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1051 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1314 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1344 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1163 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1390 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1374 ns) Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index f976211..74defe2 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -1190,7 +1190,7 @@ ENTRY(mcount) jnz trace #ifdef CONFIG_FUNCTION_RET_TRACER cmpl $ftrace_stub, ftrace_function_return - jnz trace_return + jnz ftrace_return_caller #endif .globl ftrace_stub ftrace_stub: @@ -1211,9 +1211,15 @@ trace: popl %ecx popl %eax jmp ftrace_stub +END(mcount) +#endif /* CONFIG_DYNAMIC_FTRACE */ +#endif /* CONFIG_FUNCTION_TRACER */ #ifdef CONFIG_FUNCTION_RET_TRACER -trace_return: +ENTRY(ftrace_return_caller) + cmpl $0, function_trace_stop + jne ftrace_stub + pushl %eax pushl %ecx pushl %edx @@ -1223,7 +1229,8 @@ trace_return: popl %edx popl %ecx popl %eax - jmp ftrace_stub + ret +END(ftrace_return_caller) .globl return_to_handler return_to_handler: @@ -1237,10 +1244,7 @@ return_to_handler: popl %ecx popl %eax ret -#endif /* CONFIG_FUNCTION_RET_TRACER */ -END(mcount) -#endif /* CONFIG_DYNAMIC_FTRACE */ -#endif /* CONFIG_FUNCTION_TRACER */ +#endif .section .rodata,"a" #include "syscall_table_32.S" diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index d98b5a8..924153e 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -24,134 +24,6 @@ #include - -#ifdef CONFIG_FUNCTION_RET_TRACER - -/* - * These functions are picked from those used on - * this page for dynamic ftrace. They have been - * simplified to ignore all traces in NMI context. - */ -static atomic_t in_nmi; - -void ftrace_nmi_enter(void) -{ - atomic_inc(&in_nmi); -} - -void ftrace_nmi_exit(void) -{ - atomic_dec(&in_nmi); -} - -/* Add a function return address to the trace stack on thread info.*/ -static int push_return_trace(unsigned long ret, unsigned long long time, - unsigned long func) -{ - int index; - struct thread_info *ti = current_thread_info(); - - /* The return trace stack is full */ - if (ti->curr_ret_stack == FTRACE_RET_STACK_SIZE - 1) - return -EBUSY; - - index = ++ti->curr_ret_stack; - barrier(); - ti->ret_stack[index].ret = ret; - ti->ret_stack[index].func = func; - ti->ret_stack[index].calltime = time; - - return 0; -} - -/* Retrieve a function return address to the trace stack on thread info.*/ -static void pop_return_trace(unsigned long *ret, unsigned long long *time, - unsigned long *func) -{ - int index; - - struct thread_info *ti = current_thread_info(); - index = ti->curr_ret_stack; - *ret = ti->ret_stack[index].ret; - *func = ti->ret_stack[index].func; - *time = ti->ret_stack[index].calltime; - ti->curr_ret_stack--; -} - -/* - * Send the trace to the ring-buffer. - * @return the original return address. - */ -unsigned long ftrace_return_to_handler(void) -{ - struct ftrace_retfunc trace; - pop_return_trace(&trace.ret, &trace.calltime, &trace.func); - trace.rettime = cpu_clock(raw_smp_processor_id()); - ftrace_function_return(&trace); - - return trace.ret; -} - -/* - * Hook the return address and push it in the stack of return addrs - * in current thread info. - */ -void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) -{ - unsigned long old; - unsigned long long calltime; - int faulted; - unsigned long return_hooker = (unsigned long) - &return_to_handler; - - /* Nmi's are currently unsupported */ - if (atomic_read(&in_nmi)) - return; - - /* - * Protect against fault, even if it shouldn't - * happen. This tool is too much intrusive to - * ignore such a protection. - */ - asm volatile( - "1: movl (%[parent_old]), %[old]\n" - "2: movl %[return_hooker], (%[parent_replaced])\n" - " movl $0, %[faulted]\n" - - ".section .fixup, \"ax\"\n" - "3: movl $1, %[faulted]\n" - ".previous\n" - - ".section __ex_table, \"a\"\n" - " .long 1b, 3b\n" - " .long 2b, 3b\n" - ".previous\n" - - : [parent_replaced] "=r" (parent), [old] "=r" (old), - [faulted] "=r" (faulted) - : [parent_old] "0" (parent), [return_hooker] "r" (return_hooker) - : "memory" - ); - - if (WARN_ON(faulted)) { - unregister_ftrace_return(); - return; - } - - if (WARN_ON(!__kernel_text_address(old))) { - unregister_ftrace_return(); - *parent = old; - return; - } - - calltime = cpu_clock(raw_smp_processor_id()); - - if (push_return_trace(old, calltime, self_addr) == -EBUSY) - *parent = old; -} - -#endif - #ifdef CONFIG_DYNAMIC_FTRACE union ftrace_code_union { @@ -450,3 +322,133 @@ int __init ftrace_dyn_arch_init(void *data) return 0; } #endif + +#ifdef CONFIG_FUNCTION_RET_TRACER + +#ifndef CONFIG_DYNAMIC_FTRACE + +/* + * These functions are picked from those used on + * this page for dynamic ftrace. They have been + * simplified to ignore all traces in NMI context. + */ +static atomic_t in_nmi; + +void ftrace_nmi_enter(void) +{ + atomic_inc(&in_nmi); +} + +void ftrace_nmi_exit(void) +{ + atomic_dec(&in_nmi); +} +#endif /* !CONFIG_DYNAMIC_FTRACE */ + +/* Add a function return address to the trace stack on thread info.*/ +static int push_return_trace(unsigned long ret, unsigned long long time, + unsigned long func) +{ + int index; + struct thread_info *ti = current_thread_info(); + + /* The return trace stack is full */ + if (ti->curr_ret_stack == FTRACE_RET_STACK_SIZE - 1) + return -EBUSY; + + index = ++ti->curr_ret_stack; + barrier(); + ti->ret_stack[index].ret = ret; + ti->ret_stack[index].func = func; + ti->ret_stack[index].calltime = time; + + return 0; +} + +/* Retrieve a function return address to the trace stack on thread info.*/ +static void pop_return_trace(unsigned long *ret, unsigned long long *time, + unsigned long *func) +{ + int index; + + struct thread_info *ti = current_thread_info(); + index = ti->curr_ret_stack; + *ret = ti->ret_stack[index].ret; + *func = ti->ret_stack[index].func; + *time = ti->ret_stack[index].calltime; + ti->curr_ret_stack--; +} + +/* + * Send the trace to the ring-buffer. + * @return the original return address. + */ +unsigned long ftrace_return_to_handler(void) +{ + struct ftrace_retfunc trace; + pop_return_trace(&trace.ret, &trace.calltime, &trace.func); + trace.rettime = cpu_clock(raw_smp_processor_id()); + ftrace_function_return(&trace); + + return trace.ret; +} + +/* + * Hook the return address and push it in the stack of return addrs + * in current thread info. + */ +void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) +{ + unsigned long old; + unsigned long long calltime; + int faulted; + unsigned long return_hooker = (unsigned long) + &return_to_handler; + + /* Nmi's are currently unsupported */ + if (atomic_read(&in_nmi)) + return; + + /* + * Protect against fault, even if it shouldn't + * happen. This tool is too much intrusive to + * ignore such a protection. + */ + asm volatile( + "1: movl (%[parent_old]), %[old]\n" + "2: movl %[return_hooker], (%[parent_replaced])\n" + " movl $0, %[faulted]\n" + + ".section .fixup, \"ax\"\n" + "3: movl $1, %[faulted]\n" + ".previous\n" + + ".section __ex_table, \"a\"\n" + " .long 1b, 3b\n" + " .long 2b, 3b\n" + ".previous\n" + + : [parent_replaced] "=r" (parent), [old] "=r" (old), + [faulted] "=r" (faulted) + : [parent_old] "0" (parent), [return_hooker] "r" (return_hooker) + : "memory" + ); + + if (WARN_ON(faulted)) { + unregister_ftrace_return(); + return; + } + + if (WARN_ON(!__kernel_text_address(old))) { + unregister_ftrace_return(); + *parent = old; + return; + } + + calltime = cpu_clock(raw_smp_processor_id()); + + if (push_return_trace(old, calltime, self_addr) == -EBUSY) + *parent = old; +} + +#endif /* CONFIG_FUNCTION_RET_TRACER */ diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 166a207..f1af1aa 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -25,6 +25,17 @@ struct ftrace_ops { extern int function_trace_stop; +/* + * Type of the current tracing. + */ +enum ftrace_tracing_type_t { + FTRACE_TYPE_ENTER = 0, /* Hook the call of the function */ + FTRACE_TYPE_RETURN, /* Hook the return of the function */ +}; + +/* Current tracing type, default is FTRACE_TYPE_ENTER */ +extern enum ftrace_tracing_type_t ftrace_tracing_type; + /** * ftrace_stop - stop function tracer. * @@ -104,6 +115,9 @@ extern int ftrace_update_ftrace_func(ftrace_func_t func); extern void ftrace_caller(void); extern void ftrace_call(void); extern void mcount_call(void); +#ifdef CONFIG_FUNCTION_RET_TRACER +extern void ftrace_return_caller(void); +#endif /** * ftrace_make_nop - convert code into top @@ -310,7 +324,7 @@ struct ftrace_retfunc { /* Type of a callback handler of tracing return function */ typedef void (*trace_function_return_t)(struct ftrace_retfunc *); -extern void register_ftrace_return(trace_function_return_t func); +extern int register_ftrace_return(trace_function_return_t func); /* The current handler in use */ extern trace_function_return_t ftrace_function_return; extern void unregister_ftrace_return(void); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 9c89526..b8378fa 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -59,7 +59,6 @@ config FUNCTION_TRACER config FUNCTION_RET_TRACER bool "Kernel Function return Tracer" - depends on !DYNAMIC_FTRACE depends on HAVE_FUNCTION_RET_TRACER depends on FUNCTION_TRACER help diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index b42ec1d..2f78a45 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -50,6 +50,9 @@ static int last_ftrace_enabled; /* Quick disabling of function tracer. */ int function_trace_stop; +/* By default, current tracing type is normal tracing. */ +enum ftrace_tracing_type_t ftrace_tracing_type = FTRACE_TYPE_ENTER; + /* * ftrace_disabled is set when an anomaly is discovered. * ftrace_disabled is much stronger than ftrace_enabled. @@ -385,12 +388,21 @@ static void ftrace_bug(int failed, unsigned long ip) } } -#define FTRACE_ADDR ((long)(ftrace_caller)) static int __ftrace_replace_code(struct dyn_ftrace *rec, int enable) { unsigned long ip, fl; + unsigned long ftrace_addr; + +#ifdef CONFIG_FUNCTION_RET_TRACER + if (ftrace_tracing_type == FTRACE_TYPE_ENTER) + ftrace_addr = (unsigned long)ftrace_caller; + else + ftrace_addr = (unsigned long)ftrace_return_caller; +#else + ftrace_addr = (unsigned long)ftrace_caller; +#endif ip = rec->ip; @@ -450,9 +462,9 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) } if (rec->flags & FTRACE_FL_ENABLED) - return ftrace_make_call(rec, FTRACE_ADDR); + return ftrace_make_call(rec, ftrace_addr); else - return ftrace_make_nop(NULL, rec, FTRACE_ADDR); + return ftrace_make_nop(NULL, rec, ftrace_addr); } static void ftrace_replace_code(int enable) @@ -1405,10 +1417,17 @@ int register_ftrace_function(struct ftrace_ops *ops) return -1; mutex_lock(&ftrace_sysctl_lock); + + if (ftrace_tracing_type == FTRACE_TYPE_RETURN) { + ret = -EBUSY; + goto out; + } + ret = __register_ftrace_function(ops); ftrace_startup(); - mutex_unlock(&ftrace_sysctl_lock); +out: + mutex_unlock(&ftrace_sysctl_lock); return ret; } @@ -1474,16 +1493,45 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, } #ifdef CONFIG_FUNCTION_RET_TRACER + +/* The callback that hooks the return of a function */ trace_function_return_t ftrace_function_return = (trace_function_return_t)ftrace_stub; -void register_ftrace_return(trace_function_return_t func) + +int register_ftrace_return(trace_function_return_t func) { + int ret = 0; + + mutex_lock(&ftrace_sysctl_lock); + + /* + * Don't launch return tracing if normal function + * tracing is already running. + */ + if (ftrace_trace_function != ftrace_stub) { + ret = -EBUSY; + goto out; + } + + ftrace_tracing_type = FTRACE_TYPE_RETURN; ftrace_function_return = func; + ftrace_startup(); + +out: + mutex_unlock(&ftrace_sysctl_lock); + return ret; } void unregister_ftrace_return(void) { + mutex_lock(&ftrace_sysctl_lock); + ftrace_function_return = (trace_function_return_t)ftrace_stub; + ftrace_shutdown(); + /* Restore normal tracing type */ + ftrace_tracing_type = FTRACE_TYPE_ENTER; + + mutex_unlock(&ftrace_sysctl_lock); } #endif diff --git a/kernel/trace/trace_functions_return.c b/kernel/trace/trace_functions_return.c index 61185f7..a68564a 100644 --- a/kernel/trace/trace_functions_return.c +++ b/kernel/trace/trace_functions_return.c @@ -14,29 +14,18 @@ #include "trace.h" -static void start_return_trace(struct trace_array *tr) -{ - register_ftrace_return(&trace_function_return); -} - -static void stop_return_trace(struct trace_array *tr) -{ - unregister_ftrace_return(); -} - static int return_trace_init(struct trace_array *tr) { int cpu; for_each_online_cpu(cpu) tracing_reset(tr, cpu); - start_return_trace(tr); - return 0; + return register_ftrace_return(&trace_function_return); } static void return_trace_reset(struct trace_array *tr) { - stop_return_trace(tr); + unregister_ftrace_return(); } -- cgit v0.10.2 From 954e100d2275cb2f150f2b18d5cddcdf67b956ac Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Fri, 14 Nov 2008 17:47:34 -0500 Subject: rcu: add rcu_read_*_sched_notrace() Impact: new API, useful for tracepoints and markers. Add _notrace version to rcu_read_*_sched(). Signed-off-by: Mathieu Desnoyers Reviewed-by: Paul E McKenney Signed-off-by: Ingo Molnar diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 86f1f5e..895dc9c 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -142,6 +142,7 @@ struct rcu_head { * on the write-side to insure proper synchronization. */ #define rcu_read_lock_sched() preempt_disable() +#define rcu_read_lock_sched_notrace() preempt_disable_notrace() /* * rcu_read_unlock_sched - marks the end of a RCU-classic critical section @@ -149,6 +150,7 @@ struct rcu_head { * See rcu_read_lock_sched for more information. */ #define rcu_read_unlock_sched() preempt_enable() +#define rcu_read_unlock_sched_notrace() preempt_enable_notrace() -- cgit v0.10.2 From 2bdba316c989da028a59becf7516c6350ce3c173 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Fri, 14 Nov 2008 17:47:35 -0500 Subject: markers: fix unregister Impact: fix marker registers/unregister race get_marker() can return a NULL entry because the mutex is released in the middle of those functions. Make sure we check to see if it has been concurrently removed. Signed-off-by: Mathieu Desnoyers Signed-off-by: Ingo Molnar diff --git a/kernel/marker.c b/kernel/marker.c index 2898b64..de683a7 100644 --- a/kernel/marker.c +++ b/kernel/marker.c @@ -653,10 +653,11 @@ int marker_probe_register(const char *name, const char *format, goto end; } mutex_unlock(&markers_mutex); - marker_update_probes(); /* may update entry */ + marker_update_probes(); mutex_lock(&markers_mutex); entry = get_marker(name); - WARN_ON(!entry); + if (!entry) + goto end; if (entry->rcu_pending) rcu_barrier_sched(); entry->oldptr = old; @@ -697,7 +698,7 @@ int marker_probe_unregister(const char *name, rcu_barrier_sched(); old = marker_entry_remove_probe(entry, probe, probe_private); mutex_unlock(&markers_mutex); - marker_update_probes(); /* may update entry */ + marker_update_probes(); mutex_lock(&markers_mutex); entry = get_marker(name); if (!entry) @@ -778,10 +779,11 @@ int marker_probe_unregister_private_data(marker_probe_func *probe, rcu_barrier_sched(); old = marker_entry_remove_probe(entry, NULL, probe_private); mutex_unlock(&markers_mutex); - marker_update_probes(); /* may update entry */ + marker_update_probes(); mutex_lock(&markers_mutex); entry = get_marker_from_private_data(probe, probe_private); - WARN_ON(!entry); + if (!entry) + goto end; if (entry->rcu_pending) rcu_barrier_sched(); entry->oldptr = old; -- cgit v0.10.2 From e3f8c4b9117d70127a8cab480af83bbfd048a28b Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 14 Nov 2008 17:47:36 -0500 Subject: markers: add missing stdargs.h include, needed due to va_list usage Impact: build fix (for future changes) That seemed to cause built issue when marker.h is included early, even though stdargs.h is included in kernel.h. Signed-off-by: Mathieu Desnoyers Signed-off-by: Ingo Molnar diff --git a/include/linux/marker.h b/include/linux/marker.h index 4cf4547..05ec0df 100644 --- a/include/linux/marker.h +++ b/include/linux/marker.h @@ -12,6 +12,7 @@ * See the file COPYING for more details. */ +#include #include struct module; -- cgit v0.10.2 From 021aeb057fc48af03fe5f37d3dda366c0d97aaf3 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Fri, 14 Nov 2008 17:47:37 -0500 Subject: markers: use rcu_*_sched_notrace and notrace Make marker critical code use notrace to make sure they can be used as an ftrace callback. Signed-off-by: Mathieu Desnoyers Signed-off-by: Ingo Molnar diff --git a/kernel/marker.c b/kernel/marker.c index de683a7..22cd7ba 100644 --- a/kernel/marker.c +++ b/kernel/marker.c @@ -81,7 +81,7 @@ struct marker_entry { * though the function pointer change and the marker enabling are two distinct * operations that modifies the execution flow of preemptible code. */ -void __mark_empty_function(void *probe_private, void *call_private, +notrace void __mark_empty_function(void *probe_private, void *call_private, const char *fmt, va_list *args) { } @@ -97,7 +97,8 @@ EXPORT_SYMBOL_GPL(__mark_empty_function); * need to put a full smp_rmb() in this branch. This is why we do not use * rcu_dereference() for the pointer read. */ -void marker_probe_cb(const struct marker *mdata, void *call_private, ...) +notrace void marker_probe_cb(const struct marker *mdata, + void *call_private, ...) { va_list args; char ptype; @@ -107,7 +108,7 @@ void marker_probe_cb(const struct marker *mdata, void *call_private, ...) * sure the teardown of the callbacks can be done correctly when they * are in modules and they insure RCU read coherency. */ - rcu_read_lock_sched(); + rcu_read_lock_sched_notrace(); ptype = mdata->ptype; if (likely(!ptype)) { marker_probe_func *func; @@ -145,7 +146,7 @@ void marker_probe_cb(const struct marker *mdata, void *call_private, ...) va_end(args); } } - rcu_read_unlock_sched(); + rcu_read_unlock_sched_notrace(); } EXPORT_SYMBOL_GPL(marker_probe_cb); @@ -157,12 +158,13 @@ EXPORT_SYMBOL_GPL(marker_probe_cb); * * Should be connected to markers "MARK_NOARGS". */ -static void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...) +static notrace void marker_probe_cb_noarg(const struct marker *mdata, + void *call_private, ...) { va_list args; /* not initialized */ char ptype; - rcu_read_lock_sched(); + rcu_read_lock_sched_notrace(); ptype = mdata->ptype; if (likely(!ptype)) { marker_probe_func *func; @@ -195,7 +197,7 @@ static void marker_probe_cb_noarg(const struct marker *mdata, void *call_private multi[i].func(multi[i].probe_private, call_private, mdata->format, &args); } - rcu_read_unlock_sched(); + rcu_read_unlock_sched_notrace(); } static void free_old_closure(struct rcu_head *head) -- cgit v0.10.2 From a419246ac7c2d9282dfd843103702895bb3f3fd7 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Fri, 14 Nov 2008 17:47:38 -0500 Subject: markers: use module notifier Impact: cleanup Use module notifiers instead of adding a hook in module.c. Signed-off-by: Mathieu Desnoyers Signed-off-by: Ingo Molnar diff --git a/kernel/marker.c b/kernel/marker.c index 22cd7ba..348e70c 100644 --- a/kernel/marker.c +++ b/kernel/marker.c @@ -846,3 +846,32 @@ void *marker_get_private_data(const char *name, marker_probe_func *probe, return ERR_PTR(-ENOENT); } EXPORT_SYMBOL_GPL(marker_get_private_data); + +int marker_module_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + struct module *mod = data; + + switch (val) { + case MODULE_STATE_COMING: + marker_update_probe_range(mod->markers, + mod->markers + mod->num_markers); + break; + case MODULE_STATE_GOING: + marker_update_probe_range(mod->markers, + mod->markers + mod->num_markers); + break; + } + return 0; +} + +struct notifier_block marker_module_nb = { + .notifier_call = marker_module_notify, + .priority = 0, +}; + +static int init_markers(void) +{ + return register_module_notifier(&marker_module_nb); +} +__initcall(init_markers); diff --git a/kernel/module.c b/kernel/module.c index 1f4cc00..72c6ca5 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2184,10 +2184,6 @@ static noinline struct module *load_module(void __user *umod, struct mod_debug *debug; unsigned int num_debug; -#ifdef CONFIG_MARKERS - marker_update_probe_range(mod->markers, - mod->markers + mod->num_markers); -#endif debug = section_objs(hdr, sechdrs, secstrings, "__verbose", sizeof(*debug), &num_debug); dynamic_printk_setup(debug, num_debug); -- cgit v0.10.2 From c1df1bd2c4d4b20c83755a0f41956b57aec4842a Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Fri, 14 Nov 2008 17:47:39 -0500 Subject: markers: auto enable tracepoints (new API : trace_mark_tp()) Impact: new API Add a new API trace_mark_tp(), which declares a marker within a tracepoint probe. When the marker is activated, the tracepoint is automatically enabled. No branch test is used at the marker site, because it would be a duplicate of the branch already present in the tracepoint. Signed-off-by: Mathieu Desnoyers Signed-off-by: Ingo Molnar diff --git a/include/linux/marker.h b/include/linux/marker.h index 05ec0df..57a30701 100644 --- a/include/linux/marker.h +++ b/include/linux/marker.h @@ -49,6 +49,8 @@ struct marker { void (*call)(const struct marker *mdata, void *call_private, ...); struct marker_probe_closure single; struct marker_probe_closure *multi; + const char *tp_name; /* Optional tracepoint name */ + void *tp_cb; /* Optional tracepoint callback */ } __attribute__((aligned(8))); #ifdef CONFIG_MARKERS @@ -73,7 +75,7 @@ struct marker { __attribute__((section("__markers"), aligned(8))) = \ { __mstrtab_##name, &__mstrtab_##name[sizeof(#name)], \ 0, 0, marker_probe_cb, \ - { __mark_empty_function, NULL}, NULL }; \ + { __mark_empty_function, NULL}, NULL, NULL, NULL }; \ __mark_check_format(format, ## args); \ if (unlikely(__mark_##name.state)) { \ (*__mark_##name.call) \ @@ -81,11 +83,38 @@ struct marker { } \ } while (0) +#define __trace_mark_tp(name, call_private, tp_name, tp_cb, format, args...) \ + do { \ + void __check_tp_type(void) \ + { \ + register_trace_##tp_name(tp_cb); \ + } \ + static const char __mstrtab_##name[] \ + __attribute__((section("__markers_strings"))) \ + = #name "\0" format; \ + static struct marker __mark_##name \ + __attribute__((section("__markers"), aligned(8))) = \ + { __mstrtab_##name, &__mstrtab_##name[sizeof(#name)], \ + 0, 0, marker_probe_cb, \ + { __mark_empty_function, NULL}, NULL, #tp_name, tp_cb };\ + __mark_check_format(format, ## args); \ + (*__mark_##name.call)(&__mark_##name, call_private, \ + ## args); \ + } while (0) + extern void marker_update_probe_range(struct marker *begin, struct marker *end); #else /* !CONFIG_MARKERS */ #define __trace_mark(generic, name, call_private, format, args...) \ __mark_check_format(format, ## args) +#define __trace_mark_tp(name, call_private, tp_name, tp_cb, format, args...) \ + do { \ + void __check_tp_type(void) \ + { \ + register_trace_##tp_name(tp_cb); \ + } \ + __mark_check_format(format, ## args); \ + } while (0) static inline void marker_update_probe_range(struct marker *begin, struct marker *end) { } @@ -118,6 +147,20 @@ static inline void marker_update_probe_range(struct marker *begin, __trace_mark(1, name, NULL, format, ## args) /** + * trace_mark_tp - Marker in a tracepoint callback + * @name: marker name, not quoted. + * @tp_name: tracepoint name, not quoted. + * @tp_cb: tracepoint callback. Should have an associated global symbol so it + * is not optimized away by the compiler (should not be static). + * @format: format string + * @args...: variable argument list + * + * Places a marker in a tracepoint callback. + */ +#define trace_mark_tp(name, tp_name, tp_cb, format, args...) \ + __trace_mark_tp(name, NULL, tp_name, tp_cb, format, ## args) + +/** * MARK_NOARGS - Format string for a marker with no argument. */ #define MARK_NOARGS " " diff --git a/init/Kconfig b/init/Kconfig index 86b00c5..f5bacb4 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -808,6 +808,7 @@ config TRACEPOINTS config MARKERS bool "Activate markers" + depends on TRACEPOINTS help Place an empty function call at each marker site. Can be dynamically changed for a probe function. diff --git a/kernel/marker.c b/kernel/marker.c index 348e70c..c14ec26 100644 --- a/kernel/marker.c +++ b/kernel/marker.c @@ -479,7 +479,7 @@ static int marker_set_format(struct marker_entry *entry, const char *format) static int set_marker(struct marker_entry *entry, struct marker *elem, int active) { - int ret; + int ret = 0; WARN_ON(strcmp(entry->name, elem->name) != 0); if (entry->format) { @@ -531,9 +531,40 @@ static int set_marker(struct marker_entry *entry, struct marker *elem, */ smp_wmb(); elem->ptype = entry->ptype; + + if (elem->tp_name && (active ^ elem->state)) { + WARN_ON(!elem->tp_cb); + /* + * It is ok to directly call the probe registration because type + * checking has been done in the __trace_mark_tp() macro. + */ + + if (active) { + /* + * try_module_get should always succeed because we hold + * lock_module() to get the tp_cb address. + */ + ret = try_module_get(__module_text_address( + (unsigned long)elem->tp_cb)); + BUG_ON(!ret); + ret = tracepoint_probe_register_noupdate( + elem->tp_name, + elem->tp_cb); + } else { + ret = tracepoint_probe_unregister_noupdate( + elem->tp_name, + elem->tp_cb); + /* + * tracepoint_probe_update_all() must be called + * before the module containing tp_cb is unloaded. + */ + module_put(__module_text_address( + (unsigned long)elem->tp_cb)); + } + } elem->state = active; - return 0; + return ret; } /* @@ -544,7 +575,24 @@ static int set_marker(struct marker_entry *entry, struct marker *elem, */ static void disable_marker(struct marker *elem) { + int ret; + /* leave "call" as is. It is known statically. */ + if (elem->tp_name && elem->state) { + WARN_ON(!elem->tp_cb); + /* + * It is ok to directly call the probe registration because type + * checking has been done in the __trace_mark_tp() macro. + */ + ret = tracepoint_probe_unregister_noupdate(elem->tp_name, + elem->tp_cb); + WARN_ON(ret); + /* + * tracepoint_probe_update_all() must be called + * before the module containing tp_cb is unloaded. + */ + module_put(__module_text_address((unsigned long)elem->tp_cb)); + } elem->state = 0; elem->single.func = __mark_empty_function; /* Update the function before setting the ptype */ @@ -608,6 +656,7 @@ static void marker_update_probes(void) marker_update_probe_range(__start___markers, __stop___markers); /* Markers in modules. */ module_update_markers(); + tracepoint_probe_update_all(); } /** -- cgit v0.10.2 From a0bca6a59ebc052751eed6e3b182c153495672d8 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Fri, 14 Nov 2008 17:47:40 -0500 Subject: markers: create DEFINE_MARKER and GET_MARKER (new API) Impact: new API. Allow markers to be used only for declaration, without function call associated. Useful to create specialized probes. The problem we had is that two function calls were required when one wanted to put a marker in a tracepoint probe. Now the marker can be used simply for trace data type declaration, leaving the trace write work within the tracepoint probe without any additional function call. Signed-off-by: Mathieu Desnoyers Signed-off-by: Ingo Molnar diff --git a/Documentation/markers.txt b/Documentation/markers.txt index 089f613..6d275e4 100644 --- a/Documentation/markers.txt +++ b/Documentation/markers.txt @@ -70,6 +70,20 @@ a printk warning which identifies the inconsistency: "Format mismatch for probe probe_name (format), marker (format)" +Another way to use markers is to simply define the marker without generating any +function call to actually call into the marker. This is useful in combination +with tracepoint probes in a scheme like this : + +void probe_tracepoint_name(unsigned int arg1, struct task_struct *tsk); + +DEFINE_MARKER_TP(marker_eventname, tracepoint_name, probe_tracepoint_name, + "arg1 %u pid %d"); + +notrace void probe_tracepoint_name(unsigned int arg1, struct task_struct *tsk) +{ + struct marker *marker = &GET_MARKER(kernel_irq_entry); + /* write data to trace buffers ... */ +} * Probe / marker example diff --git a/include/linux/marker.h b/include/linux/marker.h index 57a30701..34c14bc 100644 --- a/include/linux/marker.h +++ b/include/linux/marker.h @@ -55,6 +55,22 @@ struct marker { #ifdef CONFIG_MARKERS +#define _DEFINE_MARKER(name, tp_name_str, tp_cb, format) \ + static const char __mstrtab_##name[] \ + __attribute__((section("__markers_strings"))) \ + = #name "\0" format; \ + static struct marker __mark_##name \ + __attribute__((section("__markers"), aligned(8))) = \ + { __mstrtab_##name, &__mstrtab_##name[sizeof(#name)], \ + 0, 0, marker_probe_cb, { __mark_empty_function, NULL},\ + NULL, tp_name_str, tp_cb } + +#define DEFINE_MARKER(name, format) \ + _DEFINE_MARKER(name, NULL, NULL, format) + +#define DEFINE_MARKER_TP(name, tp_name, tp_cb, format) \ + _DEFINE_MARKER(name, #tp_name, tp_cb, format) + /* * Note : the empty asm volatile with read constraint is used here instead of a * "used" attribute to fix a gcc 4.1.x bug. @@ -68,14 +84,7 @@ struct marker { */ #define __trace_mark(generic, name, call_private, format, args...) \ do { \ - static const char __mstrtab_##name[] \ - __attribute__((section("__markers_strings"))) \ - = #name "\0" format; \ - static struct marker __mark_##name \ - __attribute__((section("__markers"), aligned(8))) = \ - { __mstrtab_##name, &__mstrtab_##name[sizeof(#name)], \ - 0, 0, marker_probe_cb, \ - { __mark_empty_function, NULL}, NULL, NULL, NULL }; \ + DEFINE_MARKER(name, format); \ __mark_check_format(format, ## args); \ if (unlikely(__mark_##name.state)) { \ (*__mark_##name.call) \ @@ -89,14 +98,7 @@ struct marker { { \ register_trace_##tp_name(tp_cb); \ } \ - static const char __mstrtab_##name[] \ - __attribute__((section("__markers_strings"))) \ - = #name "\0" format; \ - static struct marker __mark_##name \ - __attribute__((section("__markers"), aligned(8))) = \ - { __mstrtab_##name, &__mstrtab_##name[sizeof(#name)], \ - 0, 0, marker_probe_cb, \ - { __mark_empty_function, NULL}, NULL, #tp_name, tp_cb };\ + DEFINE_MARKER_TP(name, tp_name, tp_cb, format); \ __mark_check_format(format, ## args); \ (*__mark_##name.call)(&__mark_##name, call_private, \ ## args); \ @@ -104,7 +106,11 @@ struct marker { extern void marker_update_probe_range(struct marker *begin, struct marker *end); + +#define GET_MARKER(name) (__mark_##name) + #else /* !CONFIG_MARKERS */ +#define DEFINE_MARKER(name, tp_name, tp_cb, format) #define __trace_mark(generic, name, call_private, format, args...) \ __mark_check_format(format, ## args) #define __trace_mark_tp(name, call_private, tp_name, tp_cb, format, args...) \ @@ -118,6 +124,7 @@ extern void marker_update_probe_range(struct marker *begin, static inline void marker_update_probe_range(struct marker *begin, struct marker *end) { } +#define GET_MARKER(name) #endif /* CONFIG_MARKERS */ /** -- cgit v0.10.2 From 2504ea5edfebb14133b8571c20785cdc077e07d2 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Fri, 14 Nov 2008 17:47:41 -0500 Subject: tracepoints: samples, fix teardown Impact: fix a bug in sample tracepoints Need a tracepoint_synchronize_unregister() before the end of exit() to make sure every probe callers have exited the non preemptible section and thus are not executing the probe code anymore. Signed-off-by: Mathieu Desnoyers Signed-off-by: Ingo Molnar diff --git a/samples/tracepoints/tracepoint-probe-sample.c b/samples/tracepoints/tracepoint-probe-sample.c index 55abfdd..e3a9648 100644 --- a/samples/tracepoints/tracepoint-probe-sample.c +++ b/samples/tracepoints/tracepoint-probe-sample.c @@ -46,6 +46,7 @@ void __exit tp_sample_trace_exit(void) { unregister_trace_subsys_eventb(probe_subsys_eventb); unregister_trace_subsys_event(probe_subsys_event); + tracepoint_synchronize_unregister(); } module_exit(tp_sample_trace_exit); diff --git a/samples/tracepoints/tracepoint-probe-sample2.c b/samples/tracepoints/tracepoint-probe-sample2.c index 5e9fcf4..685a5ac 100644 --- a/samples/tracepoints/tracepoint-probe-sample2.c +++ b/samples/tracepoints/tracepoint-probe-sample2.c @@ -33,6 +33,7 @@ module_init(tp_sample_trace_init); void __exit tp_sample_trace_exit(void) { unregister_trace_subsys_event(probe_subsys_event); + tracepoint_synchronize_unregister(); } module_exit(tp_sample_trace_exit); -- cgit v0.10.2 From de0baf9ad661ac630a45a50ea1717cc4f4b33ace Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Fri, 14 Nov 2008 17:47:42 -0500 Subject: tracepoints: fix disable Impact: fix race Set the probe array pointer to NULL when the tracepoint is disabled. The probe array point not being NULL could generate a race condition where the reader would dereference a freed pointer. Signed-off-by: Mathieu Desnoyers Signed-off-by: Ingo Molnar diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index e96590f..47a7303 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -262,6 +262,7 @@ static void set_tracepoint(struct tracepoint_entry **entry, static void disable_tracepoint(struct tracepoint *elem) { elem->state = 0; + rcu_assign_pointer(elem->funcs, NULL); } /** -- cgit v0.10.2 From da7b3eab167091693ad215ad7692f7d0d24d1356 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Fri, 14 Nov 2008 17:47:43 -0500 Subject: tracepoints: use rcu_*_sched_notrace Make sure tracepoints can be called within ftrace callbacks. Signed-off-by: Mathieu Desnoyers Signed-off-by: Ingo Molnar diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 63064e9..69648c5 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -40,14 +40,14 @@ struct tracepoint { do { \ void **it_func; \ \ - rcu_read_lock_sched(); \ + rcu_read_lock_sched_notrace(); \ it_func = rcu_dereference((tp)->funcs); \ if (it_func) { \ do { \ ((void(*)(proto))(*it_func))(args); \ } while (*(++it_func)); \ } \ - rcu_read_unlock_sched(); \ + rcu_read_unlock_sched_notrace(); \ } while (0) /* -- cgit v0.10.2 From c420970ef476d7d68df119711700666224001f43 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Fri, 14 Nov 2008 17:47:44 -0500 Subject: tracepoints: use unregister return value Impact: bugfix. Unregistering a tracepoint can fail. Return the error value. Signed-off-by: Mathieu Desnoyers Signed-off-by: Ingo Molnar diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 69648c5..c60a791 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -73,9 +73,9 @@ struct tracepoint { return tracepoint_probe_register(#name ":" #proto, \ (void *)probe); \ } \ - static inline void unregister_trace_##name(void (*probe)(proto))\ + static inline int unregister_trace_##name(void (*probe)(proto)) \ { \ - tracepoint_probe_unregister(#name ":" #proto, \ + return tracepoint_probe_unregister(#name ":" #proto, \ (void *)probe); \ } @@ -92,8 +92,10 @@ extern void tracepoint_update_probe_range(struct tracepoint *begin, { \ return -ENOSYS; \ } \ - static inline void unregister_trace_##name(void (*probe)(proto))\ - { } + static inline int unregister_trace_##name(void (*probe)(proto)) \ + { \ + return -ENOSYS; \ + } static inline void tracepoint_update_probe_range(struct tracepoint *begin, struct tracepoint *end) -- cgit v0.10.2 From 5f382671def7cb9c0f4b75d586dc5f60dca5e1c3 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Fri, 14 Nov 2008 17:47:45 -0500 Subject: tracepoints: do not put arguments in name Impact: cleanup That's overkill, takes space. We have a global tracepoint registery in header files anyway. Signed-off-by: Mathieu Desnoyers Signed-off-by: Ingo Molnar diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index c60a791..7e9b42a 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -60,7 +60,7 @@ struct tracepoint { { \ static const char __tpstrtab_##name[] \ __attribute__((section("__tracepoints_strings"))) \ - = #name ":" #proto; \ + = #name; \ static struct tracepoint __tracepoint_##name \ __attribute__((section("__tracepoints"), aligned(8))) = \ { __tpstrtab_##name, 0, NULL }; \ @@ -70,13 +70,11 @@ struct tracepoint { } \ static inline int register_trace_##name(void (*probe)(proto)) \ { \ - return tracepoint_probe_register(#name ":" #proto, \ - (void *)probe); \ + return tracepoint_probe_register(#name, (void *)probe); \ } \ static inline int unregister_trace_##name(void (*probe)(proto)) \ { \ - return tracepoint_probe_unregister(#name ":" #proto, \ - (void *)probe); \ + return tracepoint_probe_unregister(#name, (void *)probe);\ } extern void tracepoint_update_probe_range(struct tracepoint *begin, -- cgit v0.10.2 From 32f85742778dfc2c74975cf0b9f5bdb13470cb32 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Fri, 14 Nov 2008 17:47:46 -0500 Subject: tracepoints: use modules notifiers Impact: cleanup Use module notifiers for tracepoint updates rather than adding a hook in module.c. Signed-off-by: Mathieu Desnoyers Signed-off-by: Ingo Molnar diff --git a/kernel/module.c b/kernel/module.c index 72c6ca5..fc1dff9 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2187,11 +2187,6 @@ static noinline struct module *load_module(void __user *umod, debug = section_objs(hdr, sechdrs, secstrings, "__verbose", sizeof(*debug), &num_debug); dynamic_printk_setup(debug, num_debug); - -#ifdef CONFIG_TRACEPOINTS - tracepoint_update_probe_range(mod->tracepoints, - mod->tracepoints + mod->num_tracepoints); -#endif } /* sechdrs[0].sh_size is always zero */ diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 47a7303..94ac4e3 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -541,3 +541,32 @@ void tracepoint_iter_reset(struct tracepoint_iter *iter) iter->tracepoint = NULL; } EXPORT_SYMBOL_GPL(tracepoint_iter_reset); + +int tracepoint_module_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + struct module *mod = data; + + switch (val) { + case MODULE_STATE_COMING: + tracepoint_update_probe_range(mod->tracepoints, + mod->tracepoints + mod->num_tracepoints); + break; + case MODULE_STATE_GOING: + tracepoint_update_probe_range(mod->tracepoints, + mod->tracepoints + mod->num_tracepoints); + break; + } + return 0; +} + +struct notifier_block tracepoint_module_nb = { + .notifier_call = tracepoint_module_notify, + .priority = 0, +}; + +static int init_tracepoints(void) +{ + return register_module_notifier(&tracepoint_module_nb); +} +__initcall(init_tracepoints); -- cgit v0.10.2 From 7e066fb870fcd1025ec3ba7bbde5d541094f4ce1 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Fri, 14 Nov 2008 17:47:47 -0500 Subject: tracepoints: add DECLARE_TRACE() and DEFINE_TRACE() Impact: API *CHANGE*. Must update all tracepoint users. Add DEFINE_TRACE() to tracepoints to let them declare the tracepoint structure in a single spot for all the kernel. It helps reducing memory consumption, especially when declaring a lot of tracepoints, e.g. for kmalloc tracing. *API CHANGE WARNING*: now, DECLARE_TRACE() must be used in headers for tracepoint declarations rather than DEFINE_TRACE(). This is the sane way to do it. The name previously used was misleading. Updates scheduler instrumentation to follow this API change. Signed-off-by: Mathieu Desnoyers Signed-off-by: Ingo Molnar diff --git a/Documentation/tracepoints.txt b/Documentation/tracepoints.txt index 5d354e1..e8ad47b 100644 --- a/Documentation/tracepoints.txt +++ b/Documentation/tracepoints.txt @@ -42,7 +42,7 @@ In include/trace/subsys.h : #include -DEFINE_TRACE(subsys_eventname, +DECLARE_TRACE(subsys_eventname, TPPTOTO(int firstarg, struct task_struct *p), TPARGS(firstarg, p)); @@ -50,6 +50,8 @@ In subsys/file.c (where the tracing statement must be added) : #include +DEFINE_TRACE(subsys_eventname); + void somefct(void) { ... @@ -86,6 +88,9 @@ to limit collisions. Tracepoint names are global to the kernel: they are considered as being the same whether they are in the core kernel image or in modules. +If the tracepoint has to be used in kernel modules, an +EXPORT_TRACEPOINT_SYMBOL_GPL() or EXPORT_TRACEPOINT_SYMBOL() can be used to +export the defined tracepoints. * Probe / tracepoint example diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index a5e4ed9..3b46ae4 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -71,6 +71,7 @@ VMLINUX_SYMBOL(__start___markers) = .; \ *(__markers) \ VMLINUX_SYMBOL(__stop___markers) = .; \ + . = ALIGN(32); \ VMLINUX_SYMBOL(__start___tracepoints) = .; \ *(__tracepoints) \ VMLINUX_SYMBOL(__stop___tracepoints) = .; \ diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 7e9b42a..7570054 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -24,8 +24,12 @@ struct tracepoint { const char *name; /* Tracepoint name */ int state; /* State. */ void **funcs; -} __attribute__((aligned(8))); - +} __attribute__((aligned(32))); /* + * Aligned on 32 bytes because it is + * globally visible and gcc happily + * align these on the structure size. + * Keep in sync with vmlinux.lds.h. + */ #define TPPROTO(args...) args #define TPARGS(args...) args @@ -55,15 +59,10 @@ struct tracepoint { * not add unwanted padding between the beginning of the section and the * structure. Force alignment to the same alignment as the section start. */ -#define DEFINE_TRACE(name, proto, args) \ +#define DECLARE_TRACE(name, proto, args) \ + extern struct tracepoint __tracepoint_##name; \ static inline void trace_##name(proto) \ { \ - static const char __tpstrtab_##name[] \ - __attribute__((section("__tracepoints_strings"))) \ - = #name; \ - static struct tracepoint __tracepoint_##name \ - __attribute__((section("__tracepoints"), aligned(8))) = \ - { __tpstrtab_##name, 0, NULL }; \ if (unlikely(__tracepoint_##name.state)) \ __DO_TRACE(&__tracepoint_##name, \ TPPROTO(proto), TPARGS(args)); \ @@ -77,11 +76,23 @@ struct tracepoint { return tracepoint_probe_unregister(#name, (void *)probe);\ } +#define DEFINE_TRACE(name) \ + static const char __tpstrtab_##name[] \ + __attribute__((section("__tracepoints_strings"))) = #name; \ + struct tracepoint __tracepoint_##name \ + __attribute__((section("__tracepoints"), aligned(32))) = \ + { __tpstrtab_##name, 0, NULL } + +#define EXPORT_TRACEPOINT_SYMBOL_GPL(name) \ + EXPORT_SYMBOL_GPL(__tracepoint_##name) +#define EXPORT_TRACEPOINT_SYMBOL(name) \ + EXPORT_SYMBOL(__tracepoint_##name) + extern void tracepoint_update_probe_range(struct tracepoint *begin, struct tracepoint *end); #else /* !CONFIG_TRACEPOINTS */ -#define DEFINE_TRACE(name, proto, args) \ +#define DECLARE_TRACE(name, proto, args) \ static inline void _do_trace_##name(struct tracepoint *tp, proto) \ { } \ static inline void trace_##name(proto) \ @@ -95,6 +106,10 @@ extern void tracepoint_update_probe_range(struct tracepoint *begin, return -ENOSYS; \ } +#define DEFINE_TRACE(name) +#define EXPORT_TRACEPOINT_SYMBOL_GPL(name) +#define EXPORT_TRACEPOINT_SYMBOL(name) + static inline void tracepoint_update_probe_range(struct tracepoint *begin, struct tracepoint *end) { } diff --git a/include/trace/sched.h b/include/trace/sched.h index ad47369..9b2854a 100644 --- a/include/trace/sched.h +++ b/include/trace/sched.h @@ -4,52 +4,52 @@ #include #include -DEFINE_TRACE(sched_kthread_stop, +DECLARE_TRACE(sched_kthread_stop, TPPROTO(struct task_struct *t), TPARGS(t)); -DEFINE_TRACE(sched_kthread_stop_ret, +DECLARE_TRACE(sched_kthread_stop_ret, TPPROTO(int ret), TPARGS(ret)); -DEFINE_TRACE(sched_wait_task, +DECLARE_TRACE(sched_wait_task, TPPROTO(struct rq *rq, struct task_struct *p), TPARGS(rq, p)); -DEFINE_TRACE(sched_wakeup, +DECLARE_TRACE(sched_wakeup, TPPROTO(struct rq *rq, struct task_struct *p), TPARGS(rq, p)); -DEFINE_TRACE(sched_wakeup_new, +DECLARE_TRACE(sched_wakeup_new, TPPROTO(struct rq *rq, struct task_struct *p), TPARGS(rq, p)); -DEFINE_TRACE(sched_switch, +DECLARE_TRACE(sched_switch, TPPROTO(struct rq *rq, struct task_struct *prev, struct task_struct *next), TPARGS(rq, prev, next)); -DEFINE_TRACE(sched_migrate_task, +DECLARE_TRACE(sched_migrate_task, TPPROTO(struct rq *rq, struct task_struct *p, int dest_cpu), TPARGS(rq, p, dest_cpu)); -DEFINE_TRACE(sched_process_free, +DECLARE_TRACE(sched_process_free, TPPROTO(struct task_struct *p), TPARGS(p)); -DEFINE_TRACE(sched_process_exit, +DECLARE_TRACE(sched_process_exit, TPPROTO(struct task_struct *p), TPARGS(p)); -DEFINE_TRACE(sched_process_wait, +DECLARE_TRACE(sched_process_wait, TPPROTO(struct pid *pid), TPARGS(pid)); -DEFINE_TRACE(sched_process_fork, +DECLARE_TRACE(sched_process_fork, TPPROTO(struct task_struct *parent, struct task_struct *child), TPARGS(parent, child)); -DEFINE_TRACE(sched_signal_send, +DECLARE_TRACE(sched_signal_send, TPPROTO(int sig, struct task_struct *p), TPARGS(sig, p)); diff --git a/kernel/exit.c b/kernel/exit.c index ae2b92b..f995d24 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -54,6 +54,10 @@ #include #include +DEFINE_TRACE(sched_process_free); +DEFINE_TRACE(sched_process_exit); +DEFINE_TRACE(sched_process_wait); + static void exit_mm(struct task_struct * tsk); static inline int task_detached(struct task_struct *p) diff --git a/kernel/fork.c b/kernel/fork.c index f608356..0837d0d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -79,6 +79,8 @@ DEFINE_PER_CPU(unsigned long, process_counts) = 0; __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ +DEFINE_TRACE(sched_process_fork); + int nr_processes(void) { int cpu; diff --git a/kernel/kthread.c b/kernel/kthread.c index 8e7a7ce..4fbc456 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -21,6 +21,9 @@ static DEFINE_SPINLOCK(kthread_create_lock); static LIST_HEAD(kthread_create_list); struct task_struct *kthreadd_task; +DEFINE_TRACE(sched_kthread_stop); +DEFINE_TRACE(sched_kthread_stop_ret); + struct kthread_create_info { /* Information passed to kthread() from kthreadd. */ diff --git a/kernel/sched.c b/kernel/sched.c index 50a21f9..327f91c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -118,6 +118,12 @@ */ #define RUNTIME_INF ((u64)~0ULL) +DEFINE_TRACE(sched_wait_task); +DEFINE_TRACE(sched_wakeup); +DEFINE_TRACE(sched_wakeup_new); +DEFINE_TRACE(sched_switch); +DEFINE_TRACE(sched_migrate_task); + #ifdef CONFIG_SMP /* * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) diff --git a/kernel/signal.c b/kernel/signal.c index 4530fc6..e9afe63 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -41,6 +41,8 @@ static struct kmem_cache *sigqueue_cachep; +DEFINE_TRACE(sched_signal_send); + static void __user *sig_handler(struct task_struct *t, int sig) { return t->sighand->action[sig - 1].sa.sa_handler; diff --git a/samples/tracepoints/tp-samples-trace.h b/samples/tracepoints/tp-samples-trace.h index 0216b55..01724e0 100644 --- a/samples/tracepoints/tp-samples-trace.h +++ b/samples/tracepoints/tp-samples-trace.h @@ -4,10 +4,10 @@ #include /* for struct inode and struct file */ #include -DEFINE_TRACE(subsys_event, +DECLARE_TRACE(subsys_event, TPPROTO(struct inode *inode, struct file *file), TPARGS(inode, file)); -DEFINE_TRACE(subsys_eventb, +DECLARE_TRACE(subsys_eventb, TPPROTO(void), TPARGS()); #endif diff --git a/samples/tracepoints/tracepoint-sample.c b/samples/tracepoints/tracepoint-sample.c index 4ae4b7f..00d1697 100644 --- a/samples/tracepoints/tracepoint-sample.c +++ b/samples/tracepoints/tracepoint-sample.c @@ -13,6 +13,9 @@ #include #include "tp-samples-trace.h" +DEFINE_TRACE(subsys_event); +DEFINE_TRACE(subsys_eventb); + struct proc_dir_entry *pentry_example; static int my_open(struct inode *inode, struct file *file) -- cgit v0.10.2 From 8fd88d159031bd238dad1d7186a2030b9f9349de Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Fri, 14 Nov 2008 17:47:48 -0500 Subject: tracepoints: documentation fix for teardown Impact: documentation update Need a tracepoint_synchronize_unregister() before the end of exit() to make sure every probe callers have exited the non preemptible section and thus are not executing the probe code anymore. Signed-off-by: Mathieu Desnoyers Signed-off-by: Ingo Molnar diff --git a/Documentation/tracepoints.txt b/Documentation/tracepoints.txt index e8ad47b..93cd90e 100644 --- a/Documentation/tracepoints.txt +++ b/Documentation/tracepoints.txt @@ -70,10 +70,12 @@ Where : Connecting a function (probe) to a tracepoint is done by providing a probe (function to call) for the specific tracepoint through register_trace_subsys_eventname(). Removing a probe is done through -unregister_trace_subsys_eventname(); it will remove the probe sure there is no -caller left using the probe when it returns. Probe removal is preempt-safe -because preemption is disabled around the probe call. See the "Probe example" -section below for a sample probe module. +unregister_trace_subsys_eventname(); it will remove the probe. +marker_synchronize_unregister() must be called before the end of the module exit +function to make sure there is no caller left using the probe. This, and the +fact that preemption is disabled around the probe call, make sure that probe +removal and module unload are safe. See the "Probe example" section below for a +sample probe module. The tracepoint mechanism supports inserting multiple instances of the same tracepoint, but a single definition must be made of a given tracepoint name over -- cgit v0.10.2 From 0dcf8fe5fe5d7279f1c479fa82f1f1ca6f22e814 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Fri, 14 Nov 2008 17:47:49 -0500 Subject: tracepoints, docs: marker_synchronize_unregister->tracepoint_synchronize_unregister Impact: documentation update. Signed-off-by: Zhaolei Signed-off-by: Mathieu Desnoyers Signed-off-by: Ingo Molnar diff --git a/Documentation/tracepoints.txt b/Documentation/tracepoints.txt index 93cd90e..3a1c743 100644 --- a/Documentation/tracepoints.txt +++ b/Documentation/tracepoints.txt @@ -71,7 +71,7 @@ Connecting a function (probe) to a tracepoint is done by providing a probe (function to call) for the specific tracepoint through register_trace_subsys_eventname(). Removing a probe is done through unregister_trace_subsys_eventname(); it will remove the probe. -marker_synchronize_unregister() must be called before the end of the module exit +tracepoint_synchronize_unregister() must be called before the end of the module exit function to make sure there is no caller left using the probe. This, and the fact that preemption is disabled around the probe call, make sure that probe removal and module unload are safe. See the "Probe example" section below for a -- cgit v0.10.2 From 0a7ad64531713e33e39af95bdbfb172f4f328b1e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 16 Nov 2008 08:54:36 +0100 Subject: tracepoints: format documentation Impact: documentation update Properly format Documentation/tracepoints.txt - it was full of overlong lines and other typographical problems. Signed-off-by: Ingo Molnar diff --git a/Documentation/tracepoints.txt b/Documentation/tracepoints.txt index 3a1c743..2d42241 100644 --- a/Documentation/tracepoints.txt +++ b/Documentation/tracepoints.txt @@ -3,28 +3,30 @@ Mathieu Desnoyers -This document introduces Linux Kernel Tracepoints and their use. It provides -examples of how to insert tracepoints in the kernel and connect probe functions -to them and provides some examples of probe functions. +This document introduces Linux Kernel Tracepoints and their use. It +provides examples of how to insert tracepoints in the kernel and +connect probe functions to them and provides some examples of probe +functions. * Purpose of tracepoints -A tracepoint placed in code provides a hook to call a function (probe) that you -can provide at runtime. A tracepoint can be "on" (a probe is connected to it) or -"off" (no probe is attached). When a tracepoint is "off" it has no effect, -except for adding a tiny time penalty (checking a condition for a branch) and -space penalty (adding a few bytes for the function call at the end of the -instrumented function and adds a data structure in a separate section). When a -tracepoint is "on", the function you provide is called each time the tracepoint -is executed, in the execution context of the caller. When the function provided -ends its execution, it returns to the caller (continuing from the tracepoint -site). +A tracepoint placed in code provides a hook to call a function (probe) +that you can provide at runtime. A tracepoint can be "on" (a probe is +connected to it) or "off" (no probe is attached). When a tracepoint is +"off" it has no effect, except for adding a tiny time penalty +(checking a condition for a branch) and space penalty (adding a few +bytes for the function call at the end of the instrumented function +and adds a data structure in a separate section). When a tracepoint +is "on", the function you provide is called each time the tracepoint +is executed, in the execution context of the caller. When the function +provided ends its execution, it returns to the caller (continuing from +the tracepoint site). You can put tracepoints at important locations in the code. They are lightweight hooks that can pass an arbitrary number of parameters, -which prototypes are described in a tracepoint declaration placed in a header -file. +which prototypes are described in a tracepoint declaration placed in a +header file. They can be used for tracing and performance accounting. @@ -63,36 +65,41 @@ Where : - subsys_eventname is an identifier unique to your event - subsys is the name of your subsystem. - eventname is the name of the event to trace. -- TPPTOTO(int firstarg, struct task_struct *p) is the prototype of the function - called by this tracepoint. -- TPARGS(firstarg, p) are the parameters names, same as found in the prototype. -Connecting a function (probe) to a tracepoint is done by providing a probe -(function to call) for the specific tracepoint through +- TPPTOTO(int firstarg, struct task_struct *p) is the prototype of the + function called by this tracepoint. + +- TPARGS(firstarg, p) are the parameters names, same as found in the + prototype. + +Connecting a function (probe) to a tracepoint is done by providing a +probe (function to call) for the specific tracepoint through register_trace_subsys_eventname(). Removing a probe is done through unregister_trace_subsys_eventname(); it will remove the probe. -tracepoint_synchronize_unregister() must be called before the end of the module exit -function to make sure there is no caller left using the probe. This, and the -fact that preemption is disabled around the probe call, make sure that probe -removal and module unload are safe. See the "Probe example" section below for a -sample probe module. - -The tracepoint mechanism supports inserting multiple instances of the same -tracepoint, but a single definition must be made of a given tracepoint name over -all the kernel to make sure no type conflict will occur. Name mangling of the -tracepoints is done using the prototypes to make sure typing is correct. -Verification of probe type correctness is done at the registration site by the -compiler. Tracepoints can be put in inline functions, inlined static functions, -and unrolled loops as well as regular functions. - -The naming scheme "subsys_event" is suggested here as a convention intended -to limit collisions. Tracepoint names are global to the kernel: they are -considered as being the same whether they are in the core kernel image or in -modules. + +tracepoint_synchronize_unregister() must be called before the end of +the module exit function to make sure there is no caller left using +the probe. This, and the fact that preemption is disabled around the +probe call, make sure that probe removal and module unload are safe. +See the "Probe example" section below for a sample probe module. + +The tracepoint mechanism supports inserting multiple instances of the +same tracepoint, but a single definition must be made of a given +tracepoint name over all the kernel to make sure no type conflict will +occur. Name mangling of the tracepoints is done using the prototypes +to make sure typing is correct. Verification of probe type correctness +is done at the registration site by the compiler. Tracepoints can be +put in inline functions, inlined static functions, and unrolled loops +as well as regular functions. + +The naming scheme "subsys_event" is suggested here as a convention +intended to limit collisions. Tracepoint names are global to the +kernel: they are considered as being the same whether they are in the +core kernel image or in modules. If the tracepoint has to be used in kernel modules, an -EXPORT_TRACEPOINT_SYMBOL_GPL() or EXPORT_TRACEPOINT_SYMBOL() can be used to -export the defined tracepoints. +EXPORT_TRACEPOINT_SYMBOL_GPL() or EXPORT_TRACEPOINT_SYMBOL() can be +used to export the defined tracepoints. * Probe / tracepoint example -- cgit v0.10.2 From 227a837567e339c74d9d4243d03a29bd943a018c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 16 Nov 2008 09:50:34 +0100 Subject: markers/tracpoints: fix non-modular build fix: kernel/marker.c: In function 'marker_module_notify': kernel/marker.c:905: error: 'MODULE_STATE_COMING' undeclared (first use in this function) [...] Signed-off-by: Ingo Molnar diff --git a/kernel/marker.c b/kernel/marker.c index c14ec26..ea54f26 100644 --- a/kernel/marker.c +++ b/kernel/marker.c @@ -896,6 +896,8 @@ void *marker_get_private_data(const char *name, marker_probe_func *probe, } EXPORT_SYMBOL_GPL(marker_get_private_data); +#ifdef CONFIG_MODULES + int marker_module_notify(struct notifier_block *self, unsigned long val, void *data) { @@ -924,3 +926,5 @@ static int init_markers(void) return register_module_notifier(&marker_module_nb); } __initcall(init_markers); + +#endif /* CONFIG_MODULES */ diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 94ac4e3..7960274 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -542,6 +542,8 @@ void tracepoint_iter_reset(struct tracepoint_iter *iter) } EXPORT_SYMBOL_GPL(tracepoint_iter_reset); +#ifdef CONFIG_MODULES + int tracepoint_module_notify(struct notifier_block *self, unsigned long val, void *data) { @@ -570,3 +572,5 @@ static int init_tracepoints(void) return register_module_notifier(&tracepoint_module_nb); } __initcall(init_tracepoints); + +#endif /* CONFIG_MODULES */ -- cgit v0.10.2 From 0c726da983de0704254250ef6495ca152e7abcca Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Sun, 16 Nov 2008 16:07:58 +0530 Subject: tracing: branch tracer, fix writing to trace/trace_options Impact: fix trace_options behavior writing to trace/trace_options use the index of the array to find the value of the flag. With branch tracer flag defined conditionally, this breaks writing to trace_options with branch tracer disabled. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Ingo Molnar diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4a90462..b04923b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -259,9 +259,7 @@ static const char *trace_options[] = { "sched-tree", "ftrace_printk", "ftrace_preempt", -#ifdef CONFIG_BRANCH_TRACER "branch", -#endif "annotate", NULL }; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 790ea8c..b41d7b4 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -470,9 +470,7 @@ enum trace_iterator_flags { TRACE_ITER_SCHED_TREE = 0x200, TRACE_ITER_PRINTK = 0x400, TRACE_ITER_PREEMPTONLY = 0x800, -#ifdef CONFIG_BRANCH_TRACER TRACE_ITER_BRANCH = 0x1000, -#endif TRACE_ITER_ANNOTATE = 0x2000, }; -- cgit v0.10.2 From adf9f19574334c9a29a2bc956009fcac7edf1a6b Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 17 Nov 2008 19:23:42 +0100 Subject: tracing/ftrace: implement a set_flag callback for tracers Impact: give a way to send specific messages to tracers The current implementation of tracing uses some flags to control the output of general tracers. But we have no way to implement custom flags handling for a specific tracer. This patch proposes a new callback for the struct tracer which called set_flag and a structure that represents a 32 bits variable flag. A tracer can implement a struct tracer_flags on which it puts the initial value of the flag integer. Than it can place a range of flags with their name and their flag mask on the flag integer. The structure that implement a single flag is called struct tracer_opt. These custom flags will be available through the trace_options file like the general tracing flags. Changing their value is done like the other general flags. For example if you have a flag that calls "foo", you can activate it by writing "foo" or "nofoo" on trace_options. Note that the set_flag callback is optional and is only needed if you want the flags changing to be signaled to your tracer and let it to accept or refuse their assignment. V2: Some arrangements in coding style.... Signed-off-by: Frederic Weisbecker Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2596b5a..9531fdd 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -43,6 +43,20 @@ unsigned long __read_mostly tracing_max_latency = (cycle_t)ULONG_MAX; unsigned long __read_mostly tracing_thresh; +/* For tracers that don't implement custom flags */ +static struct tracer_opt dummy_tracer_opt[] = { + { } +}; + +static struct tracer_flags dummy_tracer_flags = { + .val = 0, + .opts = dummy_tracer_opt +}; + +static int dummy_set_flag(u32 old_flags, u32 bit, int set) +{ + return 0; +} /* * Kill all tracing for good (never come back). @@ -529,6 +543,14 @@ int register_tracer(struct tracer *type) } } + if (!type->set_flag) + type->set_flag = &dummy_set_flag; + if (!type->flags) + type->flags = &dummy_tracer_flags; + else + if (!type->flags->opts) + type->flags->opts = dummy_tracer_opt; + #ifdef CONFIG_FTRACE_STARTUP_TEST if (type->selftest) { struct tracer *saved_tracer = current_trace; @@ -2426,10 +2448,13 @@ static ssize_t tracing_trace_options_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { + int i; char *buf; int r = 0; int len = 0; - int i; + u32 tracer_flags = current_trace->flags->val; + struct tracer_opt *trace_opts = current_trace->flags->opts; + /* calulate max size */ for (i = 0; trace_options[i]; i++) { @@ -2437,6 +2462,15 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf, len += 3; /* "no" and space */ } + /* + * Increase the size with names of options specific + * of the current tracer. + */ + for (i = 0; trace_opts[i].name; i++) { + len += strlen(trace_opts[i].name); + len += 3; /* "no" and space */ + } + /* +2 for \n and \0 */ buf = kmalloc(len + 2, GFP_KERNEL); if (!buf) @@ -2449,6 +2483,15 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf, r += sprintf(buf + r, "no%s ", trace_options[i]); } + for (i = 0; trace_opts[i].name; i++) { + if (tracer_flags & trace_opts[i].bit) + r += sprintf(buf + r, "%s ", + trace_opts[i].name); + else + r += sprintf(buf + r, "no%s ", + trace_opts[i].name); + } + r += sprintf(buf + r, "\n"); WARN_ON(r >= len + 2); @@ -2459,6 +2502,40 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf, return r; } +/* Try to assign a tracer specific option */ +static int set_tracer_option(struct tracer *trace, char *cmp, int neg) +{ + struct tracer_flags *trace_flags = trace->flags; + struct tracer_opt *opts = NULL; + int ret = 0, i = 0; + int len; + + for (i = 0; trace_flags->opts[i].name; i++) { + opts = &trace_flags->opts[i]; + len = strlen(opts->name); + + if (strncmp(cmp, opts->name, len) == 0) { + ret = trace->set_flag(trace_flags->val, + opts->bit, !neg); + break; + } + } + /* Not found */ + if (!trace_flags->opts[i].name) + return -EINVAL; + + /* Refused to handle */ + if (ret) + return ret; + + if (neg) + trace_flags->val &= ~opts->bit; + else + trace_flags->val |= opts->bit; + + return 0; +} + static ssize_t tracing_trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) @@ -2466,6 +2543,7 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf, char buf[64]; char *cmp = buf; int neg = 0; + int ret; int i; if (cnt >= sizeof(buf)) @@ -2492,11 +2570,13 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf, break; } } - /* - * If no option could be set, return an error: - */ - if (!trace_options[i]) - return -EINVAL; + + /* If no option could be set, test the specific tracer options */ + if (!trace_options[i]) { + ret = set_tracer_option(current_trace, cmp, neg); + if (ret) + return ret; + } filp->f_pos += cnt; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 37947f6..9d22618 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -259,6 +259,29 @@ enum print_line_t { TRACE_TYPE_UNHANDLED = 2 /* Relay to other output functions */ }; + +/* + * An option specific to a tracer. This is a boolean value. + * The bit is the bit index that sets its value on the + * flags value in struct tracer_flags. + */ +struct tracer_opt { + const char *name; /* Will appear on the trace_options file */ + u32 bit; /* Mask assigned in val field in tracer_flags */ +}; + +/* + * The set of specific options for a tracer. Your tracer + * have to set the initial value of the flags val. + */ +struct tracer_flags { + u32 val; + struct tracer_opt *opts; +}; + +/* Makes more easy to define a tracer opt */ +#define TRACER_OPT(s, b) .name = #s, .bit = b + /* * A specific tracer, represented by methods that operate on a trace array: */ @@ -280,8 +303,11 @@ struct tracer { struct trace_array *tr); #endif enum print_line_t (*print_line)(struct trace_iterator *iter); + /* If you handled the flag setting, return 0 */ + int (*set_flag)(u32 old_flags, u32 bit, int set); struct tracer *next; int print_max; + struct tracer_flags *flags; }; struct trace_seq { -- cgit v0.10.2 From 0619faf657806b943e6acf51f60f1cd023a96c78 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 17 Nov 2008 19:26:30 +0100 Subject: tracing/ftrace: make nop tracer using tracer flags Impact: give an example on how to use specific tracer flags This patch propose to use the nop tracer to provide an example for using the tracer's custom flags implementation. V2: replace structures and defines just after the headers includes for cleanliness. V3: replace defines by enum values. Signed-off-by: Frederic Weisbecker Acked-by: Steven Noonan Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c index 0e77415..b9767ac 100644 --- a/kernel/trace/trace_nop.c +++ b/kernel/trace/trace_nop.c @@ -12,6 +12,27 @@ #include "trace.h" +/* Our two options */ +enum { + TRACE_NOP_OPT_ACCEPT = 0x1, + TRACE_NOP_OPT_REFUSE = 0x2 +}; + +/* Options for the tracer (see trace_options file) */ +static struct tracer_opt nop_opts[] = { + /* Option that will be accepted by set_flag callback */ + { TRACER_OPT(test_nop_accept, TRACE_NOP_OPT_ACCEPT) }, + /* Option that will be refused by set_flag callback */ + { TRACER_OPT(test_nop_refuse, TRACE_NOP_OPT_REFUSE) }, + { } /* Always set a last empty entry */ +}; + +static struct tracer_flags nop_flags = { + /* You can check your flags value here when you want. */ + .val = 0, /* By default: all flags disabled */ + .opts = nop_opts +}; + static struct trace_array *ctx_trace; static void start_nop_trace(struct trace_array *tr) @@ -41,6 +62,35 @@ static void nop_trace_reset(struct trace_array *tr) stop_nop_trace(tr); } +/* It only serves as a signal handler and a callback to + * accept or refuse tthe setting of a flag. + * If you don't implement it, then the flag setting will be + * automatically accepted. + */ +static int nop_set_flag(u32 old_flags, u32 bit, int set) +{ + /* + * Note that you don't need to update nop_flags.val yourself. + * The tracing Api will do it automatically if you return 0 + */ + if (bit == TRACE_NOP_OPT_ACCEPT) { + printk(KERN_DEBUG "nop_test_accept flag set to %d: we accept." + " Now cat trace_options to see the result\n", + set); + return 0; + } + + if (bit == TRACE_NOP_OPT_REFUSE) { + printk(KERN_DEBUG "nop_test_refuse flag set to %d: we refuse." + "Now cat trace_options to see the result\n", + set); + return -EINVAL; + } + + return 0; +} + + struct tracer nop_trace __read_mostly = { .name = "nop", @@ -49,5 +99,7 @@ struct tracer nop_trace __read_mostly = #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_nop, #endif + .flags = &nop_flags, + .set_flag = nop_set_flag }; -- cgit v0.10.2 From 0231022cc32d5f2e7f3c06b75691dda0ad6aec33 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 17 Nov 2008 03:22:41 +0100 Subject: tracing/function-return-tracer: add the overrun field Impact: help to find the better depth of trace We decided to arbitrary define the depth of function return trace as "20". Perhaps this is not enough. To help finding an optimal depth, we measure now the overrun: the number of functions that have been missed for the current thread. By default this is not displayed, we have to do set a particular flag on the return tracer: echo overrun > /debug/tracing/trace_options And the overrun will be printed on the right. As the trace shows below, the current 20 depth is not enough. update_wall_time+0x37f/0x8c0 -> update_xtime_cache (345 ns) (Overruns: 2838) update_wall_time+0x384/0x8c0 -> clocksource_get_next (1141 ns) (Overruns: 2838) do_timer+0x23/0x100 -> update_wall_time (3882 ns) (Overruns: 2838) tick_do_update_jiffies64+0xbf/0x160 -> do_timer (5339 ns) (Overruns: 2838) tick_sched_timer+0x6a/0xf0 -> tick_do_update_jiffies64 (7209 ns) (Overruns: 2838) vgacon_set_cursor_size+0x98/0x120 -> native_io_delay (2613 ns) (Overruns: 274) vgacon_cursor+0x16e/0x1d0 -> vgacon_set_cursor_size (33151 ns) (Overruns: 274) set_cursor+0x5f/0x80 -> vgacon_cursor (36432 ns) (Overruns: 274) con_flush_chars+0x34/0x40 -> set_cursor (38790 ns) (Overruns: 274) release_console_sem+0x1ec/0x230 -> up (721 ns) (Overruns: 274) release_console_sem+0x225/0x230 -> wake_up_klogd (316 ns) (Overruns: 274) con_flush_chars+0x39/0x40 -> release_console_sem (2996 ns) (Overruns: 274) con_write+0x22/0x30 -> con_flush_chars (46067 ns) (Overruns: 274) n_tty_write+0x1cc/0x360 -> con_write (292670 ns) (Overruns: 274) smp_apic_timer_interrupt+0x2a/0x90 -> native_apic_mem_write (330 ns) (Overruns: 274) irq_enter+0x17/0x70 -> idle_cpu (413 ns) (Overruns: 274) smp_apic_timer_interrupt+0x2f/0x90 -> irq_enter (1525 ns) (Overruns: 274) ktime_get_ts+0x40/0x70 -> getnstimeofday (465 ns) (Overruns: 274) ktime_get_ts+0x60/0x70 -> set_normalized_timespec (436 ns) (Overruns: 274) ktime_get+0x16/0x30 -> ktime_get_ts (2501 ns) (Overruns: 274) hrtimer_interrupt+0x77/0x1a0 -> ktime_get (3439 ns) (Overruns: 274) Signed-off-by: Frederic Weisbecker Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index a711583..e90e81e 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -21,6 +21,7 @@ struct task_struct; struct exec_domain; #include #include +#include struct thread_info { struct task_struct *task; /* main task structure */ @@ -45,6 +46,11 @@ struct thread_info { int curr_ret_stack; /* Stack of return addresses for return function tracing */ struct ftrace_ret_stack ret_stack[FTRACE_RET_STACK_SIZE]; + /* + * Number of functions that haven't been traced + * because of depth overrun. + */ + atomic_t trace_overrun; #endif }; @@ -61,6 +67,7 @@ struct thread_info { .fn = do_no_restart_syscall, \ }, \ .curr_ret_stack = -1,\ + .trace_overrun = ATOMIC_INIT(0) \ } #else #define INIT_THREAD_INFO(tsk) \ diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 924153e..356bb1e 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -353,8 +353,10 @@ static int push_return_trace(unsigned long ret, unsigned long long time, struct thread_info *ti = current_thread_info(); /* The return trace stack is full */ - if (ti->curr_ret_stack == FTRACE_RET_STACK_SIZE - 1) + if (ti->curr_ret_stack == FTRACE_RET_STACK_SIZE - 1) { + atomic_inc(&ti->trace_overrun); return -EBUSY; + } index = ++ti->curr_ret_stack; barrier(); @@ -367,7 +369,7 @@ static int push_return_trace(unsigned long ret, unsigned long long time, /* Retrieve a function return address to the trace stack on thread info.*/ static void pop_return_trace(unsigned long *ret, unsigned long long *time, - unsigned long *func) + unsigned long *func, unsigned long *overrun) { int index; @@ -376,6 +378,7 @@ static void pop_return_trace(unsigned long *ret, unsigned long long *time, *ret = ti->ret_stack[index].ret; *func = ti->ret_stack[index].func; *time = ti->ret_stack[index].calltime; + *overrun = atomic_read(&ti->trace_overrun); ti->curr_ret_stack--; } @@ -386,7 +389,8 @@ static void pop_return_trace(unsigned long *ret, unsigned long long *time, unsigned long ftrace_return_to_handler(void) { struct ftrace_retfunc trace; - pop_return_trace(&trace.ret, &trace.calltime, &trace.func); + pop_return_trace(&trace.ret, &trace.calltime, &trace.func, + &trace.overrun); trace.rettime = cpu_clock(raw_smp_processor_id()); ftrace_function_return(&trace); diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index f1af1aa..f7ba4ea 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -318,6 +318,8 @@ struct ftrace_retfunc { unsigned long func; /* Current function */ unsigned long long calltime; unsigned long long rettime; + /* Number of functions that overran the depth limit for current task */ + unsigned long overrun; }; #ifdef CONFIG_FUNCTION_RET_TRACER diff --git a/include/linux/sched.h b/include/linux/sched.h index 61c8cc3..c8e0db4 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2016,6 +2016,7 @@ static inline void setup_thread_stack(struct task_struct *p, struct task_struct * used. */ task_thread_info(p)->curr_ret_stack = -1; + atomic_set(&task_thread_info(p)->trace_overrun, 0); #endif } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9531fdd..e97c29a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -853,6 +853,7 @@ static void __trace_function_return(struct trace_array *tr, entry->parent_ip = trace->ret; entry->rettime = trace->rettime; entry->calltime = trace->calltime; + entry->overrun = trace->overrun; ring_buffer_unlock_commit(global_trace.buffer, event, irq_flags); } #endif diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 9d22618..2cb12fd 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -60,6 +60,7 @@ struct ftrace_ret_entry { unsigned long parent_ip; unsigned long long calltime; unsigned long long rettime; + unsigned long overrun; }; extern struct tracer boot_tracer; diff --git a/kernel/trace/trace_functions_return.c b/kernel/trace/trace_functions_return.c index a68564a..e00d645 100644 --- a/kernel/trace/trace_functions_return.c +++ b/kernel/trace/trace_functions_return.c @@ -14,6 +14,19 @@ #include "trace.h" +#define TRACE_RETURN_PRINT_OVERRUN 0x1 +static struct tracer_opt trace_opts[] = { + /* Display overruns or not */ + { TRACER_OPT(overrun, TRACE_RETURN_PRINT_OVERRUN) }, + { } /* Empty entry */ +}; + +static struct tracer_flags tracer_flags = { + .val = 0, /* Don't display overruns by default */ + .opts = trace_opts +}; + + static int return_trace_init(struct trace_array *tr) { int cpu; @@ -42,26 +55,39 @@ print_return_function(struct trace_iterator *iter) ret = trace_seq_printf(s, "%pF -> ", (void *)field->parent_ip); if (!ret) return TRACE_TYPE_PARTIAL_LINE; + ret = seq_print_ip_sym(s, field->ip, trace_flags & TRACE_ITER_SYM_MASK); if (!ret) return TRACE_TYPE_PARTIAL_LINE; - ret = trace_seq_printf(s, " (%llu ns)\n", + + ret = trace_seq_printf(s, " (%llu ns)", field->rettime - field->calltime); if (!ret) return TRACE_TYPE_PARTIAL_LINE; - else - return TRACE_TYPE_HANDLED; + + if (tracer_flags.val & TRACE_RETURN_PRINT_OVERRUN) { + ret = trace_seq_printf(s, " (Overruns: %lu)", + field->overrun); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + + ret = trace_seq_printf(s, "\n"); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + return TRACE_TYPE_HANDLED; } return TRACE_TYPE_UNHANDLED; } -static struct tracer return_trace __read_mostly = -{ +static struct tracer return_trace __read_mostly = { .name = "return", .init = return_trace_init, .reset = return_trace_reset, - .print_line = print_return_function + .print_line = print_return_function, + .flags = &tracer_flags, }; static __init int init_return_trace(void) -- cgit v0.10.2 From a22506347d038a66506c6f57e9b97104128e280d Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 18 Nov 2008 18:06:35 +0100 Subject: ftrace: preemptoff selftest not working Impact: fix preemptoff and preemptirqsoff tracer self-tests I was wondering why the preemptoff and preemptirqsoff tracer selftests don't work on s390. After all its just that they get called from non-preemptible context: kernel_init() will execute all initcalls, however the first line in kernel_init() is lock_kernel(), which causes the preempt_count to be increased. Any later calls to add_preempt_count() (especially those from the selftests) will therefore not result in a call to trace_preempt_off() since the check below in add_preempt_count() will be false: if (preempt_count() == val) trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); Hence the trace buffer will be empty. Fix this by releasing the BKL during the self-tests. Signed-off-by: Heiko Carstens Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 396fda0..1689212 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -532,6 +532,13 @@ int register_tracer(struct tracer *type) } #ifdef CONFIG_FTRACE_STARTUP_TEST + /* + * When this gets called we hold the BKL which means that preemption + * is disabled. Various trace selftests however need to disable + * and enable preemption for successful tests. So we drop the BKL here + * and grab it after the tests again. + */ + unlock_kernel(); if (type->selftest) { struct tracer *saved_tracer = current_trace; struct trace_array *tr = &global_trace; @@ -562,6 +569,7 @@ int register_tracer(struct tracer *type) } printk(KERN_CONT "PASSED\n"); } + lock_kernel(); #endif type->next = trace_types; -- cgit v0.10.2 From 86fa2f60674540df0b34f5c547ed0c1cf3a8f212 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 19 Nov 2008 10:00:15 +0100 Subject: ftrace: fix selftest locking Impact: fix self-test boot crash Self-test failure forgot to re-lock the BKL - crashing the next initcall: Testing tracer irqsoff: .. no entries found ..FAILED! initcall init_irqsoff_tracer+0x0/0x11 returned 0 after 3906 usecs calling init_mmio_trace+0x0/0xf @ 1 ------------[ cut here ]------------ Kernel BUG at c0c0a915 [verbose debug info unavailable] invalid opcode: 0000 [#1] SMP DEBUG_PAGEALLOC last sysfs file: Pid: 1, comm: swapper Not tainted (2.6.28-rc5-tip #53704) EIP: 0060:[] EFLAGS: 00010286 CPU: 1 EIP is at unlock_kernel+0x10/0x2b EAX: ffffffff EBX: 00000000 ECX: 00000000 EDX: f7030000 ESI: c12da19c EDI: 00000000 EBP: f7039f54 ESP: f7039f54 DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068 Process swapper (pid: 1, ti=f7038000 task=f7030000 task.ti=f7038000) Stack: f7039f6c c0164d30 c013fed8 a7d8d7b4 00000000 00000000 f7039f74 c12fb78a f7039fd0 c0101132 c12fb77d 00000000 6f727200 6f632072 2d206564 c1002031 0000000f f7039fa2 f7039fb0 3531b171 00000000 00000000 0000002f c12ca480 Call Trace: [] ? register_tracer+0x66/0x13f [] ? ktime_get+0x19/0x1b [] ? init_mmio_trace+0xd/0xf [] ? do_one_initcall+0x4a/0x111 [] ? init_mmio_trace+0x0/0xf [] ? init_irq_proc+0x46/0x59 [] ? kernel_init+0x104/0x152 [] ? kernel_init+0x0/0x152 [] ? kernel_thread_helper+0x7/0x10 Code: 58 14 43 75 0a b8 00 9b 2d c1 e8 51 43 7a ff 64 a1 00 a0 37 c1 89 58 14 5b 5d c3 55 64 8b 15 00 a0 37 c1 83 7a 14 00 89 e5 79 04 <0f> 0b eb fe 8b 42 14 48 85 c0 89 42 14 79 0a b8 00 9b 2d c1 e8 EIP: [] unlock_kernel+0x10/0x2b SS:ESP 0068:f7039f54 ---[ end trace a7919e7f17c0a725 ]--- Kernel panic - not syncing: Attempted to kill init! So clean up the flow a bit. Signed-off-by: Ingo Molnar diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 1689212..24b6238 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -520,7 +520,15 @@ int register_tracer(struct tracer *type) return -1; } + /* + * When this gets called we hold the BKL which means that + * preemption is disabled. Various trace selftests however + * need to disable and enable preemption for successful tests. + * So we drop the BKL here and grab it after the tests again. + */ + unlock_kernel(); mutex_lock(&trace_types_lock); + for (t = trace_types; t; t = t->next) { if (strcmp(type->name, t->name) == 0) { /* already found */ @@ -532,13 +540,6 @@ int register_tracer(struct tracer *type) } #ifdef CONFIG_FTRACE_STARTUP_TEST - /* - * When this gets called we hold the BKL which means that preemption - * is disabled. Various trace selftests however need to disable - * and enable preemption for successful tests. So we drop the BKL here - * and grab it after the tests again. - */ - unlock_kernel(); if (type->selftest) { struct tracer *saved_tracer = current_trace; struct trace_array *tr = &global_trace; @@ -550,9 +551,9 @@ int register_tracer(struct tracer *type) * internal tracing to verify that everything is in order. * If we fail, we do not register this tracer. */ - for_each_tracing_cpu(i) { + for_each_tracing_cpu(i) tracing_reset(tr, i); - } + current_trace = type; /* the test is responsible for initializing and enabling */ pr_info("Testing tracer %s: ", type->name); @@ -564,12 +565,11 @@ int register_tracer(struct tracer *type) goto out; } /* Only reset on passing, to avoid touching corrupted buffers */ - for_each_tracing_cpu(i) { + for_each_tracing_cpu(i) tracing_reset(tr, i); - } + printk(KERN_CONT "PASSED\n"); } - lock_kernel(); #endif type->next = trace_types; @@ -580,6 +580,7 @@ int register_tracer(struct tracer *type) out: mutex_unlock(&trace_types_lock); + lock_kernel(); return ret; } -- cgit v0.10.2 From 60a515132086b2c28a8141d873297fdf7a180ca7 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 18 Nov 2008 22:20:10 -0800 Subject: profiling: clean up profile_nop() Impact: cleanup No point in inlining this. Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar diff --git a/kernel/profile.c b/kernel/profile.c index 5b7d1ac..7f93a50 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -544,7 +544,7 @@ static const struct file_operations proc_profile_operations = { }; #ifdef CONFIG_SMP -static inline void profile_nop(void *unused) +static void profile_nop(void *unused) { } -- cgit v0.10.2 From 6d07bb47354174a9b52d3b03f9e38b069a93d341 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 14 Nov 2008 16:21:19 -0800 Subject: powerpc: ftrace, do not latency trace idle Impact: fix for irq off latency tracer When idle is called, interrupts are disabled, but the idle function will still wake up on an interrupt. The problem is that the interrupt disabled latency tracer will take this call to idle as a latency. This patch disables the latency tracing when going into idle. Signed-off-by: Steven Rostedt diff --git a/arch/powerpc/kernel/idle.c b/arch/powerpc/kernel/idle.c index 31982d0..88d9c1d 100644 --- a/arch/powerpc/kernel/idle.c +++ b/arch/powerpc/kernel/idle.c @@ -69,10 +69,15 @@ void cpu_idle(void) smp_mb(); local_irq_disable(); + /* Don't trace irqs off for idle */ + stop_critical_timings(); + /* check again after disabling irqs */ if (!need_resched() && !cpu_should_die()) ppc_md.power_save(); + start_critical_timings(); + local_irq_enable(); set_thread_flag(TIF_POLLING_NRFLAG); -- cgit v0.10.2 From 8fd6e5a8c81e2e9b912ea33c8425a10729db469b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 14 Nov 2008 16:21:19 -0800 Subject: powerpc: ftrace, convert to new dynamic ftrace arch API Impact: update to PowerPC ftrace arch API This patch converts PowerPC to use the new dynamic ftrace arch API. Thanks to Paul Mackennas for pointing out the mistakes of my original test_24bit_addr function. Signed-off-by: Steven Rostedt diff --git a/arch/powerpc/include/asm/ftrace.h b/arch/powerpc/include/asm/ftrace.h index b298f7a..17efecc2 100644 --- a/arch/powerpc/include/asm/ftrace.h +++ b/arch/powerpc/include/asm/ftrace.h @@ -7,7 +7,19 @@ #ifndef __ASSEMBLY__ extern void _mcount(void); -#endif + +#ifdef CONFIG_DYNAMIC_FTRACE +static inline unsigned long ftrace_call_adjust(unsigned long addr) +{ + /* reloction of mcount call site is the same as the address */ + return addr; +} + +struct dyn_arch_ftrace { + /* nothing yet */ +}; +#endif /* CONFIG_DYNAMIC_FTRACE */ +#endif /* __ASSEMBLY__ */ #endif diff --git a/arch/powerpc/kernel/ftrace.c b/arch/powerpc/kernel/ftrace.c index f4b006e..24c023a 100644 --- a/arch/powerpc/kernel/ftrace.c +++ b/arch/powerpc/kernel/ftrace.c @@ -33,12 +33,12 @@ static unsigned int ftrace_calc_offset(long ip, long addr) return (int)(addr - ip); } -unsigned char *ftrace_nop_replace(void) +static unsigned char *ftrace_nop_replace(void) { return (char *)&ftrace_nop; } -unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr) +static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr) { static unsigned int op; @@ -68,7 +68,7 @@ unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr) # define _ASM_PTR " .long " #endif -int +static int ftrace_modify_code(unsigned long ip, unsigned char *old_code, unsigned char *new_code) { @@ -113,6 +113,62 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code, return faulted; } +static int test_24bit_addr(unsigned long ip, unsigned long addr) +{ + long diff; + + /* + * Can we get to addr from ip in 24 bits? + * (26 really, since we mulitply by 4 for 4 byte alignment) + */ + diff = addr - ip; + + /* + * Return true if diff is less than 1 << 25 + * and greater than -1 << 26. + */ + return (diff < (1 << 25)) && (diff > (-1 << 26)); +} + +int ftrace_make_nop(struct module *mod, + struct dyn_ftrace *rec, unsigned long addr) +{ + unsigned char *old, *new; + + /* + * If the calling address is more that 24 bits away, + * then we had to use a trampoline to make the call. + * Otherwise just update the call site. + */ + if (test_24bit_addr(rec->ip, addr)) { + /* within range */ + old = ftrace_call_replace(rec->ip, addr); + new = ftrace_nop_replace(); + return ftrace_modify_code(rec->ip, old, new); + } + + return 0; +} + +int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) +{ + unsigned char *old, *new; + + /* + * If the calling address is more that 24 bits away, + * then we had to use a trampoline to make the call. + * Otherwise just update the call site. + */ + if (test_24bit_addr(rec->ip, addr)) { + /* within range */ + old = ftrace_nop_replace(); + new = ftrace_call_replace(rec->ip, addr); + return ftrace_modify_code(rec->ip, old, new); + } + + return 0; +} + int ftrace_update_ftrace_func(ftrace_func_t func) { unsigned long ip = (unsigned long)(&ftrace_call); @@ -128,9 +184,10 @@ int ftrace_update_ftrace_func(ftrace_func_t func) int __init ftrace_dyn_arch_init(void *data) { - /* This is running in kstop_machine */ + /* caller expects data to be zero */ + unsigned long *p = data; - ftrace_mcount_set(data); + *p = 0; return 0; } -- cgit v0.10.2 From e4486fe316895e87672a563c4f36393218f84ff1 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 14 Nov 2008 16:21:20 -0800 Subject: powerpc: ftrace, use probe_kernel API to modify code Impact: use cleaner probe_kernel API over assembly Using probe_kernel_read/write interface is a much cleaner approach than the current assembly version. Signed-off-by: Steven Rostedt diff --git a/arch/powerpc/kernel/ftrace.c b/arch/powerpc/kernel/ftrace.c index 24c023a..1adfbb2 100644 --- a/arch/powerpc/kernel/ftrace.c +++ b/arch/powerpc/kernel/ftrace.c @@ -9,6 +9,7 @@ #include #include +#include #include #include #include @@ -72,45 +73,33 @@ static int ftrace_modify_code(unsigned long ip, unsigned char *old_code, unsigned char *new_code) { - unsigned replaced; - unsigned old = *(unsigned *)old_code; - unsigned new = *(unsigned *)new_code; - int faulted = 0; + unsigned char replaced[MCOUNT_INSN_SIZE]; /* * Note: Due to modules and __init, code can * disappear and change, we need to protect against faulting - * as well as code changing. + * as well as code changing. We do this by using the + * probe_kernel_* functions. * * No real locking needed, this code is run through - * kstop_machine. + * kstop_machine, or before SMP starts. */ - asm volatile ( - "1: lwz %1, 0(%2)\n" - " cmpw %1, %5\n" - " bne 2f\n" - " stwu %3, 0(%2)\n" - "2:\n" - ".section .fixup, \"ax\"\n" - "3: li %0, 1\n" - " b 2b\n" - ".previous\n" - ".section __ex_table,\"a\"\n" - _ASM_ALIGN "\n" - _ASM_PTR "1b, 3b\n" - ".previous" - : "=r"(faulted), "=r"(replaced) - : "r"(ip), "r"(new), - "0"(faulted), "r"(old) - : "memory"); - - if (replaced != old && replaced != new) - faulted = 2; - - if (!faulted) - flush_icache_range(ip, ip + 8); - - return faulted; + + /* read the text we want to modify */ + if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE)) + return -EFAULT; + + /* Make sure it is what we expect it to be */ + if (memcmp(replaced, old_code, MCOUNT_INSN_SIZE) != 0) + return -EINVAL; + + /* replace the text with the new text */ + if (probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE)) + return -EPERM; + + flush_icache_range(ip, ip + 8); + + return 0; } static int test_24bit_addr(unsigned long ip, unsigned long addr) -- cgit v0.10.2 From f48cb8b48b0b10025ca9c451b9b32cac3fcd33ba Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 14 Nov 2008 20:47:03 -0800 Subject: powerpc/ppc64: ftrace, handle module trampolines for dyn ftrace Impact: Allow 64 bit PowerPC to trace modules with dynamic ftrace This adds code to handle the PPC64 module trampolines, and allows for PPC64 to use dynamic ftrace. Thanks to Paul Mackerras for these updates: - fix the mod and rec->arch.mod NULL checks. - fix to is_bl_op compare. Thanks to Milton Miller for: - finding the nasty race with using two nops, and recommending instead that I use a branch 8 forward. Signed-off-by: Steven Rostedt diff --git a/arch/powerpc/include/asm/ftrace.h b/arch/powerpc/include/asm/ftrace.h index 17efecc2..e5f2ae8 100644 --- a/arch/powerpc/include/asm/ftrace.h +++ b/arch/powerpc/include/asm/ftrace.h @@ -16,7 +16,7 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr) } struct dyn_arch_ftrace { - /* nothing yet */ + struct module *mod; }; #endif /* CONFIG_DYNAMIC_FTRACE */ #endif /* __ASSEMBLY__ */ diff --git a/arch/powerpc/include/asm/module.h b/arch/powerpc/include/asm/module.h index e5f14b1..340bc69 100644 --- a/arch/powerpc/include/asm/module.h +++ b/arch/powerpc/include/asm/module.h @@ -34,6 +34,11 @@ struct mod_arch_specific { #ifdef __powerpc64__ unsigned int stubs_section; /* Index of stubs section in module */ unsigned int toc_section; /* What section is the TOC? */ +#ifdef CONFIG_DYNAMIC_FTRACE + unsigned long toc; + unsigned long tramp; +#endif + #else /* Indices of PLT sections within module. */ unsigned int core_plt_section; @@ -68,6 +73,12 @@ struct mod_arch_specific { # endif /* MODULE */ #endif +#ifdef CONFIG_DYNAMIC_FTRACE +# ifdef MODULE + asm(".section .ftrace.tramp,\"ax\",@nobits; .align 3; .previous"); +# endif /* MODULE */ +#endif + struct exception_table_entry; void sort_ex_table(struct exception_table_entry *start, diff --git a/arch/powerpc/kernel/ftrace.c b/arch/powerpc/kernel/ftrace.c index 1adfbb2..1aec559 100644 --- a/arch/powerpc/kernel/ftrace.c +++ b/arch/powerpc/kernel/ftrace.c @@ -10,22 +10,29 @@ #include #include #include +#include #include #include #include #include #include +#include #include +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(fmt , ...) do { } while (0) +#endif -static unsigned int ftrace_nop = 0x60000000; +static unsigned int ftrace_nop = PPC_NOP_INSTR; #ifdef CONFIG_PPC32 # define GET_ADDR(addr) addr #else /* PowerPC64's functions are data that points to the functions */ -# define GET_ADDR(addr) *(unsigned long *)addr +# define GET_ADDR(addr) (*(unsigned long *)addr) #endif @@ -102,6 +109,9 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code, return 0; } +/* + * Helper functions that are the same for both PPC64 and PPC32. + */ static int test_24bit_addr(unsigned long ip, unsigned long addr) { long diff; @@ -119,43 +129,292 @@ static int test_24bit_addr(unsigned long ip, unsigned long addr) return (diff < (1 << 25)) && (diff > (-1 << 26)); } +static int is_bl_op(unsigned int op) +{ + return (op & 0xfc000003) == 0x48000001; +} + +static int test_offset(unsigned long offset) +{ + return (offset + 0x2000000 > 0x3ffffff) || ((offset & 3) != 0); +} + +static unsigned long find_bl_target(unsigned long ip, unsigned int op) +{ + static int offset; + + offset = (op & 0x03fffffc); + /* make it signed */ + if (offset & 0x02000000) + offset |= 0xfe000000; + + return ip + (long)offset; +} + +static unsigned int branch_offset(unsigned long offset) +{ + /* return "bl ip+offset" */ + return 0x48000001 | (offset & 0x03fffffc); +} + +#ifdef CONFIG_PPC64 +static int +__ftrace_make_nop(struct module *mod, + struct dyn_ftrace *rec, unsigned long addr) +{ + unsigned char replaced[MCOUNT_INSN_SIZE * 2]; + unsigned int *op = (unsigned *)&replaced; + unsigned char jmp[8]; + unsigned long *ptr = (unsigned long *)&jmp; + unsigned long ip = rec->ip; + unsigned long tramp; + int offset; + + /* read where this goes */ + if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE)) + return -EFAULT; + + /* Make sure that that this is still a 24bit jump */ + if (!is_bl_op(*op)) { + printk(KERN_ERR "Not expected bl: opcode is %x\n", *op); + return -EINVAL; + } + + /* lets find where the pointer goes */ + tramp = find_bl_target(ip, *op); + + /* + * On PPC64 the trampoline looks like: + * 0x3d, 0x82, 0x00, 0x00, addis r12,r2, + * 0x39, 0x8c, 0x00, 0x00, addi r12,r12, + * Where the bytes 2,3,6 and 7 make up the 32bit offset + * to the TOC that holds the pointer. + * to jump to. + * 0xf8, 0x41, 0x00, 0x28, std r2,40(r1) + * 0xe9, 0x6c, 0x00, 0x20, ld r11,32(r12) + * The actually address is 32 bytes from the offset + * into the TOC. + * 0xe8, 0x4c, 0x00, 0x28, ld r2,40(r12) + */ + + DEBUGP("ip:%lx jumps to %lx r2: %lx", ip, tramp, mod->arch.toc); + + /* Find where the trampoline jumps to */ + if (probe_kernel_read(jmp, (void *)tramp, 8)) { + printk(KERN_ERR "Failed to read %lx\n", tramp); + return -EFAULT; + } + + DEBUGP(" %08x %08x", + (unsigned)(*ptr >> 32), + (unsigned)*ptr); + + offset = (unsigned)jmp[2] << 24 | + (unsigned)jmp[3] << 16 | + (unsigned)jmp[6] << 8 | + (unsigned)jmp[7]; + + DEBUGP(" %x ", offset); + + /* get the address this jumps too */ + tramp = mod->arch.toc + offset + 32; + DEBUGP("toc: %lx", tramp); + + if (probe_kernel_read(jmp, (void *)tramp, 8)) { + printk(KERN_ERR "Failed to read %lx\n", tramp); + return -EFAULT; + } + + DEBUGP(" %08x %08x\n", + (unsigned)(*ptr >> 32), + (unsigned)*ptr); + + /* This should match what was called */ + if (*ptr != GET_ADDR(addr)) { + printk(KERN_ERR "addr does not match %lx\n", *ptr); + return -EINVAL; + } + + /* + * We want to nop the line, but the next line is + * 0xe8, 0x41, 0x00, 0x28 ld r2,40(r1) + * This needs to be turned to a nop too. + */ + if (probe_kernel_read(replaced, (void *)(ip+4), MCOUNT_INSN_SIZE)) + return -EFAULT; + + if (*op != 0xe8410028) { + printk(KERN_ERR "Next line is not ld! (%08x)\n", *op); + return -EINVAL; + } + + /* + * Milton Miller pointed out that we can not blindly do nops. + * If a task was preempted when calling a trace function, + * the nops will remove the way to restore the TOC in r2 + * and the r2 TOC will get corrupted. + */ + + /* + * Replace: + * bl <==== will be replaced with "b 1f" + * ld r2,40(r1) + * 1: + */ + op[0] = 0x48000008; /* b +8 */ + + if (probe_kernel_write((void *)ip, replaced, MCOUNT_INSN_SIZE)) + return -EPERM; + + return 0; +} + +#else /* !PPC64 */ +static int +__ftrace_make_nop(struct module *mod, + struct dyn_ftrace *rec, unsigned long addr) +{ + /* Ignore modules for PPC32 (for now) */ + return 0; +} +#endif /* PPC64 */ + int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, unsigned long addr) { unsigned char *old, *new; + unsigned long ip = rec->ip; /* * If the calling address is more that 24 bits away, * then we had to use a trampoline to make the call. * Otherwise just update the call site. */ - if (test_24bit_addr(rec->ip, addr)) { + if (test_24bit_addr(ip, addr)) { /* within range */ - old = ftrace_call_replace(rec->ip, addr); + old = ftrace_call_replace(ip, addr); new = ftrace_nop_replace(); - return ftrace_modify_code(rec->ip, old, new); + return ftrace_modify_code(ip, old, new); + } + +#ifdef CONFIG_PPC64 + /* + * Out of range jumps are called from modules. + * We should either already have a pointer to the module + * or it has been passed in. + */ + if (!rec->arch.mod) { + if (!mod) { + printk(KERN_ERR "No module loaded addr=%lx\n", + addr); + return -EFAULT; + } + rec->arch.mod = mod; + } else if (mod) { + if (mod != rec->arch.mod) { + printk(KERN_ERR + "Record mod %p not equal to passed in mod %p\n", + rec->arch.mod, mod); + return -EINVAL; + } + /* nothing to do if mod == rec->arch.mod */ + } else + mod = rec->arch.mod; +#endif /* CONFIG_PPC64 */ + + return __ftrace_make_nop(mod, rec, addr); + +} + +#ifdef CONFIG_PPC64 +static int +__ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) +{ + unsigned char replaced[MCOUNT_INSN_SIZE * 2]; + unsigned int *op = (unsigned *)&replaced; + unsigned long ip = rec->ip; + unsigned long offset; + + /* read where this goes */ + if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE * 2)) + return -EFAULT; + + /* + * It should be pointing to two nops or + * b +8; ld r2,40(r1) + */ + if (((op[0] != 0x48000008) || (op[1] != 0xe8410028)) && + ((op[0] != PPC_NOP_INSTR) || (op[1] != PPC_NOP_INSTR))) { + printk(KERN_ERR "Expected NOPs but have %x %x\n", op[0], op[1]); + return -EINVAL; + } + + /* If we never set up a trampoline to ftrace_caller, then bail */ + if (!rec->arch.mod->arch.tramp) { + printk(KERN_ERR "No ftrace trampoline\n"); + return -EINVAL; + } + + /* now calculate a jump to the ftrace caller trampoline */ + offset = rec->arch.mod->arch.tramp - ip; + + if (test_offset(offset)) { + printk(KERN_ERR "REL24 %li out of range!\n", + (long int)offset); + return -EINVAL; } + /* Set to "bl addr" */ + op[0] = branch_offset(offset); + /* ld r2,40(r1) */ + op[1] = 0xe8410028; + + DEBUGP("write to %lx\n", rec->ip); + + if (probe_kernel_write((void *)ip, replaced, MCOUNT_INSN_SIZE * 2)) + return -EPERM; + return 0; } +#else +static int +__ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) +{ + /* PPC32 ignores modules for now */ + return 0; +} +#endif /* CONFIG_PPC64 */ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) { unsigned char *old, *new; + unsigned long ip = rec->ip; /* * If the calling address is more that 24 bits away, * then we had to use a trampoline to make the call. * Otherwise just update the call site. */ - if (test_24bit_addr(rec->ip, addr)) { + if (test_24bit_addr(ip, addr)) { /* within range */ old = ftrace_nop_replace(); - new = ftrace_call_replace(rec->ip, addr); - return ftrace_modify_code(rec->ip, old, new); + new = ftrace_call_replace(ip, addr); + return ftrace_modify_code(ip, old, new); } - return 0; +#ifdef CONFIG_PPC64 + /* + * Out of range jumps are called from modules. + * Being that we are converting from nop, it had better + * already have a module defined. + */ + if (!rec->arch.mod) { + printk(KERN_ERR "No module loaded\n"); + return -EINVAL; + } +#endif + + return __ftrace_make_call(rec, addr); } int ftrace_update_ftrace_func(ftrace_func_t func) @@ -180,4 +439,3 @@ int __init ftrace_dyn_arch_init(void *data) return 0; } - diff --git a/arch/powerpc/kernel/module_64.c b/arch/powerpc/kernel/module_64.c index 1af2377..8992b03 100644 --- a/arch/powerpc/kernel/module_64.c +++ b/arch/powerpc/kernel/module_64.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -163,6 +164,11 @@ static unsigned long get_stubs_size(const Elf64_Ehdr *hdr, } } +#ifdef CONFIG_DYNAMIC_FTRACE + /* make the trampoline to the ftrace_caller */ + relocs++; +#endif + DEBUGP("Looks like a total of %lu stubs, max\n", relocs); return relocs * sizeof(struct ppc64_stub_entry); } @@ -441,5 +447,12 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, } } +#ifdef CONFIG_DYNAMIC_FTRACE + me->arch.toc = my_r2(sechdrs, me); + me->arch.tramp = stub_for_addr(sechdrs, + (unsigned long)ftrace_caller, + me); +#endif + return 0; } -- cgit v0.10.2 From 7cc45e64323c8a1042f56e6a8d1dc982f98d52a8 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Sat, 15 Nov 2008 02:39:05 -0500 Subject: powerpc/ppc32: ftrace, dynamic ftrace to handle modules Impact: add ability to trace modules on 32 bit PowerPC This patch performs the necessary trampoline calls to handle modules with dynamic ftrace on 32 bit PowerPC. Signed-off-by: Steven Rostedt diff --git a/arch/powerpc/include/asm/module.h b/arch/powerpc/include/asm/module.h index 340bc69..0845488 100644 --- a/arch/powerpc/include/asm/module.h +++ b/arch/powerpc/include/asm/module.h @@ -39,11 +39,14 @@ struct mod_arch_specific { unsigned long tramp; #endif -#else +#else /* powerpc64 */ /* Indices of PLT sections within module. */ unsigned int core_plt_section; unsigned int init_plt_section; +#ifdef CONFIG_DYNAMIC_FTRACE + unsigned long tramp; #endif +#endif /* powerpc64 */ /* List of BUG addresses, source line numbers and filenames */ struct list_head bug_list; diff --git a/arch/powerpc/kernel/ftrace.c b/arch/powerpc/kernel/ftrace.c index 1aec559..3271cd6 100644 --- a/arch/powerpc/kernel/ftrace.c +++ b/arch/powerpc/kernel/ftrace.c @@ -274,7 +274,63 @@ static int __ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, unsigned long addr) { - /* Ignore modules for PPC32 (for now) */ + unsigned char replaced[MCOUNT_INSN_SIZE]; + unsigned int *op = (unsigned *)&replaced; + unsigned char jmp[8]; + unsigned int *ptr = (unsigned int *)&jmp; + unsigned long ip = rec->ip; + unsigned long tramp; + int offset; + + if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE)) + return -EFAULT; + + /* Make sure that that this is still a 24bit jump */ + if (!is_bl_op(*op)) { + printk(KERN_ERR "Not expected bl: opcode is %x\n", *op); + return -EINVAL; + } + + /* lets find where the pointer goes */ + tramp = find_bl_target(ip, *op); + + /* + * On PPC32 the trampoline looks like: + * lis r11,sym@ha + * addi r11,r11,sym@l + * mtctr r11 + * bctr + */ + + DEBUGP("ip:%lx jumps to %lx", ip, tramp); + + /* Find where the trampoline jumps to */ + if (probe_kernel_read(jmp, (void *)tramp, 8)) { + printk(KERN_ERR "Failed to read %lx\n", tramp); + return -EFAULT; + } + + DEBUGP(" %08x %08x ", ptr[0], ptr[1]); + + tramp = (ptr[1] & 0xffff) | + ((ptr[0] & 0xffff) << 16); + if (tramp & 0x8000) + tramp -= 0x10000; + + DEBUGP(" %x ", tramp); + + if (tramp != addr) { + printk(KERN_ERR + "Trampoline location %08lx does not match addr\n", + tramp); + return -EINVAL; + } + + op[0] = PPC_NOP_INSTR; + + if (probe_kernel_write((void *)ip, replaced, MCOUNT_INSN_SIZE)) + return -EPERM; + return 0; } #endif /* PPC64 */ @@ -297,7 +353,6 @@ int ftrace_make_nop(struct module *mod, return ftrace_modify_code(ip, old, new); } -#ifdef CONFIG_PPC64 /* * Out of range jumps are called from modules. * We should either already have a pointer to the module @@ -320,7 +375,6 @@ int ftrace_make_nop(struct module *mod, /* nothing to do if mod == rec->arch.mod */ } else mod = rec->arch.mod; -#endif /* CONFIG_PPC64 */ return __ftrace_make_nop(mod, rec, addr); @@ -380,7 +434,44 @@ __ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) static int __ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) { - /* PPC32 ignores modules for now */ + unsigned char replaced[MCOUNT_INSN_SIZE]; + unsigned int *op = (unsigned *)&replaced; + unsigned long ip = rec->ip; + unsigned long offset; + + /* read where this goes */ + if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE)) + return -EFAULT; + + /* It should be pointing to a nop */ + if (op[0] != PPC_NOP_INSTR) { + printk(KERN_ERR "Expected NOP but have %x\n", op[0]); + return -EINVAL; + } + + /* If we never set up a trampoline to ftrace_caller, then bail */ + if (!rec->arch.mod->arch.tramp) { + printk(KERN_ERR "No ftrace trampoline\n"); + return -EINVAL; + } + + /* now calculate a jump to the ftrace caller trampoline */ + offset = rec->arch.mod->arch.tramp - ip; + + if (test_offset(offset)) { + printk(KERN_ERR "REL24 %li out of range!\n", + (long int)offset); + return -EINVAL; + } + + /* Set to "bl addr" */ + op[0] = branch_offset(offset); + + DEBUGP("write to %lx\n", rec->ip); + + if (probe_kernel_write((void *)ip, replaced, MCOUNT_INSN_SIZE)) + return -EPERM; + return 0; } #endif /* CONFIG_PPC64 */ @@ -402,7 +493,6 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) return ftrace_modify_code(ip, old, new); } -#ifdef CONFIG_PPC64 /* * Out of range jumps are called from modules. * Being that we are converting from nop, it had better @@ -412,7 +502,6 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) printk(KERN_ERR "No module loaded\n"); return -EINVAL; } -#endif return __ftrace_make_call(rec, addr); } diff --git a/arch/powerpc/kernel/module_32.c b/arch/powerpc/kernel/module_32.c index 2df91a0..f832773 100644 --- a/arch/powerpc/kernel/module_32.c +++ b/arch/powerpc/kernel/module_32.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -53,6 +54,9 @@ static unsigned int count_relocs(const Elf32_Rela *rela, unsigned int num) r_addend = rela[i].r_addend; } +#ifdef CONFIG_DYNAMIC_FTRACE + _count_relocs++; /* add one for ftrace_caller */ +#endif return _count_relocs; } @@ -306,5 +310,11 @@ int apply_relocate_add(Elf32_Shdr *sechdrs, return -ENOEXEC; } } +#ifdef CONFIG_DYNAMIC_FTRACE + module->arch.tramp = + do_plt_call(module->module_core, + (unsigned long)ftrace_caller, + sechdrs, module); +#endif return 0; } -- cgit v0.10.2 From f201ae2356c74bcae130b2177b3dca903ea98071 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 23 Nov 2008 06:22:56 +0100 Subject: tracing/function-return-tracer: store return stack into task_struct and allocate it dynamically Impact: use deeper function tracing depth safely Some tests showed that function return tracing needed a more deeper depth of function calls. But it could be unsafe to store these return addresses to the stack. So these arrays will now be allocated dynamically into task_struct of current only when the tracer is activated. Typical scheme when tracer is activated: - allocate a return stack for each task in global list. - fork: allocate the return stack for the newly created task - exit: free return stack of current - idle init: same as fork I chose a default depth of 50. I don't have overruns anymore. Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index 2bb43b4..754a3e0 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -29,7 +29,6 @@ struct dyn_arch_ftrace { #endif /* CONFIG_FUNCTION_TRACER */ #ifdef CONFIG_FUNCTION_RET_TRACER -#define FTRACE_RET_STACK_SIZE 20 #ifndef __ASSEMBLY__ diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index e90e81e..0921b40 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -40,36 +40,8 @@ struct thread_info { */ __u8 supervisor_stack[0]; #endif - -#ifdef CONFIG_FUNCTION_RET_TRACER - /* Index of current stored adress in ret_stack */ - int curr_ret_stack; - /* Stack of return addresses for return function tracing */ - struct ftrace_ret_stack ret_stack[FTRACE_RET_STACK_SIZE]; - /* - * Number of functions that haven't been traced - * because of depth overrun. - */ - atomic_t trace_overrun; -#endif }; -#ifdef CONFIG_FUNCTION_RET_TRACER -#define INIT_THREAD_INFO(tsk) \ -{ \ - .task = &tsk, \ - .exec_domain = &default_exec_domain, \ - .flags = 0, \ - .cpu = 0, \ - .preempt_count = 1, \ - .addr_limit = KERNEL_DS, \ - .restart_block = { \ - .fn = do_no_restart_syscall, \ - }, \ - .curr_ret_stack = -1,\ - .trace_overrun = ATOMIC_INIT(0) \ -} -#else #define INIT_THREAD_INFO(tsk) \ { \ .task = &tsk, \ @@ -82,7 +54,6 @@ struct thread_info { .fn = do_no_restart_syscall, \ }, \ } -#endif #define init_thread_info (init_thread_union.thread_info) #define init_stack (init_thread_union.stack) diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 356bb1e..bb137f7 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -350,19 +350,21 @@ static int push_return_trace(unsigned long ret, unsigned long long time, unsigned long func) { int index; - struct thread_info *ti = current_thread_info(); + + if (!current->ret_stack) + return -EBUSY; /* The return trace stack is full */ - if (ti->curr_ret_stack == FTRACE_RET_STACK_SIZE - 1) { - atomic_inc(&ti->trace_overrun); + if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) { + atomic_inc(¤t->trace_overrun); return -EBUSY; } - index = ++ti->curr_ret_stack; + index = ++current->curr_ret_stack; barrier(); - ti->ret_stack[index].ret = ret; - ti->ret_stack[index].func = func; - ti->ret_stack[index].calltime = time; + current->ret_stack[index].ret = ret; + current->ret_stack[index].func = func; + current->ret_stack[index].calltime = time; return 0; } @@ -373,13 +375,12 @@ static void pop_return_trace(unsigned long *ret, unsigned long long *time, { int index; - struct thread_info *ti = current_thread_info(); - index = ti->curr_ret_stack; - *ret = ti->ret_stack[index].ret; - *func = ti->ret_stack[index].func; - *time = ti->ret_stack[index].calltime; - *overrun = atomic_read(&ti->trace_overrun); - ti->curr_ret_stack--; + index = current->curr_ret_stack; + *ret = current->ret_stack[index].ret; + *func = current->ret_stack[index].func; + *time = current->ret_stack[index].calltime; + *overrun = atomic_read(¤t->trace_overrun); + current->curr_ret_stack--; } /* diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index f7ba4ea..2ba259b 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -323,6 +323,8 @@ struct ftrace_retfunc { }; #ifdef CONFIG_FUNCTION_RET_TRACER +#define FTRACE_RETFUNC_DEPTH 50 +#define FTRACE_RETSTACK_ALLOC_SIZE 32 /* Type of a callback handler of tracing return function */ typedef void (*trace_function_return_t)(struct ftrace_retfunc *); @@ -330,6 +332,9 @@ extern int register_ftrace_return(trace_function_return_t func); /* The current handler in use */ extern trace_function_return_t ftrace_function_return; extern void unregister_ftrace_return(void); + +extern void ftrace_retfunc_init_task(struct task_struct *t); +extern void ftrace_retfunc_exit_task(struct task_struct *t); #endif #endif /* _LINUX_FTRACE_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index c8e0db4..bee1e93 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1352,6 +1352,17 @@ struct task_struct { unsigned long default_timer_slack_ns; struct list_head *scm_work_list; +#ifdef CONFIG_FUNCTION_RET_TRACER + /* Index of current stored adress in ret_stack */ + int curr_ret_stack; + /* Stack of return addresses for return function tracing */ + struct ftrace_ret_stack *ret_stack; + /* + * Number of functions that haven't been traced + * because of depth overrun. + */ + atomic_t trace_overrun; +#endif }; /* @@ -2006,18 +2017,6 @@ static inline void setup_thread_stack(struct task_struct *p, struct task_struct { *task_thread_info(p) = *task_thread_info(org); task_thread_info(p)->task = p; - -#ifdef CONFIG_FUNCTION_RET_TRACER - /* - * When fork() creates a child process, this function is called. - * But the child task may not inherit the return adresses traced - * by the return function tracer because it will directly execute - * in userspace and will not return to kernel functions its parent - * used. - */ - task_thread_info(p)->curr_ret_stack = -1; - atomic_set(&task_thread_info(p)->trace_overrun, 0); -#endif } static inline unsigned long *end_of_stack(struct task_struct *p) diff --git a/kernel/exit.c b/kernel/exit.c index 35c8ec2..b9d4463 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -47,6 +47,7 @@ #include #include #include +#include #include #include @@ -1127,7 +1128,9 @@ NORET_TYPE void do_exit(long code) preempt_disable(); /* causes final put_task_struct in finish_task_switch(). */ tsk->state = TASK_DEAD; - +#ifdef CONFIG_FUNCTION_RET_TRACER + ftrace_retfunc_exit_task(tsk); +#endif schedule(); BUG(); /* Avoid "noreturn function does return". */ diff --git a/kernel/fork.c b/kernel/fork.c index ac62f43..d1eb30e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -47,6 +47,7 @@ #include #include #include +#include #include #include #include @@ -1269,6 +1270,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, total_forks++; spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); +#ifdef CONFIG_FUNCTION_RET_TRACER + ftrace_retfunc_init_task(p); +#endif proc_fork_connector(p); cgroup_post_fork(p); return p; diff --git a/kernel/sched.c b/kernel/sched.c index 4de5610..fb17205 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5901,6 +5901,9 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) * The idle tasks have their own, simple scheduling class: */ idle->sched_class = &idle_sched_class; +#ifdef CONFIG_FUNCTION_RET_TRACER + ftrace_retfunc_init_task(idle); +#endif } /* diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f212da4..90d99fb 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1498,10 +1498,77 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, #ifdef CONFIG_FUNCTION_RET_TRACER +static atomic_t ftrace_retfunc_active; + /* The callback that hooks the return of a function */ trace_function_return_t ftrace_function_return = (trace_function_return_t)ftrace_stub; + +/* Try to assign a return stack array on FTRACE_RETSTACK_ALLOC_SIZE tasks. */ +static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list) +{ + int i; + int ret = 0; + unsigned long flags; + int start = 0, end = FTRACE_RETSTACK_ALLOC_SIZE; + struct task_struct *g, *t; + + for (i = 0; i < FTRACE_RETSTACK_ALLOC_SIZE; i++) { + ret_stack_list[i] = kmalloc(FTRACE_RETFUNC_DEPTH + * sizeof(struct ftrace_ret_stack), + GFP_KERNEL); + if (!ret_stack_list[i]) { + start = 0; + end = i; + ret = -ENOMEM; + goto free; + } + } + + read_lock_irqsave(&tasklist_lock, flags); + do_each_thread(g, t) { + if (start == end) { + ret = -EAGAIN; + goto unlock; + } + + if (t->ret_stack == NULL) { + t->ret_stack = ret_stack_list[start++]; + t->curr_ret_stack = -1; + atomic_set(&t->trace_overrun, 0); + } + } while_each_thread(g, t); + +unlock: + read_unlock_irqrestore(&tasklist_lock, flags); +free: + for (i = start; i < end; i++) + kfree(ret_stack_list[i]); + return ret; +} + +/* Allocate a return stack for each task */ +static int start_return_tracing(void) +{ + struct ftrace_ret_stack **ret_stack_list; + int ret; + + ret_stack_list = kmalloc(FTRACE_RETSTACK_ALLOC_SIZE * + sizeof(struct ftrace_ret_stack *), + GFP_KERNEL); + + if (!ret_stack_list) + return -ENOMEM; + + do { + ret = alloc_retstack_tasklist(ret_stack_list); + } while (ret == -EAGAIN); + + kfree(ret_stack_list); + return ret; +} + int register_ftrace_return(trace_function_return_t func) { int ret = 0; @@ -1516,7 +1583,12 @@ int register_ftrace_return(trace_function_return_t func) ret = -EBUSY; goto out; } - + atomic_inc(&ftrace_retfunc_active); + ret = start_return_tracing(); + if (ret) { + atomic_dec(&ftrace_retfunc_active); + goto out; + } ftrace_tracing_type = FTRACE_TYPE_RETURN; ftrace_function_return = func; ftrace_startup(); @@ -1530,6 +1602,7 @@ void unregister_ftrace_return(void) { mutex_lock(&ftrace_sysctl_lock); + atomic_dec(&ftrace_retfunc_active); ftrace_function_return = (trace_function_return_t)ftrace_stub; ftrace_shutdown(); /* Restore normal tracing type */ @@ -1537,6 +1610,27 @@ void unregister_ftrace_return(void) mutex_unlock(&ftrace_sysctl_lock); } + +/* Allocate a return stack for newly created task */ +void ftrace_retfunc_init_task(struct task_struct *t) +{ + if (atomic_read(&ftrace_retfunc_active)) { + t->ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH + * sizeof(struct ftrace_ret_stack), + GFP_KERNEL); + if (!t->ret_stack) + return; + t->curr_ret_stack = -1; + atomic_set(&t->trace_overrun, 0); + } else + t->ret_stack = NULL; +} + +void ftrace_retfunc_exit_task(struct task_struct *t) +{ + kfree(t->ret_stack); + t->ret_stack = NULL; +} #endif -- cgit v0.10.2 From 82f60f0bc854aada696f27d863c03bef91f1509d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 23 Nov 2008 09:18:56 +0100 Subject: tracing/function-return-tracer: clean up task start/exit callbacks Impact: cleanup Eliminate #ifdefs in core code by using empty inline functions. Signed-off-by: Ingo Molnar diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 2ba259b..938ca19 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -335,6 +335,9 @@ extern void unregister_ftrace_return(void); extern void ftrace_retfunc_init_task(struct task_struct *t); extern void ftrace_retfunc_exit_task(struct task_struct *t); +#else +static inline void ftrace_retfunc_init_task(struct task_struct *t) { } +static inline void ftrace_retfunc_exit_task(struct task_struct *t) { } #endif #endif /* _LINUX_FTRACE_H */ diff --git a/kernel/exit.c b/kernel/exit.c index b9d4463..ef04d03 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1128,9 +1128,7 @@ NORET_TYPE void do_exit(long code) preempt_disable(); /* causes final put_task_struct in finish_task_switch(). */ tsk->state = TASK_DEAD; -#ifdef CONFIG_FUNCTION_RET_TRACER ftrace_retfunc_exit_task(tsk); -#endif schedule(); BUG(); /* Avoid "noreturn function does return". */ diff --git a/kernel/fork.c b/kernel/fork.c index d1eb30e..fbf4a4c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1270,9 +1270,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, total_forks++; spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); -#ifdef CONFIG_FUNCTION_RET_TRACER ftrace_retfunc_init_task(p); -#endif proc_fork_connector(p); cgroup_post_fork(p); return p; diff --git a/kernel/sched.c b/kernel/sched.c index fb17205..388d9db 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5901,9 +5901,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) * The idle tasks have their own, simple scheduling class: */ idle->sched_class = &idle_sched_class; -#ifdef CONFIG_FUNCTION_RET_TRACER ftrace_retfunc_init_task(idle); -#endif } /* -- cgit v0.10.2 From 02b67518e2b1c490787dac7f35e1204e74fe21ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?T=C3=B6r=C3=B6k=20Edwin?= Date: Sat, 22 Nov 2008 13:28:47 +0200 Subject: tracing: add support for userspace stacktraces in tracing/iter_ctrl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Impact: add new (default-off) tracing visualization feature Usage example: mount -t debugfs nodev /sys/kernel/debug cd /sys/kernel/debug/tracing echo userstacktrace >iter_ctrl echo sched_switch >current_tracer echo 1 >tracing_enabled .... run application ... echo 0 >tracing_enabled Then read one of 'trace','latency_trace','trace_pipe'. To get the best output you can compile your userspace programs with frame pointers (at least glibc + the app you are tracing). Signed-off-by: Török Edwin Signed-off-by: Ingo Molnar diff --git a/Documentation/ftrace.txt b/Documentation/ftrace.txt index 753f4de..79a80f7 100644 --- a/Documentation/ftrace.txt +++ b/Documentation/ftrace.txt @@ -324,7 +324,7 @@ output. To see what is available, simply cat the file: cat /debug/tracing/trace_options print-parent nosym-offset nosym-addr noverbose noraw nohex nobin \ - noblock nostacktrace nosched-tree + noblock nostacktrace nosched-tree nouserstacktrace To disable one of the options, echo in the option prepended with "no". @@ -378,6 +378,9 @@ Here are the available options: When a trace is recorded, so is the stack of functions. This allows for back traces of trace sites. + userstacktrace - This option changes the trace. + It records a stacktrace of the current userspace thread. + sched-tree - TBD (any users??) diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index a03e7f6..b151530 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -6,6 +6,7 @@ #include #include #include +#include #include static void save_stack_warning(void *data, char *msg) @@ -83,3 +84,59 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) trace->entries[trace->nr_entries++] = ULONG_MAX; } EXPORT_SYMBOL_GPL(save_stack_trace_tsk); + +/* Userspace stacktrace - based on kernel/trace/trace_sysprof.c */ + +struct stack_frame { + const void __user *next_fp; + unsigned long return_address; +}; + +static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) +{ + int ret; + + if (!access_ok(VERIFY_READ, fp, sizeof(*frame))) + return 0; + + ret = 1; + pagefault_disable(); + if (__copy_from_user_inatomic(frame, fp, sizeof(*frame))) + ret = 0; + pagefault_enable(); + + return ret; +} + +void save_stack_trace_user(struct stack_trace *trace) +{ + /* + * Trace user stack if we are not a kernel thread + */ + if (current->mm) { + const struct pt_regs *regs = task_pt_regs(current); + const void __user *fp = (const void __user *)regs->bp; + + if (trace->nr_entries < trace->max_entries) + trace->entries[trace->nr_entries++] = regs->ip; + + while (trace->nr_entries < trace->max_entries) { + struct stack_frame frame; + frame.next_fp = NULL; + frame.return_address = 0; + if (!copy_stack_frame(fp, &frame)) + break; + if ((unsigned long)fp < regs->sp) + break; + if (frame.return_address) + trace->entries[trace->nr_entries++] = + frame.return_address; + if (fp == frame.next_fp) + break; + fp = frame.next_fp; + } + } + if (trace->nr_entries < trace->max_entries) + trace->entries[trace->nr_entries++] = ULONG_MAX; +} + diff --git a/include/linux/stacktrace.h b/include/linux/stacktrace.h index b106fd8..68de514 100644 --- a/include/linux/stacktrace.h +++ b/include/linux/stacktrace.h @@ -15,9 +15,17 @@ extern void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace); extern void print_stack_trace(struct stack_trace *trace, int spaces); + +#ifdef CONFIG_X86 +extern void save_stack_trace_user(struct stack_trace *trace); +#else +# define save_stack_trace_user(trace) do { } while (0) +#endif + #else # define save_stack_trace(trace) do { } while (0) # define save_stack_trace_tsk(tsk, trace) do { } while (0) +# define save_stack_trace_user(trace) do { } while (0) # define print_stack_trace(trace, spaces) do { } while (0) #endif diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4ee6f037..ced8b4f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -275,6 +275,7 @@ static const char *trace_options[] = { "ftrace_preempt", "branch", "annotate", + "userstacktrace", NULL }; @@ -918,6 +919,44 @@ void __trace_stack(struct trace_array *tr, ftrace_trace_stack(tr, data, flags, skip, preempt_count()); } +static void ftrace_trace_userstack(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long flags, int pc) +{ + struct userstack_entry *entry; + struct stack_trace trace; + struct ring_buffer_event *event; + unsigned long irq_flags; + + if (!(trace_flags & TRACE_ITER_USERSTACKTRACE)) + return; + + event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), + &irq_flags); + if (!event) + return; + entry = ring_buffer_event_data(event); + tracing_generic_entry_update(&entry->ent, flags, pc); + entry->ent.type = TRACE_USER_STACK; + + memset(&entry->caller, 0, sizeof(entry->caller)); + + trace.nr_entries = 0; + trace.max_entries = FTRACE_STACK_ENTRIES; + trace.skip = 0; + trace.entries = entry->caller; + + save_stack_trace_user(&trace); + ring_buffer_unlock_commit(tr->buffer, event, irq_flags); +} + +void __trace_userstack(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long flags) +{ + ftrace_trace_userstack(tr, data, flags, preempt_count()); +} + static void ftrace_trace_special(void *__tr, void *__data, unsigned long arg1, unsigned long arg2, unsigned long arg3, @@ -941,6 +980,7 @@ ftrace_trace_special(void *__tr, void *__data, entry->arg3 = arg3; ring_buffer_unlock_commit(tr->buffer, event, irq_flags); ftrace_trace_stack(tr, data, irq_flags, 4, pc); + ftrace_trace_userstack(tr, data, irq_flags, pc); trace_wake_up(); } @@ -979,6 +1019,7 @@ tracing_sched_switch_trace(struct trace_array *tr, entry->next_cpu = task_cpu(next); ring_buffer_unlock_commit(tr->buffer, event, irq_flags); ftrace_trace_stack(tr, data, flags, 5, pc); + ftrace_trace_userstack(tr, data, flags, pc); } void @@ -1008,6 +1049,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr, entry->next_cpu = task_cpu(wakee); ring_buffer_unlock_commit(tr->buffer, event, irq_flags); ftrace_trace_stack(tr, data, flags, 6, pc); + ftrace_trace_userstack(tr, data, flags, pc); trace_wake_up(); } @@ -1387,6 +1429,31 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) return ret; } +static int +seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s, + unsigned long sym_flags) +{ + int ret = 1; + unsigned i; + + for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { + unsigned long ip = entry->caller[i]; + + if (ip == ULONG_MAX || !ret) + break; + if (i) + ret = trace_seq_puts(s, " <- "); + if (!ip) { + ret = trace_seq_puts(s, "??"); + continue; + } + if (ret /*&& (sym_flags & TRACE_ITER_SYM_ADDR)*/) + ret = trace_seq_printf(s, " <" IP_FMT ">", ip); + } + + return ret; +} + static void print_lat_help_header(struct seq_file *m) { seq_puts(m, "# _------=> CPU# \n"); @@ -1702,6 +1769,16 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) field->line); break; } + case TRACE_USER_STACK: { + struct userstack_entry *field; + + trace_assign_type(field, entry); + + seq_print_userip_objs(field, s, sym_flags); + if (entry->flags & TRACE_FLAG_CONT) + trace_seq_print_cont(s, iter); + break; + } default: trace_seq_printf(s, "Unknown type %d\n", entry->type); } @@ -1853,6 +1930,19 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter) field->line); break; } + case TRACE_USER_STACK: { + struct userstack_entry *field; + + trace_assign_type(field, entry); + + ret = seq_print_userip_objs(field, s, sym_flags); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + ret = trace_seq_putc(s, '\n'); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + break; + } } return TRACE_TYPE_HANDLED; } @@ -1912,6 +2002,7 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter) break; } case TRACE_SPECIAL: + case TRACE_USER_STACK: case TRACE_STACK: { struct special_entry *field; @@ -2000,6 +2091,7 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter) break; } case TRACE_SPECIAL: + case TRACE_USER_STACK: case TRACE_STACK: { struct special_entry *field; @@ -2054,6 +2146,7 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter) break; } case TRACE_SPECIAL: + case TRACE_USER_STACK: case TRACE_STACK: { struct special_entry *field; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 2cb12fd..17bb4c8 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -26,6 +26,7 @@ enum trace_type { TRACE_BOOT_CALL, TRACE_BOOT_RET, TRACE_FN_RET, + TRACE_USER_STACK, __TRACE_LAST_TYPE }; @@ -42,6 +43,7 @@ struct trace_entry { unsigned char flags; unsigned char preempt_count; int pid; + int tgid; }; /* @@ -99,6 +101,11 @@ struct stack_entry { unsigned long caller[FTRACE_STACK_ENTRIES]; }; +struct userstack_entry { + struct trace_entry ent; + unsigned long caller[FTRACE_STACK_ENTRIES]; +}; + /* * ftrace_printk entry: */ @@ -240,6 +247,7 @@ extern void __ftrace_bad_type(void); IF_ASSIGN(var, ent, struct ctx_switch_entry, 0); \ IF_ASSIGN(var, ent, struct trace_field_cont, TRACE_CONT); \ IF_ASSIGN(var, ent, struct stack_entry, TRACE_STACK); \ + IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\ IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \ IF_ASSIGN(var, ent, struct special_entry, 0); \ IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \ @@ -500,6 +508,7 @@ enum trace_iterator_flags { TRACE_ITER_PREEMPTONLY = 0x800, TRACE_ITER_BRANCH = 0x1000, TRACE_ITER_ANNOTATE = 0x2000, + TRACE_ITER_USERSTACKTRACE = 0x4000 }; /* -- cgit v0.10.2 From 74e2f334f4440cbcb63e9ebbcdcea430d41bdfa3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?T=C3=B6r=C3=B6k=20Edwin?= Date: Sat, 22 Nov 2008 13:28:48 +0200 Subject: vfs, seqfile: make mangle_path() global MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Impact: expose new VFS API make mangle_path() available, as per the suggestions of Christoph Hellwig and Al Viro: http://lkml.org/lkml/2008/11/4/338 Signed-off-by: Török Edwin Signed-off-by: Ingo Molnar diff --git a/fs/seq_file.c b/fs/seq_file.c index eba2eab..f5b61cc 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -357,7 +357,18 @@ int seq_printf(struct seq_file *m, const char *f, ...) } EXPORT_SYMBOL(seq_printf); -static char *mangle_path(char *s, char *p, char *esc) +/** + * mangle_path - mangle and copy path to buffer beginning + * @s - buffer start + * @p - beginning of path in above buffer + * @esc - set of characters that need escaping + * + * Copy the path from @p to @s, replacing each occurrence of character from + * @esc with usual octal escape. + * Returns pointer past last written character in @s, or NULL in case of + * failure. + */ +char *mangle_path(char *s, char *p, char *esc) { while (s <= p) { char c = *p++; @@ -376,6 +387,7 @@ static char *mangle_path(char *s, char *p, char *esc) } return NULL; } +EXPORT_SYMBOL_GPL(mangle_path); /* * return the absolute path of 'dentry' residing in mount 'mnt'. diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h index dc50bcc..b3dfa72 100644 --- a/include/linux/seq_file.h +++ b/include/linux/seq_file.h @@ -34,6 +34,7 @@ struct seq_operations { #define SEQ_SKIP 1 +char *mangle_path(char *s, char *p, char *esc); int seq_open(struct file *, const struct seq_operations *); ssize_t seq_read(struct file *, char __user *, size_t, loff_t *); loff_t seq_lseek(struct file *, loff_t, int); -- cgit v0.10.2 From b54d3de9f3b8956653b06f1a32e9f9321c6d9027 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?T=C3=B6r=C3=B6k=20Edwin?= Date: Sat, 22 Nov 2008 13:28:48 +0200 Subject: tracing: identify which executable object the userspace address belongs to MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Impact: modify+improve the userstacktrace tracing visualization feature Store thread group leader id, and use it to lookup the address in the process's map. We could have looked up the address on thread's map, but the thread might not exist by the time we are called. The process might not exist either, but if you are reading trace_pipe, that is unlikely. Example usage: mount -t debugfs nodev /sys/kernel/debug cd /sys/kernel/debug/tracing echo userstacktrace >iter_ctrl echo sym-userobj >iter_ctrl echo sched_switch >current_tracer echo 1 >tracing_enabled cat trace_pipe >/tmp/trace& .... run application ... echo 0 >tracing_enabled cat /tmp/trace You'll see stack entries like: /lib/libpthread-2.7.so[+0xd370] You can convert them to function/line using: addr2line -fie /lib/libpthread-2.7.so 0xd370 Or: addr2line -fie /usr/lib/debug/libpthread-2.7.so 0xd370 For non-PIC/PIE executables this won't work: a.out[+0x73b] You need to run the following: addr2line -fie a.out 0x40073b (where 0x400000 is the default load address of a.out) Signed-off-by: Török Edwin Signed-off-by: Ingo Molnar diff --git a/Documentation/ftrace.txt b/Documentation/ftrace.txt index 79a80f7..35a78bc 100644 --- a/Documentation/ftrace.txt +++ b/Documentation/ftrace.txt @@ -324,7 +324,7 @@ output. To see what is available, simply cat the file: cat /debug/tracing/trace_options print-parent nosym-offset nosym-addr noverbose noraw nohex nobin \ - noblock nostacktrace nosched-tree nouserstacktrace + noblock nostacktrace nosched-tree nouserstacktrace nosym-userobj To disable one of the options, echo in the option prepended with "no". @@ -381,6 +381,17 @@ Here are the available options: userstacktrace - This option changes the trace. It records a stacktrace of the current userspace thread. + sym-userobj - when user stacktrace are enabled, look up which object the + address belongs to, and print a relative address + This is especially useful when ASLR is on, otherwise you don't + get a chance to resolve the address to object/file/line after the app is no + longer running + + The lookup is performed when you read trace,trace_pipe,latency_trace. Example: + + a.out-1623 [000] 40874.465068: /root/a.out[+0x480] <-/root/a.out[+0 +x494] <- /root/a.out[+0x4a8] <- /lib/libc-2.7.so[+0x1e1a6] + sched-tree - TBD (any users??) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ced8b4f..62776b7 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -276,6 +277,7 @@ static const char *trace_options[] = { "branch", "annotate", "userstacktrace", + "sym-userobj", NULL }; @@ -422,6 +424,28 @@ trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len) return trace_seq_putmem(s, hex, j); } +static int +trace_seq_path(struct trace_seq *s, struct path *path) +{ + unsigned char *p; + + if (s->len >= (PAGE_SIZE - 1)) + return 0; + p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len); + if (!IS_ERR(p)) { + p = mangle_path(s->buffer + s->len, p, "\n"); + if (p) { + s->len = p - s->buffer; + return 1; + } + } else { + s->buffer[s->len++] = '?'; + return 1; + } + + return 0; +} + static void trace_seq_reset(struct trace_seq *s) { @@ -802,6 +826,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, entry->preempt_count = pc & 0xff; entry->pid = (tsk) ? tsk->pid : 0; + entry->tgid = (tsk) ? tsk->tgid : 0; entry->flags = #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | @@ -1429,28 +1454,73 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) return ret; } +static inline int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, + unsigned long ip, unsigned long sym_flags) +{ + struct file *file = NULL; + unsigned long vmstart = 0; + int ret = 1; + + if (mm) { + const struct vm_area_struct *vma = find_vma(mm, ip); + if (vma) { + file = vma->vm_file; + vmstart = vma->vm_start; + } + } + if (file) { + ret = trace_seq_path(s, &file->f_path); + if (ret) + ret = trace_seq_printf(s, "[+0x%lx]", + ip - vmstart); + } + if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file)) + ret = trace_seq_printf(s, " <" IP_FMT ">", ip); + return ret; +} + static int seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s, - unsigned long sym_flags) + unsigned long sym_flags) { + struct mm_struct *mm = NULL; int ret = 1; unsigned i; + if (trace_flags & TRACE_ITER_SYM_USEROBJ) { + struct task_struct *task; + /* + * we do the lookup on the thread group leader, + * since individual threads might have already quit! + */ + rcu_read_lock(); + task = find_task_by_vpid(entry->ent.tgid); + rcu_read_unlock(); + + if (task) + mm = get_task_mm(task); + } + for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { unsigned long ip = entry->caller[i]; if (ip == ULONG_MAX || !ret) break; - if (i) + if (i && ret) ret = trace_seq_puts(s, " <- "); if (!ip) { - ret = trace_seq_puts(s, "??"); + if (ret) + ret = trace_seq_puts(s, "??"); continue; } - if (ret /*&& (sym_flags & TRACE_ITER_SYM_ADDR)*/) - ret = trace_seq_printf(s, " <" IP_FMT ">", ip); + if (!ret) + break; + if (ret) + ret = seq_print_user_ip(s, mm, ip, sym_flags); } + if (mm) + mmput(mm); return ret; } @@ -1775,8 +1845,7 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) trace_assign_type(field, entry); seq_print_userip_objs(field, s, sym_flags); - if (entry->flags & TRACE_FLAG_CONT) - trace_seq_print_cont(s, iter); + trace_seq_putc(s, '\n'); break; } default: @@ -3581,6 +3650,9 @@ void ftrace_dump(void) atomic_inc(&global_trace.data[cpu]->disabled); } + /* don't look at user memory in panic mode */ + trace_flags &= ~TRACE_ITER_SYM_USEROBJ; + printk(KERN_TRACE "Dumping ftrace buffer:\n"); iter.tr = &global_trace; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 17bb4c8..28c15c2 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -508,7 +508,8 @@ enum trace_iterator_flags { TRACE_ITER_PREEMPTONLY = 0x800, TRACE_ITER_BRANCH = 0x1000, TRACE_ITER_ANNOTATE = 0x2000, - TRACE_ITER_USERSTACKTRACE = 0x4000 + TRACE_ITER_USERSTACKTRACE = 0x4000, + TRACE_ITER_SYM_USEROBJ = 0x8000 }; /* -- cgit v0.10.2 From cbe2f5a6e84eebb98ab42fc5e58c3cd5b7767349 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 23 Nov 2008 10:37:12 +0100 Subject: tracing: allow tracing of suspend/resume & hibernation code again Impact: widen function-tracing to suspend+resume (and hibernation) sequences Now that the ftrace kernel thread is gone, we can allow tracing during suspend/resume again. So revert these two commits: f42ac38c5 "ftrace: disable tracing for suspend to ram" 41108eb10 "ftrace: disable tracing for hibernation" This should be tested very carefully, as it could interact with altneratives instruction patching, etc. Signed-off-by: Ingo Molnar diff --git a/kernel/power/disk.c b/kernel/power/disk.c index c9d7408..f77d381 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -22,7 +22,6 @@ #include #include #include -#include #include "power.h" @@ -257,7 +256,7 @@ static int create_image(int platform_mode) int hibernation_snapshot(int platform_mode) { - int error, ftrace_save; + int error; /* Free memory before shutting down devices. */ error = swsusp_shrink_memory(); @@ -269,7 +268,6 @@ int hibernation_snapshot(int platform_mode) goto Close; suspend_console(); - ftrace_save = __ftrace_enabled_save(); error = device_suspend(PMSG_FREEZE); if (error) goto Recover_platform; @@ -299,7 +297,6 @@ int hibernation_snapshot(int platform_mode) Resume_devices: device_resume(in_suspend ? (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); - __ftrace_enabled_restore(ftrace_save); resume_console(); Close: platform_end(platform_mode); @@ -370,11 +367,10 @@ static int resume_target_kernel(void) int hibernation_restore(int platform_mode) { - int error, ftrace_save; + int error; pm_prepare_console(); suspend_console(); - ftrace_save = __ftrace_enabled_save(); error = device_suspend(PMSG_QUIESCE); if (error) goto Finish; @@ -389,7 +385,6 @@ int hibernation_restore(int platform_mode) platform_restore_cleanup(platform_mode); device_resume(PMSG_RECOVER); Finish: - __ftrace_enabled_restore(ftrace_save); resume_console(); pm_restore_console(); return error; @@ -402,7 +397,7 @@ int hibernation_restore(int platform_mode) int hibernation_platform_enter(void) { - int error, ftrace_save; + int error; if (!hibernation_ops) return -ENOSYS; @@ -417,7 +412,6 @@ int hibernation_platform_enter(void) goto Close; suspend_console(); - ftrace_save = __ftrace_enabled_save(); error = device_suspend(PMSG_HIBERNATE); if (error) { if (hibernation_ops->recover) @@ -452,7 +446,6 @@ int hibernation_platform_enter(void) hibernation_ops->finish(); Resume_devices: device_resume(PMSG_RESTORE); - __ftrace_enabled_restore(ftrace_save); resume_console(); Close: hibernation_ops->end(); diff --git a/kernel/power/main.c b/kernel/power/main.c index b8f7ce9..613f169 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -22,7 +22,6 @@ #include #include #include -#include #include "power.h" @@ -317,7 +316,7 @@ static int suspend_enter(suspend_state_t state) */ int suspend_devices_and_enter(suspend_state_t state) { - int error, ftrace_save; + int error; if (!suspend_ops) return -ENOSYS; @@ -328,7 +327,6 @@ int suspend_devices_and_enter(suspend_state_t state) goto Close; } suspend_console(); - ftrace_save = __ftrace_enabled_save(); suspend_test_start(); error = device_suspend(PMSG_SUSPEND); if (error) { @@ -360,7 +358,6 @@ int suspend_devices_and_enter(suspend_state_t state) suspend_test_start(); device_resume(PMSG_RESUME); suspend_test_finish("resume devices"); - __ftrace_enabled_restore(ftrace_save); resume_console(); Close: if (suspend_ops->end) -- cgit v0.10.2 From 1d926f2756392c6909f60e0c9fe2a09d5462e376 Mon Sep 17 00:00:00 2001 From: Will Newton Date: Fri, 21 Nov 2008 14:08:59 -0800 Subject: init/main.c: use ktime accessor function in initcall_debug code Impact: fix initcall debug output on non-scalar ktime platforms (32-bit embedded) The initcall_debug code access the tv64 member of ktime. This won't work correctly for large deltas on platforms that don't use the scalar ktime implementation. Signed-off-by: Will Newton Acked-by: Tim Bird Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar diff --git a/init/main.c b/init/main.c index e810196..79213c0 100644 --- a/init/main.c +++ b/init/main.c @@ -723,7 +723,7 @@ int do_one_initcall(initcall_t fn) disable_boot_trace(); rettime = ktime_get(); delta = ktime_sub(rettime, calltime); - ret.duration = (unsigned long long) delta.tv64 >> 10; + ret.duration = (unsigned long long) ktime_to_ns(delta) >> 10; trace_boot_ret(&ret, fn); printk("initcall %pF returned %d after %Ld usecs\n", fn, ret.result, ret.duration); -- cgit v0.10.2 From 0da85c09b44bfea07e63ed5324aabc7cfc8a889a Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Wed, 12 Nov 2008 20:11:47 +0900 Subject: sh: dynamic ftrace support. First cut at dynamic ftrace support. [ Steven Rostedt - only updated the recordmcount.pl file. There are updates for PowerPC that will conflict with this, and we need to base off of these changes. ] Signed-off-by: Matt Fleming Signed-off-by: Paul Mundt Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl index eeac71c..9f75438 100755 --- a/scripts/recordmcount.pl +++ b/scripts/recordmcount.pl @@ -170,6 +170,17 @@ if ($arch eq "x86_64") { $objcopy .= " -O elf32-i386"; $cc .= " -m32"; +} elsif ($arch eq "sh") { + $section_regex = "Disassembly of section\\s+(\\S+):"; + $function_regex = "^([0-9a-fA-F]+)\\s+<(.*?)>:"; + $mcount_regex = "^\\s*([0-9a-fA-F]+):.*\\smcount\$"; + $type = ".long"; + + # force flags for this arch + $ld .= " -m shlelf_linux"; + $objcopy .= " -O elf32-sh-linux"; + $cc .= " -m32"; + } else { die "Arch $arch is not supported with CONFIG_FTRACE_MCOUNT_RECORD"; } -- cgit v0.10.2 From 42e007d0400155fbc12c5344c808889e6ae33d32 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 20 Nov 2008 07:16:16 -0800 Subject: ftrace: add support for powerpc to recordmcount.pl script Impact: Add PowerPC port to recordmcount.pl script This patch updates the recordmcount.pl script to process PowerPC. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl index 9f75438..7ec032e 100755 --- a/scripts/recordmcount.pl +++ b/scripts/recordmcount.pl @@ -130,6 +130,7 @@ my %weak; # List of weak functions my %convert; # List of local functions used that needs conversion my $type; +my $nm_regex; # Find the local functions (return function) my $section_regex; # Find the start of a section my $function_regex; # Find the name of a function # (return offset and func name) @@ -145,6 +146,7 @@ if ($arch eq "x86") { } if ($arch eq "x86_64") { + $nm_regex = "^[0-9a-fA-F]+\\s+t\\s+(\\S+)"; $section_regex = "Disassembly of section\\s+(\\S+):"; $function_regex = "^([0-9a-fA-F]+)\\s+<(.*?)>:"; $mcount_regex = "^\\s*([0-9a-fA-F]+):.*\\smcount([+-]0x[0-9a-zA-Z]+)?\$"; @@ -158,6 +160,7 @@ if ($arch eq "x86_64") { $cc .= " -m64"; } elsif ($arch eq "i386") { + $nm_regex = "^[0-9a-fA-F]+\\s+t\\s+(\\S+)"; $section_regex = "Disassembly of section\\s+(\\S+):"; $function_regex = "^([0-9a-fA-F]+)\\s+<(.*?)>:"; $mcount_regex = "^\\s*([0-9a-fA-F]+):.*\\smcount\$"; @@ -171,6 +174,7 @@ if ($arch eq "x86_64") { $cc .= " -m32"; } elsif ($arch eq "sh") { + $nm_regex = "^[0-9a-fA-F]+\\s+t\\s+(\\S+)"; $section_regex = "Disassembly of section\\s+(\\S+):"; $function_regex = "^([0-9a-fA-F]+)\\s+<(.*?)>:"; $mcount_regex = "^\\s*([0-9a-fA-F]+):.*\\smcount\$"; @@ -181,6 +185,17 @@ if ($arch eq "x86_64") { $objcopy .= " -O elf32-sh-linux"; $cc .= " -m32"; +} elsif ($arch eq "powerpc") { + $nm_regex = "^[0-9a-fA-F]+\\s+t\\s+(\\.?\\S+)"; + $section_regex = "Disassembly of section\\s+(\\S+):"; + $function_regex = "^([0-9a-fA-F]+)\\s+<(\\.?.*?)>:"; + $mcount_regex = "^\\s*([0-9a-fA-F]+):.*\\s\\.?_mcount\$"; + if ($bits == 64) { + $type = ".quad"; + } else { + $type = ".long"; + } + } else { die "Arch $arch is not supported with CONFIG_FTRACE_MCOUNT_RECORD"; } @@ -250,7 +265,7 @@ if (!$found_version) { # open (IN, "$nm $inputfile|") || die "error running $nm"; while () { - if (/^[0-9a-fA-F]+\s+t\s+(\S+)/) { + if (/$nm_regex/) { $locals{$1} = 1; } elsif (/^[0-9a-fA-F]+\s+([wW])\s+(\S+)/) { $weak{$2} = $1; @@ -302,7 +317,7 @@ sub update_funcs open(FILE, ">$mcount_s") || die "can't create $mcount_s\n"; $opened = 1; print FILE "\t.section $mcount_section,\"a\",\@progbits\n"; - print FILE "\t.align $alignment\n"; + print FILE "\t.align $alignment\n" if (defined($alignment)); } printf FILE "\t%s %s + %d\n", $type, $ref_func, $offsets[$i] - $offset; } -- cgit v0.10.2 From c204f7264c7de85aecd3638dc8fe07aba6d1fff5 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 20 Nov 2008 15:07:34 -0500 Subject: ftrace: create default variables for archs in recordmcount.pl Impact: cleanup of recordmcount.pl Now that more architectures are being ported to the MCOUNT_RECORD method, there is no reason to have each declare their own arch specific variable if most of them share the same value. This patch creates a set of default values for the arch specific variables based off of i386. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl index 7ec032e..c5c58ac 100755 --- a/scripts/recordmcount.pl +++ b/scripts/recordmcount.pl @@ -145,10 +145,17 @@ if ($arch eq "x86") { } } +# +# We base the defaults off of i386, the other archs may +# feel free to change them in the below if statements. +# +$nm_regex = "^[0-9a-fA-F]+\\s+t\\s+(\\S+)"; +$section_regex = "Disassembly of section\\s+(\\S+):"; +$function_regex = "^([0-9a-fA-F]+)\\s+<(.*?)>:"; +$mcount_regex = "^\\s*([0-9a-fA-F]+):.*\\smcount\$"; +$type = ".long"; + if ($arch eq "x86_64") { - $nm_regex = "^[0-9a-fA-F]+\\s+t\\s+(\\S+)"; - $section_regex = "Disassembly of section\\s+(\\S+):"; - $function_regex = "^([0-9a-fA-F]+)\\s+<(.*?)>:"; $mcount_regex = "^\\s*([0-9a-fA-F]+):.*\\smcount([+-]0x[0-9a-zA-Z]+)?\$"; $type = ".quad"; $alignment = 8; @@ -160,11 +167,6 @@ if ($arch eq "x86_64") { $cc .= " -m64"; } elsif ($arch eq "i386") { - $nm_regex = "^[0-9a-fA-F]+\\s+t\\s+(\\S+)"; - $section_regex = "Disassembly of section\\s+(\\S+):"; - $function_regex = "^([0-9a-fA-F]+)\\s+<(.*?)>:"; - $mcount_regex = "^\\s*([0-9a-fA-F]+):.*\\smcount\$"; - $type = ".long"; $alignment = 4; # force flags for this arch @@ -174,11 +176,6 @@ if ($arch eq "x86_64") { $cc .= " -m32"; } elsif ($arch eq "sh") { - $nm_regex = "^[0-9a-fA-F]+\\s+t\\s+(\\S+)"; - $section_regex = "Disassembly of section\\s+(\\S+):"; - $function_regex = "^([0-9a-fA-F]+)\\s+<(.*?)>:"; - $mcount_regex = "^\\s*([0-9a-fA-F]+):.*\\smcount\$"; - $type = ".long"; # force flags for this arch $ld .= " -m shlelf_linux"; @@ -187,13 +184,11 @@ if ($arch eq "x86_64") { } elsif ($arch eq "powerpc") { $nm_regex = "^[0-9a-fA-F]+\\s+t\\s+(\\.?\\S+)"; - $section_regex = "Disassembly of section\\s+(\\S+):"; $function_regex = "^([0-9a-fA-F]+)\\s+<(\\.?.*?)>:"; $mcount_regex = "^\\s*([0-9a-fA-F]+):.*\\s\\.?_mcount\$"; + if ($bits == 64) { $type = ".quad"; - } else { - $type = ".long"; } } else { -- cgit v0.10.2 From 42f565e116e0408b5ddc21a33c4a4d41fd572420 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 20 Nov 2008 23:57:47 -0500 Subject: trace: remove extra assign in branch check Impact: clean up of branch check The unlikely/likely profiler does an extra assign of the f.line. This is not needed since it is already calculated at compile time. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/include/linux/compiler.h b/include/linux/compiler.h index c7d804a..c25e525 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -87,7 +87,6 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect); .file = __FILE__, \ .line = __LINE__, \ }; \ - ______f.line = __LINE__; \ ______r = likely_notrace(x); \ ftrace_likely_update(&______f, ______r, 1); \ ______r; \ @@ -102,7 +101,6 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect); .file = __FILE__, \ .line = __LINE__, \ }; \ - ______f.line = __LINE__; \ ______r = unlikely_notrace(x); \ ftrace_likely_update(&______f, ______r, 0); \ ______r; \ -- cgit v0.10.2 From 45b797492a0758e64dff74e9db70e1f65e0603a5 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 21 Nov 2008 00:40:40 -0500 Subject: trace: consolidate unlikely and likely profiler Impact: clean up to make one profiler of like and unlikely tracer The likely and unlikely profiler prints out the file and line numbers of the annotated branches that it is profiling. It shows the number of times it was correct or incorrect in its guess. Having two different files or sections for that matter to tell us if it was a likely or unlikely is pretty pointless. We really only care if it was correct or not. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 3b46ae4..8bccb49 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -46,12 +46,9 @@ #endif #ifdef CONFIG_TRACE_BRANCH_PROFILING -#define LIKELY_PROFILE() VMLINUX_SYMBOL(__start_likely_profile) = .; \ - *(_ftrace_likely) \ - VMLINUX_SYMBOL(__stop_likely_profile) = .; \ - VMLINUX_SYMBOL(__start_unlikely_profile) = .; \ - *(_ftrace_unlikely) \ - VMLINUX_SYMBOL(__stop_unlikely_profile) = .; +#define LIKELY_PROFILE() VMLINUX_SYMBOL(__start_annotated_branch_profile) = .; \ + *(_ftrace_annotated_branch) \ + VMLINUX_SYMBOL(__stop_annotated_branch_profile) = .; #else #define LIKELY_PROFILE() #endif diff --git a/include/linux/compiler.h b/include/linux/compiler.h index c25e525..0628a20 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -77,32 +77,18 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect); #define likely_notrace(x) __builtin_expect(!!(x), 1) #define unlikely_notrace(x) __builtin_expect(!!(x), 0) -#define likely_check(x) ({ \ +#define __branch_check__(x, expect) ({ \ int ______r; \ static struct ftrace_branch_data \ __attribute__((__aligned__(4))) \ - __attribute__((section("_ftrace_likely"))) \ + __attribute__((section("_ftrace_annotated_branch"))) \ ______f = { \ .func = __func__, \ .file = __FILE__, \ .line = __LINE__, \ }; \ ______r = likely_notrace(x); \ - ftrace_likely_update(&______f, ______r, 1); \ - ______r; \ - }) -#define unlikely_check(x) ({ \ - int ______r; \ - static struct ftrace_branch_data \ - __attribute__((__aligned__(4))) \ - __attribute__((section("_ftrace_unlikely"))) \ - ______f = { \ - .func = __func__, \ - .file = __FILE__, \ - .line = __LINE__, \ - }; \ - ______r = unlikely_notrace(x); \ - ftrace_likely_update(&______f, ______r, 0); \ + ftrace_likely_update(&______f, ______r, expect); \ ______r; \ }) @@ -112,10 +98,10 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect); * written by Daniel Walker. */ # ifndef likely -# define likely(x) (__builtin_constant_p(x) ? !!(x) : likely_check(x)) +# define likely(x) (__builtin_constant_p(x) ? !!(x) : __branch_check__(x, 1)) # endif # ifndef unlikely -# define unlikely(x) (__builtin_constant_p(x) ? !!(x) : unlikely_check(x)) +# define unlikely(x) (__builtin_constant_p(x) ? !!(x) : __branch_check__(x, 0)) # endif #else # define likely(x) __builtin_expect(!!(x), 1) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index b8378fa..7e35487 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -166,8 +166,7 @@ config TRACE_BRANCH_PROFILING This tracer profiles all the the likely and unlikely macros in the kernel. It will display the results in: - /debugfs/tracing/profile_likely - /debugfs/tracing/profile_unlikely + /debugfs/tracing/profile_annotated_branch Note: this will add a significant overhead, only turn this on if you need to profile the system's use of these macros. diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 23f9b02..21dedc8 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -261,7 +261,7 @@ static struct seq_operations tracing_likely_seq_ops = { .show = t_show, }; -static int tracing_likely_open(struct inode *inode, struct file *file) +static int tracing_branch_open(struct inode *inode, struct file *file) { int ret; @@ -274,25 +274,18 @@ static int tracing_likely_open(struct inode *inode, struct file *file) return ret; } -static struct file_operations tracing_likely_fops = { - .open = tracing_likely_open, +static const struct file_operations tracing_branch_fops = { + .open = tracing_branch_open, .read = seq_read, .llseek = seq_lseek, }; -extern unsigned long __start_likely_profile[]; -extern unsigned long __stop_likely_profile[]; -extern unsigned long __start_unlikely_profile[]; -extern unsigned long __stop_unlikely_profile[]; +extern unsigned long __start_annotated_branch_profile[]; +extern unsigned long __stop_annotated_branch_profile[]; -static struct ftrace_pointer ftrace_likely_pos = { - .start = __start_likely_profile, - .stop = __stop_likely_profile, -}; - -static struct ftrace_pointer ftrace_unlikely_pos = { - .start = __start_unlikely_profile, - .stop = __stop_unlikely_profile, +static const struct ftrace_pointer ftrace_annotated_branch_pos = { + .start = __start_annotated_branch_profile, + .stop = __stop_annotated_branch_profile, }; static __init int ftrace_branch_init(void) @@ -302,18 +295,12 @@ static __init int ftrace_branch_init(void) d_tracer = tracing_init_dentry(); - entry = debugfs_create_file("profile_likely", 0444, d_tracer, - &ftrace_likely_pos, - &tracing_likely_fops); - if (!entry) - pr_warning("Could not create debugfs 'profile_likely' entry\n"); - - entry = debugfs_create_file("profile_unlikely", 0444, d_tracer, - &ftrace_unlikely_pos, - &tracing_likely_fops); + entry = debugfs_create_file("profile_annotated_branch", 0444, d_tracer, + &ftrace_annotated_branch_pos, + &tracing_branch_fops); if (!entry) - pr_warning("Could not create debugfs" - " 'profile_unlikely' entry\n"); + pr_warning("Could not create debugfs " + "'profile_annotatet_branch' entry\n"); return 0; } -- cgit v0.10.2 From bac28bfe42ba98ee67503f78984d1d5e1ebbbb78 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 21 Nov 2008 01:51:53 -0500 Subject: trace: branch profiling should not print percent without data Impact: cleanup on output of branch profiler When a branch has not been taken, it does not make sense to show a percentage incorrect or hit. This patch changes the behaviour to print out a 'X' when the branch has not been executed yet. For example: correct incorrect % Function File Line ------- --------- - -------- ---- ---- 2096 0 0 do_arch_prctl process_64.c 832 0 0 X do_arch_prctl process_64.c 804 2604 0 0 IS_ERR err.h 34 130228 5765 4 __switch_to process_64.c 673 0 0 X enable_TSC process_64.c 448 0 0 X disable_TSC process_64.c 431 Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 21dedc8..142acb3 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -225,7 +225,7 @@ static int t_show(struct seq_file *m, void *v) { struct ftrace_branch_data *p = v; const char *f; - unsigned long percent; + long percent; if (v == (void *)1) { seq_printf(m, " correct incorrect %% " @@ -247,9 +247,13 @@ static int t_show(struct seq_file *m, void *v) percent = p->incorrect * 100; percent /= p->correct + p->incorrect; } else - percent = p->incorrect ? 100 : 0; + percent = p->incorrect ? 100 : -1; - seq_printf(m, "%8lu %8lu %3lu ", p->correct, p->incorrect, percent); + seq_printf(m, "%8lu %8lu ", p->correct, p->incorrect); + if (percent < 0) + seq_printf(m, " X "); + else + seq_printf(m, "%3ld ", percent); seq_printf(m, "%-30.30s %-20.20s %d\n", p->func, f, p->line); return 0; } -- cgit v0.10.2 From 2bcd521a684cc94befbe2ce7d5b613c841b0d304 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 21 Nov 2008 01:30:54 -0500 Subject: trace: profile all if conditionals Impact: feature to profile if statements This patch adds a branch profiler for all if () statements. The results will be found in: /debugfs/tracing/profile_branch For example: miss hit % Function File Line ------- --------- - -------- ---- ---- 0 1 100 x86_64_start_reservations head64.c 127 0 1 100 copy_bootdata head64.c 69 1 0 0 x86_64_start_kernel head64.c 111 32 0 0 set_intr_gate desc.h 319 1 0 0 reserve_ebda_region head.c 51 1 0 0 reserve_ebda_region head.c 47 0 1 100 reserve_ebda_region head.c 42 0 0 X maxcpus main.c 165 Miss means the branch was not taken. Hit means the branch was taken. The percent is the percentage the branch was taken. This adds a significant amount of overhead and should only be used by those analyzing their system. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 8bccb49..eba835a 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -53,6 +53,14 @@ #define LIKELY_PROFILE() #endif +#ifdef CONFIG_PROFILE_ALL_BRANCHES +#define BRANCH_PROFILE() VMLINUX_SYMBOL(__start_branch_profile) = .; \ + *(_ftrace_branch) \ + VMLINUX_SYMBOL(__stop_branch_profile) = .; +#else +#define BRANCH_PROFILE() +#endif + /* .data section */ #define DATA_DATA \ *(.data) \ @@ -72,7 +80,8 @@ VMLINUX_SYMBOL(__start___tracepoints) = .; \ *(__tracepoints) \ VMLINUX_SYMBOL(__stop___tracepoints) = .; \ - LIKELY_PROFILE() + LIKELY_PROFILE() \ + BRANCH_PROFILE() #define RO_DATA(align) \ . = ALIGN((align)); \ diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 0628a20..ea7c6be 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -63,8 +63,16 @@ struct ftrace_branch_data { const char *func; const char *file; unsigned line; - unsigned long correct; - unsigned long incorrect; + union { + struct { + unsigned long correct; + unsigned long incorrect; + }; + struct { + unsigned long miss; + unsigned long hit; + }; + }; }; /* @@ -103,6 +111,32 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect); # ifndef unlikely # define unlikely(x) (__builtin_constant_p(x) ? !!(x) : __branch_check__(x, 0)) # endif + +#ifdef CONFIG_PROFILE_ALL_BRANCHES +/* + * "Define 'is'", Bill Clinton + * "Define 'if'", Steven Rostedt + */ +#define if(cond) if (__builtin_constant_p((cond)) ? !!(cond) : \ + ({ \ + int ______r; \ + static struct ftrace_branch_data \ + __attribute__((__aligned__(4))) \ + __attribute__((section("_ftrace_branch"))) \ + ______f = { \ + .func = __func__, \ + .file = __FILE__, \ + .line = __LINE__, \ + }; \ + ______r = !!(cond); \ + if (______r) \ + ______f.hit++; \ + else \ + ______f.miss++; \ + ______r; \ + })) +#endif /* CONFIG_PROFILE_ALL_BRANCHES */ + #else # define likely(x) __builtin_expect(!!(x), 1) # define unlikely(x) __builtin_expect(!!(x), 0) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 7e35487..61e8cca 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -173,6 +173,22 @@ config TRACE_BRANCH_PROFILING Say N if unsure. +config PROFILE_ALL_BRANCHES + bool "Profile all if conditionals" + depends on TRACE_BRANCH_PROFILING + help + This tracer profiles all branch conditions. Every if () + taken in the kernel is recorded whether it hit or miss. + The results will be displayed in: + + /debugfs/tracing/profile_branch + + This configuration, when enabled, will impose a great overhead + on the system. This should only be enabled when the system + is to be analyzed + + Say N if unsure. + config TRACING_BRANCHES bool help diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 142acb3..85792ae 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -185,6 +185,7 @@ EXPORT_SYMBOL(ftrace_likely_update); struct ftrace_pointer { void *start; void *stop; + int hit; }; static void * @@ -223,13 +224,17 @@ static void t_stop(struct seq_file *m, void *p) static int t_show(struct seq_file *m, void *v) { + struct ftrace_pointer *fp = m->private; struct ftrace_branch_data *p = v; const char *f; long percent; if (v == (void *)1) { - seq_printf(m, " correct incorrect %% " - " Function " + if (fp->hit) + seq_printf(m, " miss hit %% "); + else + seq_printf(m, " correct incorrect %% "); + seq_printf(m, " Function " " File Line\n" " ------- --------- - " " -------- " @@ -243,6 +248,9 @@ static int t_show(struct seq_file *m, void *v) f--; f++; + /* + * The miss is overlayed on correct, and hit on incorrect. + */ if (p->correct) { percent = p->incorrect * 100; percent /= p->correct + p->incorrect; @@ -284,6 +292,18 @@ static const struct file_operations tracing_branch_fops = { .llseek = seq_lseek, }; +#ifdef CONFIG_PROFILE_ALL_BRANCHES +extern unsigned long __start_branch_profile[]; +extern unsigned long __stop_branch_profile[]; + +static struct ftrace_pointer ftrace_branch_pos = { + .start = __start_branch_profile, + .stop = __stop_branch_profile, + .hit = 1, +}; + +#endif /* CONFIG_PROFILE_ALL_BRANCHES */ + extern unsigned long __start_annotated_branch_profile[]; extern unsigned long __stop_annotated_branch_profile[]; @@ -306,6 +326,15 @@ static __init int ftrace_branch_init(void) pr_warning("Could not create debugfs " "'profile_annotatet_branch' entry\n"); +#ifdef CONFIG_PROFILE_ALL_BRANCHES + entry = debugfs_create_file("profile_branch", 0444, d_tracer, + &ftrace_branch_pos, + &tracing_branch_fops); + if (!entry) + pr_warning("Could not create debugfs" + " 'profile_branch' entry\n"); +#endif + return 0; } -- cgit v0.10.2 From 3a3d04aed05add2247ea2ba5da5f90dba4062f3f Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Thu, 20 Nov 2008 21:49:52 +0000 Subject: ftrace: specify $alignment for sh architecture Impact: extend scripts/recordmcount.pl with default alignment for SH Set $alignment=2 for the sh architecture so that a ".align 2" directive will be emitted for all __mcount_loc sections. Fix a whitspace error while I'm here (converted spaces to tabs). Signed-off-by: Matt Fleming Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl index c5c58ac..03b6af1 100755 --- a/scripts/recordmcount.pl +++ b/scripts/recordmcount.pl @@ -135,7 +135,7 @@ my $section_regex; # Find the start of a section my $function_regex; # Find the name of a function # (return offset and func name) my $mcount_regex; # Find the call site to mcount (return offset) -my $alignment; # The .align value to use for $mcount_section +my $alignment; # The .align value to use for $mcount_section if ($arch eq "x86") { if ($bits == 64) { @@ -176,6 +176,7 @@ if ($arch eq "x86_64") { $cc .= " -m32"; } elsif ($arch eq "sh") { + $alignment = 2; # force flags for this arch $ld .= " -m shlelf_linux"; -- cgit v0.10.2 From e58918ab9d4cd375f6d842e6d88cf4d7a55cbfcc Mon Sep 17 00:00:00 2001 From: Jim Radford Date: Thu, 20 Nov 2008 19:48:39 -0800 Subject: ftrace: scripts/recordmcount.pl support for ARM Impact: extend scripts/recordmcount.pl to ARM Arm uses %progbits instead of @progbits and requires only 4 byte alignment. [ Thanks to Sam Ravnborg for mentioning that ARM uses %progbits ] Signed-off-by: Jim Radford Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl index 03b6af1..0197e2f 100755 --- a/scripts/recordmcount.pl +++ b/scripts/recordmcount.pl @@ -136,6 +136,7 @@ my $function_regex; # Find the name of a function # (return offset and func name) my $mcount_regex; # Find the call site to mcount (return offset) my $alignment; # The .align value to use for $mcount_section +my $section_type; # Section header plus possible alignment command if ($arch eq "x86") { if ($bits == 64) { @@ -153,6 +154,7 @@ $nm_regex = "^[0-9a-fA-F]+\\s+t\\s+(\\S+)"; $section_regex = "Disassembly of section\\s+(\\S+):"; $function_regex = "^([0-9a-fA-F]+)\\s+<(.*?)>:"; $mcount_regex = "^\\s*([0-9a-fA-F]+):.*\\smcount\$"; +$section_type = '@progbits'; $type = ".long"; if ($arch eq "x86_64") { @@ -192,6 +194,10 @@ if ($arch eq "x86_64") { $type = ".quad"; } +} elsif ($arch eq "arm") { + $alignment = 2; + $section_type = '%progbits'; + } else { die "Arch $arch is not supported with CONFIG_FTRACE_MCOUNT_RECORD"; } @@ -312,7 +318,7 @@ sub update_funcs if (!$opened) { open(FILE, ">$mcount_s") || die "can't create $mcount_s\n"; $opened = 1; - print FILE "\t.section $mcount_section,\"a\",\@progbits\n"; + print FILE "\t.section $mcount_section,\"a\",$section_type\n"; print FILE "\t.align $alignment\n" if (defined($alignment)); } printf FILE "\t%s %s + %d\n", $type, $ref_func, $offsets[$i] - $offset; -- cgit v0.10.2 From 033601a32b2012b6948e80e739cca40bff4de4a0 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 21 Nov 2008 12:41:55 -0500 Subject: ring-buffer: add tracing_off_permanent Impact: feature to permanently disable ring buffer This patch adds a API to the ring buffer code that will permanently disable the ring buffer from ever recording. This should only be called when some serious anomaly is detected, and the system may be in an unstable state. When that happens, shutting down the recording to the ring buffers may be appropriate. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index e097c2e..3bb87a7 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -122,6 +122,7 @@ void ring_buffer_normalize_time_stamp(int cpu, u64 *ts); void tracing_on(void); void tracing_off(void); +void tracing_off_permanent(void); enum ring_buffer_flags { RB_FL_OVERWRITE = 1 << 0, diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 85ced14..e206951 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -18,8 +18,46 @@ #include "trace.h" -/* Global flag to disable all recording to ring buffers */ -static int ring_buffers_off __read_mostly; +/* + * A fast way to enable or disable all ring buffers is to + * call tracing_on or tracing_off. Turning off the ring buffers + * prevents all ring buffers from being recorded to. + * Turning this switch on, makes it OK to write to the + * ring buffer, if the ring buffer is enabled itself. + * + * There's three layers that must be on in order to write + * to the ring buffer. + * + * 1) This global flag must be set. + * 2) The ring buffer must be enabled for recording. + * 3) The per cpu buffer must be enabled for recording. + * + * In case of an anomaly, this global flag has a bit set that + * will permantly disable all ring buffers. + */ + +/* + * Global flag to disable all recording to ring buffers + * This has two bits: ON, DISABLED + * + * ON DISABLED + * ---- ---------- + * 0 0 : ring buffers are off + * 1 0 : ring buffers are on + * X 1 : ring buffers are permanently disabled + */ + +enum { + RB_BUFFERS_ON_BIT = 0, + RB_BUFFERS_DISABLED_BIT = 1, +}; + +enum { + RB_BUFFERS_ON = 1 << RB_BUFFERS_ON_BIT, + RB_BUFFERS_DISABLED = 1 << RB_BUFFERS_DISABLED_BIT, +}; + +static long ring_buffer_flags __read_mostly = RB_BUFFERS_ON; /** * tracing_on - enable all tracing buffers @@ -29,7 +67,7 @@ static int ring_buffers_off __read_mostly; */ void tracing_on(void) { - ring_buffers_off = 0; + set_bit(RB_BUFFERS_ON_BIT, &ring_buffer_flags); } /** @@ -42,7 +80,18 @@ void tracing_on(void) */ void tracing_off(void) { - ring_buffers_off = 1; + clear_bit(RB_BUFFERS_ON_BIT, &ring_buffer_flags); +} + +/** + * tracing_off_permanent - permanently disable ring buffers + * + * This function, once called, will disable all ring buffers + * permanenty. + */ +void tracing_off_permanent(void) +{ + set_bit(RB_BUFFERS_DISABLED_BIT, &ring_buffer_flags); } #include "trace.h" @@ -1185,7 +1234,7 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, struct ring_buffer_event *event; int cpu, resched; - if (ring_buffers_off) + if (ring_buffer_flags != RB_BUFFERS_ON) return NULL; if (atomic_read(&buffer->record_disabled)) @@ -1297,7 +1346,7 @@ int ring_buffer_write(struct ring_buffer *buffer, int ret = -EBUSY; int cpu, resched; - if (ring_buffers_off) + if (ring_buffer_flags != RB_BUFFERS_ON) return -EBUSY; if (atomic_read(&buffer->record_disabled)) @@ -2178,12 +2227,14 @@ static ssize_t rb_simple_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { - int *p = filp->private_data; + long *p = filp->private_data; char buf[64]; int r; - /* !ring_buffers_off == tracing_on */ - r = sprintf(buf, "%d\n", !*p); + if (test_bit(RB_BUFFERS_DISABLED_BIT, p)) + r = sprintf(buf, "permanently disabled\n"); + else + r = sprintf(buf, "%d\n", test_bit(RB_BUFFERS_ON_BIT, p)); return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } @@ -2192,7 +2243,7 @@ static ssize_t rb_simple_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - int *p = filp->private_data; + long *p = filp->private_data; char buf[64]; long val; int ret; @@ -2209,8 +2260,10 @@ rb_simple_write(struct file *filp, const char __user *ubuf, if (ret < 0) return ret; - /* !ring_buffers_off == tracing_on */ - *p = !val; + if (val) + set_bit(RB_BUFFERS_ON_BIT, p); + else + clear_bit(RB_BUFFERS_ON_BIT, p); (*ppos)++; @@ -2232,7 +2285,7 @@ static __init int rb_init_debugfs(void) d_tracer = tracing_init_dentry(); entry = debugfs_create_file("tracing_on", 0644, d_tracer, - &ring_buffers_off, &rb_simple_fops); + &ring_buffer_flags, &rb_simple_fops); if (!entry) pr_warning("Could not create debugfs 'tracing_on' entry\n"); -- cgit v0.10.2 From 69bb54ec05f57da7f6fac2cec0820cbc970df20f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 21 Nov 2008 12:59:38 -0500 Subject: ftrace: add ftrace_off_permanent Impact: add new API to disable all of ftrace on anomalies It case of a serious anomaly being detected (like something caught by lockdep) it is a good idea to disable all tracing immediately, without grabing any locks. This patch adds ftrace_off_permanent that disables the tracers, function tracing and ring buffers without a way to enable them again. This should only be used when something serious has been detected. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index f7ba4ea..13e9cfc 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -257,6 +257,7 @@ extern int ftrace_dump_on_oops; extern void tracing_start(void); extern void tracing_stop(void); +extern void ftrace_off_permanent(void); extern void ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3); @@ -290,6 +291,7 @@ ftrace_printk(const char *fmt, ...) __attribute__ ((format (printf, 1, 0))); static inline void tracing_start(void) { } static inline void tracing_stop(void) { } +static inline void ftrace_off_permanent(void) { } static inline int ftrace_printk(const char *fmt, ...) { diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4ee6f037..0dbfb23 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -661,6 +661,21 @@ static int trace_stop_count; static DEFINE_SPINLOCK(tracing_start_lock); /** + * ftrace_off_permanent - disable all ftrace code permanently + * + * This should only be called when a serious anomally has + * been detected. This will turn off the function tracing, + * ring buffers, and other tracing utilites. It takes no + * locks and can be called from any context. + */ +void ftrace_off_permanent(void) +{ + tracing_disabled = 1; + ftrace_stop(); + tracing_off_permanent(); +} + +/** * tracing_start - quick start of the tracer * * If tracing is enabled but was stopped by tracing_stop, -- cgit v0.10.2 From 0429149fb5e01edc410648591c19095d2074ee00 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 21 Nov 2008 14:44:57 -0500 Subject: trace: fix compiler warning in branch profiler Impact: fix compiler warning The ftrace_pointers used in the branch profiler are constant values. They should never change. But the compiler complains when they are passed into the debugfs_create_file as a data pointer, because the function discards the qualifier. This patch typecasts the parameter to debugfs_create_file back to a void pointer. To remind the callbacks that they are pointing to a constant value, I also modified the callback local pointers to be const struct ftrace_pointer * as well. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 85792ae..877ee88 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -191,7 +191,7 @@ struct ftrace_pointer { static void * t_next(struct seq_file *m, void *v, loff_t *pos) { - struct ftrace_pointer *f = m->private; + const struct ftrace_pointer *f = m->private; struct ftrace_branch_data *p = v; (*pos)++; @@ -224,7 +224,7 @@ static void t_stop(struct seq_file *m, void *p) static int t_show(struct seq_file *m, void *v) { - struct ftrace_pointer *fp = m->private; + const struct ftrace_pointer *fp = m->private; struct ftrace_branch_data *p = v; const char *f; long percent; @@ -296,7 +296,7 @@ static const struct file_operations tracing_branch_fops = { extern unsigned long __start_branch_profile[]; extern unsigned long __stop_branch_profile[]; -static struct ftrace_pointer ftrace_branch_pos = { +static const struct ftrace_pointer ftrace_branch_pos = { .start = __start_branch_profile, .stop = __stop_branch_profile, .hit = 1, @@ -320,7 +320,7 @@ static __init int ftrace_branch_init(void) d_tracer = tracing_init_dentry(); entry = debugfs_create_file("profile_annotated_branch", 0444, d_tracer, - &ftrace_annotated_branch_pos, + (void *)&ftrace_annotated_branch_pos, &tracing_branch_fops); if (!entry) pr_warning("Could not create debugfs " @@ -328,7 +328,7 @@ static __init int ftrace_branch_init(void) #ifdef CONFIG_PROFILE_ALL_BRANCHES entry = debugfs_create_file("profile_branch", 0444, d_tracer, - &ftrace_branch_pos, + (void *)&ftrace_branch_pos, &tracing_branch_fops); if (!entry) pr_warning("Could not create debugfs" -- cgit v0.10.2 From 8d7c6a96164651dbbab449ef0b5c20ae1f76a3a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?T=C3=B6r=C3=B6k=20Edwin?= Date: Sun, 23 Nov 2008 12:39:06 +0200 Subject: tracing/stack-tracer: fix style issues MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Impact: cleanup Signed-off-by: Török Edwin Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index b151530..10786af 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -89,7 +89,7 @@ EXPORT_SYMBOL_GPL(save_stack_trace_tsk); struct stack_frame { const void __user *next_fp; - unsigned long return_address; + unsigned long ret_addr; }; static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) @@ -108,33 +108,40 @@ static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) return ret; } +static inline void __save_stack_trace_user(struct stack_trace *trace) +{ + const struct pt_regs *regs = task_pt_regs(current); + const void __user *fp = (const void __user *)regs->bp; + + if (trace->nr_entries < trace->max_entries) + trace->entries[trace->nr_entries++] = regs->ip; + + while (trace->nr_entries < trace->max_entries) { + struct stack_frame frame; + + frame.next_fp = NULL; + frame.ret_addr = 0; + if (!copy_stack_frame(fp, &frame)) + break; + if ((unsigned long)fp < regs->sp) + break; + if (frame.ret_addr) { + trace->entries[trace->nr_entries++] = + frame.ret_addr; + } + if (fp == frame.next_fp) + break; + fp = frame.next_fp; + } +} + void save_stack_trace_user(struct stack_trace *trace) { /* * Trace user stack if we are not a kernel thread */ if (current->mm) { - const struct pt_regs *regs = task_pt_regs(current); - const void __user *fp = (const void __user *)regs->bp; - - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = regs->ip; - - while (trace->nr_entries < trace->max_entries) { - struct stack_frame frame; - frame.next_fp = NULL; - frame.return_address = 0; - if (!copy_stack_frame(fp, &frame)) - break; - if ((unsigned long)fp < regs->sp) - break; - if (frame.return_address) - trace->entries[trace->nr_entries++] = - frame.return_address; - if (fp == frame.next_fp) - break; - fp = frame.next_fp; - } + __save_stack_trace_user(trace); } if (trace->nr_entries < trace->max_entries) trace->entries[trace->nr_entries++] = ULONG_MAX; diff --git a/include/linux/stacktrace.h b/include/linux/stacktrace.h index 68de514..fd42d68 100644 --- a/include/linux/stacktrace.h +++ b/include/linux/stacktrace.h @@ -25,7 +25,7 @@ extern void save_stack_trace_user(struct stack_trace *trace); #else # define save_stack_trace(trace) do { } while (0) # define save_stack_trace_tsk(tsk, trace) do { } while (0) -# define save_stack_trace_user(trace) do { } while (0) +# define save_stack_trace_user(trace) do { } while (0) # define print_stack_trace(trace, spaces) do { } while (0) #endif diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 62776b7..dedf35f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -948,9 +948,9 @@ static void ftrace_trace_userstack(struct trace_array *tr, struct trace_array_cpu *data, unsigned long flags, int pc) { + struct ring_buffer_event *event; struct userstack_entry *entry; struct stack_trace trace; - struct ring_buffer_event *event; unsigned long irq_flags; if (!(trace_flags & TRACE_ITER_USERSTACKTRACE)) @@ -1471,8 +1471,7 @@ static inline int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, if (file) { ret = trace_seq_path(s, &file->f_path); if (ret) - ret = trace_seq_printf(s, "[+0x%lx]", - ip - vmstart); + ret = trace_seq_printf(s, "[+0x%lx]", ip - vmstart); } if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file)) ret = trace_seq_printf(s, " <" IP_FMT ">", ip); @@ -1485,7 +1484,7 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s, { struct mm_struct *mm = NULL; int ret = 1; - unsigned i; + unsigned int i; if (trace_flags & TRACE_ITER_SYM_USEROBJ) { struct task_struct *task; -- cgit v0.10.2 From cffa10aecb6891f090a4d53a075bc40c082c45fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?T=C3=B6r=C3=B6k=20Edwin?= Date: Sun, 23 Nov 2008 12:39:07 +0200 Subject: tracing/stack-tracer: fix locking and refcounts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Impact: fix refcounting/object-access bug Hold mmap_sem while looking up/accessing vma. Hold the RCU lock while using the task we looked up. Signed-off-by: Török Edwin Signed-off-by: Ingo Molnar diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index dedf35f..4c3bd82 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1462,11 +1462,15 @@ static inline int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, int ret = 1; if (mm) { - const struct vm_area_struct *vma = find_vma(mm, ip); + const struct vm_area_struct *vma; + + down_read(&mm->mmap_sem); + vma = find_vma(mm, ip); if (vma) { file = vma->vm_file; vmstart = vma->vm_start; } + up_read(&mm->mmap_sem); } if (file) { ret = trace_seq_path(s, &file->f_path); @@ -1494,10 +1498,9 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s, */ rcu_read_lock(); task = find_task_by_vpid(entry->ent.tgid); - rcu_read_unlock(); - if (task) mm = get_task_mm(task); + rcu_read_unlock(); } for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { -- cgit v0.10.2 From 8d26487fd4ddda7a0237da418fb8669fb06ae557 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?T=C3=B6r=C3=B6k=20Edwin?= Date: Sun, 23 Nov 2008 12:39:08 +0200 Subject: tracing/stack-tracer: introduce CONFIG_USER_STACKTRACE_SUPPORT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Impact: cleanup User stack tracing is just implemented for x86, but it is not x86 specific. Introduce a generic config flag, that is currently enabled only for x86. When other arches implement it, they will have to SELECT USER_STACKTRACE_SUPPORT. Signed-off-by: Török Edwin Signed-off-by: Ingo Molnar diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 7a146ba..e49a4fd 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -36,6 +36,7 @@ config X86 select HAVE_ARCH_TRACEHOOK select HAVE_GENERIC_DMA_COHERENT if X86_32 select HAVE_EFFICIENT_UNALIGNED_ACCESS + select USER_STACKTRACE_SUPPORT config ARCH_DEFCONFIG string diff --git a/include/linux/stacktrace.h b/include/linux/stacktrace.h index fd42d68..1a8cecc 100644 --- a/include/linux/stacktrace.h +++ b/include/linux/stacktrace.h @@ -16,7 +16,7 @@ extern void save_stack_trace_tsk(struct task_struct *tsk, extern void print_stack_trace(struct stack_trace *trace, int spaces); -#ifdef CONFIG_X86 +#ifdef CONFIG_USER_STACKTRACE_SUPPORT extern void save_stack_trace_user(struct stack_trace *trace); #else # define save_stack_trace_user(trace) do { } while (0) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index b8378fa..87fc34a 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -3,6 +3,9 @@ # select HAVE_FUNCTION_TRACER: # +config USER_STACKTRACE_SUPPORT + bool + config NOP_TRACER bool -- cgit v0.10.2 From e38da59269be8c0196d16dff1be5bb26076afc6a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?T=C3=B6r=C3=B6k=20Edwin?= Date: Sun, 23 Nov 2008 13:08:10 +0200 Subject: tracing/stack-tracer: avoid races accessing file Impact: fix race vma->vm_file reference is only stable while holding the mmap_sem, so move usage of it to within the critical section. Signed-off-by: Ingo Molnar diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4c3bd82..48d1536 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1470,13 +1470,13 @@ static inline int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, file = vma->vm_file; vmstart = vma->vm_start; } + if (file) { + ret = trace_seq_path(s, &file->f_path); + if (ret) + ret = trace_seq_printf(s, "[+0x%lx]", ip - vmstart); + } up_read(&mm->mmap_sem); } - if (file) { - ret = trace_seq_path(s, &file->f_path); - if (ret) - ret = trace_seq_printf(s, "[+0x%lx]", ip - vmstart); - } if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file)) ret = trace_seq_printf(s, " <" IP_FMT ">", ip); return ret; -- cgit v0.10.2 From eae849ca034c7f1015f0a6f17421ebc737f0a069 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 23 Nov 2008 17:33:12 +0100 Subject: tracing/function-return-tracer: don't trace kfree while it frees the return stack Impact: fix a crash While I killed the cat process, I got sometimes the following (but rare) crash: [ 65.689027] Pid: 2969, comm: cat Not tainted (2.6.28-rc6-tip #83) AMILO Li 2727 [ 65.689027] EIP: 0060:[<00000000>] EFLAGS: 00010082 CPU: 1 [ 65.689027] EIP is at 0x0 [ 65.689027] EAX: 00000000 EBX: f66cd780 ECX: c019a64a EDX: f66cd780 [ 65.689027] ESI: 00000286 EDI: f66cd780 EBP: f630be2c ESP: f630be24 [ 65.689027] DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068 [ 65.689027] Process cat (pid: 2969, ti=f630a000 task=f66cd780 task.ti=f630a000) [ 65.689027] Stack: [ 65.689027] 00000012 f630bd54 f630be7c c012c853 00000000 c0133cc9 f66cda54 f630be5c [ 65.689027] f630be68 f66cda54 f66cd88c f66cd878 f7070000 00000001 f630be90 c0135dbc [ 65.689027] f614a614 f630be68 f630be68 f65ba200 00000002 f630bf10 f630be90 c012cad6 [ 65.689027] Call Trace: [ 65.689027] [] ? do_exit+0x603/0x850 [ 65.689027] [] ? next_signal+0x9/0x40 [ 65.689027] [] ? dequeue_signal+0x8c/0x180 [ 65.689027] [] ? do_group_exit+0x36/0x90 [ 65.689027] [] ? get_signal_to_deliver+0x20c/0x390 [ 65.689027] [] ? do_notify_resume+0x99/0x8b0 [ 65.689027] [] ? tty_ldisc_deref+0x5a/0x80 [ 65.689027] [] ? trace_hardirqs_on+0xb/0x10 [ 65.689027] [] ? tty_ldisc_deref+0x5a/0x80 [ 65.689027] [] ? n_tty_write+0x0/0x340 [ 65.689027] [] ? redirected_tty_write+0x82/0x90 [ 65.689027] [] ? vfs_write+0x99/0xd0 [ 65.689027] [] ? redirected_tty_write+0x0/0x90 [ 65.689027] [] ? sys_write+0x42/0x70 [ 65.689027] [] ? work_notifysig+0x13/0x19 [ 65.689027] Code: Bad EIP value. [ 65.689027] EIP: [<00000000>] 0x0 SS:ESP 0068:f630be24 This is because on do_exit(), kfree is called to free the return addresses stack but kfree is traced and stored its return address in this stack. This patch fixes it. Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 90d99fb..53042f1 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1628,8 +1628,13 @@ void ftrace_retfunc_init_task(struct task_struct *t) void ftrace_retfunc_exit_task(struct task_struct *t) { - kfree(t->ret_stack); + struct ftrace_ret_stack *ret_stack = t->ret_stack; + t->ret_stack = NULL; + /* NULL must become visible to IRQs before we free it: */ + barrier(); + + kfree(ret_stack); } #endif -- cgit v0.10.2 From 65afa5e603d507014580ead016ec887b49e1afa6 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 23 Nov 2008 18:43:39 +0100 Subject: tracing/function-return-tracer: free the return stack on free_task() Impact: avoid losing some traces when a task is freed do_exit() is not the last function called when a task finishes. There are still some functions which are to be called such as ree_task(). So we delay the freeing of the return stack to the last moment. Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar diff --git a/kernel/exit.c b/kernel/exit.c index ef04d03..e5ae36e 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -47,7 +47,6 @@ #include #include #include -#include #include #include @@ -1128,7 +1127,6 @@ NORET_TYPE void do_exit(long code) preempt_disable(); /* causes final put_task_struct in finish_task_switch(). */ tsk->state = TASK_DEAD; - ftrace_retfunc_exit_task(tsk); schedule(); BUG(); /* Avoid "noreturn function does return". */ diff --git a/kernel/fork.c b/kernel/fork.c index fbf4a4c..d6e1a32 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -140,6 +140,7 @@ void free_task(struct task_struct *tsk) prop_local_destroy_single(&tsk->dirties); free_thread_info(tsk->stack); rt_mutex_debug_task_free(tsk); + ftrace_retfunc_exit_task(tsk); free_task_struct(tsk); } EXPORT_SYMBOL(free_task); -- cgit v0.10.2 From 958086d1784459be3fe85e4cad79d42b17d33381 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?T=C3=B6r=C3=B6k=20Edwin?= Date: Sun, 23 Nov 2008 23:24:53 +0200 Subject: vfs, seqfile: fix comment style on mangle_path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Impact: use standard docbook tags Reported-by: Randy Dunlap Signed-off-by: Török Edwin Signed-off-by: Ingo Molnar diff --git a/fs/seq_file.c b/fs/seq_file.c index f5b61cc..f03220d7 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -358,10 +358,10 @@ int seq_printf(struct seq_file *m, const char *f, ...) EXPORT_SYMBOL(seq_printf); /** - * mangle_path - mangle and copy path to buffer beginning - * @s - buffer start - * @p - beginning of path in above buffer - * @esc - set of characters that need escaping + * mangle_path - mangle and copy path to buffer beginning + * @s: buffer start + * @p: beginning of path in above buffer + * @esc: set of characters that need escaping * * Copy the path from @p to @s, replacing each occurrence of character from * @esc with usual octal escape. -- cgit v0.10.2 From 14bfc987e395797dfe03e915e8b4c7fc9e5078e4 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 25 Nov 2008 08:58:11 +0100 Subject: tracing, tty: fix warnings caused by branch tracing and tty_kref_get() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Stephen Rothwell reported tht this warning started triggering in linux-next: In file included from init/main.c:27: include/linux/tty.h: In function ‘tty_kref_get’: include/linux/tty.h:330: warning: ‘______f’ is static but declared in inline function ‘tty_kref_get’ which is not static Which gcc emits for 'extern inline' functions that nevertheless define static variables. Change it to 'static inline', which is the norm in the kernel anyway. Reported-by: Stephen Rothwell Signed-off-by: Ingo Molnar diff --git a/include/linux/tty.h b/include/linux/tty.h index 3b8121d..eaec37c 100644 --- a/include/linux/tty.h +++ b/include/linux/tty.h @@ -325,7 +325,7 @@ extern struct class *tty_class; * go away */ -extern inline struct tty_struct *tty_kref_get(struct tty_struct *tty) +static inline struct tty_struct *tty_kref_get(struct tty_struct *tty) { if (tty) kref_get(&tty->kref); -- cgit v0.10.2 From ca0002a179bfa532d009a9272d619732872c49bd Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Tue, 25 Nov 2008 09:01:25 +0100 Subject: x86, bts: base in-kernel ds interface on handles Impact: generalize the DS code to shared buffers Change the in-kernel ds.h interface to identify the tracer via a handle returned on ds_request_~(). Tracers used to be identified via their task_struct. The changes are required to allow DS to be shared between different tasks, which is needed for perfmon2 and for ftrace. For ptrace, the handle is stored in the traced task's task_struct. This should probably go into a (arch-specific) ptrace context some time. Signed-off-by: Markus Metzger Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/ds.h b/arch/x86/include/asm/ds.h index a950084..0af997d 100644 --- a/arch/x86/include/asm/ds.h +++ b/arch/x86/include/asm/ds.h @@ -26,11 +26,18 @@ #include #include +#include #ifdef CONFIG_X86_DS struct task_struct; +struct ds_tracer; +struct bts_tracer; +struct pebs_tracer; + +typedef void (*bts_ovfl_callback_t)(struct bts_tracer *); +typedef void (*pebs_ovfl_callback_t)(struct pebs_tracer *); /* * Request BTS or PEBS @@ -38,21 +45,29 @@ struct task_struct; * Due to alignement constraints, the actual buffer may be slightly * smaller than the requested or provided buffer. * - * Returns 0 on success; -Eerrno otherwise + * Returns a pointer to a tracer structure on success, or + * ERR_PTR(errcode) on failure. + * + * The interrupt threshold is independent from the overflow callback + * to allow users to use their own overflow interrupt handling mechanism. * * task: the task to request recording for; * NULL for per-cpu recording on the current cpu * base: the base pointer for the (non-pageable) buffer; * NULL if buffer allocation requested - * size: the size of the requested or provided buffer + * size: the size of the requested or provided buffer in bytes * ovfl: pointer to a function to be called on buffer overflow; * NULL if cyclic buffer requested + * th: the interrupt threshold in records from the end of the buffer; + * -1 if no interrupt threshold is requested. */ -typedef void (*ds_ovfl_callback_t)(struct task_struct *); -extern int ds_request_bts(struct task_struct *task, void *base, size_t size, - ds_ovfl_callback_t ovfl); -extern int ds_request_pebs(struct task_struct *task, void *base, size_t size, - ds_ovfl_callback_t ovfl); +extern struct bts_tracer *ds_request_bts(struct task_struct *task, + void *base, size_t size, + bts_ovfl_callback_t ovfl, size_t th); +extern struct pebs_tracer *ds_request_pebs(struct task_struct *task, + void *base, size_t size, + pebs_ovfl_callback_t ovfl, + size_t th); /* * Release BTS or PEBS resources @@ -61,37 +76,34 @@ extern int ds_request_pebs(struct task_struct *task, void *base, size_t size, * * Returns 0 on success; -Eerrno otherwise * - * task: the task to release resources for; - * NULL to release resources for the current cpu + * tracer: the tracer handle returned from ds_request_~() */ -extern int ds_release_bts(struct task_struct *task); -extern int ds_release_pebs(struct task_struct *task); +extern int ds_release_bts(struct bts_tracer *tracer); +extern int ds_release_pebs(struct pebs_tracer *tracer); /* - * Return the (array) index of the write pointer. + * Get the (array) index of the write pointer. * (assuming an array of BTS/PEBS records) * - * Returns -Eerrno on error + * Returns 0 on success; -Eerrno on error * - * task: the task to access; - * NULL to access the current cpu - * pos (out): if not NULL, will hold the result + * tracer: the tracer handle returned from ds_request_~() + * pos (out): will hold the result */ -extern int ds_get_bts_index(struct task_struct *task, size_t *pos); -extern int ds_get_pebs_index(struct task_struct *task, size_t *pos); +extern int ds_get_bts_index(struct bts_tracer *tracer, size_t *pos); +extern int ds_get_pebs_index(struct pebs_tracer *tracer, size_t *pos); /* - * Return the (array) index one record beyond the end of the array. + * Get the (array) index one record beyond the end of the array. * (assuming an array of BTS/PEBS records) * - * Returns -Eerrno on error + * Returns 0 on success; -Eerrno on error * - * task: the task to access; - * NULL to access the current cpu - * pos (out): if not NULL, will hold the result + * tracer: the tracer handle returned from ds_request_~() + * pos (out): will hold the result */ -extern int ds_get_bts_end(struct task_struct *task, size_t *pos); -extern int ds_get_pebs_end(struct task_struct *task, size_t *pos); +extern int ds_get_bts_end(struct bts_tracer *tracer, size_t *pos); +extern int ds_get_pebs_end(struct pebs_tracer *tracer, size_t *pos); /* * Provide a pointer to the BTS/PEBS record at parameter index. @@ -102,14 +114,13 @@ extern int ds_get_pebs_end(struct task_struct *task, size_t *pos); * * Returns the size of a single record on success; -Eerrno on error * - * task: the task to access; - * NULL to access the current cpu + * tracer: the tracer handle returned from ds_request_~() * index: the index of the requested record * record (out): pointer to the requested record */ -extern int ds_access_bts(struct task_struct *task, +extern int ds_access_bts(struct bts_tracer *tracer, size_t index, const void **record); -extern int ds_access_pebs(struct task_struct *task, +extern int ds_access_pebs(struct pebs_tracer *tracer, size_t index, const void **record); /* @@ -129,38 +140,24 @@ extern int ds_access_pebs(struct task_struct *task, * * Returns the number of bytes written or -Eerrno. * - * task: the task to access; - * NULL to access the current cpu + * tracer: the tracer handle returned from ds_request_~() * buffer: the buffer to write * size: the size of the buffer */ -extern int ds_write_bts(struct task_struct *task, +extern int ds_write_bts(struct bts_tracer *tracer, const void *buffer, size_t size); -extern int ds_write_pebs(struct task_struct *task, +extern int ds_write_pebs(struct pebs_tracer *tracer, const void *buffer, size_t size); /* - * Same as ds_write_bts/pebs, but omit ownership checks. - * - * This is needed to have some other task than the owner of the - * BTS/PEBS buffer or the parameter task itself write into the - * respective buffer. - */ -extern int ds_unchecked_write_bts(struct task_struct *task, - const void *buffer, size_t size); -extern int ds_unchecked_write_pebs(struct task_struct *task, - const void *buffer, size_t size); - -/* * Reset the write pointer of the BTS/PEBS buffer. * * Returns 0 on success; -Eerrno on error * - * task: the task to access; - * NULL to access the current cpu + * tracer: the tracer handle returned from ds_request_~() */ -extern int ds_reset_bts(struct task_struct *task); -extern int ds_reset_pebs(struct task_struct *task); +extern int ds_reset_bts(struct bts_tracer *tracer); +extern int ds_reset_pebs(struct pebs_tracer *tracer); /* * Clear the BTS/PEBS buffer and reset the write pointer. @@ -168,33 +165,30 @@ extern int ds_reset_pebs(struct task_struct *task); * * Returns 0 on success; -Eerrno on error * - * task: the task to access; - * NULL to access the current cpu + * tracer: the tracer handle returned from ds_request_~() */ -extern int ds_clear_bts(struct task_struct *task); -extern int ds_clear_pebs(struct task_struct *task); +extern int ds_clear_bts(struct bts_tracer *tracer); +extern int ds_clear_pebs(struct pebs_tracer *tracer); /* * Provide the PEBS counter reset value. * * Returns 0 on success; -Eerrno on error * - * task: the task to access; - * NULL to access the current cpu + * tracer: the tracer handle returned from ds_request_pebs() * value (out): the counter reset value */ -extern int ds_get_pebs_reset(struct task_struct *task, u64 *value); +extern int ds_get_pebs_reset(struct pebs_tracer *tracer, u64 *value); /* * Set the PEBS counter reset value. * * Returns 0 on success; -Eerrno on error * - * task: the task to access; - * NULL to access the current cpu + * tracer: the tracer handle returned from ds_request_pebs() * value: the new counter reset value */ -extern int ds_set_pebs_reset(struct task_struct *task, u64 value); +extern int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value); /* * Initialization @@ -207,17 +201,13 @@ extern void __cpuinit ds_init_intel(struct cpuinfo_x86 *); /* * The DS context - part of struct thread_struct. */ +#define MAX_SIZEOF_DS (12 * 8) + struct ds_context { /* pointer to the DS configuration; goes into MSR_IA32_DS_AREA */ - unsigned char *ds; + unsigned char ds[MAX_SIZEOF_DS]; /* the owner of the BTS and PEBS configuration, respectively */ - struct task_struct *owner[2]; - /* buffer overflow notification function for BTS and PEBS */ - ds_ovfl_callback_t callback[2]; - /* the original buffer address */ - void *buffer[2]; - /* the number of allocated pages for on-request allocated buffers */ - unsigned int pages[2]; + struct ds_tracer *owner[2]; /* use count */ unsigned long count; /* a pointer to the context location inside the thread_struct diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index d6938d9..96768e9 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -28,6 +28,7 @@ #include #include #include +#include /* @@ -44,6 +45,35 @@ struct ds_configuration { }; static struct ds_configuration ds_cfg; +/* + * A BTS or PEBS tracer. + * + * This holds the configuration of the tracer and serves as a handle + * to identify tracers. + */ +struct ds_tracer { + /* the DS context (partially) owned by this tracer */ + struct ds_context *context; + /* the buffer provided on ds_request() and its size in bytes */ + void *buffer; + size_t size; + /* the number of allocated pages for on-request allocated buffers */ + unsigned int pages; +}; + +struct bts_tracer { + /* the common DS part */ + struct ds_tracer ds; + /* buffer overflow notification function */ + bts_ovfl_callback_t ovfl; +}; + +struct pebs_tracer { + /* the common DS part */ + struct ds_tracer ds; + /* buffer overflow notification function */ + pebs_ovfl_callback_t ovfl; +}; /* * Debug Store (DS) save area configuration (see Intel64 and IA32 @@ -107,35 +137,15 @@ static inline void ds_set(unsigned char *base, enum ds_qualifier qual, (*(unsigned long *)base) = value; } +#define DS_ALIGNMENT (1 << 3) /* BTS and PEBS buffer alignment */ + /* * Locking is done only for allocating BTS or PEBS resources and for * guarding context and buffer memory allocation. - * - * Most functions require the current task to own the ds context part - * they are going to access. All the locking is done when validating - * access to the context. */ static spinlock_t ds_lock = __SPIN_LOCK_UNLOCKED(ds_lock); -/* - * Validate that the current task is allowed to access the BTS/PEBS - * buffer of the parameter task. - * - * Returns 0, if access is granted; -Eerrno, otherwise. - */ -static inline int ds_validate_access(struct ds_context *context, - enum ds_qualifier qual) -{ - if (!context) - return -EPERM; - - if (context->owner[qual] == current) - return 0; - - return -EPERM; -} - /* * We either support (system-wide) per-cpu or per-thread allocation. @@ -183,51 +193,13 @@ static inline int check_tracer(struct task_struct *task) * * Contexts are use-counted. They are allocated on first access and * deallocated when the last user puts the context. - * - * We distinguish between an allocating and a non-allocating get of a - * context: - * - the allocating get is used for requesting BTS/PEBS resources. It - * requires the caller to hold the global ds_lock. - * - the non-allocating get is used for all other cases. A - * non-existing context indicates an error. It acquires and releases - * the ds_lock itself for obtaining the context. - * - * A context and its DS configuration are allocated and deallocated - * together. A context always has a DS configuration of the - * appropriate size. */ static DEFINE_PER_CPU(struct ds_context *, system_context); #define this_system_context per_cpu(system_context, smp_processor_id()) -/* - * Returns the pointer to the parameter task's context or to the - * system-wide context, if task is NULL. - * - * Increases the use count of the returned context, if not NULL. - */ static inline struct ds_context *ds_get_context(struct task_struct *task) { - struct ds_context *context; - unsigned long irq; - - spin_lock_irqsave(&ds_lock, irq); - - context = (task ? task->thread.ds_ctx : this_system_context); - if (context) - context->count++; - - spin_unlock_irqrestore(&ds_lock, irq); - - return context; -} - -/* - * Same as ds_get_context, but allocates the context and it's DS - * structure, if necessary; returns NULL; if out of memory. - */ -static inline struct ds_context *ds_alloc_context(struct task_struct *task) -{ struct ds_context **p_context = (task ? &task->thread.ds_ctx : &this_system_context); struct ds_context *context = *p_context; @@ -238,16 +210,9 @@ static inline struct ds_context *ds_alloc_context(struct task_struct *task) if (!context) return NULL; - context->ds = kzalloc(ds_cfg.sizeof_ds, GFP_KERNEL); - if (!context->ds) { - kfree(context); - return NULL; - } - spin_lock_irqsave(&ds_lock, irq); if (*p_context) { - kfree(context->ds); kfree(context); context = *p_context; @@ -272,10 +237,6 @@ static inline struct ds_context *ds_alloc_context(struct task_struct *task) return context; } -/* - * Decreases the use count of the parameter context, if not NULL. - * Deallocates the context, if the use count reaches zero. - */ static inline void ds_put_context(struct ds_context *context) { unsigned long irq; @@ -296,13 +257,6 @@ static inline void ds_put_context(struct ds_context *context) if (!context->task || (context->task == current)) wrmsrl(MSR_IA32_DS_AREA, 0); - put_tracer(context->task); - - /* free any leftover buffers from tracers that did not - * deallocate them properly. */ - kfree(context->buffer[ds_bts]); - kfree(context->buffer[ds_pebs]); - kfree(context->ds); kfree(context); out: spin_unlock_irqrestore(&ds_lock, irq); @@ -312,21 +266,29 @@ static inline void ds_put_context(struct ds_context *context) /* * Handle a buffer overflow * - * task: the task whose buffers are overflowing; - * NULL for a buffer overflow on the current cpu * context: the ds context * qual: the buffer type */ -static void ds_overflow(struct task_struct *task, struct ds_context *context, - enum ds_qualifier qual) -{ - if (!context) - return; - - if (context->callback[qual]) - (*context->callback[qual])(task); - - /* todo: do some more overflow handling */ +static void ds_overflow(struct ds_context *context, enum ds_qualifier qual) +{ + switch (qual) { + case ds_bts: { + struct bts_tracer *tracer = + container_of(context->owner[qual], + struct bts_tracer, ds); + if (tracer->ovfl) + tracer->ovfl(tracer); + } + break; + case ds_pebs: { + struct pebs_tracer *tracer = + container_of(context->owner[qual], + struct pebs_tracer, ds); + if (tracer->ovfl) + tracer->ovfl(tracer); + } + break; + } } @@ -343,23 +305,25 @@ static void ds_overflow(struct task_struct *task, struct ds_context *context, static inline void *ds_allocate_buffer(size_t size, unsigned int *pages) { unsigned long rlim, vm, pgsz; - void *buffer; + void *buffer = NULL; pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT; + down_write(¤t->mm->mmap_sem); + rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; vm = current->mm->total_vm + pgsz; if (rlim < vm) - return NULL; + goto out; rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; vm = current->mm->locked_vm + pgsz; if (rlim < vm) - return NULL; + goto out; buffer = kzalloc(size, GFP_KERNEL); if (!buffer) - return NULL; + goto out; current->mm->total_vm += pgsz; current->mm->locked_vm += pgsz; @@ -367,290 +331,337 @@ static inline void *ds_allocate_buffer(size_t size, unsigned int *pages) if (pages) *pages = pgsz; + out: + up_write(¤t->mm->mmap_sem); return buffer; } -static int ds_request(struct task_struct *task, void *base, size_t size, - ds_ovfl_callback_t ovfl, enum ds_qualifier qual) +static void ds_install_ds_config(struct ds_context *context, + enum ds_qualifier qual, + void *base, size_t size, size_t ith) { - struct ds_context *context; unsigned long buffer, adj; - const unsigned long alignment = (1 << 3); + + /* adjust the buffer address and size to meet alignment + * constraints: + * - buffer is double-word aligned + * - size is multiple of record size + * + * We checked the size at the very beginning; we have enough + * space to do the adjustment. + */ + buffer = (unsigned long)base; + + adj = ALIGN(buffer, DS_ALIGNMENT) - buffer; + buffer += adj; + size -= adj; + + size /= ds_cfg.sizeof_rec[qual]; + size *= ds_cfg.sizeof_rec[qual]; + + ds_set(context->ds, qual, ds_buffer_base, buffer); + ds_set(context->ds, qual, ds_index, buffer); + ds_set(context->ds, qual, ds_absolute_maximum, buffer + size); + + /* The value for 'no threshold' is -1, which will set the + * threshold outside of the buffer, just like we want it. + */ + ds_set(context->ds, qual, + ds_interrupt_threshold, buffer + size - ith); +} + +static int ds_request(struct ds_tracer *tracer, enum ds_qualifier qual, + struct task_struct *task, + void *base, size_t size, size_t th) +{ + struct ds_context *context; unsigned long irq; - int error = 0; + int error; + error = -EOPNOTSUPP; if (!ds_cfg.sizeof_ds) - return -EOPNOTSUPP; + goto out; /* we require some space to do alignment adjustments below */ - if (size < (alignment + ds_cfg.sizeof_rec[qual])) - return -EINVAL; + error = -EINVAL; + if (size < (DS_ALIGNMENT + ds_cfg.sizeof_rec[qual])) + goto out; - /* buffer overflow notification is not yet implemented */ - if (ovfl) - return -EOPNOTSUPP; + if (th != (size_t)-1) { + th *= ds_cfg.sizeof_rec[qual]; + + error = -EINVAL; + if (size <= th) + goto out; + } + + error = -ENOMEM; + if (!base) { + base = ds_allocate_buffer(size, &tracer->pages); + if (!base) + goto out; + } + tracer->buffer = base; + tracer->size = size; - context = ds_alloc_context(task); + error = -ENOMEM; + context = ds_get_context(task); if (!context) - return -ENOMEM; + goto out; + tracer->context = context; + spin_lock_irqsave(&ds_lock, irq); error = -EPERM; if (!check_tracer(task)) goto out_unlock; - get_tracer(task); - error = -EALREADY; - if (context->owner[qual] == current) - goto out_put_tracer; error = -EPERM; - if (context->owner[qual] != NULL) + if (context->owner[qual]) goto out_put_tracer; - context->owner[qual] = current; + context->owner[qual] = tracer; spin_unlock_irqrestore(&ds_lock, irq); - error = -ENOMEM; - if (!base) { - base = ds_allocate_buffer(size, &context->pages[qual]); - if (!base) - goto out_release; - - context->buffer[qual] = base; - } - error = 0; + ds_install_ds_config(context, qual, base, size, th); - context->callback[qual] = ovfl; - - /* adjust the buffer address and size to meet alignment - * constraints: - * - buffer is double-word aligned - * - size is multiple of record size - * - * We checked the size at the very beginning; we have enough - * space to do the adjustment. - */ - buffer = (unsigned long)base; - - adj = ALIGN(buffer, alignment) - buffer; - buffer += adj; - size -= adj; - - size /= ds_cfg.sizeof_rec[qual]; - size *= ds_cfg.sizeof_rec[qual]; - - ds_set(context->ds, qual, ds_buffer_base, buffer); - ds_set(context->ds, qual, ds_index, buffer); - ds_set(context->ds, qual, ds_absolute_maximum, buffer + size); - - if (ovfl) { - /* todo: select a suitable interrupt threshold */ - } else - ds_set(context->ds, qual, - ds_interrupt_threshold, buffer + size + 1); - - /* we keep the context until ds_release */ - return error; - - out_release: - context->owner[qual] = NULL; - ds_put_context(context); - put_tracer(task); - return error; + return 0; out_put_tracer: - spin_unlock_irqrestore(&ds_lock, irq); - ds_put_context(context); put_tracer(task); - return error; - out_unlock: spin_unlock_irqrestore(&ds_lock, irq); ds_put_context(context); + tracer->context = NULL; + out: return error; } -int ds_request_bts(struct task_struct *task, void *base, size_t size, - ds_ovfl_callback_t ovfl) +struct bts_tracer *ds_request_bts(struct task_struct *task, + void *base, size_t size, + bts_ovfl_callback_t ovfl, size_t th) { - return ds_request(task, base, size, ovfl, ds_bts); -} + struct bts_tracer *tracer; + int error; -int ds_request_pebs(struct task_struct *task, void *base, size_t size, - ds_ovfl_callback_t ovfl) -{ - return ds_request(task, base, size, ovfl, ds_pebs); + /* buffer overflow notification is not yet implemented */ + error = -EOPNOTSUPP; + if (ovfl) + goto out; + + error = -ENOMEM; + tracer = kzalloc(sizeof(*tracer), GFP_KERNEL); + if (!tracer) + goto out; + tracer->ovfl = ovfl; + + error = ds_request(&tracer->ds, ds_bts, task, base, size, th); + if (error < 0) + goto out_tracer; + + return tracer; + + out_tracer: + (void)ds_release_bts(tracer); + out: + return ERR_PTR(error); } -static int ds_release(struct task_struct *task, enum ds_qualifier qual) +struct pebs_tracer *ds_request_pebs(struct task_struct *task, + void *base, size_t size, + pebs_ovfl_callback_t ovfl, size_t th) { - struct ds_context *context; + struct pebs_tracer *tracer; int error; - context = ds_get_context(task); - error = ds_validate_access(context, qual); - if (error < 0) + /* buffer overflow notification is not yet implemented */ + error = -EOPNOTSUPP; + if (ovfl) goto out; - kfree(context->buffer[qual]); - context->buffer[qual] = NULL; + error = -ENOMEM; + tracer = kzalloc(sizeof(*tracer), GFP_KERNEL); + if (!tracer) + goto out; + tracer->ovfl = ovfl; - current->mm->total_vm -= context->pages[qual]; - current->mm->locked_vm -= context->pages[qual]; - context->pages[qual] = 0; - context->owner[qual] = NULL; + error = ds_request(&tracer->ds, ds_pebs, task, base, size, th); + if (error < 0) + goto out_tracer; - /* - * we put the context twice: - * once for the ds_get_context - * once for the corresponding ds_request - */ - ds_put_context(context); + return tracer; + + out_tracer: + (void)ds_release_pebs(tracer); out: - ds_put_context(context); - return error; + return ERR_PTR(error); +} + +static void ds_release(struct ds_tracer *tracer, enum ds_qualifier qual) +{ + if (tracer->context) { + BUG_ON(tracer->context->owner[qual] != tracer); + tracer->context->owner[qual] = NULL; + + put_tracer(tracer->context->task); + ds_put_context(tracer->context); + } + + if (tracer->pages) { + kfree(tracer->buffer); + + down_write(¤t->mm->mmap_sem); + + current->mm->total_vm -= tracer->pages; + current->mm->locked_vm -= tracer->pages; + + up_write(¤t->mm->mmap_sem); + } } -int ds_release_bts(struct task_struct *task) +int ds_release_bts(struct bts_tracer *tracer) { - return ds_release(task, ds_bts); + if (!tracer) + return -EINVAL; + + ds_release(&tracer->ds, ds_bts); + kfree(tracer); + + return 0; } -int ds_release_pebs(struct task_struct *task) +int ds_release_pebs(struct pebs_tracer *tracer) { - return ds_release(task, ds_pebs); + if (!tracer) + return -EINVAL; + + ds_release(&tracer->ds, ds_pebs); + kfree(tracer); + + return 0; } -static int ds_get_index(struct task_struct *task, size_t *pos, - enum ds_qualifier qual) +static size_t ds_get_index(struct ds_context *context, enum ds_qualifier qual) { - struct ds_context *context; unsigned long base, index; - int error; - - context = ds_get_context(task); - error = ds_validate_access(context, qual); - if (error < 0) - goto out; base = ds_get(context->ds, qual, ds_buffer_base); index = ds_get(context->ds, qual, ds_index); - error = ((index - base) / ds_cfg.sizeof_rec[qual]); - if (pos) - *pos = error; - out: - ds_put_context(context); - return error; + return (index - base) / ds_cfg.sizeof_rec[qual]; } -int ds_get_bts_index(struct task_struct *task, size_t *pos) +int ds_get_bts_index(struct bts_tracer *tracer, size_t *pos) { - return ds_get_index(task, pos, ds_bts); + if (!tracer) + return -EINVAL; + + if (!pos) + return -EINVAL; + + *pos = ds_get_index(tracer->ds.context, ds_bts); + + return 0; } -int ds_get_pebs_index(struct task_struct *task, size_t *pos) +int ds_get_pebs_index(struct pebs_tracer *tracer, size_t *pos) { - return ds_get_index(task, pos, ds_pebs); + if (!tracer) + return -EINVAL; + + if (!pos) + return -EINVAL; + + *pos = ds_get_index(tracer->ds.context, ds_pebs); + + return 0; } -static int ds_get_end(struct task_struct *task, size_t *pos, - enum ds_qualifier qual) +static size_t ds_get_end(struct ds_context *context, enum ds_qualifier qual) { - struct ds_context *context; - unsigned long base, end; - int error; - - context = ds_get_context(task); - error = ds_validate_access(context, qual); - if (error < 0) - goto out; + unsigned long base, max; base = ds_get(context->ds, qual, ds_buffer_base); - end = ds_get(context->ds, qual, ds_absolute_maximum); + max = ds_get(context->ds, qual, ds_absolute_maximum); - error = ((end - base) / ds_cfg.sizeof_rec[qual]); - if (pos) - *pos = error; - out: - ds_put_context(context); - return error; + return (max - base) / ds_cfg.sizeof_rec[qual]; } -int ds_get_bts_end(struct task_struct *task, size_t *pos) +int ds_get_bts_end(struct bts_tracer *tracer, size_t *pos) { - return ds_get_end(task, pos, ds_bts); + if (!tracer) + return -EINVAL; + + if (!pos) + return -EINVAL; + + *pos = ds_get_end(tracer->ds.context, ds_bts); + + return 0; } -int ds_get_pebs_end(struct task_struct *task, size_t *pos) +int ds_get_pebs_end(struct pebs_tracer *tracer, size_t *pos) { - return ds_get_end(task, pos, ds_pebs); + if (!tracer) + return -EINVAL; + + if (!pos) + return -EINVAL; + + *pos = ds_get_end(tracer->ds.context, ds_pebs); + + return 0; } -static int ds_access(struct task_struct *task, size_t index, - const void **record, enum ds_qualifier qual) +static int ds_access(struct ds_context *context, enum ds_qualifier qual, + size_t index, const void **record) { - struct ds_context *context; unsigned long base, idx; - int error; if (!record) return -EINVAL; - context = ds_get_context(task); - error = ds_validate_access(context, qual); - if (error < 0) - goto out; - base = ds_get(context->ds, qual, ds_buffer_base); idx = base + (index * ds_cfg.sizeof_rec[qual]); - error = -EINVAL; if (idx > ds_get(context->ds, qual, ds_absolute_maximum)) - goto out; + return -EINVAL; *record = (const void *)idx; - error = ds_cfg.sizeof_rec[qual]; - out: - ds_put_context(context); - return error; + + return ds_cfg.sizeof_rec[qual]; } -int ds_access_bts(struct task_struct *task, size_t index, const void **record) +int ds_access_bts(struct bts_tracer *tracer, size_t index, + const void **record) { - return ds_access(task, index, record, ds_bts); + if (!tracer) + return -EINVAL; + + return ds_access(tracer->ds.context, ds_bts, index, record); } -int ds_access_pebs(struct task_struct *task, size_t index, const void **record) +int ds_access_pebs(struct pebs_tracer *tracer, size_t index, + const void **record) { - return ds_access(task, index, record, ds_pebs); + if (!tracer) + return -EINVAL; + + return ds_access(tracer->ds.context, ds_pebs, index, record); } -static int ds_write(struct task_struct *task, const void *record, size_t size, - enum ds_qualifier qual, int force) +static int ds_write(struct ds_context *context, enum ds_qualifier qual, + const void *record, size_t size) { - struct ds_context *context; - int error; + int bytes_written = 0; if (!record) return -EINVAL; - error = -EPERM; - context = ds_get_context(task); - if (!context) - goto out; - - if (!force) { - error = ds_validate_access(context, qual); - if (error < 0) - goto out; - } - - error = 0; while (size) { unsigned long base, index, end, write_end, int_th; unsigned long write_size, adj_write_size; @@ -678,14 +689,14 @@ static int ds_write(struct task_struct *task, const void *record, size_t size, write_end = end; if (write_end <= index) - goto out; + break; write_size = min((unsigned long) size, write_end - index); memcpy((void *)index, record, write_size); record = (const char *)record + write_size; - size -= write_size; - error += write_size; + size -= write_size; + bytes_written += write_size; adj_write_size = write_size / ds_cfg.sizeof_rec[qual]; adj_write_size *= ds_cfg.sizeof_rec[qual]; @@ -700,47 +711,32 @@ static int ds_write(struct task_struct *task, const void *record, size_t size, ds_set(context->ds, qual, ds_index, index); if (index >= int_th) - ds_overflow(task, context, qual); + ds_overflow(context, qual); } - out: - ds_put_context(context); - return error; + return bytes_written; } -int ds_write_bts(struct task_struct *task, const void *record, size_t size) +int ds_write_bts(struct bts_tracer *tracer, const void *record, size_t size) { - return ds_write(task, record, size, ds_bts, /* force = */ 0); -} + if (!tracer) + return -EINVAL; -int ds_write_pebs(struct task_struct *task, const void *record, size_t size) -{ - return ds_write(task, record, size, ds_pebs, /* force = */ 0); + return ds_write(tracer->ds.context, ds_bts, record, size); } -int ds_unchecked_write_bts(struct task_struct *task, - const void *record, size_t size) +int ds_write_pebs(struct pebs_tracer *tracer, const void *record, size_t size) { - return ds_write(task, record, size, ds_bts, /* force = */ 1); -} + if (!tracer) + return -EINVAL; -int ds_unchecked_write_pebs(struct task_struct *task, - const void *record, size_t size) -{ - return ds_write(task, record, size, ds_pebs, /* force = */ 1); + return ds_write(tracer->ds.context, ds_pebs, record, size); } -static int ds_reset_or_clear(struct task_struct *task, - enum ds_qualifier qual, int clear) +static void ds_reset_or_clear(struct ds_context *context, + enum ds_qualifier qual, int clear) { - struct ds_context *context; unsigned long base, end; - int error; - - context = ds_get_context(task); - error = ds_validate_access(context, qual); - if (error < 0) - goto out; base = ds_get(context->ds, qual, ds_buffer_base); end = ds_get(context->ds, qual, ds_absolute_maximum); @@ -749,70 +745,69 @@ static int ds_reset_or_clear(struct task_struct *task, memset((void *)base, 0, end - base); ds_set(context->ds, qual, ds_index, base); - - error = 0; - out: - ds_put_context(context); - return error; } -int ds_reset_bts(struct task_struct *task) +int ds_reset_bts(struct bts_tracer *tracer) { - return ds_reset_or_clear(task, ds_bts, /* clear = */ 0); + if (!tracer) + return -EINVAL; + + ds_reset_or_clear(tracer->ds.context, ds_bts, /* clear = */ 0); + + return 0; } -int ds_reset_pebs(struct task_struct *task) +int ds_reset_pebs(struct pebs_tracer *tracer) { - return ds_reset_or_clear(task, ds_pebs, /* clear = */ 0); + if (!tracer) + return -EINVAL; + + ds_reset_or_clear(tracer->ds.context, ds_pebs, /* clear = */ 0); + + return 0; } -int ds_clear_bts(struct task_struct *task) +int ds_clear_bts(struct bts_tracer *tracer) { - return ds_reset_or_clear(task, ds_bts, /* clear = */ 1); + if (!tracer) + return -EINVAL; + + ds_reset_or_clear(tracer->ds.context, ds_bts, /* clear = */ 1); + + return 0; } -int ds_clear_pebs(struct task_struct *task) +int ds_clear_pebs(struct pebs_tracer *tracer) { - return ds_reset_or_clear(task, ds_pebs, /* clear = */ 1); + if (!tracer) + return -EINVAL; + + ds_reset_or_clear(tracer->ds.context, ds_pebs, /* clear = */ 1); + + return 0; } -int ds_get_pebs_reset(struct task_struct *task, u64 *value) +int ds_get_pebs_reset(struct pebs_tracer *tracer, u64 *value) { - struct ds_context *context; - int error; + if (!tracer) + return -EINVAL; if (!value) return -EINVAL; - context = ds_get_context(task); - error = ds_validate_access(context, ds_pebs); - if (error < 0) - goto out; - - *value = *(u64 *)(context->ds + (ds_cfg.sizeof_field * 8)); + *value = *(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8)); - error = 0; - out: - ds_put_context(context); - return error; + return 0; } -int ds_set_pebs_reset(struct task_struct *task, u64 value) +int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value) { - struct ds_context *context; - int error; - - context = ds_get_context(task); - error = ds_validate_access(context, ds_pebs); - if (error < 0) - goto out; + if (!tracer) + return -EINVAL; - *(u64 *)(context->ds + (ds_cfg.sizeof_field * 8)) = value; + *(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8)) = value; - error = 0; - out: - ds_put_context(context); - return error; + return 0; } static const struct ds_configuration ds_cfg_var = { @@ -840,6 +835,10 @@ static inline void ds_configure(const struct ds_configuration *cfg) { ds_cfg = *cfg; + + printk(KERN_INFO "DS available\n"); + + BUG_ON(MAX_SIZEOF_DS < ds_cfg.sizeof_ds); } void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) @@ -883,6 +882,8 @@ void ds_free(struct ds_context *context) * is dying. There should not be any user of that context left * to disturb us, anymore. */ unsigned long leftovers = context->count; - while (leftovers--) + while (leftovers--) { + put_tracer(context->task); ds_put_context(context); + } } diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 06180df..76adf5b 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -668,14 +668,14 @@ static int ptrace_bts_read_record(struct task_struct *child, size_t index, size_t bts_index, bts_end; int error; - error = ds_get_bts_end(child, &bts_end); + error = ds_get_bts_end(child->bts, &bts_end); if (error < 0) return error; if (bts_end <= index) return -EINVAL; - error = ds_get_bts_index(child, &bts_index); + error = ds_get_bts_index(child->bts, &bts_index); if (error < 0) return error; @@ -684,7 +684,7 @@ static int ptrace_bts_read_record(struct task_struct *child, size_t index, if (bts_end <= bts_index) bts_index -= bts_end; - error = ds_access_bts(child, bts_index, &bts_record); + error = ds_access_bts(child->bts, bts_index, &bts_record); if (error < 0) return error; @@ -705,14 +705,14 @@ static int ptrace_bts_drain(struct task_struct *child, size_t end, i; int error; - error = ds_get_bts_index(child, &end); + error = ds_get_bts_index(child->bts, &end); if (error < 0) return error; if (size < (end * sizeof(struct bts_struct))) return -EIO; - error = ds_access_bts(child, 0, (const void **)&raw); + error = ds_access_bts(child->bts, 0, (const void **)&raw); if (error < 0) return error; @@ -723,18 +723,13 @@ static int ptrace_bts_drain(struct task_struct *child, return -EFAULT; } - error = ds_clear_bts(child); + error = ds_clear_bts(child->bts); if (error < 0) return error; return end; } -static void ptrace_bts_ovfl(struct task_struct *child) -{ - send_sig(child->thread.bts_ovfl_signal, child, 0); -} - static int ptrace_bts_config(struct task_struct *child, long cfg_size, const struct ptrace_bts_config __user *ucfg) @@ -760,23 +755,29 @@ static int ptrace_bts_config(struct task_struct *child, goto errout; if (cfg.flags & PTRACE_BTS_O_ALLOC) { - ds_ovfl_callback_t ovfl = NULL; + bts_ovfl_callback_t ovfl = NULL; unsigned int sig = 0; - /* we ignore the error in case we were not tracing child */ - (void)ds_release_bts(child); - if (cfg.flags & PTRACE_BTS_O_SIGNAL) { if (!cfg.signal) goto errout; + error = -EOPNOTSUPP; + goto errout; + sig = cfg.signal; - ovfl = ptrace_bts_ovfl; } - error = ds_request_bts(child, /* base = */ NULL, cfg.size, ovfl); - if (error < 0) + if (child->bts) + (void)ds_release_bts(child->bts); + + child->bts = ds_request_bts(child, /* base = */ NULL, cfg.size, + ovfl, /* th = */ (size_t)-1); + if (IS_ERR(child->bts)) { + error = PTR_ERR(child->bts); + child->bts = NULL; goto errout; + } child->thread.bts_ovfl_signal = sig; } @@ -823,15 +824,15 @@ static int ptrace_bts_status(struct task_struct *child, if (cfg_size < sizeof(cfg)) return -EIO; - error = ds_get_bts_end(child, &end); + error = ds_get_bts_end(child->bts, &end); if (error < 0) return error; - error = ds_access_bts(child, /* index = */ 0, &base); + error = ds_access_bts(child->bts, /* index = */ 0, &base); if (error < 0) return error; - error = ds_access_bts(child, /* index = */ end, &max); + error = ds_access_bts(child->bts, /* index = */ end, &max); if (error < 0) return error; @@ -884,10 +885,7 @@ static int ptrace_bts_write_record(struct task_struct *child, return -EINVAL; } - /* The writing task will be the switched-to task on a context - * switch. It needs to write into the switched-from task's BTS - * buffer. */ - return ds_unchecked_write_bts(child, bts_record, bts_cfg.sizeof_bts); + return ds_write_bts(child->bts, bts_record, bts_cfg.sizeof_bts); } void ptrace_bts_take_timestamp(struct task_struct *tsk, @@ -972,13 +970,15 @@ void ptrace_disable(struct task_struct *child) clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); #endif #ifdef CONFIG_X86_PTRACE_BTS - (void)ds_release_bts(child); + if (child->bts) { + (void)ds_release_bts(child->bts); - child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask; - if (!child->thread.debugctlmsr) - clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); + child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask; + if (!child->thread.debugctlmsr) + clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); - clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); + clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); + } #endif /* CONFIG_X86_PTRACE_BTS */ } @@ -1110,9 +1110,16 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) (child, data, (struct ptrace_bts_config __user *)addr); break; - case PTRACE_BTS_SIZE: - ret = ds_get_bts_index(child, /* pos = */ NULL); + case PTRACE_BTS_SIZE: { + size_t size; + + ret = ds_get_bts_index(child->bts, &size); + if (ret == 0) { + BUG_ON(size != (int) size); + ret = (int) size; + } break; + } case PTRACE_BTS_GET: ret = ptrace_bts_read_record @@ -1120,7 +1127,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) break; case PTRACE_BTS_CLEAR: - ret = ds_clear_bts(child); + ret = ds_clear_bts(child->bts); break; case PTRACE_BTS_DRAIN: diff --git a/include/linux/sched.h b/include/linux/sched.h index bee1e93..a9780ea 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -96,6 +96,7 @@ struct exec_domain; struct futex_pi_state; struct robust_list_head; struct bio; +struct bts_tracer; /* * List of flags we want to share for kernel threads, @@ -1161,6 +1162,14 @@ struct task_struct { struct list_head ptraced; struct list_head ptrace_entry; +#ifdef CONFIG_X86_PTRACE_BTS + /* + * This is the tracer handle for the ptrace BTS extension. + * This field actually belongs to the ptracer task. + */ + struct bts_tracer *bts; +#endif /* CONFIG_X86_PTRACE_BTS */ + /* PID/PID hash table linkage. */ struct pid_link pids[PIDTYPE_MAX]; struct list_head thread_group; -- cgit v0.10.2 From 6abb11aecd888d1da6276399380b7355f127c006 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Tue, 25 Nov 2008 09:05:27 +0100 Subject: x86, bts, ptrace: move BTS buffer allocation from ds.c into ptrace.c Impact: restructure DS memory allocation to be done by the usage site of DS Require pre-allocated buffers in ds.h. Move the BTS buffer allocation for ptrace into ptrace.c. The pointer to the allocated buffer is stored in the traced task's task_struct together with the handle returned by ds_request_bts(). Removes memory accounting code. Signed-off-by: Markus Metzger Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/ds.h b/arch/x86/include/asm/ds.h index 0af997d..99b6c39 100644 --- a/arch/x86/include/asm/ds.h +++ b/arch/x86/include/asm/ds.h @@ -7,13 +7,12 @@ * * It manages: * - per-thread and per-cpu allocation of BTS and PEBS - * - buffer memory allocation (optional) - * - buffer overflow handling + * - buffer overflow handling (to be done) * - buffer access * * It assumes: - * - get_task_struct on all parameter tasks - * - current is allowed to trace parameter tasks + * - get_task_struct on all traced tasks + * - current is allowed to trace tasks * * * Copyright (C) 2007-2008 Intel Corporation. @@ -54,8 +53,7 @@ typedef void (*pebs_ovfl_callback_t)(struct pebs_tracer *); * task: the task to request recording for; * NULL for per-cpu recording on the current cpu * base: the base pointer for the (non-pageable) buffer; - * NULL if buffer allocation requested - * size: the size of the requested or provided buffer in bytes + * size: the size of the provided buffer in bytes * ovfl: pointer to a function to be called on buffer overflow; * NULL if cyclic buffer requested * th: the interrupt threshold in records from the end of the buffer; @@ -72,8 +70,6 @@ extern struct pebs_tracer *ds_request_pebs(struct task_struct *task, /* * Release BTS or PEBS resources * - * Frees buffers allocated on ds_request. - * * Returns 0 on success; -Eerrno otherwise * * tracer: the tracer handle returned from ds_request_~() diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 96768e9..19a8c2c 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -7,13 +7,12 @@ * * It manages: * - per-thread and per-cpu allocation of BTS and PEBS - * - buffer memory allocation (optional) - * - buffer overflow handling + * - buffer overflow handling (to be done) * - buffer access * * It assumes: - * - get_task_struct on all parameter tasks - * - current is allowed to trace parameter tasks + * - get_task_struct on all traced tasks + * - current is allowed to trace tasks * * * Copyright (C) 2007-2008 Intel Corporation. @@ -57,8 +56,6 @@ struct ds_tracer { /* the buffer provided on ds_request() and its size in bytes */ void *buffer; size_t size; - /* the number of allocated pages for on-request allocated buffers */ - unsigned int pages; }; struct bts_tracer { @@ -141,8 +138,7 @@ static inline void ds_set(unsigned char *base, enum ds_qualifier qual, /* - * Locking is done only for allocating BTS or PEBS resources and for - * guarding context and buffer memory allocation. + * Locking is done only for allocating BTS or PEBS resources. */ static spinlock_t ds_lock = __SPIN_LOCK_UNLOCKED(ds_lock); @@ -292,50 +288,6 @@ static void ds_overflow(struct ds_context *context, enum ds_qualifier qual) } -/* - * Allocate a non-pageable buffer of the parameter size. - * Checks the memory and the locked memory rlimit. - * - * Returns the buffer, if successful; - * NULL, if out of memory or rlimit exceeded. - * - * size: the requested buffer size in bytes - * pages (out): if not NULL, contains the number of pages reserved - */ -static inline void *ds_allocate_buffer(size_t size, unsigned int *pages) -{ - unsigned long rlim, vm, pgsz; - void *buffer = NULL; - - pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT; - - down_write(¤t->mm->mmap_sem); - - rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; - vm = current->mm->total_vm + pgsz; - if (rlim < vm) - goto out; - - rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; - vm = current->mm->locked_vm + pgsz; - if (rlim < vm) - goto out; - - buffer = kzalloc(size, GFP_KERNEL); - if (!buffer) - goto out; - - current->mm->total_vm += pgsz; - current->mm->locked_vm += pgsz; - - if (pages) - *pages = pgsz; - - out: - up_write(¤t->mm->mmap_sem); - return buffer; -} - static void ds_install_ds_config(struct ds_context *context, enum ds_qualifier qual, void *base, size_t size, size_t ith) @@ -382,6 +334,10 @@ static int ds_request(struct ds_tracer *tracer, enum ds_qualifier qual, if (!ds_cfg.sizeof_ds) goto out; + error = -EINVAL; + if (!base) + goto out; + /* we require some space to do alignment adjustments below */ error = -EINVAL; if (size < (DS_ALIGNMENT + ds_cfg.sizeof_rec[qual])) @@ -395,13 +351,6 @@ static int ds_request(struct ds_tracer *tracer, enum ds_qualifier qual, goto out; } - error = -ENOMEM; - if (!base) { - base = ds_allocate_buffer(size, &tracer->pages); - if (!base) - goto out; - } - tracer->buffer = base; tracer->size = size; @@ -466,7 +415,7 @@ struct bts_tracer *ds_request_bts(struct task_struct *task, return tracer; out_tracer: - (void)ds_release_bts(tracer); + kfree(tracer); out: return ERR_PTR(error); } @@ -496,31 +445,18 @@ struct pebs_tracer *ds_request_pebs(struct task_struct *task, return tracer; out_tracer: - (void)ds_release_pebs(tracer); + kfree(tracer); out: return ERR_PTR(error); } static void ds_release(struct ds_tracer *tracer, enum ds_qualifier qual) { - if (tracer->context) { - BUG_ON(tracer->context->owner[qual] != tracer); - tracer->context->owner[qual] = NULL; - - put_tracer(tracer->context->task); - ds_put_context(tracer->context); - } + BUG_ON(tracer->context->owner[qual] != tracer); + tracer->context->owner[qual] = NULL; - if (tracer->pages) { - kfree(tracer->buffer); - - down_write(¤t->mm->mmap_sem); - - current->mm->total_vm -= tracer->pages; - current->mm->locked_vm -= tracer->pages; - - up_write(¤t->mm->mmap_sem); - } + put_tracer(tracer->context->task); + ds_put_context(tracer->context); } int ds_release_bts(struct bts_tracer *tracer) diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 76adf5b..2c8ec1b 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -758,6 +758,10 @@ static int ptrace_bts_config(struct task_struct *child, bts_ovfl_callback_t ovfl = NULL; unsigned int sig = 0; + error = -EINVAL; + if (cfg.size < (10 * bts_cfg.sizeof_bts)) + goto errout; + if (cfg.flags & PTRACE_BTS_O_SIGNAL) { if (!cfg.signal) goto errout; @@ -768,14 +772,26 @@ static int ptrace_bts_config(struct task_struct *child, sig = cfg.signal; } - if (child->bts) + if (child->bts) { (void)ds_release_bts(child->bts); + kfree(child->bts_buffer); + + child->bts = NULL; + child->bts_buffer = NULL; + } + + error = -ENOMEM; + child->bts_buffer = kzalloc(cfg.size, GFP_KERNEL); + if (!child->bts_buffer) + goto errout; - child->bts = ds_request_bts(child, /* base = */ NULL, cfg.size, + child->bts = ds_request_bts(child, child->bts_buffer, cfg.size, ovfl, /* th = */ (size_t)-1); if (IS_ERR(child->bts)) { error = PTR_ERR(child->bts); + kfree(child->bts_buffer); child->bts = NULL; + child->bts_buffer = NULL; goto errout; } @@ -972,6 +988,8 @@ void ptrace_disable(struct task_struct *child) #ifdef CONFIG_X86_PTRACE_BTS if (child->bts) { (void)ds_release_bts(child->bts); + kfree(child->bts_buffer); + child->bts_buffer = NULL; child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask; if (!child->thread.debugctlmsr) diff --git a/include/linux/sched.h b/include/linux/sched.h index a9780ea..d02a0ca 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1168,6 +1168,10 @@ struct task_struct { * This field actually belongs to the ptracer task. */ struct bts_tracer *bts; + /* + * The buffer to hold the BTS data. + */ + void *bts_buffer; #endif /* CONFIG_X86_PTRACE_BTS */ /* PID/PID hash table linkage. */ -- cgit v0.10.2 From 8bba1bf5e2434c83f2fe8b1422604ace9bbe4cb8 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Tue, 25 Nov 2008 09:12:31 +0100 Subject: x86, ftrace: call trace->open() before stopping tracing; add trace->print_header() Add a callback to allow an ftrace plug-in to write its own header. Move the call to trace->open() up a few lines. The changes are required by the BTS ftrace plug-in. Signed-off-by: Markus Metzger Signed-off-by: Ingo Molnar diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a45b59e..8df8fdd 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2298,7 +2298,9 @@ static int s_show(struct seq_file *m, void *v) seq_printf(m, "# tracer: %s\n", iter->trace->name); seq_puts(m, "#\n"); } - if (iter->iter_flags & TRACE_FILE_LAT_FMT) { + if (iter->trace && iter->trace->print_header) + iter->trace->print_header(m); + else if (iter->iter_flags & TRACE_FILE_LAT_FMT) { /* print nothing if the buffers are empty */ if (trace_empty(iter)) return 0; @@ -2350,6 +2352,10 @@ __tracing_open(struct inode *inode, struct file *file, int *ret) iter->trace = current_trace; iter->pos = -1; + /* Notify the tracer early; before we stop tracing. */ + if (iter->trace && iter->trace->open) + iter->trace->open(iter); + /* Annotate start of buffers if we had overruns */ if (ring_buffer_overruns(iter->tr->buffer)) iter->iter_flags |= TRACE_FILE_ANNOTATE; @@ -2375,9 +2381,6 @@ __tracing_open(struct inode *inode, struct file *file, int *ret) /* stop the trace while dumping */ tracing_stop(); - if (iter->trace && iter->trace->open) - iter->trace->open(iter); - mutex_unlock(&trace_types_lock); out: diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 28c15c2..717f9f0 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -311,6 +311,7 @@ struct tracer { int (*selftest)(struct tracer *trace, struct trace_array *tr); #endif + void (*print_header)(struct seq_file *m); enum print_line_t (*print_line)(struct trace_iterator *iter); /* If you handled the flag setting, return 0 */ int (*set_flag)(u32 old_flags, u32 bit, int set); -- cgit v0.10.2 From 1e9b51c28312f7334394aa30be56ff52c2b65b7e Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Tue, 25 Nov 2008 09:24:15 +0100 Subject: x86, bts, ftrace: a BTS ftrace plug-in prototype Impact: add new ftrace plugin A prototype for a BTS ftrace plug-in. The tracer collects branch trace in a cyclic buffer for each cpu. The tracer is not configurable and the trace for each snapshot is appended when doing cat /debug/tracing/trace. This is a proof of concept that will be extended with future patches to become a (hopefully) useful tool. Signed-off-by: Markus Metzger Signed-off-by: Ingo Molnar diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index b815664..85a7857 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -515,6 +515,7 @@ config CPU_SUP_UMC_32 config X86_DS def_bool X86_PTRACE_BTS depends on X86_DEBUGCTLMSR + select HAVE_HW_BRANCH_TRACER config X86_PTRACE_BTS bool "Branch Trace Store" diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 9cbf776..620fead 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -28,6 +28,9 @@ config HAVE_DYNAMIC_FTRACE config HAVE_FTRACE_MCOUNT_RECORD bool +config HAVE_HW_BRANCH_TRACER + bool + config TRACER_MAX_TRACE bool @@ -233,6 +236,14 @@ config STACK_TRACER Say N if unsure. +config BTS_TRACER + depends on HAVE_HW_BRANCH_TRACER + bool "Trace branches" + select TRACING + help + This tracer records all branches on the system in a circular + buffer giving access to the last N branches for each cpu. + config DYNAMIC_FTRACE bool "enable/disable ftrace tracepoints dynamically" depends on FUNCTION_TRACER diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 1a8c925..cef4bcb 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -31,5 +31,6 @@ obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o obj-$(CONFIG_BOOT_TRACER) += trace_boot.o obj-$(CONFIG_FUNCTION_RET_TRACER) += trace_functions_return.o obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o +obj-$(CONFIG_BTS_TRACER) += trace_bts.o libftrace-y := ftrace.o diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 717f9f0..3abd645 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -27,6 +27,7 @@ enum trace_type { TRACE_BOOT_RET, TRACE_FN_RET, TRACE_USER_STACK, + TRACE_BTS, __TRACE_LAST_TYPE }; @@ -153,6 +154,12 @@ struct trace_branch { char correct; }; +struct bts_entry { + struct trace_entry ent; + unsigned long from; + unsigned long to; +}; + /* * trace_flag_type is an enumeration that holds different * states when a trace occurs. These are: @@ -258,6 +265,7 @@ extern void __ftrace_bad_type(void); IF_ASSIGN(var, ent, struct trace_boot_ret, TRACE_BOOT_RET);\ IF_ASSIGN(var, ent, struct trace_branch, TRACE_BRANCH); \ IF_ASSIGN(var, ent, struct ftrace_ret_entry, TRACE_FN_RET);\ + IF_ASSIGN(var, ent, struct bts_entry, TRACE_BTS);\ __ftrace_bad_type(); \ } while (0) @@ -392,6 +400,10 @@ void trace_function(struct trace_array *tr, void trace_function_return(struct ftrace_retfunc *trace); +void trace_bts(struct trace_array *tr, + unsigned long from, + unsigned long to); + void tracing_start_cmdline_record(void); void tracing_stop_cmdline_record(void); void tracing_sched_switch_assign_trace(struct trace_array *tr); diff --git a/kernel/trace/trace_bts.c b/kernel/trace/trace_bts.c new file mode 100644 index 0000000..23b76e4 --- /dev/null +++ b/kernel/trace/trace_bts.c @@ -0,0 +1,276 @@ +/* + * BTS tracer + * + * Copyright (C) 2008 Markus Metzger + * + */ + +#include +#include +#include +#include +#include + +#include + +#include "trace.h" + + +#define SIZEOF_BTS (1 << 13) + +static DEFINE_PER_CPU(struct bts_tracer *, tracer); +static DEFINE_PER_CPU(unsigned char[SIZEOF_BTS], buffer); + +#define this_tracer per_cpu(tracer, smp_processor_id()) +#define this_buffer per_cpu(buffer, smp_processor_id()) + + +/* + * Information to interpret a BTS record. + * This will go into an in-kernel BTS interface. + */ +static unsigned char sizeof_field; +static unsigned long debugctl_mask; + +#define sizeof_bts (3 * sizeof_field) + +static void bts_trace_cpuinit(struct cpuinfo_x86 *c) +{ + switch (c->x86) { + case 0x6: + switch (c->x86_model) { + case 0x0 ... 0xC: + break; + case 0xD: + case 0xE: /* Pentium M */ + sizeof_field = sizeof(long); + debugctl_mask = (1<<6)|(1<<7); + break; + default: + sizeof_field = 8; + debugctl_mask = (1<<6)|(1<<7); + break; + } + break; + case 0xF: + switch (c->x86_model) { + case 0x0: + case 0x1: + case 0x2: /* Netburst */ + sizeof_field = sizeof(long); + debugctl_mask = (1<<2)|(1<<3); + break; + default: + /* sorry, don't know about them */ + break; + } + break; + default: + /* sorry, don't know about them */ + break; + } +} + +static inline void bts_enable(void) +{ + unsigned long debugctl; + + rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); + wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl | debugctl_mask); +} + +static inline void bts_disable(void) +{ + unsigned long debugctl; + + rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); + wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl & ~debugctl_mask); +} + +static void bts_trace_reset(struct trace_array *tr) +{ + int cpu; + + tr->time_start = ftrace_now(tr->cpu); + + for_each_online_cpu(cpu) + tracing_reset(tr, cpu); +} + +static void bts_trace_start_cpu(void *arg) +{ + this_tracer = + ds_request_bts(/* task = */ NULL, this_buffer, SIZEOF_BTS, + /* ovfl = */ NULL, /* th = */ (size_t)-1); + if (IS_ERR(this_tracer)) { + this_tracer = NULL; + return; + } + + bts_enable(); +} + +static void bts_trace_start(struct trace_array *tr) +{ + int cpu; + + bts_trace_reset(tr); + + for_each_cpu_mask(cpu, cpu_possible_map) + smp_call_function_single(cpu, bts_trace_start_cpu, NULL, 1); +} + +static void bts_trace_stop_cpu(void *arg) +{ + if (this_tracer) { + bts_disable(); + + ds_release_bts(this_tracer); + this_tracer = NULL; + } +} + +static void bts_trace_stop(struct trace_array *tr) +{ + int cpu; + + for_each_cpu_mask(cpu, cpu_possible_map) + smp_call_function_single(cpu, bts_trace_stop_cpu, NULL, 1); +} + +static int bts_trace_init(struct trace_array *tr) +{ + bts_trace_cpuinit(&boot_cpu_data); + bts_trace_reset(tr); + bts_trace_start(tr); + + return 0; +} + +static void bts_trace_print_header(struct seq_file *m) +{ +#ifdef __i386__ + seq_puts(m, "# CPU# FROM TO FUNCTION\n"); + seq_puts(m, "# | | | |\n"); +#else + seq_puts(m, + "# CPU# FROM TO FUNCTION\n"); + seq_puts(m, + "# | | | |\n"); +#endif +} + +static enum print_line_t bts_trace_print_line(struct trace_iterator *iter) +{ + struct trace_entry *entry = iter->ent; + struct trace_seq *seq = &iter->seq; + struct bts_entry *it; + + trace_assign_type(it, entry); + + if (entry->type == TRACE_BTS) { + int ret; +#ifdef CONFIG_KALLSYMS + char function[KSYM_SYMBOL_LEN]; + sprint_symbol(function, it->from); +#else + char *function = ""; +#endif + + ret = trace_seq_printf(seq, "%4d 0x%lx -> 0x%lx [%s]\n", + entry->cpu, it->from, it->to, function); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE;; + return TRACE_TYPE_HANDLED; + } + return TRACE_TYPE_UNHANDLED; +} + +void trace_bts(struct trace_array *tr, unsigned long from, unsigned long to) +{ + struct ring_buffer_event *event; + struct bts_entry *entry; + unsigned long irq; + + event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), &irq); + if (!event) + return; + entry = ring_buffer_event_data(event); + tracing_generic_entry_update(&entry->ent, 0, from); + entry->ent.type = TRACE_BTS; + entry->ent.cpu = smp_processor_id(); + entry->from = from; + entry->to = to; + ring_buffer_unlock_commit(tr->buffer, event, irq); +} + +static void trace_bts_at(struct trace_array *tr, size_t index) +{ + const void *raw = NULL; + unsigned long from, to; + int err; + + err = ds_access_bts(this_tracer, index, &raw); + if (err < 0) + return; + + from = *(const unsigned long *)raw; + to = *(const unsigned long *)((const char *)raw + sizeof_field); + + trace_bts(tr, from, to); +} + +static void trace_bts_cpu(void *arg) +{ + struct trace_array *tr = (struct trace_array *) arg; + size_t index = 0, end = 0, i; + int err; + + if (!this_tracer) + return; + + bts_disable(); + + err = ds_get_bts_index(this_tracer, &index); + if (err < 0) + goto out; + + err = ds_get_bts_end(this_tracer, &end); + if (err < 0) + goto out; + + for (i = index; i < end; i++) + trace_bts_at(tr, i); + + for (i = 0; i < index; i++) + trace_bts_at(tr, i); + +out: + bts_enable(); +} + +static void trace_bts_prepare(struct trace_iterator *iter) +{ + int cpu; + + for_each_cpu_mask(cpu, cpu_possible_map) + smp_call_function_single(cpu, trace_bts_cpu, iter->tr, 1); +} + +struct tracer bts_tracer __read_mostly = +{ + .name = "bts", + .init = bts_trace_init, + .reset = bts_trace_stop, + .print_header = bts_trace_print_header, + .print_line = bts_trace_print_line, + .start = bts_trace_start, + .stop = bts_trace_stop, + .open = trace_bts_prepare +}; + +__init static int init_bts_trace(void) +{ + return register_tracer(&bts_tracer); +} +device_initcall(init_bts_trace); -- cgit v0.10.2 From fb52607afcd0629776f1dc9e657647ceae81dd50 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 25 Nov 2008 21:07:04 +0100 Subject: tracing/function-return-tracer: change the name into function-graph-tracer Impact: cleanup This patch changes the name of the "return function tracer" into function-graph-tracer which is a more suitable name for a tracing which makes one able to retrieve the ordered call stack during the code flow. Signed-off-by: Frederic Weisbecker Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index e49a4fd..0842b11 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -29,7 +29,7 @@ config X86 select HAVE_FTRACE_MCOUNT_RECORD select HAVE_DYNAMIC_FTRACE select HAVE_FUNCTION_TRACER - select HAVE_FUNCTION_RET_TRACER if X86_32 + select HAVE_FUNCTION_GRAPH_TRACER if X86_32 select HAVE_FUNCTION_TRACE_MCOUNT_TEST select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) select HAVE_ARCH_KGDB if !X86_VOYAGER diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index 754a3e0..7e61b4c 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -28,7 +28,7 @@ struct dyn_arch_ftrace { #endif /* __ASSEMBLY__ */ #endif /* CONFIG_FUNCTION_TRACER */ -#ifdef CONFIG_FUNCTION_RET_TRACER +#ifdef CONFIG_FUNCTION_GRAPH_TRACER #ifndef __ASSEMBLY__ @@ -51,6 +51,6 @@ struct ftrace_ret_stack { extern void return_to_handler(void); #endif /* __ASSEMBLY__ */ -#endif /* CONFIG_FUNCTION_RET_TRACER */ +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ #endif /* _ASM_X86_FTRACE_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index af2bc36..64939a0 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -14,7 +14,7 @@ CFLAGS_REMOVE_paravirt-spinlocks.o = -pg CFLAGS_REMOVE_ftrace.o = -pg endif -ifdef CONFIG_FUNCTION_RET_TRACER +ifdef CONFIG_FUNCTION_GRAPH_TRACER # Don't trace __switch_to() but let it for function tracer CFLAGS_REMOVE_process_32.o = -pg endif @@ -70,7 +70,7 @@ obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o obj-$(CONFIG_X86_IO_APIC) += io_apic.o obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o -obj-$(CONFIG_FUNCTION_RET_TRACER) += ftrace.o +obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 74defe2..2b1f0f0 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -1188,9 +1188,9 @@ ENTRY(mcount) cmpl $ftrace_stub, ftrace_trace_function jnz trace -#ifdef CONFIG_FUNCTION_RET_TRACER - cmpl $ftrace_stub, ftrace_function_return - jnz ftrace_return_caller +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + cmpl $ftrace_stub, ftrace_graph_function + jnz ftrace_graph_caller #endif .globl ftrace_stub ftrace_stub: @@ -1215,8 +1215,8 @@ END(mcount) #endif /* CONFIG_DYNAMIC_FTRACE */ #endif /* CONFIG_FUNCTION_TRACER */ -#ifdef CONFIG_FUNCTION_RET_TRACER -ENTRY(ftrace_return_caller) +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +ENTRY(ftrace_graph_caller) cmpl $0, function_trace_stop jne ftrace_stub @@ -1230,7 +1230,7 @@ ENTRY(ftrace_return_caller) popl %ecx popl %eax ret -END(ftrace_return_caller) +END(ftrace_graph_caller) .globl return_to_handler return_to_handler: diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index bb137f7..3595a4c 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -323,7 +323,7 @@ int __init ftrace_dyn_arch_init(void *data) } #endif -#ifdef CONFIG_FUNCTION_RET_TRACER +#ifdef CONFIG_FUNCTION_GRAPH_TRACER #ifndef CONFIG_DYNAMIC_FTRACE @@ -389,11 +389,11 @@ static void pop_return_trace(unsigned long *ret, unsigned long long *time, */ unsigned long ftrace_return_to_handler(void) { - struct ftrace_retfunc trace; + struct ftrace_graph_ret trace; pop_return_trace(&trace.ret, &trace.calltime, &trace.func, &trace.overrun); trace.rettime = cpu_clock(raw_smp_processor_id()); - ftrace_function_return(&trace); + ftrace_graph_function(&trace); return trace.ret; } @@ -440,12 +440,12 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) ); if (WARN_ON(faulted)) { - unregister_ftrace_return(); + unregister_ftrace_graph(); return; } if (WARN_ON(!__kernel_text_address(old))) { - unregister_ftrace_return(); + unregister_ftrace_graph(); *parent = old; return; } @@ -456,4 +456,4 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) *parent = old; } -#endif /* CONFIG_FUNCTION_RET_TRACER */ +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 7854d87..b4ac734 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -115,8 +115,8 @@ extern int ftrace_update_ftrace_func(ftrace_func_t func); extern void ftrace_caller(void); extern void ftrace_call(void); extern void mcount_call(void); -#ifdef CONFIG_FUNCTION_RET_TRACER -extern void ftrace_return_caller(void); +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +extern void ftrace_graph_caller(void); #endif /** @@ -315,7 +315,7 @@ ftrace_init_module(struct module *mod, /* * Structure that defines a return function trace. */ -struct ftrace_retfunc { +struct ftrace_graph_ret { unsigned long ret; /* Return address */ unsigned long func; /* Current function */ unsigned long long calltime; @@ -324,22 +324,22 @@ struct ftrace_retfunc { unsigned long overrun; }; -#ifdef CONFIG_FUNCTION_RET_TRACER +#ifdef CONFIG_FUNCTION_GRAPH_TRACER #define FTRACE_RETFUNC_DEPTH 50 #define FTRACE_RETSTACK_ALLOC_SIZE 32 /* Type of a callback handler of tracing return function */ -typedef void (*trace_function_return_t)(struct ftrace_retfunc *); +typedef void (*trace_function_graph_t)(struct ftrace_graph_ret *); -extern int register_ftrace_return(trace_function_return_t func); +extern int register_ftrace_graph(trace_function_graph_t func); /* The current handler in use */ -extern trace_function_return_t ftrace_function_return; -extern void unregister_ftrace_return(void); +extern trace_function_graph_t ftrace_graph_function; +extern void unregister_ftrace_graph(void); -extern void ftrace_retfunc_init_task(struct task_struct *t); -extern void ftrace_retfunc_exit_task(struct task_struct *t); +extern void ftrace_graph_init_task(struct task_struct *t); +extern void ftrace_graph_exit_task(struct task_struct *t); #else -static inline void ftrace_retfunc_init_task(struct task_struct *t) { } -static inline void ftrace_retfunc_exit_task(struct task_struct *t) { } +static inline void ftrace_graph_init_task(struct task_struct *t) { } +static inline void ftrace_graph_exit_task(struct task_struct *t) { } #endif #endif /* _LINUX_FTRACE_H */ diff --git a/include/linux/ftrace_irq.h b/include/linux/ftrace_irq.h index 0b4df55..366a054 100644 --- a/include/linux/ftrace_irq.h +++ b/include/linux/ftrace_irq.h @@ -2,7 +2,7 @@ #define _LINUX_FTRACE_IRQ_H -#if defined(CONFIG_DYNAMIC_FTRACE) || defined(CONFIG_FUNCTION_RET_TRACER) +#if defined(CONFIG_DYNAMIC_FTRACE) || defined(CONFIG_FUNCTION_GRAPH_TRACER) extern void ftrace_nmi_enter(void); extern void ftrace_nmi_exit(void); #else diff --git a/include/linux/sched.h b/include/linux/sched.h index d02a0ca..7ad48f2 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1365,7 +1365,7 @@ struct task_struct { unsigned long default_timer_slack_ns; struct list_head *scm_work_list; -#ifdef CONFIG_FUNCTION_RET_TRACER +#ifdef CONFIG_FUNCTION_GRAPH_TRACER /* Index of current stored adress in ret_stack */ int curr_ret_stack; /* Stack of return addresses for return function tracing */ diff --git a/kernel/Makefile b/kernel/Makefile index 03a45e7..703cf3b 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -21,7 +21,7 @@ CFLAGS_REMOVE_cgroup-debug.o = -pg CFLAGS_REMOVE_sched_clock.o = -pg CFLAGS_REMOVE_sched.o = -pg endif -ifdef CONFIG_FUNCTION_RET_TRACER +ifdef CONFIG_FUNCTION_GRAPH_TRACER CFLAGS_REMOVE_extable.o = -pg # For __kernel_text_address() CFLAGS_REMOVE_module.o = -pg # For __module_text_address() endif diff --git a/kernel/fork.c b/kernel/fork.c index d6e1a32..5f82a99 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -140,7 +140,7 @@ void free_task(struct task_struct *tsk) prop_local_destroy_single(&tsk->dirties); free_thread_info(tsk->stack); rt_mutex_debug_task_free(tsk); - ftrace_retfunc_exit_task(tsk); + ftrace_graph_exit_task(tsk); free_task_struct(tsk); } EXPORT_SYMBOL(free_task); @@ -1271,7 +1271,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, total_forks++; spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); - ftrace_retfunc_init_task(p); + ftrace_graph_init_task(p); proc_fork_connector(p); cgroup_post_fork(p); return p; diff --git a/kernel/sched.c b/kernel/sched.c index 388d9db..52490bf 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5901,7 +5901,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) * The idle tasks have their own, simple scheduling class: */ idle->sched_class = &idle_sched_class; - ftrace_retfunc_init_task(idle); + ftrace_graph_init_task(idle); } /* diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 620fead..eb9b901 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -12,7 +12,7 @@ config NOP_TRACER config HAVE_FUNCTION_TRACER bool -config HAVE_FUNCTION_RET_TRACER +config HAVE_FUNCTION_GRAPH_TRACER bool config HAVE_FUNCTION_TRACE_MCOUNT_TEST @@ -63,15 +63,18 @@ config FUNCTION_TRACER (the bootup default), then the overhead of the instructions is very small and not measurable even in micro-benchmarks. -config FUNCTION_RET_TRACER - bool "Kernel Function return Tracer" - depends on HAVE_FUNCTION_RET_TRACER +config FUNCTION_GRAPH_TRACER + bool "Kernel Function Graph Tracer" + depends on HAVE_FUNCTION_GRAPH_TRACER depends on FUNCTION_TRACER help - Enable the kernel to trace a function at its return. - It's first purpose is to trace the duration of functions. - This is done by setting the current return address on the thread - info structure of the current task. + Enable the kernel to trace a function at both its return + and its entry. + It's first purpose is to trace the duration of functions and + draw a call graph for each thread with some informations like + the return value. + This is done by setting the current return address on the current + task structure into a stack of calls. config IRQSOFF_TRACER bool "Interrupts-off Latency Tracer" diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index cef4bcb..08c5fe6 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -29,7 +29,7 @@ obj-$(CONFIG_NOP_TRACER) += trace_nop.o obj-$(CONFIG_STACK_TRACER) += trace_stack.o obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o obj-$(CONFIG_BOOT_TRACER) += trace_boot.o -obj-$(CONFIG_FUNCTION_RET_TRACER) += trace_functions_return.o +obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o obj-$(CONFIG_BTS_TRACER) += trace_bts.o diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 53042f1..9e19976 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -395,11 +395,11 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) unsigned long ip, fl; unsigned long ftrace_addr; -#ifdef CONFIG_FUNCTION_RET_TRACER +#ifdef CONFIG_FUNCTION_GRAPH_TRACER if (ftrace_tracing_type == FTRACE_TYPE_ENTER) ftrace_addr = (unsigned long)ftrace_caller; else - ftrace_addr = (unsigned long)ftrace_return_caller; + ftrace_addr = (unsigned long)ftrace_graph_caller; #else ftrace_addr = (unsigned long)ftrace_caller; #endif @@ -1496,13 +1496,13 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, return ret; } -#ifdef CONFIG_FUNCTION_RET_TRACER +#ifdef CONFIG_FUNCTION_GRAPH_TRACER static atomic_t ftrace_retfunc_active; /* The callback that hooks the return of a function */ -trace_function_return_t ftrace_function_return = - (trace_function_return_t)ftrace_stub; +trace_function_graph_t ftrace_graph_function = + (trace_function_graph_t)ftrace_stub; /* Try to assign a return stack array on FTRACE_RETSTACK_ALLOC_SIZE tasks. */ @@ -1549,7 +1549,7 @@ free: } /* Allocate a return stack for each task */ -static int start_return_tracing(void) +static int start_graph_tracing(void) { struct ftrace_ret_stack **ret_stack_list; int ret; @@ -1569,7 +1569,7 @@ static int start_return_tracing(void) return ret; } -int register_ftrace_return(trace_function_return_t func) +int register_ftrace_graph(trace_function_graph_t func) { int ret = 0; @@ -1584,13 +1584,13 @@ int register_ftrace_return(trace_function_return_t func) goto out; } atomic_inc(&ftrace_retfunc_active); - ret = start_return_tracing(); + ret = start_graph_tracing(); if (ret) { atomic_dec(&ftrace_retfunc_active); goto out; } ftrace_tracing_type = FTRACE_TYPE_RETURN; - ftrace_function_return = func; + ftrace_graph_function = func; ftrace_startup(); out: @@ -1598,12 +1598,12 @@ out: return ret; } -void unregister_ftrace_return(void) +void unregister_ftrace_graph(void) { mutex_lock(&ftrace_sysctl_lock); atomic_dec(&ftrace_retfunc_active); - ftrace_function_return = (trace_function_return_t)ftrace_stub; + ftrace_graph_function = (trace_function_graph_t)ftrace_stub; ftrace_shutdown(); /* Restore normal tracing type */ ftrace_tracing_type = FTRACE_TYPE_ENTER; @@ -1612,7 +1612,7 @@ void unregister_ftrace_return(void) } /* Allocate a return stack for newly created task */ -void ftrace_retfunc_init_task(struct task_struct *t) +void ftrace_graph_init_task(struct task_struct *t) { if (atomic_read(&ftrace_retfunc_active)) { t->ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH @@ -1626,7 +1626,7 @@ void ftrace_retfunc_init_task(struct task_struct *t) t->ret_stack = NULL; } -void ftrace_retfunc_exit_task(struct task_struct *t) +void ftrace_graph_exit_task(struct task_struct *t) { struct ftrace_ret_stack *ret_stack = t->ret_stack; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8df8fdd..f21ab2c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -878,15 +878,15 @@ trace_function(struct trace_array *tr, struct trace_array_cpu *data, ring_buffer_unlock_commit(tr->buffer, event, irq_flags); } -#ifdef CONFIG_FUNCTION_RET_TRACER -static void __trace_function_return(struct trace_array *tr, +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +static void __trace_function_graph(struct trace_array *tr, struct trace_array_cpu *data, - struct ftrace_retfunc *trace, + struct ftrace_graph_ret *trace, unsigned long flags, int pc) { struct ring_buffer_event *event; - struct ftrace_ret_entry *entry; + struct ftrace_graph_entry *entry; unsigned long irq_flags; if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) @@ -1177,8 +1177,8 @@ function_trace_call(unsigned long ip, unsigned long parent_ip) local_irq_restore(flags); } -#ifdef CONFIG_FUNCTION_RET_TRACER -void trace_function_return(struct ftrace_retfunc *trace) +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +void trace_function_graph(struct ftrace_graph_ret *trace) { struct trace_array *tr = &global_trace; struct trace_array_cpu *data; @@ -1193,12 +1193,12 @@ void trace_function_return(struct ftrace_retfunc *trace) disabled = atomic_inc_return(&data->disabled); if (likely(disabled == 1)) { pc = preempt_count(); - __trace_function_return(tr, data, trace, flags, pc); + __trace_function_graph(tr, data, trace, flags, pc); } atomic_dec(&data->disabled); raw_local_irq_restore(flags); } -#endif /* CONFIG_FUNCTION_RET_TRACER */ +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ static struct ftrace_ops trace_ops __read_mostly = { @@ -2001,7 +2001,7 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter) break; } case TRACE_FN_RET: { - return print_return_function(iter); + return print_graph_function(iter); break; } case TRACE_BRANCH: { diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 3abd645..72b5ef8 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -57,7 +57,7 @@ struct ftrace_entry { }; /* Function return entry */ -struct ftrace_ret_entry { +struct ftrace_graph_entry { struct trace_entry ent; unsigned long ip; unsigned long parent_ip; @@ -264,7 +264,7 @@ extern void __ftrace_bad_type(void); IF_ASSIGN(var, ent, struct trace_boot_call, TRACE_BOOT_CALL);\ IF_ASSIGN(var, ent, struct trace_boot_ret, TRACE_BOOT_RET);\ IF_ASSIGN(var, ent, struct trace_branch, TRACE_BRANCH); \ - IF_ASSIGN(var, ent, struct ftrace_ret_entry, TRACE_FN_RET);\ + IF_ASSIGN(var, ent, struct ftrace_graph_entry, TRACE_FN_RET);\ IF_ASSIGN(var, ent, struct bts_entry, TRACE_BTS);\ __ftrace_bad_type(); \ } while (0) @@ -398,7 +398,7 @@ void trace_function(struct trace_array *tr, unsigned long parent_ip, unsigned long flags, int pc); void -trace_function_return(struct ftrace_retfunc *trace); +trace_function_graph(struct ftrace_graph_ret *trace); void trace_bts(struct trace_array *tr, unsigned long from, @@ -489,11 +489,11 @@ extern int trace_vprintk(unsigned long ip, const char *fmt, va_list args); extern unsigned long trace_flags; /* Standard output formatting function used for function return traces */ -#ifdef CONFIG_FUNCTION_RET_TRACER -extern enum print_line_t print_return_function(struct trace_iterator *iter); +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +extern enum print_line_t print_graph_function(struct trace_iterator *iter); #else static inline enum print_line_t -print_return_function(struct trace_iterator *iter) +print_graph_function(struct trace_iterator *iter) { return TRACE_TYPE_UNHANDLED; } diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c new file mode 100644 index 0000000..f5bad46 --- /dev/null +++ b/kernel/trace/trace_functions_graph.c @@ -0,0 +1,98 @@ +/* + * + * Function graph tracer. + * Copyright (c) 2008 Frederic Weisbecker + * Mostly borrowed from function tracer which + * is Copyright (c) Steven Rostedt + * + */ +#include +#include +#include +#include + +#include "trace.h" + + +#define TRACE_GRAPH_PRINT_OVERRUN 0x1 +static struct tracer_opt trace_opts[] = { + /* Display overruns or not */ + { TRACER_OPT(overrun, TRACE_GRAPH_PRINT_OVERRUN) }, + { } /* Empty entry */ +}; + +static struct tracer_flags tracer_flags = { + .val = 0, /* Don't display overruns by default */ + .opts = trace_opts +}; + + +static int graph_trace_init(struct trace_array *tr) +{ + int cpu; + for_each_online_cpu(cpu) + tracing_reset(tr, cpu); + + return register_ftrace_graph(&trace_function_graph); +} + +static void graph_trace_reset(struct trace_array *tr) +{ + unregister_ftrace_graph(); +} + + +enum print_line_t +print_graph_function(struct trace_iterator *iter) +{ + struct trace_seq *s = &iter->seq; + struct trace_entry *entry = iter->ent; + struct ftrace_graph_entry *field; + int ret; + + if (entry->type == TRACE_FN_RET) { + trace_assign_type(field, entry); + ret = trace_seq_printf(s, "%pF -> ", (void *)field->parent_ip); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + ret = seq_print_ip_sym(s, field->ip, + trace_flags & TRACE_ITER_SYM_MASK); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + ret = trace_seq_printf(s, " (%llu ns)", + field->rettime - field->calltime); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERRUN) { + ret = trace_seq_printf(s, " (Overruns: %lu)", + field->overrun); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + + ret = trace_seq_printf(s, "\n"); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + return TRACE_TYPE_HANDLED; + } + return TRACE_TYPE_UNHANDLED; +} + +static struct tracer graph_trace __read_mostly = { + .name = "function-graph", + .init = graph_trace_init, + .reset = graph_trace_reset, + .print_line = print_graph_function, + .flags = &tracer_flags, +}; + +static __init int init_graph_trace(void) +{ + return register_tracer(&graph_trace); +} + +device_initcall(init_graph_trace); -- cgit v0.10.2 From 287b6e68ca7209caec40b2f44f837c580a413bae Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 26 Nov 2008 00:57:25 +0100 Subject: tracing/function-return-tracer: set a more human readable output Impact: feature This patch sets a C-like output for the function graph tracing. For this aim, we now call two handler for each function: one on the entry and one other on return. This way we can draw a well-ordered call stack. The pid of the previous trace is loosely stored to be compared against the one of the current trace to see if there were a context switch. Without this little feature, the call tree would seem broken at some locations. We could use the sched_tracer to capture these sched_events but this way of processing is much more simpler. 2 spaces have been chosen for indentation to fit the screen while deep calls. The time of execution in nanosecs is printed just after closed braces, it seems more easy this way to find the corresponding function. If the time was printed as a first column, it would be not so easy to find the corresponding function if it is called on a deep depth. I plan to output the return value but on 32 bits CPU, the return value can be 32 or 64, and its difficult to guess on which case we are. I don't know what would be the better solution on X86-32: only print eax (low-part) or even edx (high-part). Actually it's thee same problem when a function return a 8 bits value, the high part of eax could contain junk values... Here is an example of trace: sys_read() { fget_light() { } 526 vfs_read() { rw_verify_area() { security_file_permission() { cap_file_permission() { } 519 } 1564 } 2640 do_sync_read() { pipe_read() { __might_sleep() { } 511 pipe_wait() { prepare_to_wait() { } 760 deactivate_task() { dequeue_task() { dequeue_task_fair() { dequeue_entity() { update_curr() { update_min_vruntime() { } 504 } 1587 clear_buddies() { } 512 add_cfs_task_weight() { } 519 update_min_vruntime() { } 511 } 5602 dequeue_entity() { update_curr() { update_min_vruntime() { } 496 } 1631 clear_buddies() { } 496 update_min_vruntime() { } 527 } 4580 hrtick_update() { hrtick_start_fair() { } 488 } 1489 } 13700 } 14949 } 16016 msecs_to_jiffies() { } 496 put_prev_task_fair() { } 504 pick_next_task_fair() { } 489 pick_next_task_rt() { } 496 pick_next_task_fair() { } 489 pick_next_task_idle() { } 489 ------------8<---------- thread 4 ------------8<---------- finish_task_switch() { } 1203 do_softirq() { __do_softirq() { __local_bh_disable() { } 669 rcu_process_callbacks() { __rcu_process_callbacks() { cpu_quiet() { rcu_start_batch() { } 503 } 1647 } 3128 __rcu_process_callbacks() { } 542 } 5362 _local_bh_enable() { } 587 } 8880 } 9986 kthread_should_stop() { } 669 deactivate_task() { dequeue_task() { dequeue_task_fair() { dequeue_entity() { update_curr() { calc_delta_mine() { } 511 update_min_vruntime() { } 511 } 2813 Signed-off-by: Frederic Weisbecker Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 3595a4c..26b2d92 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -347,7 +347,7 @@ void ftrace_nmi_exit(void) /* Add a function return address to the trace stack on thread info.*/ static int push_return_trace(unsigned long ret, unsigned long long time, - unsigned long func) + unsigned long func, int *depth) { int index; @@ -365,21 +365,22 @@ static int push_return_trace(unsigned long ret, unsigned long long time, current->ret_stack[index].ret = ret; current->ret_stack[index].func = func; current->ret_stack[index].calltime = time; + *depth = index; return 0; } /* Retrieve a function return address to the trace stack on thread info.*/ -static void pop_return_trace(unsigned long *ret, unsigned long long *time, - unsigned long *func, unsigned long *overrun) +static void pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret) { int index; index = current->curr_ret_stack; *ret = current->ret_stack[index].ret; - *func = current->ret_stack[index].func; - *time = current->ret_stack[index].calltime; - *overrun = atomic_read(¤t->trace_overrun); + trace->func = current->ret_stack[index].func; + trace->calltime = current->ret_stack[index].calltime; + trace->overrun = atomic_read(¤t->trace_overrun); + trace->depth = index; current->curr_ret_stack--; } @@ -390,12 +391,13 @@ static void pop_return_trace(unsigned long *ret, unsigned long long *time, unsigned long ftrace_return_to_handler(void) { struct ftrace_graph_ret trace; - pop_return_trace(&trace.ret, &trace.calltime, &trace.func, - &trace.overrun); + unsigned long ret; + + pop_return_trace(&trace, &ret); trace.rettime = cpu_clock(raw_smp_processor_id()); - ftrace_graph_function(&trace); + ftrace_graph_return(&trace); - return trace.ret; + return ret; } /* @@ -407,6 +409,7 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) unsigned long old; unsigned long long calltime; int faulted; + struct ftrace_graph_ent trace; unsigned long return_hooker = (unsigned long) &return_to_handler; @@ -452,8 +455,15 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) calltime = cpu_clock(raw_smp_processor_id()); - if (push_return_trace(old, calltime, self_addr) == -EBUSY) + if (push_return_trace(old, calltime, + self_addr, &trace.depth) == -EBUSY) { *parent = old; + return; + } + + trace.func = self_addr; + ftrace_graph_entry(&trace); + } #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index b4ac734..fc2d549 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -313,26 +313,39 @@ ftrace_init_module(struct module *mod, /* + * Structure that defines an entry function trace. + */ +struct ftrace_graph_ent { + unsigned long func; /* Current function */ + int depth; +}; + +/* * Structure that defines a return function trace. */ struct ftrace_graph_ret { - unsigned long ret; /* Return address */ unsigned long func; /* Current function */ unsigned long long calltime; unsigned long long rettime; /* Number of functions that overran the depth limit for current task */ unsigned long overrun; + int depth; }; #ifdef CONFIG_FUNCTION_GRAPH_TRACER #define FTRACE_RETFUNC_DEPTH 50 #define FTRACE_RETSTACK_ALLOC_SIZE 32 -/* Type of a callback handler of tracing return function */ -typedef void (*trace_function_graph_t)(struct ftrace_graph_ret *); +/* Type of the callback handlers for tracing function graph*/ +typedef void (*trace_func_graph_ret_t)(struct ftrace_graph_ret *); /* return */ +typedef void (*trace_func_graph_ent_t)(struct ftrace_graph_ent *); /* entry */ + +extern int register_ftrace_graph(trace_func_graph_ret_t retfunc, + trace_func_graph_ent_t entryfunc); + +/* The current handlers in use */ +extern trace_func_graph_ret_t ftrace_graph_return; +extern trace_func_graph_ent_t ftrace_graph_entry; -extern int register_ftrace_graph(trace_function_graph_t func); -/* The current handler in use */ -extern trace_function_graph_t ftrace_graph_function; extern void unregister_ftrace_graph(void); extern void ftrace_graph_init_task(struct task_struct *t); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 9e19976..7e2d3b9 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1498,12 +1498,13 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, #ifdef CONFIG_FUNCTION_GRAPH_TRACER -static atomic_t ftrace_retfunc_active; - -/* The callback that hooks the return of a function */ -trace_function_graph_t ftrace_graph_function = - (trace_function_graph_t)ftrace_stub; +static atomic_t ftrace_graph_active; +/* The callbacks that hook a function */ +trace_func_graph_ret_t ftrace_graph_return = + (trace_func_graph_ret_t)ftrace_stub; +trace_func_graph_ent_t ftrace_graph_entry = + (trace_func_graph_ent_t)ftrace_stub; /* Try to assign a return stack array on FTRACE_RETSTACK_ALLOC_SIZE tasks. */ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list) @@ -1569,7 +1570,8 @@ static int start_graph_tracing(void) return ret; } -int register_ftrace_graph(trace_function_graph_t func) +int register_ftrace_graph(trace_func_graph_ret_t retfunc, + trace_func_graph_ent_t entryfunc) { int ret = 0; @@ -1583,14 +1585,15 @@ int register_ftrace_graph(trace_function_graph_t func) ret = -EBUSY; goto out; } - atomic_inc(&ftrace_retfunc_active); + atomic_inc(&ftrace_graph_active); ret = start_graph_tracing(); if (ret) { - atomic_dec(&ftrace_retfunc_active); + atomic_dec(&ftrace_graph_active); goto out; } ftrace_tracing_type = FTRACE_TYPE_RETURN; - ftrace_graph_function = func; + ftrace_graph_return = retfunc; + ftrace_graph_entry = entryfunc; ftrace_startup(); out: @@ -1602,8 +1605,9 @@ void unregister_ftrace_graph(void) { mutex_lock(&ftrace_sysctl_lock); - atomic_dec(&ftrace_retfunc_active); - ftrace_graph_function = (trace_function_graph_t)ftrace_stub; + atomic_dec(&ftrace_graph_active); + ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; + ftrace_graph_entry = (trace_func_graph_ent_t)ftrace_stub; ftrace_shutdown(); /* Restore normal tracing type */ ftrace_tracing_type = FTRACE_TYPE_ENTER; @@ -1614,7 +1618,7 @@ void unregister_ftrace_graph(void) /* Allocate a return stack for newly created task */ void ftrace_graph_init_task(struct task_struct *t) { - if (atomic_read(&ftrace_retfunc_active)) { + if (atomic_read(&ftrace_graph_active)) { t->ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH * sizeof(struct ftrace_ret_stack), GFP_KERNEL); @@ -1638,5 +1642,3 @@ void ftrace_graph_exit_task(struct task_struct *t) } #endif - - diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f21ab2c..9d5f7c9 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -879,14 +879,38 @@ trace_function(struct trace_array *tr, struct trace_array_cpu *data, } #ifdef CONFIG_FUNCTION_GRAPH_TRACER -static void __trace_function_graph(struct trace_array *tr, +static void __trace_graph_entry(struct trace_array *tr, + struct trace_array_cpu *data, + struct ftrace_graph_ent *trace, + unsigned long flags, + int pc) +{ + struct ring_buffer_event *event; + struct ftrace_graph_ent_entry *entry; + unsigned long irq_flags; + + if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) + return; + + event = ring_buffer_lock_reserve(global_trace.buffer, sizeof(*entry), + &irq_flags); + if (!event) + return; + entry = ring_buffer_event_data(event); + tracing_generic_entry_update(&entry->ent, flags, pc); + entry->ent.type = TRACE_GRAPH_ENT; + entry->graph_ent = *trace; + ring_buffer_unlock_commit(global_trace.buffer, event, irq_flags); +} + +static void __trace_graph_return(struct trace_array *tr, struct trace_array_cpu *data, struct ftrace_graph_ret *trace, unsigned long flags, int pc) { struct ring_buffer_event *event; - struct ftrace_graph_entry *entry; + struct ftrace_graph_ret_entry *entry; unsigned long irq_flags; if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) @@ -898,12 +922,8 @@ static void __trace_function_graph(struct trace_array *tr, return; entry = ring_buffer_event_data(event); tracing_generic_entry_update(&entry->ent, flags, pc); - entry->ent.type = TRACE_FN_RET; - entry->ip = trace->func; - entry->parent_ip = trace->ret; - entry->rettime = trace->rettime; - entry->calltime = trace->calltime; - entry->overrun = trace->overrun; + entry->ent.type = TRACE_GRAPH_RET; + entry->ret = *trace; ring_buffer_unlock_commit(global_trace.buffer, event, irq_flags); } #endif @@ -1178,7 +1198,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip) } #ifdef CONFIG_FUNCTION_GRAPH_TRACER -void trace_function_graph(struct ftrace_graph_ret *trace) +void trace_graph_entry(struct ftrace_graph_ent *trace) { struct trace_array *tr = &global_trace; struct trace_array_cpu *data; @@ -1193,7 +1213,28 @@ void trace_function_graph(struct ftrace_graph_ret *trace) disabled = atomic_inc_return(&data->disabled); if (likely(disabled == 1)) { pc = preempt_count(); - __trace_function_graph(tr, data, trace, flags, pc); + __trace_graph_entry(tr, data, trace, flags, pc); + } + atomic_dec(&data->disabled); + raw_local_irq_restore(flags); +} + +void trace_graph_return(struct ftrace_graph_ret *trace) +{ + struct trace_array *tr = &global_trace; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int cpu; + int pc; + + raw_local_irq_save(flags); + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + disabled = atomic_inc_return(&data->disabled); + if (likely(disabled == 1)) { + pc = preempt_count(); + __trace_graph_return(tr, data, trace, flags, pc); } atomic_dec(&data->disabled); raw_local_irq_restore(flags); @@ -2000,9 +2041,11 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter) trace_seq_print_cont(s, iter); break; } - case TRACE_FN_RET: { + case TRACE_GRAPH_RET: { + return print_graph_function(iter); + } + case TRACE_GRAPH_ENT: { return print_graph_function(iter); - break; } case TRACE_BRANCH: { struct trace_branch *field; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 72b5ef8..ffe1bb1 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -25,7 +25,8 @@ enum trace_type { TRACE_BRANCH, TRACE_BOOT_CALL, TRACE_BOOT_RET, - TRACE_FN_RET, + TRACE_GRAPH_RET, + TRACE_GRAPH_ENT, TRACE_USER_STACK, TRACE_BTS, @@ -56,14 +57,16 @@ struct ftrace_entry { unsigned long parent_ip; }; +/* Function call entry */ +struct ftrace_graph_ent_entry { + struct trace_entry ent; + struct ftrace_graph_ent graph_ent; +}; + /* Function return entry */ -struct ftrace_graph_entry { - struct trace_entry ent; - unsigned long ip; - unsigned long parent_ip; - unsigned long long calltime; - unsigned long long rettime; - unsigned long overrun; +struct ftrace_graph_ret_entry { + struct trace_entry ent; + struct ftrace_graph_ret ret; }; extern struct tracer boot_tracer; @@ -264,7 +267,10 @@ extern void __ftrace_bad_type(void); IF_ASSIGN(var, ent, struct trace_boot_call, TRACE_BOOT_CALL);\ IF_ASSIGN(var, ent, struct trace_boot_ret, TRACE_BOOT_RET);\ IF_ASSIGN(var, ent, struct trace_branch, TRACE_BRANCH); \ - IF_ASSIGN(var, ent, struct ftrace_graph_entry, TRACE_FN_RET);\ + IF_ASSIGN(var, ent, struct ftrace_graph_ent_entry, \ + TRACE_GRAPH_ENT); \ + IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \ + TRACE_GRAPH_RET); \ IF_ASSIGN(var, ent, struct bts_entry, TRACE_BTS);\ __ftrace_bad_type(); \ } while (0) @@ -397,9 +403,9 @@ void trace_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, unsigned long flags, int pc); -void -trace_function_graph(struct ftrace_graph_ret *trace); +void trace_graph_return(struct ftrace_graph_ret *trace); +void trace_graph_entry(struct ftrace_graph_ent *trace); void trace_bts(struct trace_array *tr, unsigned long from, unsigned long to); diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index f5bad46..b6f0cc2 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -13,6 +13,7 @@ #include "trace.h" +#define TRACE_GRAPH_INDENT 2 #define TRACE_GRAPH_PRINT_OVERRUN 0x1 static struct tracer_opt trace_opts[] = { @@ -26,6 +27,8 @@ static struct tracer_flags tracer_flags = { .opts = trace_opts }; +/* pid on the last trace processed */ +static pid_t last_pid = -1; static int graph_trace_init(struct trace_array *tr) { @@ -33,7 +36,8 @@ static int graph_trace_init(struct trace_array *tr) for_each_online_cpu(cpu) tracing_reset(tr, cpu); - return register_ftrace_graph(&trace_function_graph); + return register_ftrace_graph(&trace_graph_return, + &trace_graph_entry); } static void graph_trace_reset(struct trace_array *tr) @@ -41,45 +45,97 @@ static void graph_trace_reset(struct trace_array *tr) unregister_ftrace_graph(); } +/* If the pid changed since the last trace, output this event */ +static int verif_pid(struct trace_seq *s, pid_t pid) +{ + if (last_pid != -1 && last_pid == pid) + return 1; -enum print_line_t -print_graph_function(struct trace_iterator *iter) + last_pid = pid; + return trace_seq_printf(s, "\n------------8<---------- thread %d" + " ------------8<----------\n\n", + pid); +} + +static enum print_line_t +print_graph_entry(struct ftrace_graph_ent *call, struct trace_seq *s, + struct trace_entry *ent) { - struct trace_seq *s = &iter->seq; - struct trace_entry *entry = iter->ent; - struct ftrace_graph_entry *field; + int i; int ret; - if (entry->type == TRACE_FN_RET) { - trace_assign_type(field, entry); - ret = trace_seq_printf(s, "%pF -> ", (void *)field->parent_ip); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + if (!verif_pid(s, ent->pid)) + return TRACE_TYPE_PARTIAL_LINE; - ret = seq_print_ip_sym(s, field->ip, - trace_flags & TRACE_ITER_SYM_MASK); + for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { + ret = trace_seq_printf(s, " "); if (!ret) return TRACE_TYPE_PARTIAL_LINE; + } + + ret = seq_print_ip_sym(s, call->func, 0); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + ret = trace_seq_printf(s, "() {\n"); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + return TRACE_TYPE_HANDLED; +} + +static enum print_line_t +print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, + struct trace_entry *ent) +{ + int i; + int ret; + + if (!verif_pid(s, ent->pid)) + return TRACE_TYPE_PARTIAL_LINE; - ret = trace_seq_printf(s, " (%llu ns)", - field->rettime - field->calltime); + for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) { + ret = trace_seq_printf(s, " "); if (!ret) return TRACE_TYPE_PARTIAL_LINE; + } + + ret = trace_seq_printf(s, "} "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; - if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERRUN) { - ret = trace_seq_printf(s, " (Overruns: %lu)", - field->overrun); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } + ret = trace_seq_printf(s, "%llu\n", trace->rettime - trace->calltime); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; - ret = trace_seq_printf(s, "\n"); + if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERRUN) { + ret = trace_seq_printf(s, " (Overruns: %lu)\n", + trace->overrun); if (!ret) return TRACE_TYPE_PARTIAL_LINE; + } + return TRACE_TYPE_HANDLED; +} + +enum print_line_t +print_graph_function(struct trace_iterator *iter) +{ + struct trace_seq *s = &iter->seq; + struct trace_entry *entry = iter->ent; - return TRACE_TYPE_HANDLED; + switch (entry->type) { + case TRACE_GRAPH_ENT: { + struct ftrace_graph_ent_entry *field; + trace_assign_type(field, entry); + return print_graph_entry(&field->graph_ent, s, entry); + } + case TRACE_GRAPH_RET: { + struct ftrace_graph_ret_entry *field; + trace_assign_type(field, entry); + return print_graph_return(&field->ret, s, entry); + } + default: + return TRACE_TYPE_UNHANDLED; } - return TRACE_TYPE_UNHANDLED; } static struct tracer graph_trace __read_mostly = { diff --git a/kernel/trace/trace_functions_return.c b/kernel/trace/trace_functions_return.c deleted file mode 100644 index e00d645..0000000 --- a/kernel/trace/trace_functions_return.c +++ /dev/null @@ -1,98 +0,0 @@ -/* - * - * Function return tracer. - * Copyright (c) 2008 Frederic Weisbecker - * Mostly borrowed from function tracer which - * is Copyright (c) Steven Rostedt - * - */ -#include -#include -#include -#include - -#include "trace.h" - - -#define TRACE_RETURN_PRINT_OVERRUN 0x1 -static struct tracer_opt trace_opts[] = { - /* Display overruns or not */ - { TRACER_OPT(overrun, TRACE_RETURN_PRINT_OVERRUN) }, - { } /* Empty entry */ -}; - -static struct tracer_flags tracer_flags = { - .val = 0, /* Don't display overruns by default */ - .opts = trace_opts -}; - - -static int return_trace_init(struct trace_array *tr) -{ - int cpu; - for_each_online_cpu(cpu) - tracing_reset(tr, cpu); - - return register_ftrace_return(&trace_function_return); -} - -static void return_trace_reset(struct trace_array *tr) -{ - unregister_ftrace_return(); -} - - -enum print_line_t -print_return_function(struct trace_iterator *iter) -{ - struct trace_seq *s = &iter->seq; - struct trace_entry *entry = iter->ent; - struct ftrace_ret_entry *field; - int ret; - - if (entry->type == TRACE_FN_RET) { - trace_assign_type(field, entry); - ret = trace_seq_printf(s, "%pF -> ", (void *)field->parent_ip); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - ret = seq_print_ip_sym(s, field->ip, - trace_flags & TRACE_ITER_SYM_MASK); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - ret = trace_seq_printf(s, " (%llu ns)", - field->rettime - field->calltime); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - if (tracer_flags.val & TRACE_RETURN_PRINT_OVERRUN) { - ret = trace_seq_printf(s, " (Overruns: %lu)", - field->overrun); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } - - ret = trace_seq_printf(s, "\n"); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - return TRACE_TYPE_HANDLED; - } - return TRACE_TYPE_UNHANDLED; -} - -static struct tracer return_trace __read_mostly = { - .name = "return", - .init = return_trace_init, - .reset = return_trace_reset, - .print_line = print_return_function, - .flags = &tracer_flags, -}; - -static __init int init_return_trace(void) -{ - return register_tracer(&return_trace); -} - -device_initcall(init_return_trace); -- cgit v0.10.2 From c2324b694fa8ffee382a124198c68754088e483c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 26 Nov 2008 03:10:01 +0100 Subject: tracing: function graph tracer, fix fix return-tracer => graph-tracer namespace rename fallout. Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 2b1f0f0..7def9fd 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -1189,7 +1189,7 @@ ENTRY(mcount) cmpl $ftrace_stub, ftrace_trace_function jnz trace #ifdef CONFIG_FUNCTION_GRAPH_TRACER - cmpl $ftrace_stub, ftrace_graph_function + cmpl $ftrace_stub, ftrace_graph_return jnz ftrace_graph_caller #endif .globl ftrace_stub -- cgit v0.10.2 From d144d5ee6a265823d39f75ecfed351a516295183 Mon Sep 17 00:00:00 2001 From: Liming Wang Date: Wed, 26 Nov 2008 10:29:26 +0800 Subject: ftrace: adding other non-leaving .text sections Impact: widen the scope of recordmcount.pl Besides .text section, there are three .text sections that won't be freed after kernel booting. They are: .sched.text, .spinlock.text and .kprobes.text, which contain functions we can trace. But the last section ".kprobes.text" is particular, which has been marked as "notrace", we ignore it. Thus we add other two sections. Signed-off-by: Liming Wang Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl index 0197e2f..0b1dc9f 100755 --- a/scripts/recordmcount.pl +++ b/scripts/recordmcount.pl @@ -112,6 +112,8 @@ my ($arch, $bits, $objdump, $objcopy, $cc, # Acceptable sections to record. my %text_sections = ( ".text" => 1, + ".sched.text" => 1, + ".spinlock.text" => 1, ); $objdump = "objdump" if ((length $objdump) == 0); -- cgit v0.10.2 From df4fc31558dd2a3a30292ddb3a64c2a5befcec73 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 26 Nov 2008 00:16:23 -0500 Subject: ftrace: add function tracing to single thread Impact: feature to function trace a single thread This patch adds the ability to function trace a single thread. The file: /debugfs/tracing/set_ftrace_pid contains the pid to trace. Valid pids are any positive integer. Writing any negative number to this file will disable the pid tracing and the function tracer will go back to tracing all of threads. This feature works with both static and dynamic function tracing. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/Documentation/ftrace.txt b/Documentation/ftrace.txt index 35a78bc..de05042 100644 --- a/Documentation/ftrace.txt +++ b/Documentation/ftrace.txt @@ -127,6 +127,8 @@ of ftrace. Here is a list of some of the key files: be traced. If a function exists in both set_ftrace_filter and set_ftrace_notrace, the function will _not_ be traced. + set_ftrace_pid: Have the function tracer only trace a single thread. + available_filter_functions: This lists the functions that ftrace has processed and can trace. These are the function names that you can pass to "set_ftrace_filter" or @@ -1073,6 +1075,83 @@ For simple one time traces, the above is sufficent. For anything else, a search through /proc/mounts may be needed to find where the debugfs file-system is mounted. + +Single thread tracing +--------------------- + +By writing into /debug/tracing/set_ftrace_pid you can trace a +single thread. For example: + +# cat /debug/tracing/set_ftrace_pid +no pid +# echo 3111 > /debug/tracing/set_ftrace_pid +# cat /debug/tracing/set_ftrace_pid +3111 +# echo function > /debug/tracing/current_tracer +# cat /debug/tracing/trace | head + # tracer: function + # + # TASK-PID CPU# TIMESTAMP FUNCTION + # | | | | | + yum-updatesd-3111 [003] 1637.254676: finish_task_switch <-thread_return + yum-updatesd-3111 [003] 1637.254681: hrtimer_cancel <-schedule_hrtimeout_range + yum-updatesd-3111 [003] 1637.254682: hrtimer_try_to_cancel <-hrtimer_cancel + yum-updatesd-3111 [003] 1637.254683: lock_hrtimer_base <-hrtimer_try_to_cancel + yum-updatesd-3111 [003] 1637.254685: fget_light <-do_sys_poll + yum-updatesd-3111 [003] 1637.254686: pipe_poll <-do_sys_poll +# echo -1 > /debug/tracing/set_ftrace_pid +# cat /debug/tracing/trace |head + # tracer: function + # + # TASK-PID CPU# TIMESTAMP FUNCTION + # | | | | | + ##### CPU 3 buffer started #### + yum-updatesd-3111 [003] 1701.957688: free_poll_entry <-poll_freewait + yum-updatesd-3111 [003] 1701.957689: remove_wait_queue <-free_poll_entry + yum-updatesd-3111 [003] 1701.957691: fput <-free_poll_entry + yum-updatesd-3111 [003] 1701.957692: audit_syscall_exit <-sysret_audit + yum-updatesd-3111 [003] 1701.957693: path_put <-audit_syscall_exit + +If you want to trace a function when executing, you could use +something like this simple program: + +#include +#include +#include +#include +#include +#include + +int main (int argc, char **argv) +{ + if (argc < 1) + exit(-1); + + if (fork() > 0) { + int fd, ffd; + char line[64]; + int s; + + ffd = open("/debug/tracing/current_tracer", O_WRONLY); + if (ffd < 0) + exit(-1); + write(ffd, "nop", 3); + + fd = open("/debug/tracing/set_ftrace_pid", O_WRONLY); + s = sprintf(line, "%d\n", getpid()); + write(fd, line, s); + + write(ffd, "function", 8); + + close(fd); + close(ffd); + + execvp(argv[1], argv+1); + } + + return 0; +} + dynamic ftrace -------------- diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 7e2d3b9..00d98c6 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -47,6 +47,9 @@ int ftrace_enabled __read_mostly; static int last_ftrace_enabled; +/* ftrace_pid_trace >= 0 will only trace threads with this pid */ +static int ftrace_pid_trace = -1; + /* Quick disabling of function tracer. */ int function_trace_stop; @@ -61,6 +64,7 @@ static int ftrace_disabled __read_mostly; static DEFINE_SPINLOCK(ftrace_lock); static DEFINE_MUTEX(ftrace_sysctl_lock); +static DEFINE_MUTEX(ftrace_start_lock); static struct ftrace_ops ftrace_list_end __read_mostly = { @@ -70,6 +74,7 @@ static struct ftrace_ops ftrace_list_end __read_mostly = static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end; ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub; +ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; static void ftrace_list_func(unsigned long ip, unsigned long parent_ip) { @@ -86,6 +91,21 @@ static void ftrace_list_func(unsigned long ip, unsigned long parent_ip) }; } +static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip) +{ + if (current->pid != ftrace_pid_trace) + return; + + ftrace_pid_function(ip, parent_ip); +} + +static void set_ftrace_pid_function(ftrace_func_t func) +{ + /* do not set ftrace_pid_function to itself! */ + if (func != ftrace_pid_func) + ftrace_pid_function = func; +} + /** * clear_ftrace_function - reset the ftrace function * @@ -96,6 +116,7 @@ void clear_ftrace_function(void) { ftrace_trace_function = ftrace_stub; __ftrace_trace_function = ftrace_stub; + ftrace_pid_function = ftrace_stub; } #ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST @@ -128,20 +149,26 @@ static int __register_ftrace_function(struct ftrace_ops *ops) ftrace_list = ops; if (ftrace_enabled) { + ftrace_func_t func; + + if (ops->next == &ftrace_list_end) + func = ops->func; + else + func = ftrace_list_func; + + if (ftrace_pid_trace >= 0) { + set_ftrace_pid_function(func); + func = ftrace_pid_func; + } + /* * For one func, simply call it directly. * For more than one func, call the chain. */ #ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST - if (ops->next == &ftrace_list_end) - ftrace_trace_function = ops->func; - else - ftrace_trace_function = ftrace_list_func; + ftrace_trace_function = func; #else - if (ops->next == &ftrace_list_end) - __ftrace_trace_function = ops->func; - else - __ftrace_trace_function = ftrace_list_func; + __ftrace_trace_function = func; ftrace_trace_function = ftrace_test_stop_func; #endif } @@ -182,8 +209,19 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) if (ftrace_enabled) { /* If we only have one func left, then call that directly */ - if (ftrace_list->next == &ftrace_list_end) - ftrace_trace_function = ftrace_list->func; + if (ftrace_list->next == &ftrace_list_end) { + ftrace_func_t func = ftrace_list->func; + + if (ftrace_pid_trace >= 0) { + set_ftrace_pid_function(func); + func = ftrace_pid_func; + } +#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST + ftrace_trace_function = func; +#else + __ftrace_trace_function = func; +#endif + } } out: @@ -192,6 +230,38 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) return ret; } +static void ftrace_update_pid_func(void) +{ + ftrace_func_t func; + + /* should not be called from interrupt context */ + spin_lock(&ftrace_lock); + + if (ftrace_trace_function == ftrace_stub) + goto out; + + func = ftrace_trace_function; + + if (ftrace_pid_trace >= 0) { + set_ftrace_pid_function(func); + func = ftrace_pid_func; + } else { + if (func != ftrace_pid_func) + goto out; + + set_ftrace_pid_function(func); + } + +#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST + ftrace_trace_function = func; +#else + __ftrace_trace_function = func; +#endif + + out: + spin_unlock(&ftrace_lock); +} + #ifdef CONFIG_DYNAMIC_FTRACE #ifndef CONFIG_FTRACE_MCOUNT_RECORD # error Dynamic ftrace depends on MCOUNT_RECORD @@ -545,7 +615,19 @@ static void ftrace_run_update_code(int command) static ftrace_func_t saved_ftrace_func; static int ftrace_start_up; -static DEFINE_MUTEX(ftrace_start_lock); + +static void ftrace_startup_enable(int command) +{ + if (saved_ftrace_func != ftrace_trace_function) { + saved_ftrace_func = ftrace_trace_function; + command |= FTRACE_UPDATE_TRACE_FUNC; + } + + if (!command || !ftrace_enabled) + return; + + ftrace_run_update_code(command); +} static void ftrace_startup(void) { @@ -558,16 +640,8 @@ static void ftrace_startup(void) ftrace_start_up++; command |= FTRACE_ENABLE_CALLS; - if (saved_ftrace_func != ftrace_trace_function) { - saved_ftrace_func = ftrace_trace_function; - command |= FTRACE_UPDATE_TRACE_FUNC; - } - - if (!command || !ftrace_enabled) - goto out; + ftrace_startup_enable(command); - ftrace_run_update_code(command); - out: mutex_unlock(&ftrace_start_lock); } @@ -1262,13 +1336,10 @@ static struct file_operations ftrace_notrace_fops = { .release = ftrace_notrace_release, }; -static __init int ftrace_init_debugfs(void) +static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { - struct dentry *d_tracer; struct dentry *entry; - d_tracer = tracing_init_dentry(); - entry = debugfs_create_file("available_filter_functions", 0444, d_tracer, NULL, &ftrace_avail_fops); if (!entry) @@ -1295,8 +1366,6 @@ static __init int ftrace_init_debugfs(void) return 0; } -fs_initcall(ftrace_init_debugfs); - static int ftrace_convert_nops(struct module *mod, unsigned long *start, unsigned long *end) @@ -1382,12 +1451,100 @@ static int __init ftrace_nodyn_init(void) } device_initcall(ftrace_nodyn_init); +static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; } +static inline void ftrace_startup_enable(int command) { } # define ftrace_startup() do { } while (0) # define ftrace_shutdown() do { } while (0) # define ftrace_startup_sysctl() do { } while (0) # define ftrace_shutdown_sysctl() do { } while (0) #endif /* CONFIG_DYNAMIC_FTRACE */ +static ssize_t +ftrace_pid_read(struct file *file, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[64]; + int r; + + if (ftrace_pid_trace >= 0) + r = sprintf(buf, "%u\n", ftrace_pid_trace); + else + r = sprintf(buf, "no pid\n"); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static ssize_t +ftrace_pid_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[64]; + long val; + int ret; + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + ret = strict_strtol(buf, 10, &val); + if (ret < 0) + return ret; + + mutex_lock(&ftrace_start_lock); + if (ret < 0) { + /* disable pid tracing */ + if (ftrace_pid_trace < 0) + goto out; + ftrace_pid_trace = -1; + + } else { + + if (ftrace_pid_trace == val) + goto out; + + ftrace_pid_trace = val; + } + + /* update the function call */ + ftrace_update_pid_func(); + ftrace_startup_enable(0); + + out: + mutex_unlock(&ftrace_start_lock); + + return cnt; +} + +static struct file_operations ftrace_pid_fops = { + .read = ftrace_pid_read, + .write = ftrace_pid_write, +}; + +static __init int ftrace_init_debugfs(void) +{ + struct dentry *d_tracer; + struct dentry *entry; + + d_tracer = tracing_init_dentry(); + if (!d_tracer) + return 0; + + ftrace_init_dyn_debugfs(d_tracer); + + entry = debugfs_create_file("set_ftrace_pid", 0644, d_tracer, + NULL, &ftrace_pid_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'set_ftrace_pid' entry\n"); + return 0; +} + +fs_initcall(ftrace_init_debugfs); + /** * ftrace_kill - kill ftrace * -- cgit v0.10.2 From 5a45cfe1c64862e8cd3b0d79d7c4ba71c3118915 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 26 Nov 2008 00:16:24 -0500 Subject: ftrace: use code patching for ftrace graph tracer Impact: more efficient code for ftrace graph tracer This patch uses the dynamic patching, when available, to patch the function graph code into the kernel. This patch will ease the way for letting both function tracing and function graph tracing run together. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 7def9fd..958af86 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -1174,6 +1174,11 @@ ftrace_call: popl %edx popl %ecx popl %eax +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +.globl ftrace_graph_call +ftrace_graph_call: + jmp ftrace_stub +#endif .globl ftrace_stub ftrace_stub: diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 26b2d92..7ef914e 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -111,7 +111,6 @@ static void ftrace_mod_code(void) */ mod_code_status = probe_kernel_write(mod_code_ip, mod_code_newcode, MCOUNT_INSN_SIZE); - } void ftrace_nmi_enter(void) @@ -325,7 +324,51 @@ int __init ftrace_dyn_arch_init(void *data) #ifdef CONFIG_FUNCTION_GRAPH_TRACER -#ifndef CONFIG_DYNAMIC_FTRACE +#ifdef CONFIG_DYNAMIC_FTRACE +extern void ftrace_graph_call(void); + +static int ftrace_mod_jmp(unsigned long ip, + int old_offset, int new_offset) +{ + unsigned char code[MCOUNT_INSN_SIZE]; + + if (probe_kernel_read(code, (void *)ip, MCOUNT_INSN_SIZE)) + return -EFAULT; + + if (code[0] != 0xe9 || old_offset != *(int *)(&code[1])) + return -EINVAL; + + *(int *)(&code[1]) = new_offset; + + if (do_ftrace_mod_code(ip, &code)) + return -EPERM; + + return 0; +} + +int ftrace_enable_ftrace_graph_caller(void) +{ + unsigned long ip = (unsigned long)(&ftrace_graph_call); + int old_offset, new_offset; + + old_offset = (unsigned long)(&ftrace_stub) - (ip + MCOUNT_INSN_SIZE); + new_offset = (unsigned long)(&ftrace_graph_caller) - (ip + MCOUNT_INSN_SIZE); + + return ftrace_mod_jmp(ip, old_offset, new_offset); +} + +int ftrace_disable_ftrace_graph_caller(void) +{ + unsigned long ip = (unsigned long)(&ftrace_graph_call); + int old_offset, new_offset; + + old_offset = (unsigned long)(&ftrace_graph_caller) - (ip + MCOUNT_INSN_SIZE); + new_offset = (unsigned long)(&ftrace_stub) - (ip + MCOUNT_INSN_SIZE); + + return ftrace_mod_jmp(ip, old_offset, new_offset); +} + +#else /* CONFIG_DYNAMIC_FTRACE */ /* * These functions are picked from those used on @@ -343,6 +386,7 @@ void ftrace_nmi_exit(void) { atomic_dec(&in_nmi); } + #endif /* !CONFIG_DYNAMIC_FTRACE */ /* Add a function return address to the trace stack on thread info.*/ diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index fc2d549..f9792c0 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -117,6 +117,11 @@ extern void ftrace_call(void); extern void mcount_call(void); #ifdef CONFIG_FUNCTION_GRAPH_TRACER extern void ftrace_graph_caller(void); +extern int ftrace_enable_ftrace_graph_caller(void); +extern int ftrace_disable_ftrace_graph_caller(void); +#else +static inline int ftrace_enable_ftrace_graph_caller(void) { return 0; } +static inline int ftrace_disable_ftrace_graph_caller(void) { return 0; } #endif /** diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 00d98c6..5f7c864 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -281,6 +281,8 @@ enum { FTRACE_UPDATE_TRACE_FUNC = (1 << 2), FTRACE_ENABLE_MCOUNT = (1 << 3), FTRACE_DISABLE_MCOUNT = (1 << 4), + FTRACE_START_FUNC_RET = (1 << 5), + FTRACE_STOP_FUNC_RET = (1 << 6), }; static int ftrace_filtered; @@ -465,14 +467,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) unsigned long ip, fl; unsigned long ftrace_addr; -#ifdef CONFIG_FUNCTION_GRAPH_TRACER - if (ftrace_tracing_type == FTRACE_TYPE_ENTER) - ftrace_addr = (unsigned long)ftrace_caller; - else - ftrace_addr = (unsigned long)ftrace_graph_caller; -#else ftrace_addr = (unsigned long)ftrace_caller; -#endif ip = rec->ip; @@ -605,6 +600,11 @@ static int __ftrace_modify_code(void *data) if (*command & FTRACE_UPDATE_TRACE_FUNC) ftrace_update_ftrace_func(ftrace_trace_function); + if (*command & FTRACE_START_FUNC_RET) + ftrace_enable_ftrace_graph_caller(); + else if (*command & FTRACE_STOP_FUNC_RET) + ftrace_disable_ftrace_graph_caller(); + return 0; } @@ -629,10 +629,8 @@ static void ftrace_startup_enable(int command) ftrace_run_update_code(command); } -static void ftrace_startup(void) +static void ftrace_startup(int command) { - int command = 0; - if (unlikely(ftrace_disabled)) return; @@ -645,10 +643,8 @@ static void ftrace_startup(void) mutex_unlock(&ftrace_start_lock); } -static void ftrace_shutdown(void) +static void ftrace_shutdown(int command) { - int command = 0; - if (unlikely(ftrace_disabled)) return; @@ -1453,8 +1449,9 @@ device_initcall(ftrace_nodyn_init); static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; } static inline void ftrace_startup_enable(int command) { } -# define ftrace_startup() do { } while (0) -# define ftrace_shutdown() do { } while (0) +/* Keep as macros so we do not need to define the commands */ +# define ftrace_startup(command) do { } while (0) +# define ftrace_shutdown(command) do { } while (0) # define ftrace_startup_sysctl() do { } while (0) # define ftrace_shutdown_sysctl() do { } while (0) #endif /* CONFIG_DYNAMIC_FTRACE */ @@ -1585,7 +1582,7 @@ int register_ftrace_function(struct ftrace_ops *ops) } ret = __register_ftrace_function(ops); - ftrace_startup(); + ftrace_startup(0); out: mutex_unlock(&ftrace_sysctl_lock); @@ -1604,7 +1601,7 @@ int unregister_ftrace_function(struct ftrace_ops *ops) mutex_lock(&ftrace_sysctl_lock); ret = __unregister_ftrace_function(ops); - ftrace_shutdown(); + ftrace_shutdown(0); mutex_unlock(&ftrace_sysctl_lock); return ret; @@ -1751,7 +1748,7 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc, ftrace_tracing_type = FTRACE_TYPE_RETURN; ftrace_graph_return = retfunc; ftrace_graph_entry = entryfunc; - ftrace_startup(); + ftrace_startup(FTRACE_START_FUNC_RET); out: mutex_unlock(&ftrace_sysctl_lock); @@ -1765,7 +1762,7 @@ void unregister_ftrace_graph(void) atomic_dec(&ftrace_graph_active); ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; ftrace_graph_entry = (trace_func_graph_ent_t)ftrace_stub; - ftrace_shutdown(); + ftrace_shutdown(FTRACE_STOP_FUNC_RET); /* Restore normal tracing type */ ftrace_tracing_type = FTRACE_TYPE_ENTER; -- cgit v0.10.2 From e53a6319cca69111c1643dc9f18f4465d7f1cbf0 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 26 Nov 2008 00:16:25 -0500 Subject: ftrace: let function tracing and function return run together Impact: feature This patch enables function tracing and function return to run together. I've tested this by enabling the stack tracer and return tracer, where both the function entry and function return are used together with dynamic ftrace. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 5f7c864..cbf8b09 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -53,9 +53,6 @@ static int ftrace_pid_trace = -1; /* Quick disabling of function tracer. */ int function_trace_stop; -/* By default, current tracing type is normal tracing. */ -enum ftrace_tracing_type_t ftrace_tracing_type = FTRACE_TYPE_ENTER; - /* * ftrace_disabled is set when an anomaly is discovered. * ftrace_disabled is much stronger than ftrace_enabled. @@ -1576,15 +1573,9 @@ int register_ftrace_function(struct ftrace_ops *ops) mutex_lock(&ftrace_sysctl_lock); - if (ftrace_tracing_type == FTRACE_TYPE_RETURN) { - ret = -EBUSY; - goto out; - } - ret = __register_ftrace_function(ops); ftrace_startup(0); -out: mutex_unlock(&ftrace_sysctl_lock); return ret; } @@ -1731,23 +1722,16 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc, mutex_lock(&ftrace_sysctl_lock); - /* - * Don't launch return tracing if normal function - * tracing is already running. - */ - if (ftrace_trace_function != ftrace_stub) { - ret = -EBUSY; - goto out; - } atomic_inc(&ftrace_graph_active); ret = start_graph_tracing(); if (ret) { atomic_dec(&ftrace_graph_active); goto out; } - ftrace_tracing_type = FTRACE_TYPE_RETURN; + ftrace_graph_return = retfunc; ftrace_graph_entry = entryfunc; + ftrace_startup(FTRACE_START_FUNC_RET); out: @@ -1763,8 +1747,6 @@ void unregister_ftrace_graph(void) ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; ftrace_graph_entry = (trace_func_graph_ent_t)ftrace_stub; ftrace_shutdown(FTRACE_STOP_FUNC_RET); - /* Restore normal tracing type */ - ftrace_tracing_type = FTRACE_TYPE_ENTER; mutex_unlock(&ftrace_sysctl_lock); } -- cgit v0.10.2 From 660c7f9be96321fc80026d76411bd15e6f418a72 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 26 Nov 2008 00:16:26 -0500 Subject: ftrace: add thread comm to function graph tracer Impact: enhancement to function graph tracer Export the trace_find_cmdline so the function graph tracer can use it to print the comms of the threads. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9d5f7c9..5811e0a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -804,7 +804,7 @@ static void trace_save_cmdline(struct task_struct *tsk) spin_unlock(&trace_cmdline_lock); } -static char *trace_find_cmdline(int pid) +char *trace_find_cmdline(int pid) { char *cmdline = "<...>"; unsigned map; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index ffe1bb1..7adacf3 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -450,6 +450,7 @@ struct tracer_switch_ops { struct tracer_switch_ops *next; }; +char *trace_find_cmdline(int pid); #endif /* CONFIG_CONTEXT_SWITCH_TRACER */ #ifdef CONFIG_DYNAMIC_FTRACE diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index b6f0cc2..bbb81e7 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -32,29 +32,40 @@ static pid_t last_pid = -1; static int graph_trace_init(struct trace_array *tr) { - int cpu; + int cpu, ret; + for_each_online_cpu(cpu) tracing_reset(tr, cpu); - return register_ftrace_graph(&trace_graph_return, + ret = register_ftrace_graph(&trace_graph_return, &trace_graph_entry); + if (ret) + return ret; + tracing_start_cmdline_record(); + + return 0; } static void graph_trace_reset(struct trace_array *tr) { - unregister_ftrace_graph(); + tracing_stop_cmdline_record(); + unregister_ftrace_graph(); } /* If the pid changed since the last trace, output this event */ static int verif_pid(struct trace_seq *s, pid_t pid) { + char *comm; + if (last_pid != -1 && last_pid == pid) return 1; last_pid = pid; - return trace_seq_printf(s, "\n------------8<---------- thread %d" + comm = trace_find_cmdline(pid); + + return trace_seq_printf(s, "\n------------8<---------- thread %s-%d" " ------------8<----------\n\n", - pid); + comm, pid); } static enum print_line_t -- cgit v0.10.2 From 437f24fb897d409a9978eb71ecfaf279dcd94acd Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 26 Nov 2008 00:16:27 -0500 Subject: ftrace: add cpu annotation for function graph tracer Impact: enhancement for function graph tracer When run on a SMP box, the function graph tracer is confusing because it shows the different CPUS as changes in the trace. This patch adds the annotation of 'CPU[###]' where ### is a three digit number. The output will look similar to this: CPU[001] dput() { CPU[000] } 726 CPU[001] } 487 CPU[000] do_softirq() { CPU[001] } 2221 CPU[000] __do_softirq() { CPU[000] __local_bh_disable() { CPU[001] unroll_tree_refs() { CPU[000] } 569 CPU[001] } 501 CPU[000] rcu_process_callbacks() { CPU[001] kfree() { What makes this nice is that now you can grep the file and produce readable format for a particular CPU. # cat /debug/tracing/trace > /tmp/trace # grep '^CPU\[000\]' /tmp/trace > /tmp/trace0 # grep '^CPU\[001\]' /tmp/trace > /tmp/trace1 Will give you: # head /tmp/trace0 CPU[000] ------------8<---------- thread sshd-3899 ------------8<---------- CPU[000] inotify_dentry_parent_queue_event() { CPU[000] } 2531 CPU[000] inotify_inode_queue_event() { CPU[000] } 505 CPU[000] } 69626 CPU[000] } 73089 CPU[000] audit_syscall_exit() { CPU[000] path_put() { CPU[000] dput() { # head /tmp/trace1 CPU[001] ------------8<---------- thread pcscd-3446 ------------8<---------- CPU[001] } 4186 CPU[001] dput() { CPU[001] } 543 CPU[001] vfs_permission() { CPU[001] inode_permission() { CPU[001] shmem_permission() { CPU[001] generic_permission() { CPU[001] } 501 CPU[001] } 2205 Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index bbb81e7..d31d695 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -28,7 +28,7 @@ static struct tracer_flags tracer_flags = { }; /* pid on the last trace processed */ -static pid_t last_pid = -1; +static pid_t last_pid[NR_CPUS] = { [0 ... NR_CPUS-1] = -1 }; static int graph_trace_init(struct trace_array *tr) { @@ -53,29 +53,34 @@ static void graph_trace_reset(struct trace_array *tr) } /* If the pid changed since the last trace, output this event */ -static int verif_pid(struct trace_seq *s, pid_t pid) +static int verif_pid(struct trace_seq *s, pid_t pid, int cpu) { char *comm; - if (last_pid != -1 && last_pid == pid) + if (last_pid[cpu] != -1 && last_pid[cpu] == pid) return 1; - last_pid = pid; + last_pid[cpu] = pid; comm = trace_find_cmdline(pid); - return trace_seq_printf(s, "\n------------8<---------- thread %s-%d" + return trace_seq_printf(s, "\nCPU[%03d]" + " ------------8<---------- thread %s-%d" " ------------8<----------\n\n", - comm, pid); + cpu, comm, pid); } static enum print_line_t print_graph_entry(struct ftrace_graph_ent *call, struct trace_seq *s, - struct trace_entry *ent) + struct trace_entry *ent, int cpu) { int i; int ret; - if (!verif_pid(s, ent->pid)) + if (!verif_pid(s, ent->pid, cpu)) + return TRACE_TYPE_PARTIAL_LINE; + + ret = trace_seq_printf(s, "CPU[%03d] ", cpu); + if (!ret) return TRACE_TYPE_PARTIAL_LINE; for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { @@ -96,12 +101,16 @@ print_graph_entry(struct ftrace_graph_ent *call, struct trace_seq *s, static enum print_line_t print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, - struct trace_entry *ent) + struct trace_entry *ent, int cpu) { int i; int ret; - if (!verif_pid(s, ent->pid)) + if (!verif_pid(s, ent->pid, cpu)) + return TRACE_TYPE_PARTIAL_LINE; + + ret = trace_seq_printf(s, "CPU[%03d] ", cpu); + if (!ret) return TRACE_TYPE_PARTIAL_LINE; for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) { @@ -137,12 +146,13 @@ print_graph_function(struct trace_iterator *iter) case TRACE_GRAPH_ENT: { struct ftrace_graph_ent_entry *field; trace_assign_type(field, entry); - return print_graph_entry(&field->graph_ent, s, entry); + return print_graph_entry(&field->graph_ent, s, entry, + iter->cpu); } case TRACE_GRAPH_RET: { struct ftrace_graph_ret_entry *field; trace_assign_type(field, entry); - return print_graph_return(&field->ret, s, entry); + return print_graph_return(&field->ret, s, entry, iter->cpu); } default: return TRACE_TYPE_UNHANDLED; -- cgit v0.10.2 From f3f47a6768a29448866da4422b6f6bee485c947f Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 23 Nov 2008 16:49:58 -0800 Subject: tracing: add "power-tracer": C/P state tracer to help power optimization Impact: new "power-tracer" ftrace plugin This patch adds a C/P-state ftrace plugin that will generate detailed statistics about the C/P-states that are being used, so that we can look at detailed decisions that the C/P-state code is making, rather than the too high level "average" that we have today. An example way of using this is: mount -t debugfs none /sys/kernel/debug echo cstate > /sys/kernel/debug/tracing/current_tracer echo 1 > /sys/kernel/debug/tracing/tracing_enabled sleep 1 echo 0 > /sys/kernel/debug/tracing/tracing_enabled cat /sys/kernel/debug/tracing/trace | perl scripts/trace/cstate.pl > out.svg Signed-off-by: Arjan van de Ven Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index 8e48c5d..88ea02d 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include @@ -391,6 +392,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy, unsigned int next_perf_state = 0; /* Index into perf table */ unsigned int i; int result = 0; + struct power_trace it; dprintk("acpi_cpufreq_target %d (%d)\n", target_freq, policy->cpu); @@ -427,6 +429,8 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy, } } + trace_power_mark(&it, POWER_PSTATE, next_perf_state); + switch (data->cpu_feature) { case SYSTEM_INTEL_MSR_CAPABLE: cmd.type = SYSTEM_INTEL_MSR_CAPABLE; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index c622772..c27af49 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -7,6 +7,7 @@ #include #include #include +#include #include unsigned long idle_halt; @@ -100,6 +101,9 @@ static inline int hlt_use_halt(void) void default_idle(void) { if (hlt_use_halt()) { + struct power_trace it; + + trace_power_start(&it, POWER_CSTATE, 1); current_thread_info()->status &= ~TS_POLLING; /* * TS_POLLING-cleared state must be visible before we @@ -112,6 +116,7 @@ void default_idle(void) else local_irq_enable(); current_thread_info()->status |= TS_POLLING; + trace_power_end(&it); } else { local_irq_enable(); /* loop is done by the caller */ @@ -154,24 +159,31 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait); */ void mwait_idle_with_hints(unsigned long ax, unsigned long cx) { + struct power_trace it; + + trace_power_start(&it, POWER_CSTATE, (ax>>4)+1); if (!need_resched()) { __monitor((void *)¤t_thread_info()->flags, 0, 0); smp_mb(); if (!need_resched()) __mwait(ax, cx); } + trace_power_end(&it); } /* Default MONITOR/MWAIT with no hints, used for default C1 state */ static void mwait_idle(void) { + struct power_trace it; if (!need_resched()) { + trace_power_start(&it, POWER_CSTATE, 1); __monitor((void *)¤t_thread_info()->flags, 0, 0); smp_mb(); if (!need_resched()) __sti_mwait(0, 0); else local_irq_enable(); + trace_power_end(&it); } else local_irq_enable(); } @@ -183,9 +195,13 @@ static void mwait_idle(void) */ static void poll_idle(void) { + struct power_trace it; + + trace_power_start(&it, POWER_CSTATE, 0); local_irq_enable(); while (!need_resched()) cpu_relax(); + trace_power_end(&it); } /* diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 7854d87..0df2886 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -311,6 +311,35 @@ ftrace_init_module(struct module *mod, unsigned long *start, unsigned long *end) { } #endif +enum { + POWER_NONE = 0, + POWER_CSTATE = 1, + POWER_PSTATE = 2, +}; + +struct power_trace { +#ifdef CONFIG_POWER_TRACER + ktime_t stamp; + ktime_t end; + int type; + int state; +#endif +}; + +#ifdef CONFIG_POWER_TRACER +extern void trace_power_start(struct power_trace *it, unsigned int type, + unsigned int state); +extern void trace_power_mark(struct power_trace *it, unsigned int type, + unsigned int state); +extern void trace_power_end(struct power_trace *it); +#else +static inline void trace_power_start(struct power_trace *it, unsigned int type, + unsigned int state) { } +static inline void trace_power_mark(struct power_trace *it, unsigned int type, + unsigned int state) { } +static inline void trace_power_end(struct power_trace *it) { } +#endif + /* * Structure that defines a return function trace. diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 620fead..d151aab 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -217,6 +217,17 @@ config BRANCH_TRACER Say N if unsure. +config POWER_TRACER + bool "Trace power consumption behavior" + depends on DEBUG_KERNEL + depends on X86 + select TRACING + help + This tracer helps developers to analyze and optimize the kernels + power management decisions, specifically the C-state and P-state + behavior. + + config STACK_TRACER bool "Trace max stack" depends on HAVE_FUNCTION_TRACER diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index cef4bcb..acaa065 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -32,5 +32,6 @@ obj-$(CONFIG_BOOT_TRACER) += trace_boot.o obj-$(CONFIG_FUNCTION_RET_TRACER) += trace_functions_return.o obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o obj-$(CONFIG_BTS_TRACER) += trace_bts.o +obj-$(CONFIG_POWER_TRACER) += trace_power.o libftrace-y := ftrace.o diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 3abd645..4c45377 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -28,6 +28,7 @@ enum trace_type { TRACE_FN_RET, TRACE_USER_STACK, TRACE_BTS, + TRACE_POWER, __TRACE_LAST_TYPE }; @@ -160,6 +161,11 @@ struct bts_entry { unsigned long to; }; +struct trace_power { + struct trace_entry ent; + struct power_trace state_data; +}; + /* * trace_flag_type is an enumeration that holds different * states when a trace occurs. These are: @@ -266,6 +272,7 @@ extern void __ftrace_bad_type(void); IF_ASSIGN(var, ent, struct trace_branch, TRACE_BRANCH); \ IF_ASSIGN(var, ent, struct ftrace_ret_entry, TRACE_FN_RET);\ IF_ASSIGN(var, ent, struct bts_entry, TRACE_BTS);\ + IF_ASSIGN(var, ent, struct trace_power, TRACE_POWER); \ __ftrace_bad_type(); \ } while (0) diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c new file mode 100644 index 0000000..a7172a3 --- /dev/null +++ b/kernel/trace/trace_power.c @@ -0,0 +1,179 @@ +/* + * ring buffer based C-state tracer + * + * Arjan van de Ven + * Copyright (C) 2008 Intel Corporation + * + * Much is borrowed from trace_boot.c which is + * Copyright (C) 2008 Frederic Weisbecker + * + */ + +#include +#include +#include +#include +#include + +#include "trace.h" + +static struct trace_array *power_trace; +static int __read_mostly trace_power_enabled; + + +static void start_power_trace(struct trace_array *tr) +{ + trace_power_enabled = 1; +} + +static void stop_power_trace(struct trace_array *tr) +{ + trace_power_enabled = 0; +} + + +static int power_trace_init(struct trace_array *tr) +{ + int cpu; + power_trace = tr; + + trace_power_enabled = 1; + + for_each_cpu_mask(cpu, cpu_possible_map) + tracing_reset(tr, cpu); + return 0; +} + +static enum print_line_t power_print_line(struct trace_iterator *iter) +{ + int ret = 0; + struct trace_entry *entry = iter->ent; + struct trace_power *field ; + struct power_trace *it; + struct trace_seq *s = &iter->seq; + struct timespec stamp; + struct timespec duration; + + trace_assign_type(field, entry); + it = &field->state_data; + stamp = ktime_to_timespec(it->stamp); + duration = ktime_to_timespec(ktime_sub(it->end, it->stamp)); + + if (entry->type == TRACE_POWER) { + if (it->type == POWER_CSTATE) + ret = trace_seq_printf(s, "[%5ld.%09ld] CSTATE: Going to C%i on cpu %i for %ld.%09ld\n", + stamp.tv_sec, + stamp.tv_nsec, + it->state, iter->cpu, + duration.tv_sec, + duration.tv_nsec); + if (it->type == POWER_PSTATE) + ret = trace_seq_printf(s, "[%5ld.%09ld] PSTATE: Going to P%i on cpu %i\n", + stamp.tv_sec, + stamp.tv_nsec, + it->state, iter->cpu); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + return TRACE_TYPE_HANDLED; + } + return TRACE_TYPE_UNHANDLED; +} + +static struct tracer power_tracer __read_mostly = +{ + .name = "power", + .init = power_trace_init, + .start = start_power_trace, + .stop = stop_power_trace, + .reset = stop_power_trace, + .print_line = power_print_line, +}; + +static int init_power_trace(void) +{ + return register_tracer(&power_tracer); +} +device_initcall(init_power_trace); + +void trace_power_start(struct power_trace *it, unsigned int type, + unsigned int level) +{ + if (!trace_power_enabled) + return; + + memset(it, 0, sizeof(struct power_trace)); + it->state = level; + it->type = type; + it->stamp = ktime_get(); +} +EXPORT_SYMBOL_GPL(trace_power_start); + + +void trace_power_end(struct power_trace *it) +{ + struct ring_buffer_event *event; + struct trace_power *entry; + struct trace_array_cpu *data; + unsigned long irq_flags; + struct trace_array *tr = power_trace; + + if (!trace_power_enabled) + return; + + preempt_disable(); + it->end = ktime_get(); + data = tr->data[smp_processor_id()]; + + event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), + &irq_flags); + if (!event) + goto out; + entry = ring_buffer_event_data(event); + tracing_generic_entry_update(&entry->ent, 0, 0); + entry->ent.type = TRACE_POWER; + entry->state_data = *it; + ring_buffer_unlock_commit(tr->buffer, event, irq_flags); + + trace_wake_up(); + + out: + preempt_enable(); +} +EXPORT_SYMBOL_GPL(trace_power_end); + +void trace_power_mark(struct power_trace *it, unsigned int type, + unsigned int level) +{ + struct ring_buffer_event *event; + struct trace_power *entry; + struct trace_array_cpu *data; + unsigned long irq_flags; + struct trace_array *tr = power_trace; + + if (!trace_power_enabled) + return; + + memset(it, 0, sizeof(struct power_trace)); + it->state = level; + it->type = type; + it->stamp = ktime_get(); + preempt_disable(); + it->end = it->stamp; + data = tr->data[smp_processor_id()]; + + event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), + &irq_flags); + if (!event) + goto out; + entry = ring_buffer_event_data(event); + tracing_generic_entry_update(&entry->ent, 0, 0); + entry->ent.type = TRACE_POWER; + entry->state_data = *it; + ring_buffer_unlock_commit(tr->buffer, event, irq_flags); + + trace_wake_up(); + + out: + preempt_enable(); +} +EXPORT_SYMBOL_GPL(trace_power_mark); diff --git a/scripts/trace/power.pl b/scripts/trace/power.pl new file mode 100644 index 0000000..4f729b3 --- /dev/null +++ b/scripts/trace/power.pl @@ -0,0 +1,108 @@ +#!/usr/bin/perl + +# Copyright 2008, Intel Corporation +# +# This file is part of the Linux kernel +# +# This program file is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation; version 2 of the License. +# +# This program is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +# for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program in a file named COPYING; if not, write to the +# Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, +# Boston, MA 02110-1301 USA +# +# Authors: +# Arjan van de Ven + + +# +# This script turns a cstate ftrace output into a SVG graphic that shows +# historic C-state information +# +# +# cat /sys/kernel/debug/tracing/trace | perl power.pl > out.svg +# + +my @styles; +my $base = 0; + +my @pstate_last; +my @pstate_level; + +$styles[0] = "fill:rgb(0,0,255);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)"; +$styles[1] = "fill:rgb(0,255,0);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)"; +$styles[2] = "fill:rgb(255,0,20);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)"; +$styles[3] = "fill:rgb(255,255,20);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)"; +$styles[4] = "fill:rgb(255,0,255);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)"; +$styles[5] = "fill:rgb(0,255,255);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)"; +$styles[6] = "fill:rgb(0,128,255);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)"; +$styles[7] = "fill:rgb(0,255,128);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)"; +$styles[8] = "fill:rgb(0,25,20);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)"; + + +print " \n"; +print "\n"; + +my $scale = 30000.0; +while (<>) { + my $line = $_; + if ($line =~ /([0-9\.]+)\] CSTATE: Going to C([0-9]) on cpu ([0-9]+) for ([0-9\.]+)/) { + if ($base == 0) { + $base = $1; + } + my $time = $1 - $base; + $time = $time * $scale; + my $C = $2; + my $cpu = $3; + my $y = 400 * $cpu; + my $duration = $4 * $scale; + my $msec = int($4 * 100000)/100.0; + my $height = $C * 20; + $style = $styles[$C]; + + $y = $y + 140 - $height; + + $x2 = $time + 4; + $y2 = $y + 4; + + + print "\n"; + print "C$C $msec\n"; + } + if ($line =~ /([0-9\.]+)\] PSTATE: Going to P([0-9]) on cpu ([0-9]+)/) { + my $time = $1 - $base; + my $state = $2; + my $cpu = $3; + + if (defined($pstate_last[$cpu])) { + my $from = $pstate_last[$cpu]; + my $oldstate = $pstate_state[$cpu]; + my $duration = ($time-$from) * $scale; + + $from = $from * $scale; + my $to = $from + $duration; + my $height = 140 - ($oldstate * (140/8)); + + my $y = 400 * $cpu + 200 + $height; + my $y2 = $y+4; + my $style = $styles[8]; + + print "\n"; + print "P$oldstate (cpu $cpu)\n"; + }; + + $pstate_last[$cpu] = $time; + $pstate_state[$cpu] = $state; + } +} + + +print "\n"; -- cgit v0.10.2 From 5f3ea37c7716db4e894a480e0c18b24399595b6b Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 30 Oct 2008 08:34:33 +0100 Subject: blktrace: port to tracepoints This was a forward port of work done by Mathieu Desnoyers, I changed it to encode the 'what' parameter on the tracepoint name, so that one can register interest in specific events and not on classes of events to then check the 'what' parameter. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Jens Axboe Signed-off-by: Ingo Molnar diff --git a/block/Kconfig b/block/Kconfig index 1ab7c15..290b219 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -47,6 +47,7 @@ config BLK_DEV_IO_TRACE depends on SYSFS select RELAY select DEBUG_FS + select TRACEPOINTS help Say Y here if you want to be able to trace the block layer actions on a given queue. Tracing allows you to see any traffic happening diff --git a/block/blk-core.c b/block/blk-core.c index 10e8a64..04267d6 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -28,6 +28,7 @@ #include #include #include +#include #include "blk.h" @@ -205,7 +206,7 @@ void blk_plug_device(struct request_queue *q) if (!queue_flag_test_and_set(QUEUE_FLAG_PLUGGED, q)) { mod_timer(&q->unplug_timer, jiffies + q->unplug_delay); - blk_add_trace_generic(q, NULL, 0, BLK_TA_PLUG); + trace_block_plug(q); } } EXPORT_SYMBOL(blk_plug_device); @@ -292,9 +293,7 @@ void blk_unplug_work(struct work_struct *work) struct request_queue *q = container_of(work, struct request_queue, unplug_work); - blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL, - q->rq.count[READ] + q->rq.count[WRITE]); - + trace_block_unplug_io(q); q->unplug_fn(q); } @@ -302,9 +301,7 @@ void blk_unplug_timeout(unsigned long data) { struct request_queue *q = (struct request_queue *)data; - blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_TIMER, NULL, - q->rq.count[READ] + q->rq.count[WRITE]); - + trace_block_unplug_timer(q); kblockd_schedule_work(q, &q->unplug_work); } @@ -314,9 +311,7 @@ void blk_unplug(struct request_queue *q) * devices don't necessarily have an ->unplug_fn defined */ if (q->unplug_fn) { - blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL, - q->rq.count[READ] + q->rq.count[WRITE]); - + trace_block_unplug_io(q); q->unplug_fn(q); } } @@ -822,7 +817,7 @@ rq_starved: if (ioc_batching(q, ioc)) ioc->nr_batch_requests--; - blk_add_trace_generic(q, bio, rw, BLK_TA_GETRQ); + trace_block_getrq(q, bio, rw); out: return rq; } @@ -848,7 +843,7 @@ static struct request *get_request_wait(struct request_queue *q, int rw_flags, prepare_to_wait_exclusive(&rl->wait[rw], &wait, TASK_UNINTERRUPTIBLE); - blk_add_trace_generic(q, bio, rw, BLK_TA_SLEEPRQ); + trace_block_sleeprq(q, bio, rw); __generic_unplug_device(q); spin_unlock_irq(q->queue_lock); @@ -928,7 +923,7 @@ void blk_requeue_request(struct request_queue *q, struct request *rq) { blk_delete_timer(rq); blk_clear_rq_complete(rq); - blk_add_trace_rq(q, rq, BLK_TA_REQUEUE); + trace_block_rq_requeue(q, rq); if (blk_rq_tagged(rq)) blk_queue_end_tag(q, rq); @@ -1167,7 +1162,7 @@ static int __make_request(struct request_queue *q, struct bio *bio) if (!ll_back_merge_fn(q, req, bio)) break; - blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE); + trace_block_bio_backmerge(q, bio); req->biotail->bi_next = bio; req->biotail = bio; @@ -1186,7 +1181,7 @@ static int __make_request(struct request_queue *q, struct bio *bio) if (!ll_front_merge_fn(q, req, bio)) break; - blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE); + trace_block_bio_frontmerge(q, bio); bio->bi_next = req->bio; req->bio = bio; @@ -1269,7 +1264,7 @@ static inline void blk_partition_remap(struct bio *bio) bio->bi_sector += p->start_sect; bio->bi_bdev = bdev->bd_contains; - blk_add_trace_remap(bdev_get_queue(bio->bi_bdev), bio, + trace_block_remap(bdev_get_queue(bio->bi_bdev), bio, bdev->bd_dev, bio->bi_sector, bio->bi_sector - p->start_sect); } @@ -1441,10 +1436,10 @@ end_io: goto end_io; if (old_sector != -1) - blk_add_trace_remap(q, bio, old_dev, bio->bi_sector, + trace_block_remap(q, bio, old_dev, bio->bi_sector, old_sector); - blk_add_trace_bio(q, bio, BLK_TA_QUEUE); + trace_block_bio_queue(q, bio); old_sector = bio->bi_sector; old_dev = bio->bi_bdev->bd_dev; @@ -1656,7 +1651,7 @@ static int __end_that_request_first(struct request *req, int error, int total_bytes, bio_nbytes, next_idx = 0; struct bio *bio; - blk_add_trace_rq(req->q, req, BLK_TA_COMPLETE); + trace_block_rq_complete(req->q, req); /* * for a REQ_TYPE_BLOCK_PC request, we want to carry any eventual diff --git a/block/blktrace.c b/block/blktrace.c index 85049a7..b0a2cae 100644 --- a/block/blktrace.c +++ b/block/blktrace.c @@ -23,10 +23,18 @@ #include #include #include +#include #include static unsigned int blktrace_seq __read_mostly = 1; +/* Global reference count of probes */ +static DEFINE_MUTEX(blk_probe_mutex); +static atomic_t blk_probes_ref = ATOMIC_INIT(0); + +static int blk_register_tracepoints(void); +static void blk_unregister_tracepoints(void); + /* * Send out a notify message. */ @@ -119,7 +127,7 @@ static u32 ddir_act[2] __read_mostly = { BLK_TC_ACT(BLK_TC_READ), BLK_TC_ACT(BLK * The worker for the various blk_add_trace*() types. Fills out a * blk_io_trace structure and places it in a per-cpu subbuffer. */ -void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, +static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, int rw, u32 what, int error, int pdu_len, void *pdu_data) { struct task_struct *tsk = current; @@ -177,8 +185,6 @@ void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, local_irq_restore(flags); } -EXPORT_SYMBOL_GPL(__blk_add_trace); - static struct dentry *blk_tree_root; static DEFINE_MUTEX(blk_tree_mutex); static unsigned int root_users; @@ -237,6 +243,10 @@ static void blk_trace_cleanup(struct blk_trace *bt) free_percpu(bt->sequence); free_percpu(bt->msg_data); kfree(bt); + mutex_lock(&blk_probe_mutex); + if (atomic_dec_and_test(&blk_probes_ref)) + blk_unregister_tracepoints(); + mutex_unlock(&blk_probe_mutex); } int blk_trace_remove(struct request_queue *q) @@ -428,6 +438,14 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, bt->pid = buts->pid; bt->trace_state = Blktrace_setup; + mutex_lock(&blk_probe_mutex); + if (atomic_add_return(1, &blk_probes_ref) == 1) { + ret = blk_register_tracepoints(); + if (ret) + goto probe_err; + } + mutex_unlock(&blk_probe_mutex); + ret = -EBUSY; old_bt = xchg(&q->blk_trace, bt); if (old_bt) { @@ -436,6 +454,9 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, } return 0; +probe_err: + atomic_dec(&blk_probes_ref); + mutex_unlock(&blk_probe_mutex); err: if (dir) blk_remove_tree(dir); @@ -562,3 +583,308 @@ void blk_trace_shutdown(struct request_queue *q) blk_trace_remove(q); } } + +/* + * blktrace probes + */ + +/** + * blk_add_trace_rq - Add a trace for a request oriented action + * @q: queue the io is for + * @rq: the source request + * @what: the action + * + * Description: + * Records an action against a request. Will log the bio offset + size. + * + **/ +static void blk_add_trace_rq(struct request_queue *q, struct request *rq, + u32 what) +{ + struct blk_trace *bt = q->blk_trace; + int rw = rq->cmd_flags & 0x03; + + if (likely(!bt)) + return; + + if (blk_discard_rq(rq)) + rw |= (1 << BIO_RW_DISCARD); + + if (blk_pc_request(rq)) { + what |= BLK_TC_ACT(BLK_TC_PC); + __blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors, + sizeof(rq->cmd), rq->cmd); + } else { + what |= BLK_TC_ACT(BLK_TC_FS); + __blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9, + rw, what, rq->errors, 0, NULL); + } +} + +static void blk_add_trace_rq_abort(struct request_queue *q, struct request *rq) +{ + blk_add_trace_rq(q, rq, BLK_TA_ABORT); +} + +static void blk_add_trace_rq_insert(struct request_queue *q, struct request *rq) +{ + blk_add_trace_rq(q, rq, BLK_TA_INSERT); +} + +static void blk_add_trace_rq_issue(struct request_queue *q, struct request *rq) +{ + blk_add_trace_rq(q, rq, BLK_TA_ISSUE); +} + +static void blk_add_trace_rq_requeue(struct request_queue *q, struct request *rq) +{ + blk_add_trace_rq(q, rq, BLK_TA_REQUEUE); +} + +static void blk_add_trace_rq_complete(struct request_queue *q, struct request *rq) +{ + blk_add_trace_rq(q, rq, BLK_TA_COMPLETE); +} + +/** + * blk_add_trace_bio - Add a trace for a bio oriented action + * @q: queue the io is for + * @bio: the source bio + * @what: the action + * + * Description: + * Records an action against a bio. Will log the bio offset + size. + * + **/ +static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, + u32 what) +{ + struct blk_trace *bt = q->blk_trace; + + if (likely(!bt)) + return; + + __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, + !bio_flagged(bio, BIO_UPTODATE), 0, NULL); +} + +static void blk_add_trace_bio_bounce(struct request_queue *q, struct bio *bio) +{ + blk_add_trace_bio(q, bio, BLK_TA_BOUNCE); +} + +static void blk_add_trace_bio_complete(struct request_queue *q, struct bio *bio) +{ + blk_add_trace_bio(q, bio, BLK_TA_COMPLETE); +} + +static void blk_add_trace_bio_backmerge(struct request_queue *q, struct bio *bio) +{ + blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE); +} + +static void blk_add_trace_bio_frontmerge(struct request_queue *q, struct bio *bio) +{ + blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE); +} + +static void blk_add_trace_bio_queue(struct request_queue *q, struct bio *bio) +{ + blk_add_trace_bio(q, bio, BLK_TA_QUEUE); +} + +static void blk_add_trace_getrq(struct request_queue *q, struct bio *bio, int rw) +{ + if (bio) + blk_add_trace_bio(q, bio, BLK_TA_GETRQ); + else { + struct blk_trace *bt = q->blk_trace; + + if (bt) + __blk_add_trace(bt, 0, 0, rw, BLK_TA_GETRQ, 0, 0, NULL); + } +} + + +static void blk_add_trace_sleeprq(struct request_queue *q, struct bio *bio, int rw) +{ + if (bio) + blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ); + else { + struct blk_trace *bt = q->blk_trace; + + if (bt) + __blk_add_trace(bt, 0, 0, rw, BLK_TA_SLEEPRQ, 0, 0, NULL); + } +} + +static void blk_add_trace_plug(struct request_queue *q) +{ + struct blk_trace *bt = q->blk_trace; + + if (bt) + __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL); +} + +static void blk_add_trace_unplug_io(struct request_queue *q) +{ + struct blk_trace *bt = q->blk_trace; + + if (bt) { + unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE]; + __be64 rpdu = cpu_to_be64(pdu); + + __blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_IO, 0, + sizeof(rpdu), &rpdu); + } +} + +static void blk_add_trace_unplug_timer(struct request_queue *q) +{ + struct blk_trace *bt = q->blk_trace; + + if (bt) { + unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE]; + __be64 rpdu = cpu_to_be64(pdu); + + __blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_TIMER, 0, + sizeof(rpdu), &rpdu); + } +} + +static void blk_add_trace_split(struct request_queue *q, struct bio *bio, + unsigned int pdu) +{ + struct blk_trace *bt = q->blk_trace; + + if (bt) { + __be64 rpdu = cpu_to_be64(pdu); + + __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, + BLK_TA_SPLIT, !bio_flagged(bio, BIO_UPTODATE), + sizeof(rpdu), &rpdu); + } +} + +/** + * blk_add_trace_remap - Add a trace for a remap operation + * @q: queue the io is for + * @bio: the source bio + * @dev: target device + * @from: source sector + * @to: target sector + * + * Description: + * Device mapper or raid target sometimes need to split a bio because + * it spans a stripe (or similar). Add a trace for that action. + * + **/ +static void blk_add_trace_remap(struct request_queue *q, struct bio *bio, + dev_t dev, sector_t from, sector_t to) +{ + struct blk_trace *bt = q->blk_trace; + struct blk_io_trace_remap r; + + if (likely(!bt)) + return; + + r.device = cpu_to_be32(dev); + r.device_from = cpu_to_be32(bio->bi_bdev->bd_dev); + r.sector = cpu_to_be64(to); + + __blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP, + !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r); +} + +/** + * blk_add_driver_data - Add binary message with driver-specific data + * @q: queue the io is for + * @rq: io request + * @data: driver-specific data + * @len: length of driver-specific data + * + * Description: + * Some drivers might want to write driver-specific data per request. + * + **/ +void blk_add_driver_data(struct request_queue *q, + struct request *rq, + void *data, size_t len) +{ + struct blk_trace *bt = q->blk_trace; + + if (likely(!bt)) + return; + + if (blk_pc_request(rq)) + __blk_add_trace(bt, 0, rq->data_len, 0, BLK_TA_DRV_DATA, + rq->errors, len, data); + else + __blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9, + 0, BLK_TA_DRV_DATA, rq->errors, len, data); +} +EXPORT_SYMBOL_GPL(blk_add_driver_data); + +static int blk_register_tracepoints(void) +{ + int ret; + + ret = register_trace_block_rq_abort(blk_add_trace_rq_abort); + WARN_ON(ret); + ret = register_trace_block_rq_insert(blk_add_trace_rq_insert); + WARN_ON(ret); + ret = register_trace_block_rq_issue(blk_add_trace_rq_issue); + WARN_ON(ret); + ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue); + WARN_ON(ret); + ret = register_trace_block_rq_complete(blk_add_trace_rq_complete); + WARN_ON(ret); + ret = register_trace_block_bio_bounce(blk_add_trace_bio_bounce); + WARN_ON(ret); + ret = register_trace_block_bio_complete(blk_add_trace_bio_complete); + WARN_ON(ret); + ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge); + WARN_ON(ret); + ret = register_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge); + WARN_ON(ret); + ret = register_trace_block_bio_queue(blk_add_trace_bio_queue); + WARN_ON(ret); + ret = register_trace_block_getrq(blk_add_trace_getrq); + WARN_ON(ret); + ret = register_trace_block_sleeprq(blk_add_trace_sleeprq); + WARN_ON(ret); + ret = register_trace_block_plug(blk_add_trace_plug); + WARN_ON(ret); + ret = register_trace_block_unplug_timer(blk_add_trace_unplug_timer); + WARN_ON(ret); + ret = register_trace_block_unplug_io(blk_add_trace_unplug_io); + WARN_ON(ret); + ret = register_trace_block_split(blk_add_trace_split); + WARN_ON(ret); + ret = register_trace_block_remap(blk_add_trace_remap); + WARN_ON(ret); + return 0; +} + +static void blk_unregister_tracepoints(void) +{ + unregister_trace_block_remap(blk_add_trace_remap); + unregister_trace_block_split(blk_add_trace_split); + unregister_trace_block_unplug_io(blk_add_trace_unplug_io); + unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer); + unregister_trace_block_plug(blk_add_trace_plug); + unregister_trace_block_sleeprq(blk_add_trace_sleeprq); + unregister_trace_block_getrq(blk_add_trace_getrq); + unregister_trace_block_bio_queue(blk_add_trace_bio_queue); + unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge); + unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge); + unregister_trace_block_bio_complete(blk_add_trace_bio_complete); + unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce); + unregister_trace_block_rq_complete(blk_add_trace_rq_complete); + unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue); + unregister_trace_block_rq_issue(blk_add_trace_rq_issue); + unregister_trace_block_rq_insert(blk_add_trace_rq_insert); + unregister_trace_block_rq_abort(blk_add_trace_rq_abort); + + tracepoint_synchronize_unregister(); +} diff --git a/block/elevator.c b/block/elevator.c index 9ac82dd..530fcfe 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include @@ -586,7 +587,7 @@ void elv_insert(struct request_queue *q, struct request *rq, int where) unsigned ordseq; int unplug_it = 1; - blk_add_trace_rq(q, rq, BLK_TA_INSERT); + trace_block_rq_insert(q, rq); rq->q = q; @@ -772,7 +773,7 @@ struct request *elv_next_request(struct request_queue *q) * not be passed by new incoming requests */ rq->cmd_flags |= REQ_STARTED; - blk_add_trace_rq(q, rq, BLK_TA_ISSUE); + trace_block_rq_issue(q, rq); } if (!q->boundary_rq || q->boundary_rq == rq) { @@ -921,7 +922,7 @@ void elv_abort_queue(struct request_queue *q) while (!list_empty(&q->queue_head)) { rq = list_entry_rq(q->queue_head.next); rq->cmd_flags |= REQ_QUIET; - blk_add_trace_rq(q, rq, BLK_TA_ABORT); + trace_block_rq_abort(q, rq); __blk_end_request(rq, -EIO, blk_rq_bytes(rq)); } } diff --git a/drivers/md/dm.c b/drivers/md/dm.c index c99e4728..d23fda1 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -21,6 +21,7 @@ #include #include #include +#include #define DM_MSG_PREFIX "core" @@ -504,8 +505,7 @@ static void dec_pending(struct dm_io *io, int error) end_io_acct(io); if (io->error != DM_ENDIO_REQUEUE) { - blk_add_trace_bio(io->md->queue, io->bio, - BLK_TA_COMPLETE); + trace_block_bio_complete(io->md->queue, io->bio); bio_endio(io->bio, io->error); } @@ -598,7 +598,7 @@ static void __map_bio(struct dm_target *ti, struct bio *clone, if (r == DM_MAPIO_REMAPPED) { /* the bio has been remapped so dispatch it */ - blk_add_trace_remap(bdev_get_queue(clone->bi_bdev), clone, + trace_block_remap(bdev_get_queue(clone->bi_bdev), clone, tio->io->bio->bi_bdev->bd_dev, clone->bi_sector, sector); diff --git a/fs/bio.c b/fs/bio.c index 77a55bc..060859c 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -26,6 +26,7 @@ #include #include #include +#include #include /* for struct sg_iovec */ static struct kmem_cache *bio_slab __read_mostly; @@ -1263,7 +1264,7 @@ struct bio_pair *bio_split(struct bio *bi, int first_sectors) if (!bp) return bp; - blk_add_trace_pdu_int(bdev_get_queue(bi->bi_bdev), BLK_TA_SPLIT, bi, + trace_block_split(bdev_get_queue(bi->bi_bdev), bi, bi->bi_sector + first_sectors); BUG_ON(bi->bi_vcnt != 1); diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index bdf505d..1dba349 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -160,7 +160,6 @@ struct blk_trace { extern int blk_trace_ioctl(struct block_device *, unsigned, char __user *); extern void blk_trace_shutdown(struct request_queue *); -extern void __blk_add_trace(struct blk_trace *, sector_t, int, int, u32, int, int, void *); extern int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, struct blk_user_trace_setup *buts); extern void __trace_note_message(struct blk_trace *, const char *fmt, ...); @@ -186,168 +185,8 @@ extern void __trace_note_message(struct blk_trace *, const char *fmt, ...); } while (0) #define BLK_TN_MAX_MSG 128 -/** - * blk_add_trace_rq - Add a trace for a request oriented action - * @q: queue the io is for - * @rq: the source request - * @what: the action - * - * Description: - * Records an action against a request. Will log the bio offset + size. - * - **/ -static inline void blk_add_trace_rq(struct request_queue *q, struct request *rq, - u32 what) -{ - struct blk_trace *bt = q->blk_trace; - int rw = rq->cmd_flags & 0x03; - - if (likely(!bt)) - return; - - if (blk_discard_rq(rq)) - rw |= (1 << BIO_RW_DISCARD); - - if (blk_pc_request(rq)) { - what |= BLK_TC_ACT(BLK_TC_PC); - __blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors, sizeof(rq->cmd), rq->cmd); - } else { - what |= BLK_TC_ACT(BLK_TC_FS); - __blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9, rw, what, rq->errors, 0, NULL); - } -} - -/** - * blk_add_trace_bio - Add a trace for a bio oriented action - * @q: queue the io is for - * @bio: the source bio - * @what: the action - * - * Description: - * Records an action against a bio. Will log the bio offset + size. - * - **/ -static inline void blk_add_trace_bio(struct request_queue *q, struct bio *bio, - u32 what) -{ - struct blk_trace *bt = q->blk_trace; - - if (likely(!bt)) - return; - - __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, !bio_flagged(bio, BIO_UPTODATE), 0, NULL); -} - -/** - * blk_add_trace_generic - Add a trace for a generic action - * @q: queue the io is for - * @bio: the source bio - * @rw: the data direction - * @what: the action - * - * Description: - * Records a simple trace - * - **/ -static inline void blk_add_trace_generic(struct request_queue *q, - struct bio *bio, int rw, u32 what) -{ - struct blk_trace *bt = q->blk_trace; - - if (likely(!bt)) - return; - - if (bio) - blk_add_trace_bio(q, bio, what); - else - __blk_add_trace(bt, 0, 0, rw, what, 0, 0, NULL); -} - -/** - * blk_add_trace_pdu_int - Add a trace for a bio with an integer payload - * @q: queue the io is for - * @what: the action - * @bio: the source bio - * @pdu: the integer payload - * - * Description: - * Adds a trace with some integer payload. This might be an unplug - * option given as the action, with the depth at unplug time given - * as the payload - * - **/ -static inline void blk_add_trace_pdu_int(struct request_queue *q, u32 what, - struct bio *bio, unsigned int pdu) -{ - struct blk_trace *bt = q->blk_trace; - __be64 rpdu = cpu_to_be64(pdu); - - if (likely(!bt)) - return; - - if (bio) - __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, !bio_flagged(bio, BIO_UPTODATE), sizeof(rpdu), &rpdu); - else - __blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu); -} - -/** - * blk_add_trace_remap - Add a trace for a remap operation - * @q: queue the io is for - * @bio: the source bio - * @dev: target device - * @from: source sector - * @to: target sector - * - * Description: - * Device mapper or raid target sometimes need to split a bio because - * it spans a stripe (or similar). Add a trace for that action. - * - **/ -static inline void blk_add_trace_remap(struct request_queue *q, struct bio *bio, - dev_t dev, sector_t from, sector_t to) -{ - struct blk_trace *bt = q->blk_trace; - struct blk_io_trace_remap r; - - if (likely(!bt)) - return; - - r.device = cpu_to_be32(dev); - r.device_from = cpu_to_be32(bio->bi_bdev->bd_dev); - r.sector = cpu_to_be64(to); - - __blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP, !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r); -} - -/** - * blk_add_driver_data - Add binary message with driver-specific data - * @q: queue the io is for - * @rq: io request - * @data: driver-specific data - * @len: length of driver-specific data - * - * Description: - * Some drivers might want to write driver-specific data per request. - * - **/ -static inline void blk_add_driver_data(struct request_queue *q, - struct request *rq, - void *data, size_t len) -{ - struct blk_trace *bt = q->blk_trace; - - if (likely(!bt)) - return; - - if (blk_pc_request(rq)) - __blk_add_trace(bt, 0, rq->data_len, 0, BLK_TA_DRV_DATA, - rq->errors, len, data); - else - __blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9, - 0, BLK_TA_DRV_DATA, rq->errors, len, data); -} - +extern void blk_add_driver_data(struct request_queue *q, struct request *rq, + void *data, size_t len); extern int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, char __user *arg); extern int blk_trace_startstop(struct request_queue *q, int start); @@ -356,13 +195,8 @@ extern int blk_trace_remove(struct request_queue *q); #else /* !CONFIG_BLK_DEV_IO_TRACE */ #define blk_trace_ioctl(bdev, cmd, arg) (-ENOTTY) #define blk_trace_shutdown(q) do { } while (0) -#define blk_add_trace_rq(q, rq, what) do { } while (0) -#define blk_add_trace_bio(q, rq, what) do { } while (0) -#define blk_add_trace_generic(q, rq, rw, what) do { } while (0) -#define blk_add_trace_pdu_int(q, what, bio, pdu) do { } while (0) -#define blk_add_trace_remap(q, bio, dev, f, t) do {} while (0) -#define blk_add_driver_data(q, rq, data, len) do {} while (0) #define do_blk_trace_setup(q, name, dev, buts) (-ENOTTY) +#define blk_add_driver_data(q, rq, data, len) do {} while (0) #define blk_trace_setup(q, name, dev, arg) (-ENOTTY) #define blk_trace_startstop(q, start) (-ENOTTY) #define blk_trace_remove(q) (-ENOTTY) diff --git a/include/trace/block.h b/include/trace/block.h new file mode 100644 index 0000000..3cc2675 --- /dev/null +++ b/include/trace/block.h @@ -0,0 +1,60 @@ +#ifndef _TRACE_BLOCK_H +#define _TRACE_BLOCK_H + +#include +#include + +DEFINE_TRACE(block_rq_abort, + TPPROTO(struct request_queue *q, struct request *rq), + TPARGS(q, rq)); +DEFINE_TRACE(block_rq_insert, + TPPROTO(struct request_queue *q, struct request *rq), + TPARGS(q, rq)); +DEFINE_TRACE(block_rq_issue, + TPPROTO(struct request_queue *q, struct request *rq), + TPARGS(q, rq)); +DEFINE_TRACE(block_rq_requeue, + TPPROTO(struct request_queue *q, struct request *rq), + TPARGS(q, rq)); +DEFINE_TRACE(block_rq_complete, + TPPROTO(struct request_queue *q, struct request *rq), + TPARGS(q, rq)); +DEFINE_TRACE(block_bio_bounce, + TPPROTO(struct request_queue *q, struct bio *bio), + TPARGS(q, bio)); +DEFINE_TRACE(block_bio_complete, + TPPROTO(struct request_queue *q, struct bio *bio), + TPARGS(q, bio)); +DEFINE_TRACE(block_bio_backmerge, + TPPROTO(struct request_queue *q, struct bio *bio), + TPARGS(q, bio)); +DEFINE_TRACE(block_bio_frontmerge, + TPPROTO(struct request_queue *q, struct bio *bio), + TPARGS(q, bio)); +DEFINE_TRACE(block_bio_queue, + TPPROTO(struct request_queue *q, struct bio *bio), + TPARGS(q, bio)); +DEFINE_TRACE(block_getrq, + TPPROTO(struct request_queue *q, struct bio *bio, int rw), + TPARGS(q, bio, rw)); +DEFINE_TRACE(block_sleeprq, + TPPROTO(struct request_queue *q, struct bio *bio, int rw), + TPARGS(q, bio, rw)); +DEFINE_TRACE(block_plug, + TPPROTO(struct request_queue *q), + TPARGS(q)); +DEFINE_TRACE(block_unplug_timer, + TPPROTO(struct request_queue *q), + TPARGS(q)); +DEFINE_TRACE(block_unplug_io, + TPPROTO(struct request_queue *q), + TPARGS(q)); +DEFINE_TRACE(block_split, + TPPROTO(struct request_queue *q, struct bio *bio, unsigned int pdu), + TPARGS(q, bio, pdu)); +DEFINE_TRACE(block_remap, + TPPROTO(struct request_queue *q, struct bio *bio, dev_t dev, + sector_t from, sector_t to), + TPARGS(q, bio, dev, from, to)); + +#endif diff --git a/mm/bounce.c b/mm/bounce.c index 06722c4..bd1caaa 100644 --- a/mm/bounce.c +++ b/mm/bounce.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #define POOL_SIZE 64 @@ -222,7 +223,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, if (!bio) return; - blk_add_trace_bio(q, *bio_orig, BLK_TA_BOUNCE); + trace_block_bio_bounce(q, *bio_orig); /* * at least one page was bounced, fill in possible non-highmem -- cgit v0.10.2 From 0bfc24559d7945506184d86739fe365a181f06b7 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 26 Nov 2008 11:59:56 +0100 Subject: blktrace: port to tracepoints, update Port to the new tracepoints API: split DEFINE_TRACE() and DECLARE_TRACE() sites. Spread them out to the usage sites, as suggested by Mathieu Desnoyers. Signed-off-by: Ingo Molnar Acked-by: Mathieu Desnoyers diff --git a/block/blk-core.c b/block/blk-core.c index 04267d6..0c06cf5 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -32,6 +32,19 @@ #include "blk.h" +DEFINE_TRACE(block_plug); +DEFINE_TRACE(block_unplug_io); +DEFINE_TRACE(block_unplug_timer); +DEFINE_TRACE(block_getrq); +DEFINE_TRACE(block_sleeprq); +DEFINE_TRACE(block_rq_requeue); +DEFINE_TRACE(block_bio_backmerge); +DEFINE_TRACE(block_bio_frontmerge); +DEFINE_TRACE(block_bio_queue); +DEFINE_TRACE(block_rq_complete); +DEFINE_TRACE(block_remap); /* Also used in drivers/md/dm.c */ +EXPORT_TRACEPOINT_SYMBOL_GPL(block_remap); + static int __make_request(struct request_queue *q, struct bio *bio); /* diff --git a/block/elevator.c b/block/elevator.c index 530fcfe..e5677fe 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -42,6 +42,8 @@ static DEFINE_SPINLOCK(elv_list_lock); static LIST_HEAD(elv_list); +DEFINE_TRACE(block_rq_abort); + /* * Merge hash stuff. */ @@ -53,6 +55,9 @@ static const int elv_hash_shift = 6; #define rq_hash_key(rq) ((rq)->sector + (rq)->nr_sectors) #define ELV_ON_HASH(rq) (!hlist_unhashed(&(rq)->hash)) +DEFINE_TRACE(block_rq_insert); +DEFINE_TRACE(block_rq_issue); + /* * Query io scheduler to see if the current process issuing bio may be * merged with rq. diff --git a/drivers/md/dm.c b/drivers/md/dm.c index d23fda1..343094c 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -52,6 +52,8 @@ struct dm_target_io { union map_info info; }; +DEFINE_TRACE(block_bio_complete); + union map_info *dm_get_mapinfo(struct bio *bio) { if (bio && bio->bi_private) diff --git a/fs/bio.c b/fs/bio.c index 060859c..df99c88 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -29,6 +29,8 @@ #include #include /* for struct sg_iovec */ +DEFINE_TRACE(block_split); + static struct kmem_cache *bio_slab __read_mostly; static mempool_t *bio_split_pool __read_mostly; diff --git a/include/trace/block.h b/include/trace/block.h index 3cc2675..25c6a1f 100644 --- a/include/trace/block.h +++ b/include/trace/block.h @@ -4,57 +4,73 @@ #include #include -DEFINE_TRACE(block_rq_abort, +DECLARE_TRACE(block_rq_abort, TPPROTO(struct request_queue *q, struct request *rq), - TPARGS(q, rq)); -DEFINE_TRACE(block_rq_insert, + TPARGS(q, rq)); + +DECLARE_TRACE(block_rq_insert, TPPROTO(struct request_queue *q, struct request *rq), - TPARGS(q, rq)); -DEFINE_TRACE(block_rq_issue, + TPARGS(q, rq)); + +DECLARE_TRACE(block_rq_issue, TPPROTO(struct request_queue *q, struct request *rq), - TPARGS(q, rq)); -DEFINE_TRACE(block_rq_requeue, + TPARGS(q, rq)); + +DECLARE_TRACE(block_rq_requeue, TPPROTO(struct request_queue *q, struct request *rq), - TPARGS(q, rq)); -DEFINE_TRACE(block_rq_complete, + TPARGS(q, rq)); + +DECLARE_TRACE(block_rq_complete, TPPROTO(struct request_queue *q, struct request *rq), - TPARGS(q, rq)); -DEFINE_TRACE(block_bio_bounce, + TPARGS(q, rq)); + +DECLARE_TRACE(block_bio_bounce, TPPROTO(struct request_queue *q, struct bio *bio), - TPARGS(q, bio)); -DEFINE_TRACE(block_bio_complete, + TPARGS(q, bio)); + +DECLARE_TRACE(block_bio_complete, TPPROTO(struct request_queue *q, struct bio *bio), - TPARGS(q, bio)); -DEFINE_TRACE(block_bio_backmerge, + TPARGS(q, bio)); + +DECLARE_TRACE(block_bio_backmerge, TPPROTO(struct request_queue *q, struct bio *bio), - TPARGS(q, bio)); -DEFINE_TRACE(block_bio_frontmerge, + TPARGS(q, bio)); + +DECLARE_TRACE(block_bio_frontmerge, TPPROTO(struct request_queue *q, struct bio *bio), - TPARGS(q, bio)); -DEFINE_TRACE(block_bio_queue, + TPARGS(q, bio)); + +DECLARE_TRACE(block_bio_queue, TPPROTO(struct request_queue *q, struct bio *bio), - TPARGS(q, bio)); -DEFINE_TRACE(block_getrq, + TPARGS(q, bio)); + +DECLARE_TRACE(block_getrq, TPPROTO(struct request_queue *q, struct bio *bio, int rw), - TPARGS(q, bio, rw)); -DEFINE_TRACE(block_sleeprq, + TPARGS(q, bio, rw)); + +DECLARE_TRACE(block_sleeprq, TPPROTO(struct request_queue *q, struct bio *bio, int rw), - TPARGS(q, bio, rw)); -DEFINE_TRACE(block_plug, + TPARGS(q, bio, rw)); + +DECLARE_TRACE(block_plug, TPPROTO(struct request_queue *q), - TPARGS(q)); -DEFINE_TRACE(block_unplug_timer, + TPARGS(q)); + +DECLARE_TRACE(block_unplug_timer, TPPROTO(struct request_queue *q), - TPARGS(q)); -DEFINE_TRACE(block_unplug_io, + TPARGS(q)); + +DECLARE_TRACE(block_unplug_io, TPPROTO(struct request_queue *q), - TPARGS(q)); -DEFINE_TRACE(block_split, + TPARGS(q)); + +DECLARE_TRACE(block_split, TPPROTO(struct request_queue *q, struct bio *bio, unsigned int pdu), - TPARGS(q, bio, pdu)); -DEFINE_TRACE(block_remap, + TPARGS(q, bio, pdu)); + +DECLARE_TRACE(block_remap, TPPROTO(struct request_queue *q, struct bio *bio, dev_t dev, sector_t from, sector_t to), - TPARGS(q, bio, dev, from, to)); + TPARGS(q, bio, dev, from, to)); #endif diff --git a/mm/bounce.c b/mm/bounce.c index bd1caaa..bf0cf7c 100644 --- a/mm/bounce.c +++ b/mm/bounce.c @@ -22,6 +22,8 @@ static mempool_t *page_pool, *isa_page_pool; +DEFINE_TRACE(block_bio_bounce); + #ifdef CONFIG_HIGHMEM static __init int init_emergency_pool(void) { -- cgit v0.10.2 From 83a8df618eb04bd2819a758f3b409b1449862434 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 27 Nov 2008 01:46:33 +0100 Subject: tracing/function-graph-tracer: enhancements for the trace output Impact: enhance the output of the graph-tracer This patch applies some ideas of Ingo Molnar and Steven Rostedt. * Output leaf functions in one line with parenthesis, semicolon and duration output. * Add a second column (after cpu) for an overhead sign. if duration > 100 us, "!" if duration > 10 us, "+" else " " * Print output in us with remaining nanosec: u.n * Print duration on the right end, following the indentation of the functions. Use also visual clues: "-" on entry call (no duration to output) and "+" on return (duration output). The name of the tracer has been fixed as well: function-branch becomes function_branch. Here is an example of the new output: CPU[000] dequeue_entity() { - CPU[000] update_curr() { - CPU[000] update_min_vruntime(); + 0.512 us CPU[000] } + 1.504 us CPU[000] clear_buddies(); + 0.481 us CPU[000] update_min_vruntime(); + 0.504 us CPU[000] } + 4.557 us CPU[000] hrtick_update() { - CPU[000] hrtick_start_fair(); + 0.489 us CPU[000] } + 1.443 us CPU[000] + } + 14.655 us CPU[000] + } + 15.678 us CPU[000] + } + 16.686 us CPU[000] msecs_to_jiffies(); + 0.481 us CPU[000] put_prev_task_fair(); + 0.504 us CPU[000] pick_next_task_fair(); + 0.482 us CPU[000] pick_next_task_rt(); + 0.504 us CPU[000] pick_next_task_fair(); + 0.481 us CPU[000] pick_next_task_idle(); + 0.489 us CPU[000] _spin_trylock(); + 0.655 us CPU[000] _spin_unlock(); + 0.609 us CPU[000] ------------8<---------- thread bash-2794 ------------8<---------- CPU[000] finish_task_switch() { - CPU[000] _spin_unlock_irq(); + 0.722 us CPU[000] } + 2.369 us CPU[000] ! } + 501972.605 us CPU[000] ! } + 501973.763 us CPU[000] copy_from_read_buf() { - CPU[000] _spin_lock_irqsave(); + 0.670 us CPU[000] _spin_unlock_irqrestore(); + 0.699 us CPU[000] copy_to_user() { - CPU[000] might_fault() { - CPU[000] __might_sleep(); + 0.503 us CPU[000] } + 1.632 us CPU[000] __copy_to_user_ll(); + 0.542 us CPU[000] } + 3.858 us CPU[000] tty_audit_add_data() { - CPU[000] _spin_lock_irq(); + 0.609 us CPU[000] _spin_unlock_irq(); + 0.624 us CPU[000] } + 3.196 us CPU[000] _spin_lock_irqsave(); + 0.624 us CPU[000] _spin_unlock_irqrestore(); + 0.625 us CPU[000] + } + 13.611 us CPU[000] copy_from_read_buf() { - CPU[000] _spin_lock_irqsave(); + 0.624 us CPU[000] _spin_unlock_irqrestore(); + 0.616 us CPU[000] } + 2.820 us CPU[000] Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index d31d695..b958d60 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -14,6 +14,10 @@ #include "trace.h" #define TRACE_GRAPH_INDENT 2 +/* Spaces between function call and time duration */ +#define TRACE_GRAPH_TIMESPACE_ENTRY " " +/* Spaces between function call and closing braces */ +#define TRACE_GRAPH_TIMESPACE_RET " " #define TRACE_GRAPH_PRINT_OVERRUN 0x1 static struct tracer_opt trace_opts[] = { @@ -63,26 +67,130 @@ static int verif_pid(struct trace_seq *s, pid_t pid, int cpu) last_pid[cpu] = pid; comm = trace_find_cmdline(pid); - return trace_seq_printf(s, "\nCPU[%03d]" + return trace_seq_printf(s, "\nCPU[%03d] " " ------------8<---------- thread %s-%d" " ------------8<----------\n\n", cpu, comm, pid); } +static bool +trace_branch_is_leaf(struct trace_iterator *iter, + struct ftrace_graph_ent_entry *curr) +{ + struct ring_buffer_iter *ring_iter; + struct ring_buffer_event *event; + struct ftrace_graph_ret_entry *next; + + ring_iter = iter->buffer_iter[iter->cpu]; + + if (!ring_iter) + return false; + + event = ring_buffer_iter_peek(iter->buffer_iter[iter->cpu], NULL); + + if (!event) + return false; + + next = ring_buffer_event_data(event); + + if (next->ent.type != TRACE_GRAPH_RET) + return false; + + if (curr->ent.pid != next->ent.pid || + curr->graph_ent.func != next->ret.func) + return false; + + return true; +} + + +static inline int +print_graph_duration(unsigned long long duration, struct trace_seq *s) +{ + unsigned long nsecs_rem = do_div(duration, 1000); + return trace_seq_printf(s, "+ %llu.%lu us\n", duration, nsecs_rem); +} + +/* Signal a overhead of time execution to the output */ +static int +print_graph_overhead(unsigned long long duration, struct trace_seq *s) +{ + /* Duration exceeded 100 msecs */ + if (duration > 100000ULL) + return trace_seq_printf(s, "! "); + + /* Duration exceeded 10 msecs */ + if (duration > 10000ULL) + return trace_seq_printf(s, "+ "); + + return trace_seq_printf(s, " "); +} + +/* Case of a leaf function on its call entry */ static enum print_line_t -print_graph_entry(struct ftrace_graph_ent *call, struct trace_seq *s, - struct trace_entry *ent, int cpu) +print_graph_entry_leaf(struct trace_iterator *iter, + struct ftrace_graph_ent_entry *entry, struct trace_seq *s) { + struct ftrace_graph_ret_entry *ret_entry; + struct ftrace_graph_ret *graph_ret; + struct ring_buffer_event *event; + struct ftrace_graph_ent *call; + unsigned long long duration; int i; int ret; - if (!verif_pid(s, ent->pid, cpu)) + event = ring_buffer_read(iter->buffer_iter[iter->cpu], NULL); + ret_entry = ring_buffer_event_data(event); + graph_ret = &ret_entry->ret; + call = &entry->graph_ent; + duration = graph_ret->rettime - graph_ret->calltime; + + /* Overhead */ + ret = print_graph_overhead(duration, s); + if (!ret) return TRACE_TYPE_PARTIAL_LINE; - ret = trace_seq_printf(s, "CPU[%03d] ", cpu); + /* Function */ + for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { + ret = trace_seq_printf(s, " "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + + ret = seq_print_ip_sym(s, call->func, 0); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + ret = trace_seq_printf(s, "();"); if (!ret) return TRACE_TYPE_PARTIAL_LINE; + /* Duration */ + ret = trace_seq_printf(s, TRACE_GRAPH_TIMESPACE_ENTRY); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + ret = print_graph_duration(duration, s); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + return TRACE_TYPE_HANDLED; +} + +static enum print_line_t +print_graph_entry_nested(struct ftrace_graph_ent_entry *entry, + struct trace_seq *s) +{ + int i; + int ret; + struct ftrace_graph_ent *call = &entry->graph_ent; + + /* No overhead */ + ret = trace_seq_printf(s, " "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + /* Function */ for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { ret = trace_seq_printf(s, " "); if (!ret) @@ -93,26 +201,62 @@ print_graph_entry(struct ftrace_graph_ent *call, struct trace_seq *s, if (!ret) return TRACE_TYPE_PARTIAL_LINE; - ret = trace_seq_printf(s, "() {\n"); + ret = trace_seq_printf(s, "() {"); if (!ret) return TRACE_TYPE_PARTIAL_LINE; + + /* No duration to print at this state */ + ret = trace_seq_printf(s, TRACE_GRAPH_TIMESPACE_ENTRY "-\n"); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + return TRACE_TYPE_HANDLED; } static enum print_line_t +print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, + struct trace_iterator *iter, int cpu) +{ + int ret; + struct trace_entry *ent = iter->ent; + + if (!verif_pid(s, ent->pid, cpu)) + return TRACE_TYPE_PARTIAL_LINE; + + ret = trace_seq_printf(s, "CPU[%03d] ", cpu); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + if (trace_branch_is_leaf(iter, field)) + return print_graph_entry_leaf(iter, field, s); + else + return print_graph_entry_nested(field, s); + +} + +static enum print_line_t print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, struct trace_entry *ent, int cpu) { int i; int ret; + unsigned long long duration = trace->rettime - trace->calltime; + /* Pid */ if (!verif_pid(s, ent->pid, cpu)) return TRACE_TYPE_PARTIAL_LINE; + /* Cpu */ ret = trace_seq_printf(s, "CPU[%03d] ", cpu); if (!ret) return TRACE_TYPE_PARTIAL_LINE; + /* Overhead */ + ret = print_graph_overhead(duration, s); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + /* Closing brace */ for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) { ret = trace_seq_printf(s, " "); if (!ret) @@ -123,10 +267,16 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, if (!ret) return TRACE_TYPE_PARTIAL_LINE; - ret = trace_seq_printf(s, "%llu\n", trace->rettime - trace->calltime); + /* Duration */ + ret = trace_seq_printf(s, TRACE_GRAPH_TIMESPACE_RET); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + ret = print_graph_duration(duration, s); if (!ret) return TRACE_TYPE_PARTIAL_LINE; + /* Overrun */ if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERRUN) { ret = trace_seq_printf(s, " (Overruns: %lu)\n", trace->overrun); @@ -146,7 +296,7 @@ print_graph_function(struct trace_iterator *iter) case TRACE_GRAPH_ENT: { struct ftrace_graph_ent_entry *field; trace_assign_type(field, entry); - return print_graph_entry(&field->graph_ent, s, entry, + return print_graph_entry(field, s, iter, iter->cpu); } case TRACE_GRAPH_RET: { @@ -160,7 +310,7 @@ print_graph_function(struct trace_iterator *iter) } static struct tracer graph_trace __read_mostly = { - .name = "function-graph", + .name = "function_graph", .init = graph_trace_init, .reset = graph_trace_reset, .print_line = print_graph_function, -- cgit v0.10.2 From 1a056155edd458eb93ef383fa8e5741d7e7c6360 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 28 Nov 2008 00:42:46 +0100 Subject: tracing/function-graph-tracer: adjustments of the trace informations Impact: increase the visual qualities of the call-graph-tracer output This patch applies various trace output formatting changes: - CPU is now a decimal number, followed by a parenthesis. - Overhead is now on the second column (gives a good visibility) - Cost is now on the third column, can't exceed 9999.99 us. It is followed by a virtual line based on a "|" character. - Functions calls are now the last column on the right. This way, we haven't dynamic column (which flow is harder to follow) on its right. - CPU and Overhead have their own option flag. They are default-on but you can disable them easily: echo nofuncgraph-cpu > trace_options echo nofuncgraph-overhead > trace_options TODO: _ Refactoring of the thread switch output. _ Give a default-off option to output the thread and its pid on each row. _ Provide headers _ .... Here is an example of the new trace style: 0) | mutex_unlock() { 0) 0.639 us | __mutex_unlock_slowpath(); 0) 1.607 us | } 0) | remove_wait_queue() { 0) 0.616 us | _spin_lock_irqsave(); 0) 0.616 us | _spin_unlock_irqrestore(); 0) 2.779 us | } 0) 0.495 us | n_tty_set_room(); 0) ! 9999.999 us | } 0) | tty_ldisc_deref() { 0) 0.615 us | _spin_lock_irqsave(); 0) 0.616 us | _spin_unlock_irqrestore(); 0) 2.793 us | } 0) | current_fs_time() { 0) 0.488 us | current_kernel_time(); 0) 0.495 us | timespec_trunc(); 0) 2.486 us | } 0) ! 9999.999 us | } 0) ! 9999.999 us | } 0) ! 9999.999 us | } 0) | sys_read() { 0) 0.796 us | fget_light(); 0) | vfs_read() { 0) | rw_verify_area() { 0) | security_file_permission() { 0) 0.488 us | cap_file_permission(); 0) 1.720 us | } 0) 3. 4 us | } 0) | tty_read() { 0) 0.488 us | tty_paranoia_check(); 0) | tty_ldisc_ref_wait() { 0) | tty_ldisc_try() { 0) 0.615 us | _spin_lock_irqsave(); 0) 0.615 us | _spin_unlock_irqrestore(); 0) 5.436 us | } 0) 6.427 us | } Signed-off-by: Frederic Weisbecker Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index b958d60..596a3ee 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -14,20 +14,25 @@ #include "trace.h" #define TRACE_GRAPH_INDENT 2 -/* Spaces between function call and time duration */ -#define TRACE_GRAPH_TIMESPACE_ENTRY " " -/* Spaces between function call and closing braces */ -#define TRACE_GRAPH_TIMESPACE_RET " " +/* Flag options */ #define TRACE_GRAPH_PRINT_OVERRUN 0x1 +#define TRACE_GRAPH_PRINT_CPU 0x2 +#define TRACE_GRAPH_PRINT_OVERHEAD 0x4 + static struct tracer_opt trace_opts[] = { - /* Display overruns or not */ - { TRACER_OPT(overrun, TRACE_GRAPH_PRINT_OVERRUN) }, + /* Display overruns ? */ + { TRACER_OPT(funcgraph-overrun, TRACE_GRAPH_PRINT_OVERRUN) }, + /* Display CPU ? */ + { TRACER_OPT(funcgraph-cpu, TRACE_GRAPH_PRINT_CPU) }, + /* Display Overhead ? */ + { TRACER_OPT(funcgraph-overhead, TRACE_GRAPH_PRINT_OVERHEAD) }, { } /* Empty entry */ }; static struct tracer_flags tracer_flags = { - .val = 0, /* Don't display overruns by default */ + /* Don't display overruns by default */ + .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD, .opts = trace_opts }; @@ -56,6 +61,36 @@ static void graph_trace_reset(struct trace_array *tr) unregister_ftrace_graph(); } +static inline int log10_cpu(int nb) +{ + if (nb / 100) + return 3; + if (nb / 10) + return 2; + return 1; +} + +static enum print_line_t +print_graph_cpu(struct trace_seq *s, int cpu) +{ + int i; + int ret; + int log10_this = log10_cpu(cpu); + int log10_all = log10_cpu(cpus_weight_nr(cpu_online_map)); + + + for (i = 0; i < log10_all - log10_this; i++) { + ret = trace_seq_printf(s, " "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + ret = trace_seq_printf(s, "%d) ", cpu); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + return TRACE_TYPE_HANDLED; +} + + /* If the pid changed since the last trace, output this event */ static int verif_pid(struct trace_seq *s, pid_t pid, int cpu) { @@ -67,8 +102,7 @@ static int verif_pid(struct trace_seq *s, pid_t pid, int cpu) last_pid[cpu] = pid; comm = trace_find_cmdline(pid); - return trace_seq_printf(s, "\nCPU[%03d] " - " ------------8<---------- thread %s-%d" + return trace_seq_printf(s, "\n------------8<---------- thread %s-%d" " ------------8<----------\n\n", cpu, comm, pid); } @@ -86,7 +120,7 @@ trace_branch_is_leaf(struct trace_iterator *iter, if (!ring_iter) return false; - event = ring_buffer_iter_peek(iter->buffer_iter[iter->cpu], NULL); + event = ring_buffer_iter_peek(ring_iter, NULL); if (!event) return false; @@ -108,7 +142,7 @@ static inline int print_graph_duration(unsigned long long duration, struct trace_seq *s) { unsigned long nsecs_rem = do_div(duration, 1000); - return trace_seq_printf(s, "+ %llu.%lu us\n", duration, nsecs_rem); + return trace_seq_printf(s, "%4llu.%3lu us | ", duration, nsecs_rem); } /* Signal a overhead of time execution to the output */ @@ -136,8 +170,8 @@ print_graph_entry_leaf(struct trace_iterator *iter, struct ring_buffer_event *event; struct ftrace_graph_ent *call; unsigned long long duration; - int i; int ret; + int i; event = ring_buffer_read(iter->buffer_iter[iter->cpu], NULL); ret_entry = ring_buffer_event_data(event); @@ -145,8 +179,19 @@ print_graph_entry_leaf(struct trace_iterator *iter, call = &entry->graph_ent; duration = graph_ret->rettime - graph_ret->calltime; + /* Must not exceed 8 characters: 9999.999 us */ + if (duration > 10000000ULL) + duration = 9999999ULL; + /* Overhead */ - ret = print_graph_overhead(duration, s); + if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) { + ret = print_graph_overhead(duration, s); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + + /* Duration */ + ret = print_graph_duration(duration, s); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -161,16 +206,7 @@ print_graph_entry_leaf(struct trace_iterator *iter, if (!ret) return TRACE_TYPE_PARTIAL_LINE; - ret = trace_seq_printf(s, "();"); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - /* Duration */ - ret = trace_seq_printf(s, TRACE_GRAPH_TIMESPACE_ENTRY); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - ret = print_graph_duration(duration, s); + ret = trace_seq_printf(s, "();\n"); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -186,9 +222,14 @@ print_graph_entry_nested(struct ftrace_graph_ent_entry *entry, struct ftrace_graph_ent *call = &entry->graph_ent; /* No overhead */ - ret = trace_seq_printf(s, " "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) { + ret = trace_seq_printf(s, " "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + + /* No time */ + ret = trace_seq_printf(s, " | "); /* Function */ for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { @@ -201,12 +242,7 @@ print_graph_entry_nested(struct ftrace_graph_ent_entry *entry, if (!ret) return TRACE_TYPE_PARTIAL_LINE; - ret = trace_seq_printf(s, "() {"); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - /* No duration to print at this state */ - ret = trace_seq_printf(s, TRACE_GRAPH_TIMESPACE_ENTRY "-\n"); + ret = trace_seq_printf(s, "() {\n"); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -220,12 +256,16 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, int ret; struct trace_entry *ent = iter->ent; + /* Pid */ if (!verif_pid(s, ent->pid, cpu)) return TRACE_TYPE_PARTIAL_LINE; - ret = trace_seq_printf(s, "CPU[%03d] ", cpu); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + /* Cpu */ + if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) { + ret = print_graph_cpu(s, cpu); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } if (trace_branch_is_leaf(iter, field)) return print_graph_entry_leaf(iter, field, s); @@ -242,17 +282,30 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, int ret; unsigned long long duration = trace->rettime - trace->calltime; + /* Must not exceed 8 characters: xxxx.yyy us */ + if (duration > 10000000ULL) + duration = 9999999ULL; + /* Pid */ if (!verif_pid(s, ent->pid, cpu)) return TRACE_TYPE_PARTIAL_LINE; /* Cpu */ - ret = trace_seq_printf(s, "CPU[%03d] ", cpu); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) { + ret = print_graph_cpu(s, cpu); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } /* Overhead */ - ret = print_graph_overhead(duration, s); + if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) { + ret = print_graph_overhead(duration, s); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + + /* Duration */ + ret = print_graph_duration(duration, s); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -263,16 +316,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, return TRACE_TYPE_PARTIAL_LINE; } - ret = trace_seq_printf(s, "} "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - /* Duration */ - ret = trace_seq_printf(s, TRACE_GRAPH_TIMESPACE_RET); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - ret = print_graph_duration(duration, s); + ret = trace_seq_printf(s, "}\n"); if (!ret) return TRACE_TYPE_PARTIAL_LINE; -- cgit v0.10.2 From d51090b34602a20984ab0312ef04e47069c0aec6 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 28 Nov 2008 09:55:16 +0100 Subject: tracing/function-graph-tracer: more output tweaks Impact: prettify the output some more Before: 0) | sys_read() { 0) 0.796 us | fget_light(); 0) | vfs_read() { 0) | rw_verify_area() { 0) | security_file_permission() { ------------8<---------- thread sshd-1755 ------------8<---------- After: 0) | sys_read() { 0) 0.796 us | fget_light(); 0) | vfs_read() { 0) | rw_verify_area() { 0) | security_file_permission() { ------------------------------------------ | 1) migration/0--1 => sshd-1755 ------------------------------------------ Signed-off-by: Ingo Molnar diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 596a3ee..894b50b 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -79,6 +79,19 @@ print_graph_cpu(struct trace_seq *s, int cpu) int log10_all = log10_cpu(cpus_weight_nr(cpu_online_map)); + /* + * Start with a space character - to make it stand out + * to the right a bit when trace output is pasted into + * email: + */ + ret = trace_seq_printf(s, " "); + + /* + * Tricky - we space the CPU field according to the max + * number of online CPUs. On a 2-cpu system it would take + * a maximum of 1 digit - on a 128 cpu system it would + * take up to 3 digits: + */ for (i = 0; i < log10_all - log10_this; i++) { ret = trace_seq_printf(s, " "); if (!ret) @@ -86,7 +99,8 @@ print_graph_cpu(struct trace_seq *s, int cpu) } ret = trace_seq_printf(s, "%d) ", cpu); if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + return TRACE_TYPE_PARTIAL_LINE; + return TRACE_TYPE_HANDLED; } @@ -94,17 +108,34 @@ print_graph_cpu(struct trace_seq *s, int cpu) /* If the pid changed since the last trace, output this event */ static int verif_pid(struct trace_seq *s, pid_t pid, int cpu) { - char *comm; + char *comm, *prev_comm; + pid_t prev_pid; + int ret; if (last_pid[cpu] != -1 && last_pid[cpu] == pid) return 1; + prev_pid = last_pid[cpu]; last_pid[cpu] = pid; + comm = trace_find_cmdline(pid); + prev_comm = trace_find_cmdline(prev_pid); - return trace_seq_printf(s, "\n------------8<---------- thread %s-%d" - " ------------8<----------\n\n", - cpu, comm, pid); +/* + * Context-switch trace line: + + ------------------------------------------ + | 1) migration/0--1 => sshd-1755 + ------------------------------------------ + + */ + ret = trace_seq_printf(s, + " ------------------------------------------\n"); + ret += trace_seq_printf(s, " | %d) %s-%d => %s-%d\n", + cpu, prev_comm, prev_pid, comm, pid); + ret += trace_seq_printf(s, + " ------------------------------------------\n\n"); + return ret; } static bool @@ -142,7 +173,7 @@ static inline int print_graph_duration(unsigned long long duration, struct trace_seq *s) { unsigned long nsecs_rem = do_div(duration, 1000); - return trace_seq_printf(s, "%4llu.%3lu us | ", duration, nsecs_rem); + return trace_seq_printf(s, "%4llu.%3lu us | ", duration, nsecs_rem); } /* Signal a overhead of time execution to the output */ @@ -229,7 +260,7 @@ print_graph_entry_nested(struct ftrace_graph_ent_entry *entry, } /* No time */ - ret = trace_seq_printf(s, " | "); + ret = trace_seq_printf(s, " | "); /* Function */ for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { -- cgit v0.10.2 From c7425acb42fff1e723b05fbf4ea11e9a455d95dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?T=C3=B6r=C3=B6k=20Edwin?= Date: Fri, 28 Nov 2008 11:17:56 +0200 Subject: tracing, alpha: fix build: add missing #ifdef CONFIG_STACKTRACE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are architectures that still have no stacktrace support. Signed-off-by: Török Edwin Signed-off-by: Ingo Molnar diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 5811e0a..91887a2 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -983,6 +983,7 @@ static void ftrace_trace_userstack(struct trace_array *tr, struct trace_array_cpu *data, unsigned long flags, int pc) { +#ifdef CONFIG_STACKTRACE struct ring_buffer_event *event; struct userstack_entry *entry; struct stack_trace trace; @@ -1008,6 +1009,7 @@ static void ftrace_trace_userstack(struct trace_array *tr, save_stack_trace_user(&trace); ring_buffer_unlock_commit(tr->buffer, event, irq_flags); +#endif } void __trace_userstack(struct trace_array *tr, -- cgit v0.10.2 From 50cdaf08a8ec1d7f43987705da7aff7cf949708f Mon Sep 17 00:00:00 2001 From: Liming Wang Date: Fri, 28 Nov 2008 12:13:21 +0800 Subject: ftrace: improve seq_operation of ftrace Impact: make ftrace position computing more sane First remove useless ->pos field. Then we needn't check seq_printf in .show like other place. Signed-off-by: Liming Wang Reviewed-by: Bruce Ashfield Signed-off-by: Ingo Molnar diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index cbf8b09..08b536a 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -786,7 +786,6 @@ enum { #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ struct ftrace_iterator { - loff_t pos; struct ftrace_page *pg; unsigned idx; unsigned flags; @@ -811,6 +810,8 @@ t_next(struct seq_file *m, void *v, loff_t *pos) iter->pg = iter->pg->next; iter->idx = 0; goto retry; + } else { + iter->idx = -1; } } else { rec = &iter->pg->records[iter->idx++]; @@ -833,8 +834,6 @@ t_next(struct seq_file *m, void *v, loff_t *pos) } spin_unlock(&ftrace_lock); - iter->pos = *pos; - return rec; } @@ -842,13 +841,15 @@ static void *t_start(struct seq_file *m, loff_t *pos) { struct ftrace_iterator *iter = m->private; void *p = NULL; - loff_t l = -1; - if (*pos > iter->pos) - *pos = iter->pos; + if (*pos > 0) { + if (iter->idx < 0) + return p; + (*pos)--; + iter->idx--; + } - l = *pos; - p = t_next(m, p, &l); + p = t_next(m, p, pos); return p; } @@ -859,21 +860,15 @@ static void t_stop(struct seq_file *m, void *p) static int t_show(struct seq_file *m, void *v) { - struct ftrace_iterator *iter = m->private; struct dyn_ftrace *rec = v; char str[KSYM_SYMBOL_LEN]; - int ret = 0; if (!rec) return 0; kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); - ret = seq_printf(m, "%s\n", str); - if (ret < 0) { - iter->pos--; - iter->idx--; - } + seq_printf(m, "%s\n", str); return 0; } @@ -899,7 +894,6 @@ ftrace_avail_open(struct inode *inode, struct file *file) return -ENOMEM; iter->pg = ftrace_pages_start; - iter->pos = 0; ret = seq_open(file, &show_ftrace_seq_ops); if (!ret) { @@ -986,7 +980,6 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable) if (file->f_mode & FMODE_READ) { iter->pg = ftrace_pages_start; - iter->pos = 0; iter->flags = enable ? FTRACE_ITER_FILTER : FTRACE_ITER_NOTRACE; -- cgit v0.10.2 From c072c24975ec4f0ccfcb6f5c8a8040b6eb75ef8f Mon Sep 17 00:00:00 2001 From: walimis Date: Fri, 28 Nov 2008 12:21:19 +0800 Subject: ftrace: improve documentation Impact: extend documentation with notice of using wild cards correctly We know that we can use wild cards to set set_ftrace_filter, but there's problem when using them naively such as: echo h* > /debug/tracing/set_ftrace_filter If there are files named with "h" prefix in current directory, echo "h*" will echo these filenames to set_ftrace_filter, not the intended "h*". For example: $ cat /debug/tracing/available_filter_functions |grep ^hr |wc -l 23 $ ls $ touch hraa hrdd $ ls hraa hrdd $ echo hr* > /debug/tracing/set_ftrace_filter $ cat /debug/tracing/set_ftrace_filter No output in /debug/tracing/set_ftrace_filter! If we use '' to escape wild cards, it works: $ ls hraa hrdd $ echo "hr*" > /debug/tracing/set_ftrace_filter $ cat /debug/tracing/set_ftrace_filter |wc -l 23 This problem can lead to unexpected result if current directory has a lot of files. Signed-off-by: walimis Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/Documentation/ftrace.txt b/Documentation/ftrace.txt index de05042..803b131 100644 --- a/Documentation/ftrace.txt +++ b/Documentation/ftrace.txt @@ -1251,7 +1251,11 @@ These are the only wild cards which are supported. * will not work. - # echo hrtimer_* > /debug/tracing/set_ftrace_filter +Note: It is better to use quotes to enclose the wild cards, otherwise + the shell may expand the parameters into names of files in the local + directory. + + # echo 'hrtimer_*' > /debug/tracing/set_ftrace_filter Produces: @@ -1306,7 +1310,7 @@ Again, now we want to append. # echo sys_nanosleep > /debug/tracing/set_ftrace_filter # cat /debug/tracing/set_ftrace_filter sys_nanosleep - # echo hrtimer_* >> /debug/tracing/set_ftrace_filter + # echo 'hrtimer_*' >> /debug/tracing/set_ftrace_filter # cat /debug/tracing/set_ftrace_filter hrtimer_run_queues hrtimer_run_pending -- cgit v0.10.2 From c7b0d17366d6e04a11470fc8d85f9fbac02671b9 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 20 Nov 2008 13:18:55 -0800 Subject: powerpc: ftrace, do nothing in mcount call for dyn ftrace Impact: quicken mcount calls that are not replaced by dyn ftrace Dynamic ftrace no longer does on the fly recording of mcount locations. The mcount locations are now found at compile time. The mcount function no longer needs to store registers and call a stub function. It can now just simply return. Since there are some functions that do not get converted to a nop (.init sections and other code that may disappear), this patch should help speed up that code. Also, the stub for mcount on PowerPC 32 can not be a simple branch link register like it is on PowerPC 64. According to the ABI specification: "The _mcount routine is required to restore the link register from the stack so that the profiling code can be inserted transparently, whether or not the profiled function saves the link register itself." This means that we must restore the link register that was used to make the call to mcount. The minimal mcount function for PPC32 ends up being: mcount: mflr r0 mtctr r0 lwz r0, 4(r1) mtlr r0 bctr Where we move the link register used to call mcount into the ctr register, and then restore the link register from the stack. Then we use the ctr register to jump back to the mcount caller. The r0 register is free for us to use. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index 7ecc0d1..6f7eb7e 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -1162,39 +1162,17 @@ machine_check_in_rtas: #ifdef CONFIG_DYNAMIC_FTRACE _GLOBAL(mcount) _GLOBAL(_mcount) - stwu r1,-48(r1) - stw r3, 12(r1) - stw r4, 16(r1) - stw r5, 20(r1) - stw r6, 24(r1) - mflr r3 - stw r7, 28(r1) - mfcr r5 - stw r8, 32(r1) - stw r9, 36(r1) - stw r10,40(r1) - stw r3, 44(r1) - stw r5, 8(r1) - subi r3, r3, MCOUNT_INSN_SIZE - .globl mcount_call -mcount_call: - bl ftrace_stub - nop - lwz r6, 8(r1) - lwz r0, 44(r1) - lwz r3, 12(r1) + /* + * It is required that _mcount on PPC32 must preserve the + * link register. But we have r0 to play with. We use r0 + * to push the return address back to the caller of mcount + * into the ctr register, restore the link register and + * then jump back using the ctr register. + */ + mflr r0 mtctr r0 - lwz r4, 16(r1) - mtcr r6 - lwz r5, 20(r1) - lwz r6, 24(r1) - lwz r0, 52(r1) - lwz r7, 28(r1) - lwz r8, 32(r1) + lwz r0, 4(r1) mtlr r0 - lwz r9, 36(r1) - lwz r10,40(r1) - addi r1, r1, 48 bctr _GLOBAL(ftrace_caller) diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index e6d5284..b00982e 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -888,18 +888,6 @@ _GLOBAL(enter_prom) #ifdef CONFIG_DYNAMIC_FTRACE _GLOBAL(mcount) _GLOBAL(_mcount) - /* Taken from output of objdump from lib64/glibc */ - mflr r3 - stdu r1, -112(r1) - std r3, 128(r1) - subi r3, r3, MCOUNT_INSN_SIZE - .globl mcount_call -mcount_call: - bl ftrace_stub - nop - ld r0, 128(r1) - mtlr r0 - addi r1, r1, 112 blr _GLOBAL(ftrace_caller) -- cgit v0.10.2 From d9af12b72bfe2a4efc1d347e0ac1c669b85dcea9 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 25 Nov 2008 06:39:18 -0800 Subject: powerpc: ftrace, fix cast aliasing and add code verification Impact: clean up and robustness addition This patch addresses the comments made by Paul Mackerras. It removes the type casting between unsigned int and unsigned char pointers, and replaces them with a use of all unsigned int. Verification that the jump is indeed made to a trampoline has also been added. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/arch/powerpc/kernel/ftrace.c b/arch/powerpc/kernel/ftrace.c index 3271cd6..ea454a0 100644 --- a/arch/powerpc/kernel/ftrace.c +++ b/arch/powerpc/kernel/ftrace.c @@ -162,26 +162,25 @@ static int __ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, unsigned long addr) { - unsigned char replaced[MCOUNT_INSN_SIZE * 2]; - unsigned int *op = (unsigned *)&replaced; - unsigned char jmp[8]; - unsigned long *ptr = (unsigned long *)&jmp; + unsigned int op; + unsigned int jmp[5]; + unsigned long ptr; unsigned long ip = rec->ip; unsigned long tramp; int offset; /* read where this goes */ - if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE)) + if (probe_kernel_read(&op, (void *)ip, sizeof(int))) return -EFAULT; /* Make sure that that this is still a 24bit jump */ - if (!is_bl_op(*op)) { - printk(KERN_ERR "Not expected bl: opcode is %x\n", *op); + if (!is_bl_op(op)) { + printk(KERN_ERR "Not expected bl: opcode is %x\n", op); return -EINVAL; } /* lets find where the pointer goes */ - tramp = find_bl_target(ip, *op); + tramp = find_bl_target(ip, op); /* * On PPC64 the trampoline looks like: @@ -200,19 +199,25 @@ __ftrace_make_nop(struct module *mod, DEBUGP("ip:%lx jumps to %lx r2: %lx", ip, tramp, mod->arch.toc); /* Find where the trampoline jumps to */ - if (probe_kernel_read(jmp, (void *)tramp, 8)) { + if (probe_kernel_read(jmp, (void *)tramp, sizeof(jmp))) { printk(KERN_ERR "Failed to read %lx\n", tramp); return -EFAULT; } - DEBUGP(" %08x %08x", - (unsigned)(*ptr >> 32), - (unsigned)*ptr); + DEBUGP(" %08x %08x", jmp[0], jmp[1]); + + /* verify that this is what we expect it to be */ + if (((jmp[0] & 0xffff0000) != 0x3d820000) || + ((jmp[1] & 0xffff0000) != 0x398c0000) || + (jmp[2] != 0xf8410028) || + (jmp[3] != 0xe96c0020) || + (jmp[4] != 0xe84c0028)) { + printk(KERN_ERR "Not a trampoline\n"); + return -EINVAL; + } - offset = (unsigned)jmp[2] << 24 | - (unsigned)jmp[3] << 16 | - (unsigned)jmp[6] << 8 | - (unsigned)jmp[7]; + offset = (unsigned)((unsigned short)jmp[0]) << 16 | + (unsigned)((unsigned short)jmp[1]); DEBUGP(" %x ", offset); @@ -225,13 +230,13 @@ __ftrace_make_nop(struct module *mod, return -EFAULT; } - DEBUGP(" %08x %08x\n", - (unsigned)(*ptr >> 32), - (unsigned)*ptr); + DEBUGP(" %08x %08x\n", jmp[0], jmp[1]); + + ptr = ((unsigned long)jmp[0] << 32) + jmp[1]; /* This should match what was called */ - if (*ptr != GET_ADDR(addr)) { - printk(KERN_ERR "addr does not match %lx\n", *ptr); + if (ptr != GET_ADDR(addr)) { + printk(KERN_ERR "addr does not match %lx\n", ptr); return -EINVAL; } @@ -240,11 +245,11 @@ __ftrace_make_nop(struct module *mod, * 0xe8, 0x41, 0x00, 0x28 ld r2,40(r1) * This needs to be turned to a nop too. */ - if (probe_kernel_read(replaced, (void *)(ip+4), MCOUNT_INSN_SIZE)) + if (probe_kernel_read(&op, (void *)(ip+4), MCOUNT_INSN_SIZE)) return -EFAULT; - if (*op != 0xe8410028) { - printk(KERN_ERR "Next line is not ld! (%08x)\n", *op); + if (op != 0xe8410028) { + printk(KERN_ERR "Next line is not ld! (%08x)\n", op); return -EINVAL; } @@ -261,9 +266,9 @@ __ftrace_make_nop(struct module *mod, * ld r2,40(r1) * 1: */ - op[0] = 0x48000008; /* b +8 */ + op = 0x48000008; /* b +8 */ - if (probe_kernel_write((void *)ip, replaced, MCOUNT_INSN_SIZE)) + if (probe_kernel_write((void *)ip, &op, MCOUNT_INSN_SIZE)) return -EPERM; return 0; @@ -274,46 +279,52 @@ static int __ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, unsigned long addr) { - unsigned char replaced[MCOUNT_INSN_SIZE]; - unsigned int *op = (unsigned *)&replaced; - unsigned char jmp[8]; - unsigned int *ptr = (unsigned int *)&jmp; + unsigned int op; + unsigned int jmp[4]; unsigned long ip = rec->ip; unsigned long tramp; - int offset; - if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE)) + if (probe_kernel_read(&op, (void *)ip, MCOUNT_INSN_SIZE)) return -EFAULT; /* Make sure that that this is still a 24bit jump */ - if (!is_bl_op(*op)) { - printk(KERN_ERR "Not expected bl: opcode is %x\n", *op); + if (!is_bl_op(op)) { + printk(KERN_ERR "Not expected bl: opcode is %x\n", op); return -EINVAL; } /* lets find where the pointer goes */ - tramp = find_bl_target(ip, *op); + tramp = find_bl_target(ip, op); /* * On PPC32 the trampoline looks like: - * lis r11,sym@ha - * addi r11,r11,sym@l - * mtctr r11 - * bctr + * 0x3d, 0x60, 0x00, 0x00 lis r11,sym@ha + * 0x39, 0x6b, 0x00, 0x00 addi r11,r11,sym@l + * 0x7d, 0x69, 0x03, 0xa6 mtctr r11 + * 0x4e, 0x80, 0x04, 0x20 bctr */ DEBUGP("ip:%lx jumps to %lx", ip, tramp); /* Find where the trampoline jumps to */ - if (probe_kernel_read(jmp, (void *)tramp, 8)) { + if (probe_kernel_read(jmp, (void *)tramp, sizeof(jmp))) { printk(KERN_ERR "Failed to read %lx\n", tramp); return -EFAULT; } - DEBUGP(" %08x %08x ", ptr[0], ptr[1]); + DEBUGP(" %08x %08x ", jmp[0], jmp[1]); + + /* verify that this is what we expect it to be */ + if (((jmp[0] & 0xffff0000) != 0x3d600000) || + ((jmp[1] & 0xffff0000) != 0x396b0000) || + (jmp[2] != 0x7d6903a6) || + (jmp[3] != 0x4e800420)) { + printk(KERN_ERR "Not a trampoline\n"); + return -EINVAL; + } - tramp = (ptr[1] & 0xffff) | - ((ptr[0] & 0xffff) << 16); + tramp = (jmp[1] & 0xffff) | + ((jmp[0] & 0xffff) << 16); if (tramp & 0x8000) tramp -= 0x10000; @@ -326,9 +337,9 @@ __ftrace_make_nop(struct module *mod, return -EINVAL; } - op[0] = PPC_NOP_INSTR; + op = PPC_NOP_INSTR; - if (probe_kernel_write((void *)ip, replaced, MCOUNT_INSN_SIZE)) + if (probe_kernel_write((void *)ip, &op, MCOUNT_INSN_SIZE)) return -EPERM; return 0; @@ -384,13 +395,12 @@ int ftrace_make_nop(struct module *mod, static int __ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) { - unsigned char replaced[MCOUNT_INSN_SIZE * 2]; - unsigned int *op = (unsigned *)&replaced; + unsigned int op[2]; unsigned long ip = rec->ip; unsigned long offset; /* read where this goes */ - if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE * 2)) + if (probe_kernel_read(op, (void *)ip, MCOUNT_INSN_SIZE * 2)) return -EFAULT; /* @@ -425,7 +435,7 @@ __ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) DEBUGP("write to %lx\n", rec->ip); - if (probe_kernel_write((void *)ip, replaced, MCOUNT_INSN_SIZE * 2)) + if (probe_kernel_write((void *)ip, op, MCOUNT_INSN_SIZE * 2)) return -EPERM; return 0; @@ -434,18 +444,17 @@ __ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) static int __ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) { - unsigned char replaced[MCOUNT_INSN_SIZE]; - unsigned int *op = (unsigned *)&replaced; + unsigned int op; unsigned long ip = rec->ip; unsigned long offset; /* read where this goes */ - if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE)) + if (probe_kernel_read(&op, (void *)ip, MCOUNT_INSN_SIZE)) return -EFAULT; /* It should be pointing to a nop */ - if (op[0] != PPC_NOP_INSTR) { - printk(KERN_ERR "Expected NOP but have %x\n", op[0]); + if (op != PPC_NOP_INSTR) { + printk(KERN_ERR "Expected NOP but have %x\n", op); return -EINVAL; } @@ -465,11 +474,11 @@ __ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) } /* Set to "bl addr" */ - op[0] = branch_offset(offset); + op = branch_offset(offset); DEBUGP("write to %lx\n", rec->ip); - if (probe_kernel_write((void *)ip, replaced, MCOUNT_INSN_SIZE)) + if (probe_kernel_write((void *)ip, &op, MCOUNT_INSN_SIZE)) return -EPERM; return 0; -- cgit v0.10.2 From ec682cef2d2c1a25a198d32a87fe2649da671d1e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 25 Nov 2008 10:22:48 -0800 Subject: powerpc: ftrace, added missing icache flush Impact: fix to PowerPC code modification After modifying code it is essential to flush the icache. This patch adds the missing flush. Reported-by: Paul Mackerras Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/arch/powerpc/kernel/ftrace.c b/arch/powerpc/kernel/ftrace.c index ea454a0..a4640e4 100644 --- a/arch/powerpc/kernel/ftrace.c +++ b/arch/powerpc/kernel/ftrace.c @@ -271,6 +271,9 @@ __ftrace_make_nop(struct module *mod, if (probe_kernel_write((void *)ip, &op, MCOUNT_INSN_SIZE)) return -EPERM; + + flush_icache_range(ip, ip + 8); + return 0; } @@ -342,6 +345,8 @@ __ftrace_make_nop(struct module *mod, if (probe_kernel_write((void *)ip, &op, MCOUNT_INSN_SIZE)) return -EPERM; + flush_icache_range(ip, ip + 8); + return 0; } #endif /* PPC64 */ @@ -438,6 +443,8 @@ __ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) if (probe_kernel_write((void *)ip, op, MCOUNT_INSN_SIZE * 2)) return -EPERM; + flush_icache_range(ip, ip + 8); + return 0; } #else @@ -481,6 +488,8 @@ __ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) if (probe_kernel_write((void *)ip, &op, MCOUNT_INSN_SIZE)) return -EPERM; + flush_icache_range(ip, ip + 8); + return 0; } #endif /* CONFIG_PPC64 */ -- cgit v0.10.2 From 0029ff87529dff01a4b9c5bf380a0caacb5f7418 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 25 Nov 2008 14:06:19 -0800 Subject: powerpc: ftrace, use create_branch Impact: clean up Paul Mackerras pointed out that the code to determine if the branch can reach the destination is incorrect. Michael Ellerman suggested to pull out the code from create_branch and use that. Simply using create_branch is probably the best. Reported-by: Michael Ellerman Reported-by: Paul Mackerras Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/arch/powerpc/kernel/ftrace.c b/arch/powerpc/kernel/ftrace.c index a4640e4..5355244 100644 --- a/arch/powerpc/kernel/ftrace.c +++ b/arch/powerpc/kernel/ftrace.c @@ -114,19 +114,9 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code, */ static int test_24bit_addr(unsigned long ip, unsigned long addr) { - long diff; - /* - * Can we get to addr from ip in 24 bits? - * (26 really, since we mulitply by 4 for 4 byte alignment) - */ - diff = addr - ip; - - /* - * Return true if diff is less than 1 << 25 - * and greater than -1 << 26. - */ - return (diff < (1 << 25)) && (diff > (-1 << 26)); + /* use the create_branch to verify that this offset can be branched */ + return create_branch((unsigned int *)ip, addr, 0); } static int is_bl_op(unsigned int op) @@ -134,11 +124,6 @@ static int is_bl_op(unsigned int op) return (op & 0xfc000003) == 0x48000001; } -static int test_offset(unsigned long offset) -{ - return (offset + 0x2000000 > 0x3ffffff) || ((offset & 3) != 0); -} - static unsigned long find_bl_target(unsigned long ip, unsigned int op) { static int offset; @@ -151,12 +136,6 @@ static unsigned long find_bl_target(unsigned long ip, unsigned int op) return ip + (long)offset; } -static unsigned int branch_offset(unsigned long offset) -{ - /* return "bl ip+offset" */ - return 0x48000001 | (offset & 0x03fffffc); -} - #ifdef CONFIG_PPC64 static int __ftrace_make_nop(struct module *mod, @@ -402,7 +381,6 @@ __ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) { unsigned int op[2]; unsigned long ip = rec->ip; - unsigned long offset; /* read where this goes */ if (probe_kernel_read(op, (void *)ip, MCOUNT_INSN_SIZE * 2)) @@ -424,17 +402,14 @@ __ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) return -EINVAL; } - /* now calculate a jump to the ftrace caller trampoline */ - offset = rec->arch.mod->arch.tramp - ip; - - if (test_offset(offset)) { - printk(KERN_ERR "REL24 %li out of range!\n", - (long int)offset); + /* create the branch to the trampoline */ + op[0] = create_branch((unsigned int *)ip, + rec->arch.mod->arch.tramp, BRANCH_SET_LINK); + if (!op[0]) { + printk(KERN_ERR "REL24 out of range!\n"); return -EINVAL; } - /* Set to "bl addr" */ - op[0] = branch_offset(offset); /* ld r2,40(r1) */ op[1] = 0xe8410028; @@ -453,7 +428,6 @@ __ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) { unsigned int op; unsigned long ip = rec->ip; - unsigned long offset; /* read where this goes */ if (probe_kernel_read(&op, (void *)ip, MCOUNT_INSN_SIZE)) @@ -471,18 +445,14 @@ __ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) return -EINVAL; } - /* now calculate a jump to the ftrace caller trampoline */ - offset = rec->arch.mod->arch.tramp - ip; - - if (test_offset(offset)) { - printk(KERN_ERR "REL24 %li out of range!\n", - (long int)offset); + /* create the branch to the trampoline */ + op = create_branch((unsigned int *)ip, + rec->arch.mod->arch.tramp, BRANCH_SET_LINK); + if (!op) { + printk(KERN_ERR "REL24 out of range!\n"); return -EINVAL; } - /* Set to "bl addr" */ - op = branch_offset(offset); - DEBUGP("write to %lx\n", rec->ip); if (probe_kernel_write((void *)ip, &op, MCOUNT_INSN_SIZE)) -- cgit v0.10.2 From f1eecf0e4f0796911cc076f38fcf05fea0b353d5 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 26 Nov 2008 15:54:46 -0500 Subject: powerpc/ppc32: static ftrace fixes for PPC32 Impact: fix for PowerPC 32 code There were some early init code that was not safe for static ftrace to boot on my PowerBook. This code must only use relative addressing, and static mcount performs a compare of the ftrace_trace_function pointer, and gets that with an absolute address. In the early init boot up code, this will cause a fault. This patch removes tracing from the files containing the offending functions. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 92673b4..d17edb4 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -17,6 +17,7 @@ ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_cputable.o = -pg -mno-sched-epilog CFLAGS_REMOVE_prom_init.o = -pg -mno-sched-epilog CFLAGS_REMOVE_btext.o = -pg -mno-sched-epilog +CFLAGS_REMOVE_prom.o = -pg -mno-sched-epilog ifdef CONFIG_DYNAMIC_FTRACE # dynamic ftrace setup. diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile index d69912c..8db3527 100644 --- a/arch/powerpc/lib/Makefile +++ b/arch/powerpc/lib/Makefile @@ -6,6 +6,9 @@ ifeq ($(CONFIG_PPC64),y) EXTRA_CFLAGS += -mno-minimal-toc endif +CFLAGS_REMOVE_code-patching.o = -pg +CFLAGS_REMOVE_feature-fixups.o = -pg + obj-y := string.o alloc.o \ checksum_$(CONFIG_WORD_SIZE).o obj-$(CONFIG_PPC32) += div64.o copy_32.o crtsavres.o -- cgit v0.10.2 From a838c2ec6ea1f18431da74dfe4978c57355b95f3 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Thu, 27 Nov 2008 16:14:44 +0800 Subject: markers: comment marker_synchronize_unregister() on data dependency Add document and comments on marker_synchronize_unregister(): it should be called before freeing resources that the probes depend on. Based on comments from Lai Jiangshan and Mathieu Desnoyers. Signed-off-by: Wu Fengguang Reviewed-by: Mathieu Desnoyers Reviewed-by: Lai Jiangshan Signed-off-by: Ingo Molnar diff --git a/Documentation/markers.txt b/Documentation/markers.txt index 6d275e4..d2b3d0e 100644 --- a/Documentation/markers.txt +++ b/Documentation/markers.txt @@ -51,11 +51,16 @@ to call) for the specific marker through marker_probe_register() and can be activated by calling marker_arm(). Marker deactivation can be done by calling marker_disarm() as many times as marker_arm() has been called. Removing a probe is done through marker_probe_unregister(); it will disarm the probe. -marker_synchronize_unregister() must be called before the end of the module exit -function to make sure there is no caller left using the probe. This, and the -fact that preemption is disabled around the probe call, make sure that probe -removal and module unload are safe. See the "Probe example" section below for a -sample probe module. + +marker_synchronize_unregister() must be called between probe unregistration and +the first occurrence of +- the end of module exit function, + to make sure there is no caller left using the probe; +- the free of any resource used by the probes, + to make sure the probes wont be accessing invalid data. +This, and the fact that preemption is disabled around the probe call, make sure +that probe removal and module unload are safe. See the "Probe example" section +below for a sample probe module. The marker mechanism supports inserting multiple instances of the same marker. Markers can be put in inline functions, inlined static functions, and diff --git a/include/linux/marker.h b/include/linux/marker.h index 34c14bc..b85e74c 100644 --- a/include/linux/marker.h +++ b/include/linux/marker.h @@ -211,8 +211,10 @@ extern void *marker_get_private_data(const char *name, marker_probe_func *probe, /* * marker_synchronize_unregister must be called between the last marker probe - * unregistration and the end of module exit to make sure there is no caller - * executing a probe when it is freed. + * unregistration and the first one of + * - the end of module exit function + * - the free of any resource used by the probes + * to ensure the code and data are valid for any possibly running probes. */ #define marker_synchronize_unregister() synchronize_sched() -- cgit v0.10.2 From 604094f4615180f71da799e7e5b191f5c2a42a28 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 28 Nov 2008 18:03:22 +0100 Subject: vfs, seqfile: export mangle_path() generally mangle_path() is trivial enough to make export restrictions on it pointless - so change the export from EXPORT_SYMBOL_GPL to EXPORT_SYMBOL. Signed-off-by: Ingo Molnar Acked-by: Al Viro diff --git a/fs/seq_file.c b/fs/seq_file.c index f03220d7..16c2115 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -387,7 +387,7 @@ char *mangle_path(char *s, char *p, char *esc) } return NULL; } -EXPORT_SYMBOL_GPL(mangle_path); +EXPORT_SYMBOL(mangle_path); /* * return the absolute path of 'dentry' residing in mount 'mnt'. -- cgit v0.10.2 From 65c6dc6adbe7ee0acf207445243400a68c77af15 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 29 Nov 2008 04:12:46 +0100 Subject: tracing/branch-tracer: include missing irqflags.h Impact: fix build error on branch tracer This should fix a build error reported on alpha in linux-next: CC kernel/trace/trace_branch.o kernel/trace/trace_branch.c: In function 'probe_likely_condition': kernel/trace/trace_branch.c:44: error: implicit declaration of function 'raw_local_irq_save' kernel/trace/trace_branch.c:76: error: implicit declaration of function 'raw_local_irq_restore' Unfortunately, I can't test it since I don't have any Alpha build environment. Reported-by: Alexey Dobriyan Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 877ee88..bc97275 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include -- cgit v0.10.2 From f08340c5d68ab621f377c108637e2d8e95b3e5d4 Mon Sep 17 00:00:00 2001 From: Nikanth Karthikesan Date: Sat, 29 Nov 2008 15:43:32 +0530 Subject: tracepoints: Documentation TPPROTO misspelt in Documentation/tracepoints.txt Impact: fix typo in documentation TPPROTO is misspelt in Documentation/tracepoints.txt Kept me wondering what was wrong, when I was trying to add a new tracepoint subsystem. Signed-off-by: Nikanth Karthikesan Signed-off-by: Ingo Molnar diff --git a/Documentation/tracepoints.txt b/Documentation/tracepoints.txt index 2d42241..6f0a044 100644 --- a/Documentation/tracepoints.txt +++ b/Documentation/tracepoints.txt @@ -45,7 +45,7 @@ In include/trace/subsys.h : #include DECLARE_TRACE(subsys_eventname, - TPPTOTO(int firstarg, struct task_struct *p), + TPPROTO(int firstarg, struct task_struct *p), TPARGS(firstarg, p)); In subsys/file.c (where the tracing statement must be added) : @@ -66,7 +66,7 @@ Where : - subsys is the name of your subsystem. - eventname is the name of the event to trace. -- TPPTOTO(int firstarg, struct task_struct *p) is the prototype of the +- TPPROTO(int firstarg, struct task_struct *p) is the prototype of the function called by this tracepoint. - TPARGS(firstarg, p) are the parameters names, same as found in the -- cgit v0.10.2 From 66eafebc1086014709dc38f52ddcb3d67d9b346c Mon Sep 17 00:00:00 2001 From: Liming Wang Date: Tue, 2 Dec 2008 10:33:08 +0800 Subject: function trace: fix a bug of single thread function trace Impact: fix "no output from tracer" bug caused by ftrace_update_pid_func() When disabling single thread function trace using "echo -1 > set_ftrace_pid", the normal function trace has to restore to original function, otherwise the normal function trace will not work well. Without this commit, something like below: $ ps |grep 850 850 root 2556 S -/bin/sh $ echo 850 > /debug/tracing/set_ftrace_pid $ echo function > /debug/tracing/current_tracer $ echo 1 > /debug/tracing/tracing_enabled $ sleep 1 $ echo 0 > /debug/tracing/tracing_enabled $ cat /debug/tracing/trace_pipe |wc -l 59704 $ echo -1 > /debug/tracing/set_ftrace_pid $ echo 1 > /debug/tracing/tracing_enabled $ sleep 1 $ echo 0 > /debug/tracing/tracing_enabled $ more /debug/tracing/trace_pipe <====== nothing output now! it should output trace record. Signed-off-by: Liming Wang Signed-off-by: Ingo Molnar diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 08b536a..6d89ab4 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -243,10 +243,8 @@ static void ftrace_update_pid_func(void) set_ftrace_pid_function(func); func = ftrace_pid_func; } else { - if (func != ftrace_pid_func) - goto out; - - set_ftrace_pid_function(func); + if (func == ftrace_pid_func) + func = ftrace_pid_function; } #ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST -- cgit v0.10.2 From 48d68b20d00865035b8b65e69af343d0f53fac9d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 2 Dec 2008 00:20:39 +0100 Subject: tracing/function-graph-tracer: support for x86-64 Impact: extend and enable the function graph tracer to 64-bit x86 This patch implements the support for function graph tracer under x86-64. Both static and dynamic tracing are supported. This causes some small CPP conditional asm on arch/x86/kernel/ftrace.c I wanted to use probe_kernel_read/write to make the return address saving/patching code more generic but it causes tracing recursion. That would be perhaps useful to implement a notrace version of these function for other archs ports. Note that arch/x86/process_64.c is not traced, as in X86-32. I first thought __switch_to() was responsible of crashes during tracing because I believed current task were changed inside but that's actually not the case (actually yes, but not the "current" pointer). So I will have to investigate to find the functions that harm here, to enable tracing of the other functions inside (but there is no issue at this time, while process_64.c stays out of -pg flags). A little possible race condition is fixed inside this patch too. When the tracer allocate a return stack dynamically, the current depth is not initialized before but after. An interrupt could occur at this time and, after seeing that the return stack is allocated, the tracer could try to trace it with a random uninitialized depth. It's a prevention, even if I hadn't problems with it. Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt Cc: Tim Bird Signed-off-by: Ingo Molnar diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 0842b11..45c86fb 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -29,7 +29,7 @@ config X86 select HAVE_FTRACE_MCOUNT_RECORD select HAVE_DYNAMIC_FTRACE select HAVE_FUNCTION_TRACER - select HAVE_FUNCTION_GRAPH_TRACER if X86_32 + select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_TRACE_MCOUNT_TEST select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) select HAVE_ARCH_KGDB if !X86_VOYAGER diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 64939a0..d274425 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -17,6 +17,7 @@ endif ifdef CONFIG_FUNCTION_GRAPH_TRACER # Don't trace __switch_to() but let it for function tracer CFLAGS_REMOVE_process_32.o = -pg +CFLAGS_REMOVE_process_64.o = -pg endif # diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 08aa6b1..2aa0526 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -98,6 +98,12 @@ ftrace_call: movq (%rsp), %rax addq $0x38, %rsp +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +.globl ftrace_graph_call +ftrace_graph_call: + jmp ftrace_stub +#endif + .globl ftrace_stub ftrace_stub: retq @@ -110,6 +116,12 @@ ENTRY(mcount) cmpq $ftrace_stub, ftrace_trace_function jnz trace + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + cmpq $ftrace_stub, ftrace_graph_return + jnz ftrace_graph_caller +#endif + .globl ftrace_stub ftrace_stub: retq @@ -145,6 +157,68 @@ END(mcount) #endif /* CONFIG_DYNAMIC_FTRACE */ #endif /* CONFIG_FUNCTION_TRACER */ +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +ENTRY(ftrace_graph_caller) + cmpl $0, function_trace_stop + jne ftrace_stub + + subq $0x38, %rsp + movq %rax, (%rsp) + movq %rcx, 8(%rsp) + movq %rdx, 16(%rsp) + movq %rsi, 24(%rsp) + movq %rdi, 32(%rsp) + movq %r8, 40(%rsp) + movq %r9, 48(%rsp) + + leaq 8(%rbp), %rdi + movq 0x38(%rsp), %rsi + + call prepare_ftrace_return + + movq 48(%rsp), %r9 + movq 40(%rsp), %r8 + movq 32(%rsp), %rdi + movq 24(%rsp), %rsi + movq 16(%rsp), %rdx + movq 8(%rsp), %rcx + movq (%rsp), %rax + addq $0x38, %rsp + retq +END(ftrace_graph_caller) + + +.globl return_to_handler +return_to_handler: + subq $80, %rsp + + movq %rax, (%rsp) + movq %rcx, 8(%rsp) + movq %rdx, 16(%rsp) + movq %rsi, 24(%rsp) + movq %rdi, 32(%rsp) + movq %r8, 40(%rsp) + movq %r9, 48(%rsp) + movq %r10, 56(%rsp) + movq %r11, 64(%rsp) + + call ftrace_return_to_handler + + movq %rax, 72(%rsp) + movq 64(%rsp), %r11 + movq 56(%rsp), %r10 + movq 48(%rsp), %r9 + movq 40(%rsp), %r8 + movq 32(%rsp), %rdi + movq 24(%rsp), %rsi + movq 16(%rsp), %rdx + movq 8(%rsp), %rcx + movq (%rsp), %rax + addq $72, %rsp + retq +#endif + + #ifndef CONFIG_PREEMPT #define retint_kernel retint_restore_args #endif diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 7ef914e..5883247 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -467,8 +467,13 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) * ignore such a protection. */ asm volatile( +#ifdef CONFIG_X86_64 + "1: movq (%[parent_old]), %[old]\n" + "2: movq %[return_hooker], (%[parent_replaced])\n" +#else "1: movl (%[parent_old]), %[old]\n" "2: movl %[return_hooker], (%[parent_replaced])\n" +#endif " movl $0, %[faulted]\n" ".section .fixup, \"ax\"\n" @@ -476,8 +481,13 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) ".previous\n" ".section __ex_table, \"a\"\n" +#ifdef CONFIG_X86_64 + " .quad 1b, 3b\n" + " .quad 2b, 3b\n" +#else " .long 1b, 3b\n" " .long 2b, 3b\n" +#endif ".previous\n" : [parent_replaced] "=r" (parent), [old] "=r" (old), @@ -509,5 +519,4 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) ftrace_graph_entry(&trace); } - #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 08b536a..f724996 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1673,8 +1673,10 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list) } if (t->ret_stack == NULL) { - t->ret_stack = ret_stack_list[start++]; t->curr_ret_stack = -1; + /* Make sure IRQs see the -1 first: */ + barrier(); + t->ret_stack = ret_stack_list[start++]; atomic_set(&t->trace_overrun, 0); } } while_each_thread(g, t); -- cgit v0.10.2 From a5e25883a445dce94a087ca479b21a5959cd5c18 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 2 Dec 2008 15:34:05 -0500 Subject: ftrace: replace raw_local_irq_save with local_irq_save Impact: fix for lockdep and ftrace The raw_local_irq_save/restore confuses lockdep. This patch converts them to the local_irq_save/restore variants. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 46a4041..74b1878 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -25,6 +25,7 @@ * Thanks to Arjan van de Ven for coming up with the initial idea of * mapping lock dependencies runtime. */ +#define DISABLE_BRANCH_PROFILING #include #include #include diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 91887a2..380de63 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1209,7 +1209,7 @@ void trace_graph_entry(struct ftrace_graph_ent *trace) int cpu; int pc; - raw_local_irq_save(flags); + local_irq_save(flags); cpu = raw_smp_processor_id(); data = tr->data[cpu]; disabled = atomic_inc_return(&data->disabled); @@ -1218,7 +1218,7 @@ void trace_graph_entry(struct ftrace_graph_ent *trace) __trace_graph_entry(tr, data, trace, flags, pc); } atomic_dec(&data->disabled); - raw_local_irq_restore(flags); + local_irq_restore(flags); } void trace_graph_return(struct ftrace_graph_ret *trace) @@ -1230,7 +1230,7 @@ void trace_graph_return(struct ftrace_graph_ret *trace) int cpu; int pc; - raw_local_irq_save(flags); + local_irq_save(flags); cpu = raw_smp_processor_id(); data = tr->data[cpu]; disabled = atomic_inc_return(&data->disabled); @@ -1239,7 +1239,7 @@ void trace_graph_return(struct ftrace_graph_ret *trace) __trace_graph_return(tr, data, trace, flags, pc); } atomic_dec(&data->disabled); - raw_local_irq_restore(flags); + local_irq_restore(flags); } #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ @@ -2645,7 +2645,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, if (err) goto err_unlock; - raw_local_irq_disable(); + local_irq_disable(); __raw_spin_lock(&ftrace_max_lock); for_each_tracing_cpu(cpu) { /* @@ -2662,7 +2662,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, } } __raw_spin_unlock(&ftrace_max_lock); - raw_local_irq_enable(); + local_irq_enable(); tracing_cpumask = tracing_cpumask_new; diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index bc97275..6c00feb 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -42,7 +42,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) if (unlikely(!tr)) return; - raw_local_irq_save(flags); + local_irq_save(flags); cpu = raw_smp_processor_id(); if (atomic_inc_return(&tr->data[cpu]->disabled) != 1) goto out; @@ -74,7 +74,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) out: atomic_dec(&tr->data[cpu]->disabled); - raw_local_irq_restore(flags); + local_irq_restore(flags); } static inline diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index fde3be1..06a1611 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -48,7 +48,7 @@ static inline void check_stack(void) if (!object_is_on_stack(&this_size)) return; - raw_local_irq_save(flags); + local_irq_save(flags); __raw_spin_lock(&max_stack_lock); /* a race could have already updated it */ @@ -96,7 +96,7 @@ static inline void check_stack(void) out: __raw_spin_unlock(&max_stack_lock); - raw_local_irq_restore(flags); + local_irq_restore(flags); } static void @@ -162,11 +162,11 @@ stack_max_size_write(struct file *filp, const char __user *ubuf, if (ret < 0) return ret; - raw_local_irq_save(flags); + local_irq_save(flags); __raw_spin_lock(&max_stack_lock); *ptr = val; __raw_spin_unlock(&max_stack_lock); - raw_local_irq_restore(flags); + local_irq_restore(flags); return count; } -- cgit v0.10.2 From abc9b56d66fbd4d93302ef4bf6fa726e1b8255f9 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 2 Dec 2008 15:34:06 -0500 Subject: ring-buffer: move some metadata into buffer page Impact: get ready for splice changes This patch moves the commit and timestamp into the beginning of each data page of the buffer. This change will allow the page to be moved to another location (disk, network, etc) and still have information in the page to be able to read it. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index e206951..8619c53 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -195,20 +195,24 @@ void *ring_buffer_event_data(struct ring_buffer_event *event) #define TS_MASK ((1ULL << TS_SHIFT) - 1) #define TS_DELTA_TEST (~TS_MASK) -/* - * This hack stolen from mm/slob.c. - * We can store per page timing information in the page frame of the page. - * Thanks to Peter Zijlstra for suggesting this idea. - */ -struct buffer_page { +struct buffer_data_page { u64 time_stamp; /* page time stamp */ - local_t write; /* index for next write */ local_t commit; /* write commited index */ + unsigned char data[]; /* data of buffer page */ +}; + +struct buffer_page { + local_t write; /* index for next write */ unsigned read; /* index for next read */ struct list_head list; /* list of free pages */ - void *page; /* Actual data page */ + struct buffer_data_page *page; /* Actual data page */ }; +static void rb_init_page(struct buffer_data_page *page) +{ + local_set(&page->commit, 0); +} + /* * Also stolen from mm/slob.c. Thanks to Mathieu Desnoyers for pointing * this issue out. @@ -230,7 +234,7 @@ static inline int test_time_stamp(u64 delta) return 0; } -#define BUF_PAGE_SIZE PAGE_SIZE +#define BUF_PAGE_SIZE (PAGE_SIZE - sizeof(struct buffer_data_page)) /* * head_page == tail_page && head == tail then buffer is empty. @@ -333,6 +337,7 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, if (!addr) goto free_pages; page->page = (void *)addr; + rb_init_page(page->page); } list_splice(&pages, head); @@ -378,6 +383,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) if (!addr) goto fail_free_reader; page->page = (void *)addr; + rb_init_page(page->page); INIT_LIST_HEAD(&cpu_buffer->reader_page->list); @@ -647,6 +653,7 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) if (!addr) goto free_pages; page->page = (void *)addr; + rb_init_page(page->page); } } @@ -682,7 +689,7 @@ static inline int rb_null_event(struct ring_buffer_event *event) static inline void *__rb_page_index(struct buffer_page *page, unsigned index) { - return page->page + index; + return page->page->data + index; } static inline struct ring_buffer_event * @@ -712,7 +719,7 @@ static inline unsigned rb_page_write(struct buffer_page *bpage) static inline unsigned rb_page_commit(struct buffer_page *bpage) { - return local_read(&bpage->commit); + return local_read(&bpage->page->commit); } /* Size is determined by what has been commited */ @@ -804,14 +811,15 @@ rb_set_commit_event(struct ring_buffer_per_cpu *cpu_buffer, if (RB_WARN_ON(cpu_buffer, cpu_buffer->commit_page == cpu_buffer->tail_page)) return; - cpu_buffer->commit_page->commit = + cpu_buffer->commit_page->page->commit = cpu_buffer->commit_page->write; rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); - cpu_buffer->write_stamp = cpu_buffer->commit_page->time_stamp; + cpu_buffer->write_stamp = + cpu_buffer->commit_page->page->time_stamp; } /* Now set the commit to the event's index */ - local_set(&cpu_buffer->commit_page->commit, index); + local_set(&cpu_buffer->commit_page->page->commit, index); } static inline void @@ -826,16 +834,17 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) * assign the commit to the tail. */ while (cpu_buffer->commit_page != cpu_buffer->tail_page) { - cpu_buffer->commit_page->commit = + cpu_buffer->commit_page->page->commit = cpu_buffer->commit_page->write; rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); - cpu_buffer->write_stamp = cpu_buffer->commit_page->time_stamp; + cpu_buffer->write_stamp = + cpu_buffer->commit_page->page->time_stamp; /* add barrier to keep gcc from optimizing too much */ barrier(); } while (rb_commit_index(cpu_buffer) != rb_page_write(cpu_buffer->commit_page)) { - cpu_buffer->commit_page->commit = + cpu_buffer->commit_page->page->commit = cpu_buffer->commit_page->write; barrier(); } @@ -843,7 +852,7 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) static void rb_reset_reader_page(struct ring_buffer_per_cpu *cpu_buffer) { - cpu_buffer->read_stamp = cpu_buffer->reader_page->time_stamp; + cpu_buffer->read_stamp = cpu_buffer->reader_page->page->time_stamp; cpu_buffer->reader_page->read = 0; } @@ -862,7 +871,7 @@ static inline void rb_inc_iter(struct ring_buffer_iter *iter) else rb_inc_page(cpu_buffer, &iter->head_page); - iter->read_stamp = iter->head_page->time_stamp; + iter->read_stamp = iter->head_page->page->time_stamp; iter->head = 0; } @@ -998,12 +1007,12 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, */ if (tail_page == cpu_buffer->tail_page) { local_set(&next_page->write, 0); - local_set(&next_page->commit, 0); + local_set(&next_page->page->commit, 0); cpu_buffer->tail_page = next_page; /* reread the time stamp */ *ts = ring_buffer_time_stamp(cpu_buffer->cpu); - cpu_buffer->tail_page->time_stamp = *ts; + cpu_buffer->tail_page->page->time_stamp = *ts; } /* @@ -1048,7 +1057,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, * this page's time stamp. */ if (!tail && rb_is_commit(cpu_buffer, event)) - cpu_buffer->commit_page->time_stamp = *ts; + cpu_buffer->commit_page->page->time_stamp = *ts; return event; @@ -1099,7 +1108,7 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, event->time_delta = *delta & TS_MASK; event->array[0] = *delta >> TS_SHIFT; } else { - cpu_buffer->commit_page->time_stamp = *ts; + cpu_buffer->commit_page->page->time_stamp = *ts; event->time_delta = 0; event->array[0] = 0; } @@ -1552,7 +1561,7 @@ static void rb_iter_reset(struct ring_buffer_iter *iter) if (iter->head) iter->read_stamp = cpu_buffer->read_stamp; else - iter->read_stamp = iter->head_page->time_stamp; + iter->read_stamp = iter->head_page->page->time_stamp; } /** @@ -1696,7 +1705,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->reader_page->list.prev = reader->list.prev; local_set(&cpu_buffer->reader_page->write, 0); - local_set(&cpu_buffer->reader_page->commit, 0); + local_set(&cpu_buffer->reader_page->page->commit, 0); /* Make the reader page now replace the head */ reader->list.prev->next = &cpu_buffer->reader_page->list; @@ -2088,7 +2097,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->head_page = list_entry(cpu_buffer->pages.next, struct buffer_page, list); local_set(&cpu_buffer->head_page->write, 0); - local_set(&cpu_buffer->head_page->commit, 0); + local_set(&cpu_buffer->head_page->page->commit, 0); cpu_buffer->head_page->read = 0; @@ -2097,7 +2106,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) INIT_LIST_HEAD(&cpu_buffer->reader_page->list); local_set(&cpu_buffer->reader_page->write, 0); - local_set(&cpu_buffer->reader_page->commit, 0); + local_set(&cpu_buffer->reader_page->page->commit, 0); cpu_buffer->reader_page->read = 0; cpu_buffer->overrun = 0; -- cgit v0.10.2 From 8789a9e7df6bf9b93739c4c7d4e380725bc9e936 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 2 Dec 2008 15:34:07 -0500 Subject: ring-buffer: read page interface Impact: new API to ring buffer This patch adds a new interface into the ring buffer that allows a page to be read from the ring buffer on a given CPU. For every page read, one must also be given to allow for a "swap" of the pages. rpage = ring_buffer_alloc_read_page(buffer); if (!rpage) goto err; ret = ring_buffer_read_page(buffer, &rpage, cpu, full); if (!ret) goto empty; process_page(rpage); ring_buffer_free_read_page(rpage); The caller of these functions must handle any waits that are needed to wait for new data. The ring_buffer_read_page will simply return 0 if there is no data, or if "full" is set and the writer is still on the current page. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index 3bb87a7..1a350a8 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -124,6 +124,11 @@ void tracing_on(void); void tracing_off(void); void tracing_off_permanent(void); +void *ring_buffer_alloc_read_page(struct ring_buffer *buffer); +void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data); +int ring_buffer_read_page(struct ring_buffer *buffer, + void **data_page, int cpu, int full); + enum ring_buffer_flags { RB_FL_OVERWRITE = 1 << 0, }; diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 8619c53..50b74d3 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -687,6 +687,12 @@ static inline int rb_null_event(struct ring_buffer_event *event) return event->type == RINGBUF_TYPE_PADDING; } +static inline void * +__rb_data_page_index(struct buffer_data_page *page, unsigned index) +{ + return page->data + index; +} + static inline void *__rb_page_index(struct buffer_page *page, unsigned index) { return page->page->data + index; @@ -2232,6 +2238,166 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a, return 0; } +static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_data_page *page) +{ + struct ring_buffer_event *event; + unsigned long head; + + __raw_spin_lock(&cpu_buffer->lock); + for (head = 0; head < local_read(&page->commit); + head += rb_event_length(event)) { + + event = __rb_data_page_index(page, head); + if (RB_WARN_ON(cpu_buffer, rb_null_event(event))) + return; + /* Only count data entries */ + if (event->type != RINGBUF_TYPE_DATA) + continue; + cpu_buffer->entries--; + } + __raw_spin_unlock(&cpu_buffer->lock); +} + +/** + * ring_buffer_alloc_read_page - allocate a page to read from buffer + * @buffer: the buffer to allocate for. + * + * This function is used in conjunction with ring_buffer_read_page. + * When reading a full page from the ring buffer, these functions + * can be used to speed up the process. The calling function should + * allocate a few pages first with this function. Then when it + * needs to get pages from the ring buffer, it passes the result + * of this function into ring_buffer_read_page, which will swap + * the page that was allocated, with the read page of the buffer. + * + * Returns: + * The page allocated, or NULL on error. + */ +void *ring_buffer_alloc_read_page(struct ring_buffer *buffer) +{ + unsigned long addr; + struct buffer_data_page *page; + + addr = __get_free_page(GFP_KERNEL); + if (!addr) + return NULL; + + page = (void *)addr; + + return page; +} + +/** + * ring_buffer_free_read_page - free an allocated read page + * @buffer: the buffer the page was allocate for + * @data: the page to free + * + * Free a page allocated from ring_buffer_alloc_read_page. + */ +void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data) +{ + free_page((unsigned long)data); +} + +/** + * ring_buffer_read_page - extract a page from the ring buffer + * @buffer: buffer to extract from + * @data_page: the page to use allocated from ring_buffer_alloc_read_page + * @cpu: the cpu of the buffer to extract + * @full: should the extraction only happen when the page is full. + * + * This function will pull out a page from the ring buffer and consume it. + * @data_page must be the address of the variable that was returned + * from ring_buffer_alloc_read_page. This is because the page might be used + * to swap with a page in the ring buffer. + * + * for example: + * rpage = ring_buffer_alloc_page(buffer); + * if (!rpage) + * return error; + * ret = ring_buffer_read_page(buffer, &rpage, cpu, 0); + * if (ret) + * process_page(rpage); + * + * When @full is set, the function will not return true unless + * the writer is off the reader page. + * + * Note: it is up to the calling functions to handle sleeps and wakeups. + * The ring buffer can be used anywhere in the kernel and can not + * blindly call wake_up. The layer that uses the ring buffer must be + * responsible for that. + * + * Returns: + * 1 if data has been transferred + * 0 if no data has been transferred. + */ +int ring_buffer_read_page(struct ring_buffer *buffer, + void **data_page, int cpu, int full) +{ + struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; + struct ring_buffer_event *event; + struct buffer_data_page *page; + unsigned long flags; + int ret = 0; + + if (!data_page) + return 0; + + page = *data_page; + if (!page) + return 0; + + spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + + /* + * rb_buffer_peek will get the next ring buffer if + * the current reader page is empty. + */ + event = rb_buffer_peek(buffer, cpu, NULL); + if (!event) + goto out; + + /* check for data */ + if (!local_read(&cpu_buffer->reader_page->page->commit)) + goto out; + /* + * If the writer is already off of the read page, then simply + * switch the read page with the given page. Otherwise + * we need to copy the data from the reader to the writer. + */ + if (cpu_buffer->reader_page == cpu_buffer->commit_page) { + unsigned int read = cpu_buffer->reader_page->read; + + if (full) + goto out; + /* The writer is still on the reader page, we must copy */ + page = cpu_buffer->reader_page->page; + memcpy(page->data, + cpu_buffer->reader_page->page->data + read, + local_read(&page->commit) - read); + + /* consume what was read */ + cpu_buffer->reader_page += read; + + } else { + /* swap the pages */ + rb_init_page(page); + page = cpu_buffer->reader_page->page; + cpu_buffer->reader_page->page = *data_page; + cpu_buffer->reader_page->read = 0; + *data_page = page; + } + ret = 1; + + /* update the entry counter */ + rb_remove_entries(cpu_buffer, page); + out: + spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + + return ret; +} + static ssize_t rb_simple_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) -- cgit v0.10.2 From 347fdd9dd4e5d3f3a4e415925c35bdff1d59c3a9 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 2 Dec 2008 15:34:08 -0500 Subject: ftrace: clean up function graph asm Impact: clean up There exists macros for x86 asm to handle x86_64 and i386. This patch updates function graph asm to use them. Signed-off-by: Steven Rostedt Acked-by: Frederic Weisbecker Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 5883247..1a5b8f8 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -467,28 +467,16 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) * ignore such a protection. */ asm volatile( -#ifdef CONFIG_X86_64 - "1: movq (%[parent_old]), %[old]\n" - "2: movq %[return_hooker], (%[parent_replaced])\n" -#else - "1: movl (%[parent_old]), %[old]\n" - "2: movl %[return_hooker], (%[parent_replaced])\n" -#endif + "1: " _ASM_MOV " (%[parent_old]), %[old]\n" + "2: " _ASM_MOV " %[return_hooker], (%[parent_replaced])\n" " movl $0, %[faulted]\n" ".section .fixup, \"ax\"\n" "3: movl $1, %[faulted]\n" ".previous\n" - ".section __ex_table, \"a\"\n" -#ifdef CONFIG_X86_64 - " .quad 1b, 3b\n" - " .quad 2b, 3b\n" -#else - " .long 1b, 3b\n" - " .long 2b, 3b\n" -#endif - ".previous\n" + _ASM_EXTABLE(1b, 3b) + _ASM_EXTABLE(2b, 3b) : [parent_replaced] "=r" (parent), [old] "=r" (old), [faulted] "=r" (faulted) -- cgit v0.10.2 From bb4304c71c97bf727ec43cd2f195c2c237c27fd3 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 2 Dec 2008 15:34:09 -0500 Subject: ftrace: have function graph use mcount caller address Impact: consistency change for function graph This patch makes function graph record the mcount caller address the same way the function tracer does. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 958af86..826682a 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -1230,6 +1230,7 @@ ENTRY(ftrace_graph_caller) pushl %edx movl 0xc(%esp), %edx lea 0x4(%ebp), %eax + subl $MCOUNT_INSN_SIZE, %edx call prepare_ftrace_return popl %edx popl %ecx diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 2aa0526..9060ba6 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -173,6 +173,7 @@ ENTRY(ftrace_graph_caller) leaq 8(%rbp), %rdi movq 0x38(%rsp), %rsi + subq $MCOUNT_INSN_SIZE, %rsi call prepare_ftrace_return -- cgit v0.10.2 From 14a866c567e040ccf6240d68b083dd1dbbde63e6 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 2 Dec 2008 23:50:02 -0500 Subject: ftrace: add ftrace_graph_stop() Impact: new ftrace_graph_stop function While developing more features of function graph, I hit a bug that caused the WARN_ON to trigger in the prepare_ftrace_return function. Well, it was hard for me to find out that was happening because the bug would not print, it would just cause a hard lockup or reboot. The reason is that it is not safe to call printk from this function. Looking further, I also found that it calls unregister_ftrace_graph, which grabs a mutex and calls kstop machine. This would definitely lock the box up if it were to trigger. This patch adds a fast and safe ftrace_graph_stop() which will stop the function tracer. Then it is safe to call the WARN ON. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 1a5b8f8..adba8e9 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -484,14 +484,16 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) : "memory" ); - if (WARN_ON(faulted)) { - unregister_ftrace_graph(); + if (unlikely(faulted)) { + ftrace_graph_stop(); + WARN_ON(1); return; } - if (WARN_ON(!__kernel_text_address(old))) { - unregister_ftrace_graph(); + if (unlikely(!__kernel_text_address(old))) { + ftrace_graph_stop(); *parent = old; + WARN_ON(1); return; } diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index afba918..58ca1c3 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -376,6 +376,8 @@ typedef void (*trace_func_graph_ent_t)(struct ftrace_graph_ent *); /* entry */ extern int register_ftrace_graph(trace_func_graph_ret_t retfunc, trace_func_graph_ent_t entryfunc); +extern void ftrace_graph_stop(void); + /* The current handlers in use */ extern trace_func_graph_ret_t ftrace_graph_return; extern trace_func_graph_ent_t ftrace_graph_entry; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 2e78628..a44af05 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1769,5 +1769,10 @@ void ftrace_graph_exit_task(struct task_struct *t) kfree(ret_stack); } + +void ftrace_graph_stop(void) +{ + ftrace_stop(); +} #endif -- cgit v0.10.2 From 044fa782ebb9472cf5253e95d9a625fd4c0bdd99 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 2 Dec 2008 23:50:03 -0500 Subject: ring-buffer: change "page" variable names to "bpage" Impact: clean up Andrew Morton pointed out that the kernel convention of a variable named page should be of type page struct. The ring buffer uses a variable named "page" for a pointer to something else. This patch converts those to be called "bpage" (as in "buffer page"). Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 50b74d3..7f69cfea 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -208,9 +208,9 @@ struct buffer_page { struct buffer_data_page *page; /* Actual data page */ }; -static void rb_init_page(struct buffer_data_page *page) +static void rb_init_page(struct buffer_data_page *bpage) { - local_set(&page->commit, 0); + local_set(&bpage->commit, 0); } /* @@ -298,19 +298,19 @@ struct ring_buffer_iter { static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) { struct list_head *head = &cpu_buffer->pages; - struct buffer_page *page, *tmp; + struct buffer_page *bpage, *tmp; if (RB_WARN_ON(cpu_buffer, head->next->prev != head)) return -1; if (RB_WARN_ON(cpu_buffer, head->prev->next != head)) return -1; - list_for_each_entry_safe(page, tmp, head, list) { + list_for_each_entry_safe(bpage, tmp, head, list) { if (RB_WARN_ON(cpu_buffer, - page->list.next->prev != &page->list)) + bpage->list.next->prev != &bpage->list)) return -1; if (RB_WARN_ON(cpu_buffer, - page->list.prev->next != &page->list)) + bpage->list.prev->next != &bpage->list)) return -1; } @@ -321,23 +321,23 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) { struct list_head *head = &cpu_buffer->pages; - struct buffer_page *page, *tmp; + struct buffer_page *bpage, *tmp; unsigned long addr; LIST_HEAD(pages); unsigned i; for (i = 0; i < nr_pages; i++) { - page = kzalloc_node(ALIGN(sizeof(*page), cache_line_size()), + bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), GFP_KERNEL, cpu_to_node(cpu_buffer->cpu)); - if (!page) + if (!bpage) goto free_pages; - list_add(&page->list, &pages); + list_add(&bpage->list, &pages); addr = __get_free_page(GFP_KERNEL); if (!addr) goto free_pages; - page->page = (void *)addr; - rb_init_page(page->page); + bpage->page = (void *)addr; + rb_init_page(bpage->page); } list_splice(&pages, head); @@ -347,9 +347,9 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, return 0; free_pages: - list_for_each_entry_safe(page, tmp, &pages, list) { - list_del_init(&page->list); - free_buffer_page(page); + list_for_each_entry_safe(bpage, tmp, &pages, list) { + list_del_init(&bpage->list); + free_buffer_page(bpage); } return -ENOMEM; } @@ -358,7 +358,7 @@ static struct ring_buffer_per_cpu * rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; - struct buffer_page *page; + struct buffer_page *bpage; unsigned long addr; int ret; @@ -373,17 +373,17 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; INIT_LIST_HEAD(&cpu_buffer->pages); - page = kzalloc_node(ALIGN(sizeof(*page), cache_line_size()), + bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), GFP_KERNEL, cpu_to_node(cpu)); - if (!page) + if (!bpage) goto fail_free_buffer; - cpu_buffer->reader_page = page; + cpu_buffer->reader_page = bpage; addr = __get_free_page(GFP_KERNEL); if (!addr) goto fail_free_reader; - page->page = (void *)addr; - rb_init_page(page->page); + bpage->page = (void *)addr; + rb_init_page(bpage->page); INIT_LIST_HEAD(&cpu_buffer->reader_page->list); @@ -408,14 +408,14 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) { struct list_head *head = &cpu_buffer->pages; - struct buffer_page *page, *tmp; + struct buffer_page *bpage, *tmp; list_del_init(&cpu_buffer->reader_page->list); free_buffer_page(cpu_buffer->reader_page); - list_for_each_entry_safe(page, tmp, head, list) { - list_del_init(&page->list); - free_buffer_page(page); + list_for_each_entry_safe(bpage, tmp, head, list) { + list_del_init(&bpage->list); + free_buffer_page(bpage); } kfree(cpu_buffer); } @@ -512,7 +512,7 @@ static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer); static void rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) { - struct buffer_page *page; + struct buffer_page *bpage; struct list_head *p; unsigned i; @@ -523,9 +523,9 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) if (RB_WARN_ON(cpu_buffer, list_empty(&cpu_buffer->pages))) return; p = cpu_buffer->pages.next; - page = list_entry(p, struct buffer_page, list); - list_del_init(&page->list); - free_buffer_page(page); + bpage = list_entry(p, struct buffer_page, list); + list_del_init(&bpage->list); + free_buffer_page(bpage); } if (RB_WARN_ON(cpu_buffer, list_empty(&cpu_buffer->pages))) return; @@ -542,7 +542,7 @@ static void rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, struct list_head *pages, unsigned nr_pages) { - struct buffer_page *page; + struct buffer_page *bpage; struct list_head *p; unsigned i; @@ -553,9 +553,9 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, if (RB_WARN_ON(cpu_buffer, list_empty(pages))) return; p = pages->next; - page = list_entry(p, struct buffer_page, list); - list_del_init(&page->list); - list_add_tail(&page->list, &cpu_buffer->pages); + bpage = list_entry(p, struct buffer_page, list); + list_del_init(&bpage->list); + list_add_tail(&bpage->list, &cpu_buffer->pages); } rb_reset_cpu(cpu_buffer); @@ -582,7 +582,7 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) { struct ring_buffer_per_cpu *cpu_buffer; unsigned nr_pages, rm_pages, new_pages; - struct buffer_page *page, *tmp; + struct buffer_page *bpage, *tmp; unsigned long buffer_size; unsigned long addr; LIST_HEAD(pages); @@ -643,17 +643,17 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) for_each_buffer_cpu(buffer, cpu) { for (i = 0; i < new_pages; i++) { - page = kzalloc_node(ALIGN(sizeof(*page), + bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), GFP_KERNEL, cpu_to_node(cpu)); - if (!page) + if (!bpage) goto free_pages; - list_add(&page->list, &pages); + list_add(&bpage->list, &pages); addr = __get_free_page(GFP_KERNEL); if (!addr) goto free_pages; - page->page = (void *)addr; - rb_init_page(page->page); + bpage->page = (void *)addr; + rb_init_page(bpage->page); } } @@ -674,9 +674,9 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) return size; free_pages: - list_for_each_entry_safe(page, tmp, &pages, list) { - list_del_init(&page->list); - free_buffer_page(page); + list_for_each_entry_safe(bpage, tmp, &pages, list) { + list_del_init(&bpage->list); + free_buffer_page(bpage); } mutex_unlock(&buffer->mutex); return -ENOMEM; @@ -688,14 +688,14 @@ static inline int rb_null_event(struct ring_buffer_event *event) } static inline void * -__rb_data_page_index(struct buffer_data_page *page, unsigned index) +__rb_data_page_index(struct buffer_data_page *bpage, unsigned index) { - return page->data + index; + return bpage->data + index; } -static inline void *__rb_page_index(struct buffer_page *page, unsigned index) +static inline void *__rb_page_index(struct buffer_page *bpage, unsigned index) { - return page->page->data + index; + return bpage->page->data + index; } static inline struct ring_buffer_event * @@ -771,14 +771,14 @@ static void rb_update_overflow(struct ring_buffer_per_cpu *cpu_buffer) } static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer, - struct buffer_page **page) + struct buffer_page **bpage) { - struct list_head *p = (*page)->list.next; + struct list_head *p = (*bpage)->list.next; if (p == &cpu_buffer->pages) p = p->next; - *page = list_entry(p, struct buffer_page, list); + *bpage = list_entry(p, struct buffer_page, list); } static inline unsigned @@ -2239,16 +2239,16 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a, } static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer, - struct buffer_data_page *page) + struct buffer_data_page *bpage) { struct ring_buffer_event *event; unsigned long head; __raw_spin_lock(&cpu_buffer->lock); - for (head = 0; head < local_read(&page->commit); + for (head = 0; head < local_read(&bpage->commit); head += rb_event_length(event)) { - event = __rb_data_page_index(page, head); + event = __rb_data_page_index(bpage, head); if (RB_WARN_ON(cpu_buffer, rb_null_event(event))) return; /* Only count data entries */ @@ -2277,15 +2277,15 @@ static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer, void *ring_buffer_alloc_read_page(struct ring_buffer *buffer) { unsigned long addr; - struct buffer_data_page *page; + struct buffer_data_page *bpage; addr = __get_free_page(GFP_KERNEL); if (!addr) return NULL; - page = (void *)addr; + bpage = (void *)addr; - return page; + return bpage; } /** @@ -2337,15 +2337,15 @@ int ring_buffer_read_page(struct ring_buffer *buffer, { struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; struct ring_buffer_event *event; - struct buffer_data_page *page; + struct buffer_data_page *bpage; unsigned long flags; int ret = 0; if (!data_page) return 0; - page = *data_page; - if (!page) + bpage = *data_page; + if (!bpage) return 0; spin_lock_irqsave(&cpu_buffer->reader_lock, flags); @@ -2372,26 +2372,26 @@ int ring_buffer_read_page(struct ring_buffer *buffer, if (full) goto out; /* The writer is still on the reader page, we must copy */ - page = cpu_buffer->reader_page->page; - memcpy(page->data, + bpage = cpu_buffer->reader_page->page; + memcpy(bpage->data, cpu_buffer->reader_page->page->data + read, - local_read(&page->commit) - read); + local_read(&bpage->commit) - read); /* consume what was read */ cpu_buffer->reader_page += read; } else { /* swap the pages */ - rb_init_page(page); - page = cpu_buffer->reader_page->page; + rb_init_page(bpage); + bpage = cpu_buffer->reader_page->page; cpu_buffer->reader_page->page = *data_page; cpu_buffer->reader_page->read = 0; - *data_page = page; + *data_page = bpage; } ret = 1; /* update the entry counter */ - rb_remove_entries(cpu_buffer, page); + rb_remove_entries(cpu_buffer, bpage); out: spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); -- cgit v0.10.2 From 7ee991fbc6f947e9b04f29c9c6c1d057d0671a16 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 2 Dec 2008 23:50:04 -0500 Subject: ftrace: print real return in dumpstack for function graph Impact: better dumpstack output I noticed in my crash dumps and even in the stack tracer that a lot of functions listed in the stack trace are simply return_to_handler which is ftrace graphs way to insert its own call into the return of a function. But we lose out where the actually function was called from. This patch adds in hooks to the dumpstack mechanism that detects this and finds the real function to print. Both are printed to let the user know that a hook is still in place. This does give a funny side effect in the stack tracer output: Depth Size Location (80 entries) ----- ---- -------- 0) 4144 48 save_stack_trace+0x2f/0x4d 1) 4096 128 ftrace_call+0x5/0x2b 2) 3968 16 mempool_alloc_slab+0x16/0x18 3) 3952 384 return_to_handler+0x0/0x73 4) 3568 -240 stack_trace_call+0x11d/0x209 5) 3808 144 return_to_handler+0x0/0x73 6) 3664 -128 mempool_alloc+0x4d/0xfe 7) 3792 128 return_to_handler+0x0/0x73 8) 3664 -32 scsi_sg_alloc+0x48/0x4a [scsi_mod] As you can see, the real functions are now negative. This is due to them not being found inside the stack. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 5962176..6b1f6f6 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -30,6 +30,37 @@ void printk_address(unsigned long address, int reliable) reliable ? "" : "? ", (void *) address); } +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +static void +print_ftrace_graph_addr(unsigned long addr, void *data, + const struct stacktrace_ops *ops, + struct thread_info *tinfo, int *graph) +{ + struct task_struct *task = tinfo->task; + unsigned long ret_addr; + int index = task->curr_ret_stack; + + if (addr != (unsigned long)return_to_handler) + return; + + if (!task->ret_stack || index < *graph) + return; + + index -= *graph; + ret_addr = task->ret_stack[index].ret; + + ops->address(data, ret_addr, 1); + + (*graph)++; +} +#else +static inline void +print_ftrace_graph_addr(unsigned long addr, void *data, + const struct stacktrace_ops *ops, + struct thread_info *tinfo, int *graph) +{ } +#endif + /* * x86-64 can have up to three kernel stacks: * process stack @@ -54,7 +85,7 @@ unsigned long print_context_stack(struct thread_info *tinfo, unsigned long *stack, unsigned long bp, const struct stacktrace_ops *ops, void *data, - unsigned long *end) + unsigned long *end, int *graph) { struct stack_frame *frame = (struct stack_frame *)bp; @@ -70,6 +101,7 @@ print_context_stack(struct thread_info *tinfo, } else { ops->address(data, addr, bp == 0); } + print_ftrace_graph_addr(addr, data, ops, tinfo, graph); } stack++; } diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h index 3119a80..da87590b 100644 --- a/arch/x86/kernel/dumpstack.h +++ b/arch/x86/kernel/dumpstack.h @@ -18,7 +18,7 @@ extern unsigned long print_context_stack(struct thread_info *tinfo, unsigned long *stack, unsigned long bp, const struct stacktrace_ops *ops, void *data, - unsigned long *end); + unsigned long *end, int *graph); extern void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 7b031b1..d593cd1 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -23,6 +23,8 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, unsigned long bp, const struct stacktrace_ops *ops, void *data) { + int graph = 0; + if (!task) task = current; @@ -50,7 +52,8 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, context = (struct thread_info *) ((unsigned long)stack & (~(THREAD_SIZE - 1))); - bp = print_context_stack(context, stack, bp, ops, data, NULL); + bp = print_context_stack(context, stack, bp, ops, + data, NULL, &graph); stack = (unsigned long *)context->previous_esp; if (!stack) diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 33ff102..c302d07 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -109,6 +109,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr; unsigned used = 0; struct thread_info *tinfo; + int graph = 0; if (!task) task = current; @@ -149,7 +150,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, break; bp = print_context_stack(tinfo, stack, bp, ops, - data, estack_end); + data, estack_end, &graph); ops->stack(data, ""); /* * We link to the next stack via the @@ -168,7 +169,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, if (ops->stack(data, "IRQ") < 0) break; bp = print_context_stack(tinfo, stack, bp, - ops, data, irqstack_end); + ops, data, irqstack_end, &graph); /* * We link to the next stack (which would be * the process stack normally) the last @@ -186,7 +187,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, /* * This handles the process stack: */ - bp = print_context_stack(tinfo, stack, bp, ops, data, NULL); + bp = print_context_stack(tinfo, stack, bp, ops, data, NULL, &graph); put_cpu(); } EXPORT_SYMBOL(dump_trace); -- cgit v0.10.2 From e49dc19c6a19ea112fcb94b7c62ec62cdd5c08aa Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 2 Dec 2008 23:50:05 -0500 Subject: ftrace: function graph return for function entry Impact: feature, let entry function decide to trace or not This patch lets the graph tracer entry function decide if the tracing should be done at the end as well. This requires all function graph entry functions return 1 if it should trace, or 0 if the return should not be traced. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 826682a..43ceb3f 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -1196,6 +1196,9 @@ ENTRY(mcount) #ifdef CONFIG_FUNCTION_GRAPH_TRACER cmpl $ftrace_stub, ftrace_graph_return jnz ftrace_graph_caller + + cmpl $ftrace_graph_entry_stub, ftrace_graph_entry + jnz ftrace_graph_caller #endif .globl ftrace_stub ftrace_stub: diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 9060ba6..54e0bbd 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -120,6 +120,9 @@ ENTRY(mcount) #ifdef CONFIG_FUNCTION_GRAPH_TRACER cmpq $ftrace_stub, ftrace_graph_return jnz ftrace_graph_caller + + cmpq $ftrace_graph_entry_stub, ftrace_graph_entry + jnz ftrace_graph_caller #endif .globl ftrace_stub diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index adba8e9..d278ad2 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -425,6 +425,7 @@ static void pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret) trace->calltime = current->ret_stack[index].calltime; trace->overrun = atomic_read(¤t->trace_overrun); trace->depth = index; + barrier(); current->curr_ret_stack--; } @@ -506,7 +507,11 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) } trace.func = self_addr; - ftrace_graph_entry(&trace); + /* Only trace if the calling function expects to */ + if (!ftrace_graph_entry(&trace)) { + current->curr_ret_stack--; + *parent = old; + } } #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 58ca1c3..469ceb3 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -371,7 +371,7 @@ struct ftrace_graph_ret { #define FTRACE_RETSTACK_ALLOC_SIZE 32 /* Type of the callback handlers for tracing function graph*/ typedef void (*trace_func_graph_ret_t)(struct ftrace_graph_ret *); /* return */ -typedef void (*trace_func_graph_ent_t)(struct ftrace_graph_ent *); /* entry */ +typedef int (*trace_func_graph_ent_t)(struct ftrace_graph_ent *); /* entry */ extern int register_ftrace_graph(trace_func_graph_ret_t retfunc, trace_func_graph_ent_t entryfunc); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index a44af05..65b9e86 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1636,11 +1636,15 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, static atomic_t ftrace_graph_active; +int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace) +{ + return 0; +} + /* The callbacks that hook a function */ trace_func_graph_ret_t ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; -trace_func_graph_ent_t ftrace_graph_entry = - (trace_func_graph_ent_t)ftrace_stub; +trace_func_graph_ent_t ftrace_graph_entry = ftrace_graph_entry_stub; /* Try to assign a return stack array on FTRACE_RETSTACK_ALLOC_SIZE tasks. */ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list) @@ -1738,7 +1742,7 @@ void unregister_ftrace_graph(void) atomic_dec(&ftrace_graph_active); ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; - ftrace_graph_entry = (trace_func_graph_ent_t)ftrace_stub; + ftrace_graph_entry = ftrace_graph_entry_stub; ftrace_shutdown(FTRACE_STOP_FUNC_RET); mutex_unlock(&ftrace_sysctl_lock); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 380de63..8b6409a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1200,7 +1200,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip) } #ifdef CONFIG_FUNCTION_GRAPH_TRACER -void trace_graph_entry(struct ftrace_graph_ent *trace) +int trace_graph_entry(struct ftrace_graph_ent *trace) { struct trace_array *tr = &global_trace; struct trace_array_cpu *data; @@ -1219,6 +1219,8 @@ void trace_graph_entry(struct ftrace_graph_ent *trace) } atomic_dec(&data->disabled); local_irq_restore(flags); + + return 1; } void trace_graph_return(struct ftrace_graph_ret *trace) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index f96f4e7..0565ae9 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -412,7 +412,7 @@ void trace_function(struct trace_array *tr, unsigned long flags, int pc); void trace_graph_return(struct ftrace_graph_ret *trace); -void trace_graph_entry(struct ftrace_graph_ent *trace); +int trace_graph_entry(struct ftrace_graph_ent *trace); void trace_bts(struct trace_array *tr, unsigned long from, unsigned long to); -- cgit v0.10.2 From 62679efe0a5f02987a621942afc5979a80a6ca5a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 2 Dec 2008 23:50:06 -0500 Subject: ftrace: add checks on ret stack in function graph Import: robustness checks Add more checks in the function graph code to detect errors and perhaps print out better information if a bug happens. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index d278ad2..f98c407 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -420,6 +420,15 @@ static void pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret) int index; index = current->curr_ret_stack; + + if (unlikely(index < 0)) { + ftrace_graph_stop(); + WARN_ON(1); + /* Might as well panic, otherwise we have no where to go */ + *ret = (unsigned long)panic; + return; + } + *ret = current->ret_stack[index].ret; trace->func = current->ret_stack[index].func; trace->calltime = current->ret_stack[index].calltime; @@ -427,6 +436,7 @@ static void pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret) trace->depth = index; barrier(); current->curr_ret_stack--; + } /* @@ -442,6 +452,13 @@ unsigned long ftrace_return_to_handler(void) trace.rettime = cpu_clock(raw_smp_processor_id()); ftrace_graph_return(&trace); + if (unlikely(!ret)) { + ftrace_graph_stop(); + WARN_ON(1); + /* Might as well panic. What else to do? */ + ret = (unsigned long)panic; + } + return ret; } -- cgit v0.10.2 From 11e84acc400921743cc8d488e4a265cd98a655c7 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 3 Dec 2008 02:30:37 +0100 Subject: tracing/function-graph-tracer: display unified style cmdline and pid Impact: extend function-graph output: let one know which thread called a function This patch implements a helper function to print the couple cmdline/pid. Its output is provided during task switching and on each row if the new "funcgraph-proc" defualt-off option is set through trace_options file. The output is center aligned and never exceeds 14 characters. The cmdline is truncated over 7 chars. But note that if the pid exceeds 6 characters, the column will overflow (but the situation is abnormal). Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 894b50b..23e19d2 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -19,6 +19,7 @@ #define TRACE_GRAPH_PRINT_OVERRUN 0x1 #define TRACE_GRAPH_PRINT_CPU 0x2 #define TRACE_GRAPH_PRINT_OVERHEAD 0x4 +#define TRACE_GRAPH_PRINT_PROC 0x8 static struct tracer_opt trace_opts[] = { /* Display overruns ? */ @@ -27,11 +28,13 @@ static struct tracer_opt trace_opts[] = { { TRACER_OPT(funcgraph-cpu, TRACE_GRAPH_PRINT_CPU) }, /* Display Overhead ? */ { TRACER_OPT(funcgraph-overhead, TRACE_GRAPH_PRINT_OVERHEAD) }, + /* Display proc name/pid */ + { TRACER_OPT(funcgraph-proc, TRACE_GRAPH_PRINT_PROC) }, { } /* Empty entry */ }; static struct tracer_flags tracer_flags = { - /* Don't display overruns by default */ + /* Don't display overruns and proc by default */ .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD, .opts = trace_opts }; @@ -104,23 +107,63 @@ print_graph_cpu(struct trace_seq *s, int cpu) return TRACE_TYPE_HANDLED; } +#define TRACE_GRAPH_PROCINFO_LENGTH 14 + +static enum print_line_t +print_graph_proc(struct trace_seq *s, pid_t pid) +{ + int i; + int ret; + int len; + char comm[8]; + int spaces = 0; + /* sign + log10(MAX_INT) + '\0' */ + char pid_str[11]; + + strncpy(comm, trace_find_cmdline(pid), 7); + comm[7] = '\0'; + sprintf(pid_str, "%d", pid); + + /* 1 stands for the "-" character */ + len = strlen(comm) + strlen(pid_str) + 1; + + if (len < TRACE_GRAPH_PROCINFO_LENGTH) + spaces = TRACE_GRAPH_PROCINFO_LENGTH - len; + + /* First spaces to align center */ + for (i = 0; i < spaces / 2; i++) { + ret = trace_seq_printf(s, " "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + + ret = trace_seq_printf(s, "%s-%s", comm, pid_str); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + /* Last spaces to align center */ + for (i = 0; i < spaces - (spaces / 2); i++) { + ret = trace_seq_printf(s, " "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + return TRACE_TYPE_HANDLED; +} + /* If the pid changed since the last trace, output this event */ -static int verif_pid(struct trace_seq *s, pid_t pid, int cpu) +static enum print_line_t +verif_pid(struct trace_seq *s, pid_t pid, int cpu) { - char *comm, *prev_comm; pid_t prev_pid; int ret; if (last_pid[cpu] != -1 && last_pid[cpu] == pid) - return 1; + return TRACE_TYPE_HANDLED; prev_pid = last_pid[cpu]; last_pid[cpu] = pid; - comm = trace_find_cmdline(pid); - prev_comm = trace_find_cmdline(prev_pid); - /* * Context-switch trace line: @@ -130,11 +173,31 @@ static int verif_pid(struct trace_seq *s, pid_t pid, int cpu) */ ret = trace_seq_printf(s, - " ------------------------------------------\n"); - ret += trace_seq_printf(s, " | %d) %s-%d => %s-%d\n", - cpu, prev_comm, prev_pid, comm, pid); - ret += trace_seq_printf(s, - " ------------------------------------------\n\n"); + "\n ------------------------------------------\n |"); + if (!ret) + TRACE_TYPE_PARTIAL_LINE; + + ret = print_graph_cpu(s, cpu); + if (ret == TRACE_TYPE_PARTIAL_LINE) + TRACE_TYPE_PARTIAL_LINE; + + ret = print_graph_proc(s, prev_pid); + if (ret == TRACE_TYPE_PARTIAL_LINE) + TRACE_TYPE_PARTIAL_LINE; + + ret = trace_seq_printf(s, " => "); + if (!ret) + TRACE_TYPE_PARTIAL_LINE; + + ret = print_graph_proc(s, pid); + if (ret == TRACE_TYPE_PARTIAL_LINE) + TRACE_TYPE_PARTIAL_LINE; + + ret = trace_seq_printf(s, + "\n ------------------------------------------\n\n"); + if (!ret) + TRACE_TYPE_PARTIAL_LINE; + return ret; } @@ -288,12 +351,23 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, struct trace_entry *ent = iter->ent; /* Pid */ - if (!verif_pid(s, ent->pid, cpu)) + if (verif_pid(s, ent->pid, cpu) == TRACE_TYPE_PARTIAL_LINE) return TRACE_TYPE_PARTIAL_LINE; /* Cpu */ if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) { ret = print_graph_cpu(s, cpu); + if (ret == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + } + + /* Proc */ + if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) { + ret = print_graph_proc(s, ent->pid); + if (ret == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + + ret = trace_seq_printf(s, " | "); if (!ret) return TRACE_TYPE_PARTIAL_LINE; } @@ -318,12 +392,23 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, duration = 9999999ULL; /* Pid */ - if (!verif_pid(s, ent->pid, cpu)) + if (verif_pid(s, ent->pid, cpu) == TRACE_TYPE_PARTIAL_LINE) return TRACE_TYPE_PARTIAL_LINE; /* Cpu */ if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) { ret = print_graph_cpu(s, cpu); + if (ret == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + } + + /* Proc */ + if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) { + ret = print_graph_proc(s, ent->pid); + if (ret == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + + ret = trace_seq_printf(s, " | "); if (!ret) return TRACE_TYPE_PARTIAL_LINE; } -- cgit v0.10.2 From 166d3c7994d79ab3f78f420607283361ff5cce79 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 3 Dec 2008 02:32:12 +0100 Subject: tracing/function-graph-tracer: improve duration output Impact: better trace output of duration for long calls The old duration output didn't exceeded 9999.999 us to fit the column and the nanosecs were always 3 numbers. As Ingo suggested, it's better to have the whole microseconds elapsed time and shift the nanosecs precision if needed to fit the maximum 7 numbers. And usec need more number, the case should be rare and important enough to break a bit the column alignment to show it. So, depending of the duration value, we now have these patterns: u.nnn us uu.nnn us uuu.nnn us uuuu.nnn us uuuuu.nn us uuuuuu.n us uuuuuuuu..... us Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 23e19d2..c66578f 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -232,11 +232,50 @@ trace_branch_is_leaf(struct trace_iterator *iter, } -static inline int +static enum print_line_t print_graph_duration(unsigned long long duration, struct trace_seq *s) { unsigned long nsecs_rem = do_div(duration, 1000); - return trace_seq_printf(s, "%4llu.%3lu us | ", duration, nsecs_rem); + /* log10(ULONG_MAX) + '\0' */ + char msecs_str[21]; + char nsecs_str[5]; + int ret, len; + int i; + + sprintf(msecs_str, "%lu", (unsigned long) duration); + + /* Print msecs */ + ret = trace_seq_printf(s, msecs_str); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + len = strlen(msecs_str); + + /* Print nsecs (we don't want to exceed 7 numbers) */ + if (len < 7) { + snprintf(nsecs_str, 8 - len, "%03lu", nsecs_rem); + ret = trace_seq_printf(s, ".%s", nsecs_str); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + len += strlen(nsecs_str); + } + + ret = trace_seq_printf(s, " us "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + /* Print remaining spaces to fit the row's width */ + for (i = len; i < 7; i++) { + ret = trace_seq_printf(s, " "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + + ret = trace_seq_printf(s, "| "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + return TRACE_TYPE_HANDLED; + } /* Signal a overhead of time execution to the output */ @@ -273,10 +312,6 @@ print_graph_entry_leaf(struct trace_iterator *iter, call = &entry->graph_ent; duration = graph_ret->rettime - graph_ret->calltime; - /* Must not exceed 8 characters: 9999.999 us */ - if (duration > 10000000ULL) - duration = 9999999ULL; - /* Overhead */ if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) { ret = print_graph_overhead(duration, s); @@ -286,7 +321,7 @@ print_graph_entry_leaf(struct trace_iterator *iter, /* Duration */ ret = print_graph_duration(duration, s); - if (!ret) + if (ret == TRACE_TYPE_PARTIAL_LINE) return TRACE_TYPE_PARTIAL_LINE; /* Function */ @@ -387,10 +422,6 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, int ret; unsigned long long duration = trace->rettime - trace->calltime; - /* Must not exceed 8 characters: xxxx.yyy us */ - if (duration > 10000000ULL) - duration = 9999999ULL; - /* Pid */ if (verif_pid(s, ent->pid, cpu) == TRACE_TYPE_PARTIAL_LINE) return TRACE_TYPE_PARTIAL_LINE; @@ -422,7 +453,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, /* Duration */ ret = print_graph_duration(duration, s); - if (!ret) + if (ret == TRACE_TYPE_PARTIAL_LINE) return TRACE_TYPE_PARTIAL_LINE; /* Closing brace */ -- cgit v0.10.2 From 764f3b95131a7ce5c992e3d00caf590fcada2f7b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 3 Dec 2008 10:33:58 +0100 Subject: tracing/function-graph-tracer: enabled by default CONFIG_FUNCTION_GRAPH_TRACER depends on FUNCTION_TRACER already, (turning it non-default) so it so making it default-n is pointless. So enable it by default - it's a nice extension of the function tracer. Signed-off-by: Ingo Molnar diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 8b6b673..bde6f03 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -67,6 +67,7 @@ config FUNCTION_GRAPH_TRACER bool "Kernel Function Graph Tracer" depends on HAVE_FUNCTION_GRAPH_TRACER depends on FUNCTION_TRACER + default y help Enable the kernel to trace a function at both its return and its entry. -- cgit v0.10.2 From 0a37119d963e876ca86912497346ec50dea2541b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 3 Dec 2008 11:04:50 -0500 Subject: trace: fix output of stack trace Impact: fix to output of stack trace If a function is not found in the stack of the stack tracer, the number printed is quite strange. This fixes the algorithm to handle missing functions better. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 06a1611..0b863f2 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -78,6 +78,7 @@ static inline void check_stack(void) * on a new max, so it is far from a fast path. */ while (i < max_stack_trace.nr_entries) { + int found = 0; stack_dump_index[i] = this_size; p = start; @@ -86,12 +87,14 @@ static inline void check_stack(void) if (*p == stack_dump_trace[i]) { this_size = stack_dump_index[i++] = (top - p) * sizeof(unsigned long); + found = 1; /* Start the search from here */ start = p + 1; } } - i++; + if (!found) + i++; } out: -- cgit v0.10.2 From e8e1abe92fd7ea9d823a3aaf81d10e2cba593b6b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 3 Dec 2008 11:04:51 -0500 Subject: ftrace: fix race in function graph during fork Impact: graph tracer race/crash fix There is a nasy race in startup of a new process running the function graph tracer. In fork.c: total_forks++; spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); ftrace_graph_init_task(p); proc_fork_connector(p); cgroup_post_fork(p); return p; The new task is free to run as soon as the tasklist_lock is released. This is before the ftrace_graph_init_task. If the task does run it will be using the same ret_stack and curr_ret_stack as the parent. This will cause crashes that are difficult to debug. This patch moves the ftrace_graph_init_task to just after the alloc_pid code. This fixes the above race. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/kernel/fork.c b/kernel/fork.c index 5f82a99..7407ab3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1137,6 +1137,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, } } + ftrace_graph_init_task(p); + p->pid = pid_nr(pid); p->tgid = p->pid; if (clone_flags & CLONE_THREAD) @@ -1145,7 +1147,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (current->nsproxy != p->nsproxy) { retval = ns_cgroup_clone(p, pid); if (retval) - goto bad_fork_free_pid; + goto bad_fork_free_graph; } p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; @@ -1238,7 +1240,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); retval = -ERESTARTNOINTR; - goto bad_fork_free_pid; + goto bad_fork_free_graph; } if (clone_flags & CLONE_THREAD) { @@ -1271,11 +1273,12 @@ static struct task_struct *copy_process(unsigned long clone_flags, total_forks++; spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); - ftrace_graph_init_task(p); proc_fork_connector(p); cgroup_post_fork(p); return p; +bad_fork_free_graph: + ftrace_graph_exit_task(p); bad_fork_free_pid: if (pid != &init_struct_pid) free_pid(pid); -- cgit v0.10.2 From ea4e2bc4d9f7370e57a343ccb5e7c0ad3222ec3c Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 3 Dec 2008 15:36:57 -0500 Subject: ftrace: graph of a single function This patch adds the file: /debugfs/tracing/set_graph_function which can be used along with the function graph tracer. When this file is empty, the function graph tracer will act as usual. When the file has a function in it, the function graph tracer will only trace that function. For example: # echo blk_unplug > /debugfs/tracing/set_graph_function # cat /debugfs/tracing/trace [...] ------------------------------------------ | 2) make-19003 => kjournald-2219 ------------------------------------------ 2) | blk_unplug() { 2) | dm_unplug_all() { 2) | dm_get_table() { 2) 1.381 us | _read_lock(); 2) 0.911 us | dm_table_get(); 2) 1. 76 us | _read_unlock(); 2) + 12.912 us | } 2) | dm_table_unplug_all() { 2) | blk_unplug() { 2) 0.778 us | generic_unplug_device(); 2) 2.409 us | } 2) 5.992 us | } 2) 0.813 us | dm_table_put(); 2) + 29. 90 us | } 2) + 34.532 us | } You can add up to 32 functions into this file. Currently we limit it to 32, but this may change with later improvements. To add another function, use the append '>>': # echo sys_read >> /debugfs/tracing/set_graph_function # cat /debugfs/tracing/set_graph_function blk_unplug sys_read Using the '>' will clear out the function and write anew: # echo sys_write > /debug/tracing/set_graph_function # cat /debug/tracing/set_graph_function sys_write Note, if you have function graph running while doing this, the small time between clearing it and updating it will cause the graph to record all functions. This should not be an issue because after it sets the filter, only those functions will be recorded from then on. If you need to only record a particular function then set this file first before starting the function graph tracer. In the future this side effect may be corrected. The set_graph_function file is similar to the set_ftrace_filter but it does not take wild cards nor does it allow for more than one function to be set with a single write. There is no technical reason why this is the case, I just do not have the time yet to implement that. Note, dynamic ftrace must be enabled for this to appear because it uses the dynamic ftrace records to match the name to the mcount call sites. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 469ceb3..b295d31 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -7,6 +7,7 @@ #include #include #include +#include #ifdef CONFIG_FUNCTION_TRACER @@ -391,4 +392,49 @@ static inline void ftrace_graph_init_task(struct task_struct *t) { } static inline void ftrace_graph_exit_task(struct task_struct *t) { } #endif +#ifdef CONFIG_TRACING +#include + +/* flags for current->trace */ +enum { + TSK_TRACE_FL_TRACE_BIT = 0, + TSK_TRACE_FL_GRAPH_BIT = 1, +}; +enum { + TSK_TRACE_FL_TRACE = 1 << TSK_TRACE_FL_TRACE_BIT, + TSK_TRACE_FL_GRAPH = 1 << TSK_TRACE_FL_GRAPH_BIT, +}; + +static inline void set_tsk_trace_trace(struct task_struct *tsk) +{ + set_bit(TSK_TRACE_FL_TRACE_BIT, &tsk->trace); +} + +static inline void clear_tsk_trace_trace(struct task_struct *tsk) +{ + clear_bit(TSK_TRACE_FL_TRACE_BIT, &tsk->trace); +} + +static inline int test_tsk_trace_trace(struct task_struct *tsk) +{ + return tsk->trace & TSK_TRACE_FL_TRACE; +} + +static inline void set_tsk_trace_graph(struct task_struct *tsk) +{ + set_bit(TSK_TRACE_FL_GRAPH_BIT, &tsk->trace); +} + +static inline void clear_tsk_trace_graph(struct task_struct *tsk) +{ + clear_bit(TSK_TRACE_FL_GRAPH_BIT, &tsk->trace); +} + +static inline int test_tsk_trace_graph(struct task_struct *tsk) +{ + return tsk->trace & TSK_TRACE_FL_GRAPH; +} + +#endif /* CONFIG_TRACING */ + #endif /* _LINUX_FTRACE_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 2d0a93c..4c152e0 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1380,6 +1380,10 @@ struct task_struct { */ atomic_t trace_overrun; #endif +#ifdef CONFIG_TRACING + /* state flags for use by tracers */ + unsigned long trace; +#endif }; /* diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 65b9e86..b17a303 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1320,6 +1320,224 @@ static struct file_operations ftrace_notrace_fops = { .release = ftrace_notrace_release, }; +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + +static DEFINE_MUTEX(graph_lock); + +int ftrace_graph_count; +unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly; + +static void * +g_next(struct seq_file *m, void *v, loff_t *pos) +{ + unsigned long *array = m->private; + int index = *pos; + + (*pos)++; + + if (index >= ftrace_graph_count) + return NULL; + + return &array[index]; +} + +static void *g_start(struct seq_file *m, loff_t *pos) +{ + void *p = NULL; + + mutex_lock(&graph_lock); + + p = g_next(m, p, pos); + + return p; +} + +static void g_stop(struct seq_file *m, void *p) +{ + mutex_unlock(&graph_lock); +} + +static int g_show(struct seq_file *m, void *v) +{ + unsigned long *ptr = v; + char str[KSYM_SYMBOL_LEN]; + + if (!ptr) + return 0; + + kallsyms_lookup(*ptr, NULL, NULL, NULL, str); + + seq_printf(m, "%s\n", str); + + return 0; +} + +static struct seq_operations ftrace_graph_seq_ops = { + .start = g_start, + .next = g_next, + .stop = g_stop, + .show = g_show, +}; + +static int +ftrace_graph_open(struct inode *inode, struct file *file) +{ + int ret = 0; + + if (unlikely(ftrace_disabled)) + return -ENODEV; + + mutex_lock(&graph_lock); + if ((file->f_mode & FMODE_WRITE) && + !(file->f_flags & O_APPEND)) { + ftrace_graph_count = 0; + memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs)); + } + + if (file->f_mode & FMODE_READ) { + ret = seq_open(file, &ftrace_graph_seq_ops); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = ftrace_graph_funcs; + } + } else + file->private_data = ftrace_graph_funcs; + mutex_unlock(&graph_lock); + + return ret; +} + +static ssize_t +ftrace_graph_read(struct file *file, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + if (file->f_mode & FMODE_READ) + return seq_read(file, ubuf, cnt, ppos); + else + return -EPERM; +} + +static int +ftrace_set_func(unsigned long *array, int idx, char *buffer) +{ + char str[KSYM_SYMBOL_LEN]; + struct dyn_ftrace *rec; + struct ftrace_page *pg; + int found = 0; + int i; + + if (ftrace_disabled) + return -ENODEV; + + /* should not be called from interrupt context */ + spin_lock(&ftrace_lock); + + for (pg = ftrace_pages_start; pg; pg = pg->next) { + for (i = 0; i < pg->index; i++) { + rec = &pg->records[i]; + + if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE)) + continue; + + kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); + if (strcmp(str, buffer) == 0) { + found = 1; + array[idx] = rec->ip; + break; + } + } + } + spin_unlock(&ftrace_lock); + + return found ? 0 : -EINVAL; +} + +static ssize_t +ftrace_graph_write(struct file *file, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + unsigned char buffer[FTRACE_BUFF_MAX+1]; + unsigned long *array; + size_t read = 0; + ssize_t ret; + int index = 0; + char ch; + + if (!cnt || cnt < 0) + return 0; + + mutex_lock(&graph_lock); + + if (ftrace_graph_count >= FTRACE_GRAPH_MAX_FUNCS) { + ret = -EBUSY; + goto out; + } + + if (file->f_mode & FMODE_READ) { + struct seq_file *m = file->private_data; + array = m->private; + } else + array = file->private_data; + + ret = get_user(ch, ubuf++); + if (ret) + goto out; + read++; + cnt--; + + /* skip white space */ + while (cnt && isspace(ch)) { + ret = get_user(ch, ubuf++); + if (ret) + goto out; + read++; + cnt--; + } + + if (isspace(ch)) { + *ppos += read; + ret = read; + goto out; + } + + while (cnt && !isspace(ch)) { + if (index < FTRACE_BUFF_MAX) + buffer[index++] = ch; + else { + ret = -EINVAL; + goto out; + } + ret = get_user(ch, ubuf++); + if (ret) + goto out; + read++; + cnt--; + } + buffer[index] = 0; + + /* we allow only one at a time */ + ret = ftrace_set_func(array, ftrace_graph_count, buffer); + if (ret) + goto out; + + ftrace_graph_count++; + + file->f_pos += read; + + ret = read; + out: + mutex_unlock(&graph_lock); + + return ret; +} + +static const struct file_operations ftrace_graph_fops = { + .open = ftrace_graph_open, + .read = ftrace_graph_read, + .write = ftrace_graph_write, +}; +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { struct dentry *entry; @@ -1347,6 +1565,15 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) pr_warning("Could not create debugfs " "'set_ftrace_notrace' entry\n"); +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + entry = debugfs_create_file("set_graph_function", 0444, d_tracer, + NULL, + &ftrace_graph_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'set_graph_function' entry\n"); +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + return 0; } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8b6409a..710b39a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1209,6 +1209,9 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) int cpu; int pc; + if (!ftrace_graph_addr(trace->func)) + return 0; + local_irq_save(flags); cpu = raw_smp_processor_id(); data = tr->data[cpu]; @@ -1217,6 +1220,9 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) pc = preempt_count(); __trace_graph_entry(tr, data, trace, flags, pc); } + /* Only do the atomic if it is not already set */ + if (!test_tsk_trace_graph(current)) + set_tsk_trace_graph(current); atomic_dec(&data->disabled); local_irq_restore(flags); @@ -1240,6 +1246,8 @@ void trace_graph_return(struct ftrace_graph_ret *trace) pc = preempt_count(); __trace_graph_return(tr, data, trace, flags, pc); } + if (!trace->depth) + clear_tsk_trace_graph(current); atomic_dec(&data->disabled); local_irq_restore(flags); } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 0565ae9..41f026b 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -505,13 +505,41 @@ extern unsigned long trace_flags; /* Standard output formatting function used for function return traces */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER extern enum print_line_t print_graph_function(struct trace_iterator *iter); + +#ifdef CONFIG_DYNAMIC_FTRACE +/* TODO: make this variable */ +#define FTRACE_GRAPH_MAX_FUNCS 32 +extern int ftrace_graph_count; +extern unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS]; + +static inline int ftrace_graph_addr(unsigned long addr) +{ + int i; + + if (!ftrace_graph_count || test_tsk_trace_graph(current)) + return 1; + + for (i = 0; i < ftrace_graph_count; i++) { + if (addr == ftrace_graph_funcs[i]) + return 1; + } + + return 0; +} #else +static inline int ftrace_trace_addr(unsigned long addr) +{ + return 1 +} +#endif /* CONFIG_DYNAMIC_FTRACE */ + +#else /* CONFIG_FUNCTION_GRAPH_TRACER */ static inline enum print_line_t print_graph_function(struct trace_iterator *iter) { return TRACE_TYPE_UNHANDLED; } -#endif +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ /* * trace_iterator_flags is an enumeration that defines bit -- cgit v0.10.2 From 0ef8cde56ab92ab3f65221246dc1622c6b5068b3 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 3 Dec 2008 15:36:58 -0500 Subject: ftrace: use task struct trace flag to filter on pid Impact: clean up Use the new task struct trace flags to determine if a process should be traced or not. Note: this moves the searching of the pid to the slow path of setting the pid field. This needs to be converted to the pid name space. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index b17a303..c5049f5 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -47,7 +47,7 @@ int ftrace_enabled __read_mostly; static int last_ftrace_enabled; -/* ftrace_pid_trace >= 0 will only trace threads with this pid */ +/* set when tracing only a pid */ static int ftrace_pid_trace = -1; /* Quick disabling of function tracer. */ @@ -90,7 +90,7 @@ static void ftrace_list_func(unsigned long ip, unsigned long parent_ip) static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip) { - if (current->pid != ftrace_pid_trace) + if (!test_tsk_trace_trace(current)) return; ftrace_pid_function(ip, parent_ip); @@ -1714,11 +1714,33 @@ ftrace_pid_write(struct file *filp, const char __user *ubuf, ftrace_pid_trace = -1; } else { + struct task_struct *p; + int found = 0; if (ftrace_pid_trace == val) goto out; - ftrace_pid_trace = val; + /* + * Find the task that matches this pid. + * TODO: use pid namespaces instead. + */ + rcu_read_lock(); + for_each_process(p) { + if (p->pid == val) { + found = 1; + set_tsk_trace_trace(p); + } else if (test_tsk_trace_trace(p)) + clear_tsk_trace_trace(p); + } + rcu_read_unlock(); + + if (found) + ftrace_pid_trace = val; + else { + if (ftrace_pid_trace < 0) + goto out; + ftrace_pid_trace = -1; + } } /* update the function call */ -- cgit v0.10.2 From 804a685162a7080386714166776f57255a75238e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 3 Dec 2008 15:36:59 -0500 Subject: ftrace: trace single pid for function graph tracer Impact: New feature This patch makes the changes to set_ftrace_pid apply to the function graph tracer. # echo $$ > /debugfs/tracing/set_ftrace_pid # echo function_graph > /debugfs/tracing/current_tracer Will cause only the current task to be traced. Note, the trace flags are also inherited by child processes, so the children of the shell will also be traced. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index c5049f5..57592a9 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -48,7 +48,7 @@ int ftrace_enabled __read_mostly; static int last_ftrace_enabled; /* set when tracing only a pid */ -static int ftrace_pid_trace = -1; +int ftrace_pid_trace = -1; /* Quick disabling of function tracer. */ int function_trace_stop; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 710b39a..1bd9574 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1209,6 +1209,9 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) int cpu; int pc; + if (!ftrace_trace_task(current)) + return 0; + if (!ftrace_graph_addr(trace->func)) return 0; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 41f026b..95fff37 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -541,6 +541,16 @@ print_graph_function(struct trace_iterator *iter) } #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ +extern int ftrace_pid_trace; + +static inline int ftrace_trace_task(struct task_struct *task) +{ + if (ftrace_pid_trace < 0) + return 1; + + return test_tsk_trace_trace(task); +} + /* * trace_iterator_flags is an enumeration that defines bit * positions into trace_flags that controls the output. -- cgit v0.10.2 From 5ef6476190d24419a9a537baa0b5641845136989 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 4 Dec 2008 00:26:39 -0500 Subject: pid: fix the do_each_pid_task() macro Impact: macro side-effects fix This patch adds parenthesis around 'pid' in the do_each_pid_task macro to allow callers to pass in more complex parameters. e.g. do_each_pid_task(*pid, type, task) Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/include/linux/pid.h b/include/linux/pid.h index d7e98ff..bb206c5 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -147,9 +147,9 @@ pid_t pid_vnr(struct pid *pid); #define do_each_pid_task(pid, type, task) \ do { \ struct hlist_node *pos___; \ - if (pid != NULL) \ + if ((pid) != NULL) \ hlist_for_each_entry_rcu((task), pos___, \ - &pid->tasks[type], pids[type].node) { + &(pid)->tasks[type], pids[type].node) { /* * Both old and new leaders may be attached to -- cgit v0.10.2 From 978f3a45d9499c7a447ca7615455cefb63d44165 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 4 Dec 2008 00:26:40 -0500 Subject: ftrace: use struct pid Impact: clean up, extend PID filtering to PID namespaces Eric Biederman suggested using the struct pid for filtering on pids in the kernel. This patch is based off of a demonstration of an implementation that Eric sent me in an email. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 57592a9..10b1d7c 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -48,7 +48,7 @@ int ftrace_enabled __read_mostly; static int last_ftrace_enabled; /* set when tracing only a pid */ -int ftrace_pid_trace = -1; +struct pid *ftrace_pid_trace; /* Quick disabling of function tracer. */ int function_trace_stop; @@ -153,7 +153,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops) else func = ftrace_list_func; - if (ftrace_pid_trace >= 0) { + if (ftrace_pid_trace) { set_ftrace_pid_function(func); func = ftrace_pid_func; } @@ -209,7 +209,7 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) if (ftrace_list->next == &ftrace_list_end) { ftrace_func_t func = ftrace_list->func; - if (ftrace_pid_trace >= 0) { + if (ftrace_pid_trace) { set_ftrace_pid_function(func); func = ftrace_pid_func; } @@ -239,7 +239,7 @@ static void ftrace_update_pid_func(void) func = ftrace_trace_function; - if (ftrace_pid_trace >= 0) { + if (ftrace_pid_trace) { set_ftrace_pid_function(func); func = ftrace_pid_func; } else { @@ -1678,18 +1678,40 @@ ftrace_pid_read(struct file *file, char __user *ubuf, char buf[64]; int r; - if (ftrace_pid_trace >= 0) - r = sprintf(buf, "%u\n", ftrace_pid_trace); + if (ftrace_pid_trace) + r = sprintf(buf, "%u\n", pid_nr(ftrace_pid_trace)); else r = sprintf(buf, "no pid\n"); return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } +static void clear_ftrace_pid_task(struct pid **pid) +{ + struct task_struct *p; + + do_each_pid_task(*pid, PIDTYPE_PID, p) { + clear_tsk_trace_trace(p); + } while_each_pid_task(*pid, PIDTYPE_PID, p); + put_pid(*pid); + + *pid = NULL; +} + +static void set_ftrace_pid_task(struct pid *pid) +{ + struct task_struct *p; + + do_each_pid_task(pid, PIDTYPE_PID, p) { + set_tsk_trace_trace(p); + } while_each_pid_task(pid, PIDTYPE_PID, p); +} + static ssize_t ftrace_pid_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { + struct pid *pid; char buf[64]; long val; int ret; @@ -1707,40 +1729,30 @@ ftrace_pid_write(struct file *filp, const char __user *ubuf, return ret; mutex_lock(&ftrace_start_lock); - if (ret < 0) { + if (val < 0) { /* disable pid tracing */ - if (ftrace_pid_trace < 0) + if (!ftrace_pid_trace) goto out; - ftrace_pid_trace = -1; + + clear_ftrace_pid_task(&ftrace_pid_trace); } else { - struct task_struct *p; - int found = 0; + pid = find_get_pid(val); - if (ftrace_pid_trace == val) + if (pid == ftrace_pid_trace) { + put_pid(pid); goto out; - - /* - * Find the task that matches this pid. - * TODO: use pid namespaces instead. - */ - rcu_read_lock(); - for_each_process(p) { - if (p->pid == val) { - found = 1; - set_tsk_trace_trace(p); - } else if (test_tsk_trace_trace(p)) - clear_tsk_trace_trace(p); } - rcu_read_unlock(); - if (found) - ftrace_pid_trace = val; - else { - if (ftrace_pid_trace < 0) - goto out; - ftrace_pid_trace = -1; - } + if (ftrace_pid_trace) + clear_ftrace_pid_task(&ftrace_pid_trace); + + if (!pid) + goto out; + + ftrace_pid_trace = pid; + + set_ftrace_pid_task(ftrace_pid_trace); } /* update the function call */ diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 95fff37..8b81b4d 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -541,11 +541,11 @@ print_graph_function(struct trace_iterator *iter) } #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ -extern int ftrace_pid_trace; +extern struct pid *ftrace_pid_trace; static inline int ftrace_trace_task(struct task_struct *task) { - if (ftrace_pid_trace < 0) + if (ftrace_pid_trace) return 1; return test_tsk_trace_trace(task); -- cgit v0.10.2 From e32d89569128e76bdf84867be0928902ca9f7555 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 4 Dec 2008 00:26:41 -0500 Subject: ftrace: add ability to only trace swapper tasks Impact: new feature This patch lets the swapper tasks of all CPUS be filtered by the set_ftrace_pid file. If '0' is echoed into this file, then all the idle tasks (aka swapper) is flagged to be traced. This affects all CPU idle tasks. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 10b1d7c..eb57dc1 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -49,6 +49,7 @@ static int last_ftrace_enabled; /* set when tracing only a pid */ struct pid *ftrace_pid_trace; +static struct pid * const ftrace_swapper_pid = (struct pid *)1; /* Quick disabling of function tracer. */ int function_trace_stop; @@ -1678,7 +1679,9 @@ ftrace_pid_read(struct file *file, char __user *ubuf, char buf[64]; int r; - if (ftrace_pid_trace) + if (ftrace_pid_trace == ftrace_swapper_pid) + r = sprintf(buf, "swapper tasks\n"); + else if (ftrace_pid_trace) r = sprintf(buf, "%u\n", pid_nr(ftrace_pid_trace)); else r = sprintf(buf, "no pid\n"); @@ -1686,19 +1689,43 @@ ftrace_pid_read(struct file *file, char __user *ubuf, return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } -static void clear_ftrace_pid_task(struct pid **pid) +static void clear_ftrace_swapper(void) { struct task_struct *p; + int cpu; - do_each_pid_task(*pid, PIDTYPE_PID, p) { + get_online_cpus(); + for_each_online_cpu(cpu) { + p = idle_task(cpu); clear_tsk_trace_trace(p); - } while_each_pid_task(*pid, PIDTYPE_PID, p); - put_pid(*pid); + } + put_online_cpus(); +} - *pid = NULL; +static void set_ftrace_swapper(void) +{ + struct task_struct *p; + int cpu; + + get_online_cpus(); + for_each_online_cpu(cpu) { + p = idle_task(cpu); + set_tsk_trace_trace(p); + } + put_online_cpus(); } -static void set_ftrace_pid_task(struct pid *pid) +static void clear_ftrace_pid(struct pid *pid) +{ + struct task_struct *p; + + do_each_pid_task(pid, PIDTYPE_PID, p) { + clear_tsk_trace_trace(p); + } while_each_pid_task(pid, PIDTYPE_PID, p); + put_pid(pid); +} + +static void set_ftrace_pid(struct pid *pid) { struct task_struct *p; @@ -1707,6 +1734,24 @@ static void set_ftrace_pid_task(struct pid *pid) } while_each_pid_task(pid, PIDTYPE_PID, p); } +static void clear_ftrace_pid_task(struct pid **pid) +{ + if (*pid == ftrace_swapper_pid) + clear_ftrace_swapper(); + else + clear_ftrace_pid(*pid); + + *pid = NULL; +} + +static void set_ftrace_pid_task(struct pid *pid) +{ + if (pid == ftrace_swapper_pid) + set_ftrace_swapper(); + else + set_ftrace_pid(pid); +} + static ssize_t ftrace_pid_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) @@ -1737,11 +1782,18 @@ ftrace_pid_write(struct file *filp, const char __user *ubuf, clear_ftrace_pid_task(&ftrace_pid_trace); } else { - pid = find_get_pid(val); + /* swapper task is special */ + if (!val) { + pid = ftrace_swapper_pid; + if (pid == ftrace_pid_trace) + goto out; + } else { + pid = find_get_pid(val); - if (pid == ftrace_pid_trace) { - put_pid(pid); - goto out; + if (pid == ftrace_pid_trace) { + put_pid(pid); + goto out; + } } if (ftrace_pid_trace) -- cgit v0.10.2 From 6b2539302bee8e88c99e3c7d80c16a04dbe5e2ad Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 4 Dec 2008 09:18:28 +0100 Subject: tracing: fix typo and missing inline function Impact: fix build bugs Signed-off-by: Ingo Molnar diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 8b81b4d..b4b7b73 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -529,7 +529,11 @@ static inline int ftrace_graph_addr(unsigned long addr) #else static inline int ftrace_trace_addr(unsigned long addr) { - return 1 + return 1; +} +static inline int ftrace_graph_addr(unsigned long addr) +{ + return 1; } #endif /* CONFIG_DYNAMIC_FTRACE */ -- cgit v0.10.2 From faec2ec505d397e9426754722b6e80d519c4938f Mon Sep 17 00:00:00 2001 From: Liming Wang Date: Thu, 4 Dec 2008 14:24:49 +0800 Subject: ftrace: avoid duplicated function when writing set_graph_function Impact: fix a bug in function filter setting when writing function to set_graph_function, we should check whether it has existed in set_graph_function to avoid duplicating. Signed-off-by: Liming Wang Signed-off-by: Ingo Molnar diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index eb57dc1..d2b1565 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1425,7 +1425,7 @@ ftrace_set_func(unsigned long *array, int idx, char *buffer) struct dyn_ftrace *rec; struct ftrace_page *pg; int found = 0; - int i; + int i, j; if (ftrace_disabled) return -ENODEV; @@ -1443,7 +1443,13 @@ ftrace_set_func(unsigned long *array, int idx, char *buffer) kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); if (strcmp(str, buffer) == 0) { found = 1; - array[idx] = rec->ip; + for (j = 0; j < idx; j++) + if (array[j] == rec->ip) { + found = 0; + break; + } + if (found) + array[idx] = rec->ip; break; } } -- cgit v0.10.2 From 1fd8f2a3f9a91b287a876cef830b21baafc8a799 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 3 Dec 2008 23:45:11 +0100 Subject: tracing/function-graph-tracer: handle ftrace_printk entries Handle the TRACE_PRINT entries from the function grapg tracer and output them as a C comment just below the function that called it, as if it was a comment inside this function. Example with an ftrace_printk inside might_sleep() function: void __might_sleep(char *file, int line) { static unsigned long prev_jiffy; /* ratelimiting */ ftrace_printk("Hi I'm a comment in might_sleep() :-)"); A chunk of a resulting trace: 0) | _reiserfs_free_block() { 0) | reiserfs_read_bitmap_block() { 0) | __bread() { 0) | __getblk() { 0) | __find_get_block() { 0) 0.698 us | mark_page_accessed(); 0) 2.267 us | } 0) | __might_sleep() { 0) | /* Hi I'm a comment in might_sleep() :-) */ 0) 1.321 us | } 0) 5.872 us | } 0) 7.313 us | } 0) 8.718 us | } And this patch brings two minor fixes: - The newline after a switch-out task has disappeared - The "|" sign just before the cpu number on task-switch has been deleted. 0) 0.616 us | pick_next_task_rt(); 0) 1.457 us | _spin_trylock(); 0) 0.653 us | _spin_unlock(); 0) 0.728 us | _spin_trylock(); 0) 0.631 us | _spin_unlock(); 0) 0.729 us | native_load_sp0(); 0) 0.593 us | native_load_tls(); ------------------------------------------ 0) cat-2834 => migrati-3 ------------------------------------------ 0) | finish_task_switch() { 0) 0.841 us | _spin_unlock_irq(); 0) 0.616 us | post_schedule_rt(); 0) 3.882 us | } Signed-off-by: Frederic Weisbecker Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8b6409a..1ca74c0 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3335,7 +3335,7 @@ static int mark_printk(const char *fmt, ...) int ret; va_list args; va_start(args, fmt); - ret = trace_vprintk(0, fmt, args); + ret = trace_vprintk(0, -1, fmt, args); va_end(args); return ret; } @@ -3564,9 +3564,16 @@ static __init int tracer_init_debugfs(void) return 0; } -int trace_vprintk(unsigned long ip, const char *fmt, va_list args) +int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args) { - static DEFINE_SPINLOCK(trace_buf_lock); + /* + * Raw Spinlock because a normal spinlock would be traced here + * and append an irrelevant couple spin_lock_irqsave/ + * spin_unlock_irqrestore traced by ftrace around this + * TRACE_PRINTK trace. + */ + static raw_spinlock_t trace_buf_lock = + (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; static char trace_buf[TRACE_BUF_SIZE]; struct ring_buffer_event *event; @@ -3587,7 +3594,8 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args) if (unlikely(atomic_read(&data->disabled))) goto out; - spin_lock_irqsave(&trace_buf_lock, flags); + local_irq_save(flags); + __raw_spin_lock(&trace_buf_lock); len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args); len = min(len, TRACE_BUF_SIZE-1); @@ -3601,13 +3609,15 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args) tracing_generic_entry_update(&entry->ent, flags, pc); entry->ent.type = TRACE_PRINT; entry->ip = ip; + entry->depth = depth; memcpy(&entry->buf, trace_buf, len); entry->buf[len] = 0; ring_buffer_unlock_commit(tr->buffer, event, irq_flags); out_unlock: - spin_unlock_irqrestore(&trace_buf_lock, flags); + __raw_spin_unlock(&trace_buf_lock); + local_irq_restore(flags); out: preempt_enable_notrace(); @@ -3625,7 +3635,13 @@ int __ftrace_printk(unsigned long ip, const char *fmt, ...) return 0; va_start(ap, fmt); - ret = trace_vprintk(ip, fmt, ap); + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + ret = trace_vprintk(ip, current->curr_ret_stack, fmt, ap); +#else + ret = trace_vprintk(ip, -1, fmt, ap); +#endif + va_end(ap); return ret; } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 0565ae9..fce9889 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -117,6 +117,7 @@ struct userstack_entry { struct print_entry { struct trace_entry ent; unsigned long ip; + int depth; char buf[]; }; @@ -498,7 +499,8 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt); extern long ns2usecs(cycle_t nsec); -extern int trace_vprintk(unsigned long ip, const char *fmt, va_list args); +extern int +trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args); extern unsigned long trace_flags; diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index c66578f..32b7fb9 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -173,7 +173,7 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu) */ ret = trace_seq_printf(s, - "\n ------------------------------------------\n |"); + " ------------------------------------------\n"); if (!ret) TRACE_TYPE_PARTIAL_LINE; @@ -477,6 +477,71 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, return TRACE_TYPE_HANDLED; } +static enum print_line_t +print_graph_comment(struct print_entry *trace, struct trace_seq *s, + struct trace_entry *ent, struct trace_iterator *iter) +{ + int i; + int ret; + + /* Pid */ + if (verif_pid(s, ent->pid, iter->cpu) == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + + /* Cpu */ + if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) { + ret = print_graph_cpu(s, iter->cpu); + if (ret == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + } + + /* Proc */ + if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) { + ret = print_graph_proc(s, ent->pid); + if (ret == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + + ret = trace_seq_printf(s, " | "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + + /* No overhead */ + if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) { + ret = trace_seq_printf(s, " "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + + /* No time */ + ret = trace_seq_printf(s, " | "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + /* Indentation */ + if (trace->depth > 0) + for (i = 0; i < (trace->depth + 1) * TRACE_GRAPH_INDENT; i++) { + ret = trace_seq_printf(s, " "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + + /* The comment */ + ret = trace_seq_printf(s, "/* %s", trace->buf); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + if (ent->flags & TRACE_FLAG_CONT) + trace_seq_print_cont(s, iter); + + ret = trace_seq_printf(s, " */\n"); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + return TRACE_TYPE_HANDLED; +} + + enum print_line_t print_graph_function(struct trace_iterator *iter) { @@ -495,6 +560,11 @@ print_graph_function(struct trace_iterator *iter) trace_assign_type(field, entry); return print_graph_return(&field->ret, s, entry, iter->cpu); } + case TRACE_PRINT: { + struct print_entry *field; + trace_assign_type(field, entry); + return print_graph_comment(field, s, entry, iter); + } default: return TRACE_TYPE_UNHANDLED; } diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index 2a98a20..2fb6da6 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -366,5 +366,5 @@ void mmio_trace_mapping(struct mmiotrace_map *map) int mmio_trace_printk(const char *fmt, va_list args) { - return trace_vprintk(0, fmt, args); + return trace_vprintk(0, -1, fmt, args); } -- cgit v0.10.2 From ff32504fdc56407654584ef187b20022c94a3486 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 4 Dec 2008 23:47:35 +0100 Subject: tracing/ftrace: don't insert TRACE_PRINT during selftests Impact: fix tracer selfstests false results After setting a ftrace_printk somewhere in th kernel, I saw the Function tracer selftest failing. When a selftest occurs, the ring buffer is lurked to see if some entries were inserted. But concurrent insertion such as ftrace_printk could occured at the same time and could give false positive or negative results. This patch prevent prevent from TRACE_PRINT entries insertion during selftests. Signed-off-by: Frederic Weisbecker Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ea38652..5dca6ef 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -44,6 +44,14 @@ unsigned long __read_mostly tracing_max_latency = (cycle_t)ULONG_MAX; unsigned long __read_mostly tracing_thresh; +/* We need to change this state when a selftest is running. + * A selftest will lurk into the ring-buffer to count the + * entries inserted during the selftest although some concurrent + * insertions into the ring-buffer such as ftrace_printk could occurred + * at the same time, giving false positive or negative results. + */ +static atomic_t tracing_selftest_running = ATOMIC_INIT(0); + /* For tracers that don't implement custom flags */ static struct tracer_opt dummy_tracer_opt[] = { { } @@ -589,6 +597,8 @@ int register_tracer(struct tracer *type) struct tracer *saved_tracer = current_trace; struct trace_array *tr = &global_trace; int i; + + atomic_set(&tracing_selftest_running, 1); /* * Run a selftest on this tracer. * Here we reset the trace buffer, and set the current @@ -603,6 +613,7 @@ int register_tracer(struct tracer *type) /* the test is responsible for initializing and enabling */ pr_info("Testing tracer %s: ", type->name); ret = type->selftest(type, tr); + atomic_set(&tracing_selftest_running, 0); /* the test is responsible for resetting too */ current_trace = saved_tracer; if (ret) { @@ -3594,7 +3605,7 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args) unsigned long flags, irq_flags; int cpu, len = 0, size, pc; - if (tracing_disabled) + if (tracing_disabled || atomic_read(&tracing_selftest_running)) return 0; pc = preempt_count(); -- cgit v0.10.2 From 77d683f3e0258d522c5506e7b5fd05c9411184d9 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 4 Dec 2008 23:49:47 +0100 Subject: tracing/ftrace: fix the check of ftrace_trace_task Impact: fix default empty traces on function-graph-tracer The actual ftrace_trace_task() checks if ftrace_pid_trace is allocated and return 1 if it is true. If it is NULL, it will check the bit of pid tracing flag for the current task (which are not set by default). So by default, a task is not traced. Actually all tasks should be traced by default and filter_by_pid when ftrace_pid_trace is allocated. The appropriate condition should be to return 1 if filter_by_pid is set. Signed-off-by: Frederic Weisbecker Acke-dby: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index a71bbe0..5ac6970 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -551,7 +551,7 @@ extern struct pid *ftrace_pid_trace; static inline int ftrace_trace_task(struct task_struct *task) { - if (ftrace_pid_trace) + if (!ftrace_pid_trace) return 1; return test_tsk_trace_trace(task); -- cgit v0.10.2 From 21a8c466f99063eeb8567318b4e305eda9015408 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 4 Dec 2008 23:51:23 +0100 Subject: tracing/ftrace: provide the macro task_curr_ret_stack() Impact: cleanup As suggested by Steven Rostedt, this patch provide a new macro task_curr_ret_stack() to move the cpp conditionnal CONFIG into the linux/ftrace.h headers. Signed-off-by: Frederic Weisbecker Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index b295d31..b9b4d0a 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -8,6 +8,7 @@ #include #include #include +#include #ifdef CONFIG_FUNCTION_TRACER @@ -387,9 +388,19 @@ extern void unregister_ftrace_graph(void); extern void ftrace_graph_init_task(struct task_struct *t); extern void ftrace_graph_exit_task(struct task_struct *t); + +static inline int task_curr_ret_stack(struct task_struct *t) +{ + return t->curr_ret_stack; +} #else static inline void ftrace_graph_init_task(struct task_struct *t) { } static inline void ftrace_graph_exit_task(struct task_struct *t) { } + +static inline int task_curr_ret_stack(struct task_struct *tsk) +{ + return -1; +} #endif #ifdef CONFIG_TRACING diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 5dca6ef..7a93c66 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3657,13 +3657,7 @@ int __ftrace_printk(unsigned long ip, const char *fmt, ...) return 0; va_start(ap, fmt); - -#ifdef CONFIG_FUNCTION_GRAPH_TRACER - ret = trace_vprintk(ip, current->curr_ret_stack, fmt, ap); -#else - ret = trace_vprintk(ip, -1, fmt, ap); -#endif - + ret = trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap); va_end(ap); return ret; } -- cgit v0.10.2 From 21bbecdaaef3a6acc19905ab88c0587817318870 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 4 Dec 2008 23:30:56 -0500 Subject: ftrace: use init_struct_pid as swapper pid Impact: clean up Using (struct pid *)-1 as the pointer for ftrace_swapper_pid is a little confusing for others. This patch uses the address of the actual init pid structure instead. This change is only for clarity. It does not affect the code itself. Hopefully soon the swapper tasks will all have their own pid structure and then we can clean up the code a bit more. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index d2b1565..2971fe4 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -49,7 +49,7 @@ static int last_ftrace_enabled; /* set when tracing only a pid */ struct pid *ftrace_pid_trace; -static struct pid * const ftrace_swapper_pid = (struct pid *)1; +static struct pid * const ftrace_swapper_pid = &init_struct_pid; /* Quick disabling of function tracer. */ int function_trace_stop; -- cgit v0.10.2 From decbec3838d10ecd7aabdb4c0e05aac0e5f5dc0c Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 8 Dec 2008 01:56:06 +0100 Subject: tracing/function-graph-tracer: implement a print_headers function Impact: provide trace headers to explain a bit the output This patch implements the print_headers callback for the function graph tracer. These headers are output according to the current trace options. Signed-off-by: Frederic Weisbecker Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 32b7fb9..af60eef 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -570,11 +570,36 @@ print_graph_function(struct trace_iterator *iter) } } +static void print_graph_headers(struct seq_file *s) +{ + /* 1st line */ + seq_printf(s, "# "); + if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) + seq_printf(s, "CPU "); + if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) + seq_printf(s, "TASK/PID "); + if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) + seq_printf(s, "OVERHEAD/"); + seq_printf(s, "DURATION FUNCTION CALLS\n"); + + /* 2nd line */ + seq_printf(s, "# "); + if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) + seq_printf(s, "| "); + if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) + seq_printf(s, "| | "); + if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) { + seq_printf(s, "| "); + seq_printf(s, "| | | | |\n"); + } else + seq_printf(s, " | | | | |\n"); +} static struct tracer graph_trace __read_mostly = { - .name = "function_graph", - .init = graph_trace_init, - .reset = graph_trace_reset, - .print_line = print_graph_function, + .name = "function_graph", + .init = graph_trace_init, + .reset = graph_trace_reset, + .print_line = print_graph_function, + .print_header = print_graph_headers, .flags = &tracer_flags, }; -- cgit v0.10.2 From 361b73d5c34f59c3fd107bb9dbe7a1fbff2c2517 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Mon, 8 Dec 2008 10:58:08 +0800 Subject: ring_buffer: fix comments Impact: comments cleanup fix incorrect comments for enum ring_buffer_type Signed-off-by: Lai Jiangshan Signed-off-by: Ingo Molnar diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index 1a350a8..d363467 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -28,17 +28,19 @@ struct ring_buffer_event { * size = 8 bytes * * @RINGBUF_TYPE_TIME_STAMP: Sync time stamp with external clock - * array[0] = tv_nsec - * array[1] = tv_sec + * array[0] = tv_nsec + * array[1..2] = tv_sec * size = 16 bytes * * @RINGBUF_TYPE_DATA: Data record * If len is zero: * array[0] holds the actual length - * array[1..(length+3)/4-1] holds data + * array[1..(length+3)/4] holds data + * size = 4 + 4 + length (bytes) * else * length = len << 2 - * array[0..(length+3)/4] holds data + * array[0..(length+3)/4-1] holds data + * size = 4 + length (bytes) */ enum ring_buffer_type { RINGBUF_TYPE_PADDING, -- cgit v0.10.2 From 8b96f0119818964e4944fd1c423bf6770027d3ac Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 6 Dec 2008 03:40:00 +0100 Subject: tracing/function-graph-tracer: introduce __notrace_funcgraph to filter special functions Impact: trace more functions When the function graph tracer is configured, three more files are not traced to prevent only four functions to be traced. And this impacts the normal function tracer too. arch/x86/kernel/process_64/32.c: I had crashes when I let this file traced. After some debugging, I saw that the "current" task point was changed inside__swtich_to(), ie: "write_pda(pcurrent, next_p);" inside process_64.c Since the tracer store the original return address of the function inside current, we had crashes. Only __switch_to() has to be excluded from tracing. kernel/module.c and kernel/extable.c: Because of a function used internally by the function graph tracer: __kernel_text_address() To let the other functions inside these files to be traced, this patch introduces the __notrace_funcgraph function prefix which is __notrace if function graph tracer is configured and nothing if not. Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index a3049da..1cad931 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -14,12 +14,6 @@ CFLAGS_REMOVE_paravirt-spinlocks.o = -pg CFLAGS_REMOVE_ftrace.o = -pg endif -ifdef CONFIG_FUNCTION_GRAPH_TRACER -# Don't trace __switch_to() but let it for function tracer -CFLAGS_REMOVE_process_32.o = -pg -CFLAGS_REMOVE_process_64.o = -pg -endif - # # vsyscalls (which work on the user stack) should have # no stack-protector checks: diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 0a1302f..24c2276 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include @@ -548,7 +549,8 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, * the task-switch, and shows up in ret_from_fork in entry.S, * for example. */ -struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) +__notrace_funcgraph struct task_struct * +__switch_to(struct task_struct *prev_p, struct task_struct *next_p) { struct thread_struct *prev = &prev_p->thread, *next = &next_p->thread; diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index c958120f..fbb321d 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include @@ -551,8 +552,9 @@ static inline void __switch_to_xtra(struct task_struct *prev_p, * - could test fs/gs bitsliced * * Kprobes not supported here. Set the probe on schedule instead. + * Function graph tracer not supported too. */ -struct task_struct * +__notrace_funcgraph struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) { struct thread_struct *prev = &prev_p->thread; diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index b9b4d0a..449fa8e 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -369,6 +369,14 @@ struct ftrace_graph_ret { }; #ifdef CONFIG_FUNCTION_GRAPH_TRACER + +/* + * Sometimes we don't want to trace a function with the function + * graph tracer but we want them to keep traced by the usual function + * tracer if the function graph tracer is not configured. + */ +#define __notrace_funcgraph notrace + #define FTRACE_RETFUNC_DEPTH 50 #define FTRACE_RETSTACK_ALLOC_SIZE 32 /* Type of the callback handlers for tracing function graph*/ @@ -394,6 +402,9 @@ static inline int task_curr_ret_stack(struct task_struct *t) return t->curr_ret_stack; } #else + +#define __notrace_funcgraph + static inline void ftrace_graph_init_task(struct task_struct *t) { } static inline void ftrace_graph_exit_task(struct task_struct *t) { } diff --git a/kernel/Makefile b/kernel/Makefile index 703cf3b..19fad00 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -21,10 +21,6 @@ CFLAGS_REMOVE_cgroup-debug.o = -pg CFLAGS_REMOVE_sched_clock.o = -pg CFLAGS_REMOVE_sched.o = -pg endif -ifdef CONFIG_FUNCTION_GRAPH_TRACER -CFLAGS_REMOVE_extable.o = -pg # For __kernel_text_address() -CFLAGS_REMOVE_module.o = -pg # For __module_text_address() -endif obj-$(CONFIG_FREEZER) += freezer.o obj-$(CONFIG_PROFILING) += profile.o diff --git a/kernel/extable.c b/kernel/extable.c index a26cb2e..feb0317 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -17,6 +17,7 @@ */ #include #include +#include #include #include @@ -40,7 +41,7 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr) return e; } -int core_kernel_text(unsigned long addr) +__notrace_funcgraph int core_kernel_text(unsigned long addr) { if (addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) @@ -53,7 +54,7 @@ int core_kernel_text(unsigned long addr) return 0; } -int __kernel_text_address(unsigned long addr) +__notrace_funcgraph int __kernel_text_address(unsigned long addr) { if (core_kernel_text(addr)) return 1; diff --git a/kernel/module.c b/kernel/module.c index 89bcf7c..dd2a541 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2704,7 +2704,7 @@ int is_module_address(unsigned long addr) /* Is this a valid kernel address? */ -struct module *__module_text_address(unsigned long addr) +__notrace_funcgraph struct module *__module_text_address(unsigned long addr) { struct module *mod; -- cgit v0.10.2 From 8e1b82e0866befaa0b2920be296c6e4c3fc7f422 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 6 Dec 2008 03:41:33 +0100 Subject: tracing/function-graph-tracer: turn tracing_selftest_running into an int Impact: cleanup Apply some suggestions of Steven Rostedt: _turn tracing_selftest_running into a simple int (no need of an atomic_t) _set it __read_mostly _fix a comment style Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 7a93c66..3354953 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -44,13 +44,14 @@ unsigned long __read_mostly tracing_max_latency = (cycle_t)ULONG_MAX; unsigned long __read_mostly tracing_thresh; -/* We need to change this state when a selftest is running. +/* + * We need to change this state when a selftest is running. * A selftest will lurk into the ring-buffer to count the * entries inserted during the selftest although some concurrent * insertions into the ring-buffer such as ftrace_printk could occurred * at the same time, giving false positive or negative results. */ -static atomic_t tracing_selftest_running = ATOMIC_INIT(0); +static bool __read_mostly tracing_selftest_running; /* For tracers that don't implement custom flags */ static struct tracer_opt dummy_tracer_opt[] = { @@ -574,6 +575,8 @@ int register_tracer(struct tracer *type) unlock_kernel(); mutex_lock(&trace_types_lock); + tracing_selftest_running = true; + for (t = trace_types; t; t = t->next) { if (strcmp(type->name, t->name) == 0) { /* already found */ @@ -598,7 +601,6 @@ int register_tracer(struct tracer *type) struct trace_array *tr = &global_trace; int i; - atomic_set(&tracing_selftest_running, 1); /* * Run a selftest on this tracer. * Here we reset the trace buffer, and set the current @@ -613,7 +615,6 @@ int register_tracer(struct tracer *type) /* the test is responsible for initializing and enabling */ pr_info("Testing tracer %s: ", type->name); ret = type->selftest(type, tr); - atomic_set(&tracing_selftest_running, 0); /* the test is responsible for resetting too */ current_trace = saved_tracer; if (ret) { @@ -635,6 +636,7 @@ int register_tracer(struct tracer *type) max_tracer_type_len = len; out: + tracing_selftest_running = false; mutex_unlock(&trace_types_lock); lock_kernel(); @@ -3605,7 +3607,7 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args) unsigned long flags, irq_flags; int cpu, len = 0, size, pc; - if (tracing_disabled || atomic_read(&tracing_selftest_running)) + if (tracing_disabled || tracing_selftest_running) return 0; pc = preempt_count(); -- cgit v0.10.2 From 380c4b1411ccd6885f92b2c8ceb08433a720f44e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 6 Dec 2008 03:43:41 +0100 Subject: tracing/function-graph-tracer: append the tracing_graph_flag Impact: Provide a way to pause the function graph tracer As suggested by Steven Rostedt, the previous patch that prevented from spinlock function tracing shouldn't use the raw_spinlock to fix it. It's much better to follow lockdep with normal spinlock, so this patch adds a new flag for each task to make the function graph tracer able to be paused. We also can send an ftrace_printk whithout worrying of the irrelevant traced spinlock during insertion. Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index f98c407..1b43086 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -476,7 +476,10 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) &return_to_handler; /* Nmi's are currently unsupported */ - if (atomic_read(&in_nmi)) + if (unlikely(atomic_read(&in_nmi))) + return; + + if (unlikely(atomic_read(¤t->tracing_graph_pause))) return; /* diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 449fa8e..11cac81 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -401,6 +401,16 @@ static inline int task_curr_ret_stack(struct task_struct *t) { return t->curr_ret_stack; } + +static inline void pause_graph_tracing(void) +{ + atomic_inc(¤t->tracing_graph_pause); +} + +static inline void unpause_graph_tracing(void) +{ + atomic_dec(¤t->tracing_graph_pause); +} #else #define __notrace_funcgraph @@ -412,6 +422,9 @@ static inline int task_curr_ret_stack(struct task_struct *tsk) { return -1; } + +static inline void pause_graph_tracing(void) { } +static inline void unpause_graph_tracing(void) { } #endif #ifdef CONFIG_TRACING diff --git a/include/linux/sched.h b/include/linux/sched.h index 4c152e0..4b81fc5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1379,6 +1379,8 @@ struct task_struct { * because of depth overrun. */ atomic_t trace_overrun; + /* Pause for the tracing */ + atomic_t tracing_graph_pause; #endif #ifdef CONFIG_TRACING /* state flags for use by tracers */ diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 2971fe4..a12f80e 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1998,6 +1998,7 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list) /* Make sure IRQs see the -1 first: */ barrier(); t->ret_stack = ret_stack_list[start++]; + atomic_set(&t->tracing_graph_pause, 0); atomic_set(&t->trace_overrun, 0); } } while_each_thread(g, t); @@ -2077,6 +2078,7 @@ void ftrace_graph_init_task(struct task_struct *t) if (!t->ret_stack) return; t->curr_ret_stack = -1; + atomic_set(&t->tracing_graph_pause, 0); atomic_set(&t->trace_overrun, 0); } else t->ret_stack = NULL; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 3354953..0b8659b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3590,14 +3590,7 @@ static __init int tracer_init_debugfs(void) int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args) { - /* - * Raw Spinlock because a normal spinlock would be traced here - * and append an irrelevant couple spin_lock_irqsave/ - * spin_unlock_irqrestore traced by ftrace around this - * TRACE_PRINTK trace. - */ - static raw_spinlock_t trace_buf_lock = - (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; + static DEFINE_SPINLOCK(trace_buf_lock); static char trace_buf[TRACE_BUF_SIZE]; struct ring_buffer_event *event; @@ -3618,8 +3611,8 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args) if (unlikely(atomic_read(&data->disabled))) goto out; - local_irq_save(flags); - __raw_spin_lock(&trace_buf_lock); + pause_graph_tracing(); + spin_lock_irqsave(&trace_buf_lock, irq_flags); len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args); len = min(len, TRACE_BUF_SIZE-1); @@ -3640,9 +3633,8 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args) ring_buffer_unlock_commit(tr->buffer, event, irq_flags); out_unlock: - __raw_spin_unlock(&trace_buf_lock); - local_irq_restore(flags); - + spin_unlock_irqrestore(&trace_buf_lock, irq_flags); + unpause_graph_tracing(); out: preempt_enable_notrace(); -- cgit v0.10.2 From e726f5f91effd8944c76475a2688093a03ba0d10 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 8 Dec 2008 16:55:53 +0100 Subject: tracing/function-graph-tracer: fix 'flags' variable mismatch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit this warning: kernel/trace/trace.c: In function ‘trace_vprintk’: kernel/trace/trace.c:3626: warning: ‘flags’ may be used uninitialized in this function shows some confusion about irq_flags / flags use here. We already have irq_flags so remove the extra flags variable. Signed-off-by: Ingo Molnar diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 0b8659b..8ebe007 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3596,9 +3596,9 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args) struct ring_buffer_event *event; struct trace_array *tr = &global_trace; struct trace_array_cpu *data; - struct print_entry *entry; - unsigned long flags, irq_flags; int cpu, len = 0, size, pc; + struct print_entry *entry; + unsigned long irq_flags; if (tracing_disabled || tracing_selftest_running) return 0; @@ -3623,7 +3623,7 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args) if (!event) goto out_unlock; entry = ring_buffer_event_data(event); - tracing_generic_entry_update(&entry->ent, flags, pc); + tracing_generic_entry_update(&entry->ent, irq_flags, pc); entry->ent.type = TRACE_PRINT; entry->ip = ip; entry->depth = depth; -- cgit v0.10.2 From b0884e25fe361f2ca228808fb5fd1b74cb04e711 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Thu, 11 Dec 2008 13:45:23 +0100 Subject: x86, bts: turn BUG_ON into WARN_ON_ONCE Impact: make the ds code more debuggable Turn BUG_ON's into WARN_ON_ONCE. Signed-off-by: Markus Metzger Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 19a8c2c..0953069 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -452,7 +452,7 @@ struct pebs_tracer *ds_request_pebs(struct task_struct *task, static void ds_release(struct ds_tracer *tracer, enum ds_qualifier qual) { - BUG_ON(tracer->context->owner[qual] != tracer); + WARN_ON_ONCE(tracer->context->owner[qual] != tracer); tracer->context->owner[qual] = NULL; put_tracer(tracer->context->task); @@ -774,7 +774,7 @@ ds_configure(const struct ds_configuration *cfg) printk(KERN_INFO "DS available\n"); - BUG_ON(MAX_SIZEOF_DS < ds_cfg.sizeof_ds); + WARN_ON_ONCE(MAX_SIZEOF_DS < ds_cfg.sizeof_ds); } void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 2c8ec1b..b2998fe 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -878,7 +878,8 @@ static int ptrace_bts_write_record(struct task_struct *child, { unsigned char bts_record[BTS_MAX_RECORD_SIZE]; - BUG_ON(BTS_MAX_RECORD_SIZE < bts_cfg.sizeof_bts); + if (BTS_MAX_RECORD_SIZE < bts_cfg.sizeof_bts) + return -EOVERFLOW; memset(bts_record, 0, bts_cfg.sizeof_bts); switch (in->qualifier) { @@ -1133,7 +1134,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) ret = ds_get_bts_index(child->bts, &size); if (ret == 0) { - BUG_ON(size != (int) size); + WARN_ON_ONCE(size != (int) size); ret = (int) size; } break; -- cgit v0.10.2 From c2724775ce57c98b8af9694857b941dc61056516 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Thu, 11 Dec 2008 13:49:59 +0100 Subject: x86, bts: provide in-kernel branch-trace interface Impact: cleanup Move the BTS bits from ptrace.c into ds.c. Signed-off-by: Markus Metzger Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/ds.h b/arch/x86/include/asm/ds.h index 99b6c39..ee0ea3a 100644 --- a/arch/x86/include/asm/ds.h +++ b/arch/x86/include/asm/ds.h @@ -6,13 +6,13 @@ * precise-event based sampling (PEBS). * * It manages: - * - per-thread and per-cpu allocation of BTS and PEBS + * - DS and BTS hardware configuration * - buffer overflow handling (to be done) * - buffer access * - * It assumes: - * - get_task_struct on all traced tasks - * - current is allowed to trace tasks + * It does not do: + * - security checking (is the caller allowed to trace the task) + * - buffer allocation (memory accounting) * * * Copyright (C) 2007-2008 Intel Corporation. @@ -31,6 +31,7 @@ #ifdef CONFIG_X86_DS struct task_struct; +struct ds_context; struct ds_tracer; struct bts_tracer; struct pebs_tracer; @@ -38,6 +39,38 @@ struct pebs_tracer; typedef void (*bts_ovfl_callback_t)(struct bts_tracer *); typedef void (*pebs_ovfl_callback_t)(struct pebs_tracer *); + +/* + * A list of features plus corresponding macros to talk about them in + * the ds_request function's flags parameter. + * + * We use the enum to index an array of corresponding control bits; + * we use the macro to index a flags bit-vector. + */ +enum ds_feature { + dsf_bts = 0, + dsf_bts_kernel, +#define BTS_KERNEL (1 << dsf_bts_kernel) + /* trace kernel-mode branches */ + + dsf_bts_user, +#define BTS_USER (1 << dsf_bts_user) + /* trace user-mode branches */ + + dsf_bts_overflow, + dsf_bts_max, + dsf_pebs = dsf_bts_max, + + dsf_pebs_max, + dsf_ctl_max = dsf_pebs_max, + dsf_bts_timestamps = dsf_ctl_max, +#define BTS_TIMESTAMPS (1 << dsf_bts_timestamps) + /* add timestamps into BTS trace */ + +#define BTS_USER_FLAGS (BTS_KERNEL | BTS_USER | BTS_TIMESTAMPS) +}; + + /* * Request BTS or PEBS * @@ -58,92 +91,135 @@ typedef void (*pebs_ovfl_callback_t)(struct pebs_tracer *); * NULL if cyclic buffer requested * th: the interrupt threshold in records from the end of the buffer; * -1 if no interrupt threshold is requested. + * flags: a bit-mask of the above flags */ extern struct bts_tracer *ds_request_bts(struct task_struct *task, void *base, size_t size, - bts_ovfl_callback_t ovfl, size_t th); + bts_ovfl_callback_t ovfl, + size_t th, unsigned int flags); extern struct pebs_tracer *ds_request_pebs(struct task_struct *task, void *base, size_t size, pebs_ovfl_callback_t ovfl, - size_t th); + size_t th, unsigned int flags); /* * Release BTS or PEBS resources - * - * Returns 0 on success; -Eerrno otherwise + * Suspend and resume BTS or PEBS tracing * * tracer: the tracer handle returned from ds_request_~() */ -extern int ds_release_bts(struct bts_tracer *tracer); -extern int ds_release_pebs(struct pebs_tracer *tracer); +extern void ds_release_bts(struct bts_tracer *tracer); +extern void ds_suspend_bts(struct bts_tracer *tracer); +extern void ds_resume_bts(struct bts_tracer *tracer); +extern void ds_release_pebs(struct pebs_tracer *tracer); +extern void ds_suspend_pebs(struct pebs_tracer *tracer); +extern void ds_resume_pebs(struct pebs_tracer *tracer); + /* - * Get the (array) index of the write pointer. - * (assuming an array of BTS/PEBS records) - * - * Returns 0 on success; -Eerrno on error + * The raw DS buffer state as it is used for BTS and PEBS recording. * - * tracer: the tracer handle returned from ds_request_~() - * pos (out): will hold the result + * This is the low-level, arch-dependent interface for working + * directly on the raw trace data. */ -extern int ds_get_bts_index(struct bts_tracer *tracer, size_t *pos); -extern int ds_get_pebs_index(struct pebs_tracer *tracer, size_t *pos); +struct ds_trace { + /* the number of bts/pebs records */ + size_t n; + /* the size of a bts/pebs record in bytes */ + size_t size; + /* pointers into the raw buffer: + - to the first entry */ + void *begin; + /* - one beyond the last entry */ + void *end; + /* - one beyond the newest entry */ + void *top; + /* - the interrupt threshold */ + void *ith; + /* flags given on ds_request() */ + unsigned int flags; +}; /* - * Get the (array) index one record beyond the end of the array. - * (assuming an array of BTS/PEBS records) - * - * Returns 0 on success; -Eerrno on error - * - * tracer: the tracer handle returned from ds_request_~() - * pos (out): will hold the result + * An arch-independent view on branch trace data. */ -extern int ds_get_bts_end(struct bts_tracer *tracer, size_t *pos); -extern int ds_get_pebs_end(struct pebs_tracer *tracer, size_t *pos); +enum bts_qualifier { + bts_invalid, +#define BTS_INVALID bts_invalid + + bts_branch, +#define BTS_BRANCH bts_branch + + bts_task_arrives, +#define BTS_TASK_ARRIVES bts_task_arrives + + bts_task_departs, +#define BTS_TASK_DEPARTS bts_task_departs + + bts_qual_bit_size = 4, + bts_qual_max = (1 << bts_qual_bit_size), +}; + +struct bts_struct { + __u64 qualifier; + union { + /* BTS_BRANCH */ + struct { + __u64 from; + __u64 to; + } lbr; + /* BTS_TASK_ARRIVES or BTS_TASK_DEPARTS */ + struct { + __u64 jiffies; + pid_t pid; + } timestamp; + } variant; +}; + /* - * Provide a pointer to the BTS/PEBS record at parameter index. - * (assuming an array of BTS/PEBS records) - * - * The pointer points directly into the buffer. The user is - * responsible for copying the record. - * - * Returns the size of a single record on success; -Eerrno on error + * The BTS state. * - * tracer: the tracer handle returned from ds_request_~() - * index: the index of the requested record - * record (out): pointer to the requested record + * This gives access to the raw DS state and adds functions to provide + * an arch-independent view of the BTS data. */ -extern int ds_access_bts(struct bts_tracer *tracer, - size_t index, const void **record); -extern int ds_access_pebs(struct pebs_tracer *tracer, - size_t index, const void **record); +struct bts_trace { + struct ds_trace ds; + + int (*read)(struct bts_tracer *tracer, const void *at, + struct bts_struct *out); + int (*write)(struct bts_tracer *tracer, const struct bts_struct *in); +}; + /* - * Write one or more BTS/PEBS records at the write pointer index and - * advance the write pointer. + * The PEBS state. * - * If size is not a multiple of the record size, trailing bytes are - * zeroed out. - * - * May result in one or more overflow notifications. - * - * If called during overflow handling, that is, with index >= - * interrupt threshold, the write will wrap around. + * This gives access to the raw DS state and the PEBS-specific counter + * reset value. + */ +struct pebs_trace { + struct ds_trace ds; + + /* the PEBS reset value */ + unsigned long long reset_value; +}; + + +/* + * Read the BTS or PEBS trace. * - * An overflow notification is given if and when the interrupt - * threshold is reached during or after the write. + * Returns a view on the trace collected for the parameter tracer. * - * Returns the number of bytes written or -Eerrno. + * The view remains valid as long as the traced task is not running or + * the tracer is suspended. + * Writes into the trace buffer are not reflected. * * tracer: the tracer handle returned from ds_request_~() - * buffer: the buffer to write - * size: the size of the buffer */ -extern int ds_write_bts(struct bts_tracer *tracer, - const void *buffer, size_t size); -extern int ds_write_pebs(struct pebs_tracer *tracer, - const void *buffer, size_t size); +extern const struct bts_trace *ds_read_bts(struct bts_tracer *tracer); +extern const struct pebs_trace *ds_read_pebs(struct pebs_tracer *tracer); + /* * Reset the write pointer of the BTS/PEBS buffer. @@ -156,27 +232,6 @@ extern int ds_reset_bts(struct bts_tracer *tracer); extern int ds_reset_pebs(struct pebs_tracer *tracer); /* - * Clear the BTS/PEBS buffer and reset the write pointer. - * The entire buffer will be zeroed out. - * - * Returns 0 on success; -Eerrno on error - * - * tracer: the tracer handle returned from ds_request_~() - */ -extern int ds_clear_bts(struct bts_tracer *tracer); -extern int ds_clear_pebs(struct pebs_tracer *tracer); - -/* - * Provide the PEBS counter reset value. - * - * Returns 0 on success; -Eerrno on error - * - * tracer: the tracer handle returned from ds_request_pebs() - * value (out): the counter reset value - */ -extern int ds_get_pebs_reset(struct pebs_tracer *tracer, u64 *value); - -/* * Set the PEBS counter reset value. * * Returns 0 on success; -Eerrno on error @@ -192,35 +247,17 @@ extern int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value); struct cpuinfo_x86; extern void __cpuinit ds_init_intel(struct cpuinfo_x86 *); - - /* - * The DS context - part of struct thread_struct. + * Context switch work */ -#define MAX_SIZEOF_DS (12 * 8) - -struct ds_context { - /* pointer to the DS configuration; goes into MSR_IA32_DS_AREA */ - unsigned char ds[MAX_SIZEOF_DS]; - /* the owner of the BTS and PEBS configuration, respectively */ - struct ds_tracer *owner[2]; - /* use count */ - unsigned long count; - /* a pointer to the context location inside the thread_struct - * or the per_cpu context array */ - struct ds_context **this; - /* a pointer to the task owning this context, or NULL, if the - * context is owned by a cpu */ - struct task_struct *task; -}; - -/* called by exit_thread() to free leftover contexts */ -extern void ds_free(struct ds_context *context); +extern void ds_switch_to(struct task_struct *prev, struct task_struct *next); #else /* CONFIG_X86_DS */ struct cpuinfo_x86; static inline void __cpuinit ds_init_intel(struct cpuinfo_x86 *ignored) {} +static inline void ds_switch_to(struct task_struct *prev, + struct task_struct *next) {} #endif /* CONFIG_X86_DS */ #endif /* _ASM_X86_DS_H */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 5ca01e3..aa5914f 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -752,6 +752,19 @@ extern void switch_to_new_gdt(void); extern void cpu_init(void); extern void init_gdt(int cpu); +static inline unsigned long get_debugctlmsr(void) +{ + unsigned long debugctlmsr = 0; + +#ifndef CONFIG_X86_DEBUGCTLMSR + if (boot_cpu_data.x86 < 6) + return 0; +#endif + rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr); + + return debugctlmsr; +} + static inline void update_debugctlmsr(unsigned long debugctlmsr) { #ifndef CONFIG_X86_DEBUGCTLMSR diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index eefb059..fbf7442 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -6,7 +6,6 @@ #include #ifdef __KERNEL__ -#include /* the DS BTS struct is used for ptrace too */ #include #endif @@ -128,34 +127,6 @@ struct pt_regs { #endif /* !__i386__ */ -#ifdef CONFIG_X86_PTRACE_BTS -/* a branch trace record entry - * - * In order to unify the interface between various processor versions, - * we use the below data structure for all processors. - */ -enum bts_qualifier { - BTS_INVALID = 0, - BTS_BRANCH, - BTS_TASK_ARRIVES, - BTS_TASK_DEPARTS -}; - -struct bts_struct { - __u64 qualifier; - union { - /* BTS_BRANCH */ - struct { - __u64 from_ip; - __u64 to_ip; - } lbr; - /* BTS_TASK_ARRIVES or - BTS_TASK_DEPARTS */ - __u64 jiffies; - } variant; -}; -#endif /* CONFIG_X86_PTRACE_BTS */ - #ifdef __KERNEL__ #include @@ -163,13 +134,6 @@ struct bts_struct { struct cpuinfo_x86; struct task_struct; -#ifdef CONFIG_X86_PTRACE_BTS -extern void __cpuinit ptrace_bts_init_intel(struct cpuinfo_x86 *); -extern void ptrace_bts_take_timestamp(struct task_struct *, enum bts_qualifier); -#else -#define ptrace_bts_init_intel(config) do {} while (0) -#endif /* CONFIG_X86_PTRACE_BTS */ - extern unsigned long profile_pc(struct pt_regs *regs); extern unsigned long diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 0921b40..bf8113d 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -93,7 +93,6 @@ struct thread_info { #define TIF_FORCED_TF 24 /* true if TF in eflags artificially */ #define TIF_DEBUGCTLMSR 25 /* uses thread_struct.debugctlmsr */ #define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */ -#define TIF_BTS_TRACE_TS 27 /* record scheduling event timestamps */ #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) @@ -115,7 +114,6 @@ struct thread_info { #define _TIF_FORCED_TF (1 << TIF_FORCED_TF) #define _TIF_DEBUGCTLMSR (1 << TIF_DEBUGCTLMSR) #define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR) -#define _TIF_BTS_TRACE_TS (1 << TIF_BTS_TRACE_TS) /* work to do in syscall_trace_enter() */ #define _TIF_WORK_SYSCALL_ENTRY \ @@ -141,8 +139,7 @@ struct thread_info { /* flags to check in __switch_to() */ #define _TIF_WORK_CTXSW \ - (_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_BTS_TRACE_TS| \ - _TIF_NOTSC) + (_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_NOTSC) #define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG) diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 816f27f..cd413d9 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -11,7 +11,6 @@ #include #include #include -#include #include #include @@ -309,9 +308,6 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_P3); #endif - if (cpu_has_bts) - ptrace_bts_init_intel(c); - detect_extended_topology(c); if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) { /* diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 0953069..f058300 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -6,13 +6,13 @@ * precise-event based sampling (PEBS). * * It manages: - * - per-thread and per-cpu allocation of BTS and PEBS + * - DS and BTS hardware configuration * - buffer overflow handling (to be done) * - buffer access * - * It assumes: - * - get_task_struct on all traced tasks - * - current is allowed to trace tasks + * It does not do: + * - security checking (is the caller allowed to trace the task) + * - buffer allocation (memory accounting) * * * Copyright (C) 2007-2008 Intel Corporation. @@ -34,15 +34,30 @@ * The configuration for a particular DS hardware implementation. */ struct ds_configuration { - /* the size of the DS structure in bytes */ - unsigned char sizeof_ds; - /* the size of one pointer-typed field in the DS structure in bytes; - this covers the first 8 fields related to buffer management. */ + /* the name of the configuration */ + const char *name; + /* the size of one pointer-typed field in the DS structure and + in the BTS and PEBS buffers in bytes; + this covers the first 8 DS fields related to buffer management. */ unsigned char sizeof_field; /* the size of a BTS/PEBS record in bytes */ unsigned char sizeof_rec[2]; + /* a series of bit-masks to control various features indexed + * by enum ds_feature */ + unsigned long ctl[dsf_ctl_max]; }; -static struct ds_configuration ds_cfg; +static DEFINE_PER_CPU(struct ds_configuration, ds_cfg_array); + +#define ds_cfg per_cpu(ds_cfg_array, smp_processor_id()) + +#define MAX_SIZEOF_DS (12 * 8) /* maximal size of a DS configuration */ +#define MAX_SIZEOF_BTS (3 * 8) /* maximal size of a BTS record */ +#define DS_ALIGNMENT (1 << 3) /* BTS and PEBS buffer alignment */ + +#define BTS_CONTROL \ + (ds_cfg.ctl[dsf_bts] | ds_cfg.ctl[dsf_bts_kernel] | ds_cfg.ctl[dsf_bts_user] |\ + ds_cfg.ctl[dsf_bts_overflow]) + /* * A BTS or PEBS tracer. @@ -61,6 +76,8 @@ struct ds_tracer { struct bts_tracer { /* the common DS part */ struct ds_tracer ds; + /* the trace including the DS configuration */ + struct bts_trace trace; /* buffer overflow notification function */ bts_ovfl_callback_t ovfl; }; @@ -68,6 +85,8 @@ struct bts_tracer { struct pebs_tracer { /* the common DS part */ struct ds_tracer ds; + /* the trace including the DS configuration */ + struct pebs_trace trace; /* buffer overflow notification function */ pebs_ovfl_callback_t ovfl; }; @@ -134,13 +153,11 @@ static inline void ds_set(unsigned char *base, enum ds_qualifier qual, (*(unsigned long *)base) = value; } -#define DS_ALIGNMENT (1 << 3) /* BTS and PEBS buffer alignment */ - /* * Locking is done only for allocating BTS or PEBS resources. */ -static spinlock_t ds_lock = __SPIN_LOCK_UNLOCKED(ds_lock); +static DEFINE_SPINLOCK(ds_lock); /* @@ -156,27 +173,32 @@ static spinlock_t ds_lock = __SPIN_LOCK_UNLOCKED(ds_lock); * >0 number of per-thread tracers * <0 number of per-cpu tracers * - * The below functions to get and put tracers and to check the - * allocation type require the ds_lock to be held by the caller. - * * Tracers essentially gives the number of ds contexts for a certain * type of allocation. */ -static long tracers; +static atomic_t tracers = ATOMIC_INIT(0); static inline void get_tracer(struct task_struct *task) { - tracers += (task ? 1 : -1); + if (task) + atomic_inc(&tracers); + else + atomic_dec(&tracers); } static inline void put_tracer(struct task_struct *task) { - tracers -= (task ? 1 : -1); + if (task) + atomic_dec(&tracers); + else + atomic_inc(&tracers); } static inline int check_tracer(struct task_struct *task) { - return (task ? (tracers >= 0) : (tracers <= 0)); + return task ? + (atomic_read(&tracers) >= 0) : + (atomic_read(&tracers) <= 0); } @@ -190,14 +212,30 @@ static inline int check_tracer(struct task_struct *task) * Contexts are use-counted. They are allocated on first access and * deallocated when the last user puts the context. */ -static DEFINE_PER_CPU(struct ds_context *, system_context); +struct ds_context { + /* pointer to the DS configuration; goes into MSR_IA32_DS_AREA */ + unsigned char ds[MAX_SIZEOF_DS]; + /* the owner of the BTS and PEBS configuration, respectively */ + struct bts_tracer *bts_master; + struct pebs_tracer *pebs_master; + /* use count */ + unsigned long count; + /* a pointer to the context location inside the thread_struct + * or the per_cpu context array */ + struct ds_context **this; + /* a pointer to the task owning this context, or NULL, if the + * context is owned by a cpu */ + struct task_struct *task; +}; + +static DEFINE_PER_CPU(struct ds_context *, system_context_array); -#define this_system_context per_cpu(system_context, smp_processor_id()) +#define system_context per_cpu(system_context_array, smp_processor_id()) static inline struct ds_context *ds_get_context(struct task_struct *task) { struct ds_context **p_context = - (task ? &task->thread.ds_ctx : &this_system_context); + (task ? &task->thread.ds_ctx : &system_context); struct ds_context *context = *p_context; unsigned long irq; @@ -225,10 +263,22 @@ static inline struct ds_context *ds_get_context(struct task_struct *task) wrmsrl(MSR_IA32_DS_AREA, (unsigned long)context->ds); } + + context->count++; + + spin_unlock_irqrestore(&ds_lock, irq); + } else { + spin_lock_irqsave(&ds_lock, irq); + + context = *p_context; + if (context) + context->count++; + spin_unlock_irqrestore(&ds_lock, irq); - } - context->count++; + if (!context) + context = ds_get_context(task); + } return context; } @@ -242,8 +292,10 @@ static inline void ds_put_context(struct ds_context *context) spin_lock_irqsave(&ds_lock, irq); - if (--context->count) - goto out; + if (--context->count) { + spin_unlock_irqrestore(&ds_lock, irq); + return; + } *(context->this) = NULL; @@ -253,14 +305,14 @@ static inline void ds_put_context(struct ds_context *context) if (!context->task || (context->task == current)) wrmsrl(MSR_IA32_DS_AREA, 0); - kfree(context); - out: spin_unlock_irqrestore(&ds_lock, irq); + + kfree(context); } /* - * Handle a buffer overflow + * Call the tracer's callback on a buffer overflow. * * context: the ds context * qual: the buffer type @@ -268,30 +320,244 @@ static inline void ds_put_context(struct ds_context *context) static void ds_overflow(struct ds_context *context, enum ds_qualifier qual) { switch (qual) { - case ds_bts: { - struct bts_tracer *tracer = - container_of(context->owner[qual], - struct bts_tracer, ds); - if (tracer->ovfl) - tracer->ovfl(tracer); - } + case ds_bts: + if (context->bts_master && + context->bts_master->ovfl) + context->bts_master->ovfl(context->bts_master); + break; + case ds_pebs: + if (context->pebs_master && + context->pebs_master->ovfl) + context->pebs_master->ovfl(context->pebs_master); break; - case ds_pebs: { - struct pebs_tracer *tracer = - container_of(context->owner[qual], - struct pebs_tracer, ds); - if (tracer->ovfl) - tracer->ovfl(tracer); } +} + + +/* + * Write raw data into the BTS or PEBS buffer. + * + * The remainder of any partially written record is zeroed out. + * + * context: the DS context + * qual: the buffer type + * record: the data to write + * size: the size of the data + */ +static int ds_write(struct ds_context *context, enum ds_qualifier qual, + const void *record, size_t size) +{ + int bytes_written = 0; + + if (!record) + return -EINVAL; + + while (size) { + unsigned long base, index, end, write_end, int_th; + unsigned long write_size, adj_write_size; + + /* + * write as much as possible without producing an + * overflow interrupt. + * + * interrupt_threshold must either be + * - bigger than absolute_maximum or + * - point to a record between buffer_base and absolute_maximum + * + * index points to a valid record. + */ + base = ds_get(context->ds, qual, ds_buffer_base); + index = ds_get(context->ds, qual, ds_index); + end = ds_get(context->ds, qual, ds_absolute_maximum); + int_th = ds_get(context->ds, qual, ds_interrupt_threshold); + + write_end = min(end, int_th); + + /* if we are already beyond the interrupt threshold, + * we fill the entire buffer */ + if (write_end <= index) + write_end = end; + + if (write_end <= index) + break; + + write_size = min((unsigned long) size, write_end - index); + memcpy((void *)index, record, write_size); + + record = (const char *)record + write_size; + size -= write_size; + bytes_written += write_size; + + adj_write_size = write_size / ds_cfg.sizeof_rec[qual]; + adj_write_size *= ds_cfg.sizeof_rec[qual]; + + /* zero out trailing bytes */ + memset((char *)index + write_size, 0, + adj_write_size - write_size); + index += adj_write_size; + + if (index >= end) + index = base; + ds_set(context->ds, qual, ds_index, index); + + if (index >= int_th) + ds_overflow(context, qual); + } + + return bytes_written; +} + + +/* + * Branch Trace Store (BTS) uses the following format. Different + * architectures vary in the size of those fields. + * - source linear address + * - destination linear address + * - flags + * + * Later architectures use 64bit pointers throughout, whereas earlier + * architectures use 32bit pointers in 32bit mode. + * + * We compute the base address for the first 8 fields based on: + * - the field size stored in the DS configuration + * - the relative field position + * + * In order to store additional information in the BTS buffer, we use + * a special source address to indicate that the record requires + * special interpretation. + * + * Netburst indicated via a bit in the flags field whether the branch + * was predicted; this is ignored. + * + * We use two levels of abstraction: + * - the raw data level defined here + * - an arch-independent level defined in ds.h + */ + +enum bts_field { + bts_from, + bts_to, + bts_flags, + + bts_qual = bts_from, + bts_jiffies = bts_to, + bts_pid = bts_flags, + + bts_qual_mask = (bts_qual_max - 1), + bts_escape = ((unsigned long)-1 & ~bts_qual_mask) +}; + +static inline unsigned long bts_get(const char *base, enum bts_field field) +{ + base += (ds_cfg.sizeof_field * field); + return *(unsigned long *)base; +} + +static inline void bts_set(char *base, enum bts_field field, unsigned long val) +{ + base += (ds_cfg.sizeof_field * field);; + (*(unsigned long *)base) = val; +} + + +/* + * The raw BTS data is architecture dependent. + * + * For higher-level users, we give an arch-independent view. + * - ds.h defines struct bts_struct + * - bts_read translates one raw bts record into a bts_struct + * - bts_write translates one bts_struct into the raw format and + * writes it into the top of the parameter tracer's buffer. + * + * return: bytes read/written on success; -Eerrno, otherwise + */ +static int bts_read(struct bts_tracer *tracer, const void *at, + struct bts_struct *out) +{ + if (!tracer) + return -EINVAL; + + if (at < tracer->trace.ds.begin) + return -EINVAL; + + if (tracer->trace.ds.end < (at + tracer->trace.ds.size)) + return -EINVAL; + + memset(out, 0, sizeof(*out)); + if ((bts_get(at, bts_qual) & ~bts_qual_mask) == bts_escape) { + out->qualifier = (bts_get(at, bts_qual) & bts_qual_mask); + out->variant.timestamp.jiffies = bts_get(at, bts_jiffies); + out->variant.timestamp.pid = bts_get(at, bts_pid); + } else { + out->qualifier = bts_branch; + out->variant.lbr.from = bts_get(at, bts_from); + out->variant.lbr.to = bts_get(at, bts_to); + } + + return ds_cfg.sizeof_rec[ds_bts]; +} + +static int bts_write(struct bts_tracer *tracer, const struct bts_struct *in) +{ + unsigned char raw[MAX_SIZEOF_BTS]; + + if (!tracer) + return -EINVAL; + + if (MAX_SIZEOF_BTS < ds_cfg.sizeof_rec[ds_bts]) + return -EOVERFLOW; + + switch (in->qualifier) { + case bts_invalid: + bts_set(raw, bts_from, 0); + bts_set(raw, bts_to, 0); + bts_set(raw, bts_flags, 0); + break; + case bts_branch: + bts_set(raw, bts_from, in->variant.lbr.from); + bts_set(raw, bts_to, in->variant.lbr.to); + bts_set(raw, bts_flags, 0); + break; + case bts_task_arrives: + case bts_task_departs: + bts_set(raw, bts_qual, (bts_escape | in->qualifier)); + bts_set(raw, bts_jiffies, in->variant.timestamp.jiffies); + bts_set(raw, bts_pid, in->variant.timestamp.pid); break; + default: + return -EINVAL; } + + return ds_write(tracer->ds.context, ds_bts, raw, + ds_cfg.sizeof_rec[ds_bts]); } -static void ds_install_ds_config(struct ds_context *context, - enum ds_qualifier qual, - void *base, size_t size, size_t ith) +static void ds_write_config(struct ds_context *context, + struct ds_trace *cfg, enum ds_qualifier qual) +{ + unsigned char *ds = context->ds; + + ds_set(ds, qual, ds_buffer_base, (unsigned long)cfg->begin); + ds_set(ds, qual, ds_index, (unsigned long)cfg->top); + ds_set(ds, qual, ds_absolute_maximum, (unsigned long)cfg->end); + ds_set(ds, qual, ds_interrupt_threshold, (unsigned long)cfg->ith); +} + +static void ds_read_config(struct ds_context *context, + struct ds_trace *cfg, enum ds_qualifier qual) { + unsigned char *ds = context->ds; + + cfg->begin = (void *)ds_get(ds, qual, ds_buffer_base); + cfg->top = (void *)ds_get(ds, qual, ds_index); + cfg->end = (void *)ds_get(ds, qual, ds_absolute_maximum); + cfg->ith = (void *)ds_get(ds, qual, ds_interrupt_threshold); +} + +static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual, + void *base, size_t size, size_t ith, + unsigned int flags) { unsigned long buffer, adj; /* adjust the buffer address and size to meet alignment @@ -308,32 +574,30 @@ static void ds_install_ds_config(struct ds_context *context, buffer += adj; size -= adj; - size /= ds_cfg.sizeof_rec[qual]; - size *= ds_cfg.sizeof_rec[qual]; + trace->n = size / ds_cfg.sizeof_rec[qual]; + trace->size = ds_cfg.sizeof_rec[qual]; - ds_set(context->ds, qual, ds_buffer_base, buffer); - ds_set(context->ds, qual, ds_index, buffer); - ds_set(context->ds, qual, ds_absolute_maximum, buffer + size); + size = (trace->n * trace->size); + trace->begin = (void *)buffer; + trace->top = trace->begin; + trace->end = (void *)(buffer + size); /* The value for 'no threshold' is -1, which will set the * threshold outside of the buffer, just like we want it. */ - ds_set(context->ds, qual, - ds_interrupt_threshold, buffer + size - ith); + trace->ith = (void *)(buffer + size - ith); + + trace->flags = flags; } -static int ds_request(struct ds_tracer *tracer, enum ds_qualifier qual, - struct task_struct *task, - void *base, size_t size, size_t th) + +static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace, + enum ds_qualifier qual, struct task_struct *task, + void *base, size_t size, size_t th, unsigned int flags) { struct ds_context *context; - unsigned long irq; int error; - error = -EOPNOTSUPP; - if (!ds_cfg.sizeof_ds) - goto out; - error = -EINVAL; if (!base) goto out; @@ -360,43 +624,26 @@ static int ds_request(struct ds_tracer *tracer, enum ds_qualifier qual, goto out; tracer->context = context; + ds_init_ds_trace(trace, qual, base, size, th, flags); - spin_lock_irqsave(&ds_lock, irq); - - error = -EPERM; - if (!check_tracer(task)) - goto out_unlock; - get_tracer(task); - - error = -EPERM; - if (context->owner[qual]) - goto out_put_tracer; - context->owner[qual] = tracer; - - spin_unlock_irqrestore(&ds_lock, irq); - - - ds_install_ds_config(context, qual, base, size, th); - - return 0; - - out_put_tracer: - put_tracer(task); - out_unlock: - spin_unlock_irqrestore(&ds_lock, irq); - ds_put_context(context); - tracer->context = NULL; + error = 0; out: return error; } struct bts_tracer *ds_request_bts(struct task_struct *task, void *base, size_t size, - bts_ovfl_callback_t ovfl, size_t th) + bts_ovfl_callback_t ovfl, size_t th, + unsigned int flags) { struct bts_tracer *tracer; + unsigned long irq; int error; + error = -EOPNOTSUPP; + if (!ds_cfg.ctl[dsf_bts]) + goto out; + /* buffer overflow notification is not yet implemented */ error = -EOPNOTSUPP; if (ovfl) @@ -408,12 +655,40 @@ struct bts_tracer *ds_request_bts(struct task_struct *task, goto out; tracer->ovfl = ovfl; - error = ds_request(&tracer->ds, ds_bts, task, base, size, th); + error = ds_request(&tracer->ds, &tracer->trace.ds, + ds_bts, task, base, size, th, flags); if (error < 0) goto out_tracer; + + spin_lock_irqsave(&ds_lock, irq); + + error = -EPERM; + if (!check_tracer(task)) + goto out_unlock; + get_tracer(task); + + error = -EPERM; + if (tracer->ds.context->bts_master) + goto out_put_tracer; + tracer->ds.context->bts_master = tracer; + + spin_unlock_irqrestore(&ds_lock, irq); + + + tracer->trace.read = bts_read; + tracer->trace.write = bts_write; + + ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts); + ds_resume_bts(tracer); + return tracer; + out_put_tracer: + put_tracer(task); + out_unlock: + spin_unlock_irqrestore(&ds_lock, irq); + ds_put_context(tracer->ds.context); out_tracer: kfree(tracer); out: @@ -422,9 +697,11 @@ struct bts_tracer *ds_request_bts(struct task_struct *task, struct pebs_tracer *ds_request_pebs(struct task_struct *task, void *base, size_t size, - pebs_ovfl_callback_t ovfl, size_t th) + pebs_ovfl_callback_t ovfl, size_t th, + unsigned int flags) { struct pebs_tracer *tracer; + unsigned long irq; int error; /* buffer overflow notification is not yet implemented */ @@ -438,300 +715,171 @@ struct pebs_tracer *ds_request_pebs(struct task_struct *task, goto out; tracer->ovfl = ovfl; - error = ds_request(&tracer->ds, ds_pebs, task, base, size, th); + error = ds_request(&tracer->ds, &tracer->trace.ds, + ds_pebs, task, base, size, th, flags); if (error < 0) goto out_tracer; + spin_lock_irqsave(&ds_lock, irq); + + error = -EPERM; + if (!check_tracer(task)) + goto out_unlock; + get_tracer(task); + + error = -EPERM; + if (tracer->ds.context->pebs_master) + goto out_put_tracer; + tracer->ds.context->pebs_master = tracer; + + spin_unlock_irqrestore(&ds_lock, irq); + + ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts); + ds_resume_pebs(tracer); + return tracer; + out_put_tracer: + put_tracer(task); + out_unlock: + spin_unlock_irqrestore(&ds_lock, irq); + ds_put_context(tracer->ds.context); out_tracer: kfree(tracer); out: return ERR_PTR(error); } -static void ds_release(struct ds_tracer *tracer, enum ds_qualifier qual) -{ - WARN_ON_ONCE(tracer->context->owner[qual] != tracer); - tracer->context->owner[qual] = NULL; - - put_tracer(tracer->context->task); - ds_put_context(tracer->context); -} - -int ds_release_bts(struct bts_tracer *tracer) +void ds_release_bts(struct bts_tracer *tracer) { if (!tracer) - return -EINVAL; + return; - ds_release(&tracer->ds, ds_bts); - kfree(tracer); + ds_suspend_bts(tracer); - return 0; -} + WARN_ON_ONCE(tracer->ds.context->bts_master != tracer); + tracer->ds.context->bts_master = NULL; -int ds_release_pebs(struct pebs_tracer *tracer) -{ - if (!tracer) - return -EINVAL; + put_tracer(tracer->ds.context->task); + ds_put_context(tracer->ds.context); - ds_release(&tracer->ds, ds_pebs); kfree(tracer); - - return 0; -} - -static size_t ds_get_index(struct ds_context *context, enum ds_qualifier qual) -{ - unsigned long base, index; - - base = ds_get(context->ds, qual, ds_buffer_base); - index = ds_get(context->ds, qual, ds_index); - - return (index - base) / ds_cfg.sizeof_rec[qual]; } -int ds_get_bts_index(struct bts_tracer *tracer, size_t *pos) +void ds_suspend_bts(struct bts_tracer *tracer) { - if (!tracer) - return -EINVAL; + struct task_struct *task; - if (!pos) - return -EINVAL; - - *pos = ds_get_index(tracer->ds.context, ds_bts); - - return 0; -} - -int ds_get_pebs_index(struct pebs_tracer *tracer, size_t *pos) -{ if (!tracer) - return -EINVAL; + return; - if (!pos) - return -EINVAL; + task = tracer->ds.context->task; - *pos = ds_get_index(tracer->ds.context, ds_pebs); + if (!task || (task == current)) + update_debugctlmsr(get_debugctlmsr() & ~BTS_CONTROL); - return 0; -} + if (task) { + task->thread.debugctlmsr &= ~BTS_CONTROL; -static size_t ds_get_end(struct ds_context *context, enum ds_qualifier qual) -{ - unsigned long base, max; - - base = ds_get(context->ds, qual, ds_buffer_base); - max = ds_get(context->ds, qual, ds_absolute_maximum); - - return (max - base) / ds_cfg.sizeof_rec[qual]; + if (!task->thread.debugctlmsr) + clear_tsk_thread_flag(task, TIF_DEBUGCTLMSR); + } } -int ds_get_bts_end(struct bts_tracer *tracer, size_t *pos) +void ds_resume_bts(struct bts_tracer *tracer) { - if (!tracer) - return -EINVAL; - - if (!pos) - return -EINVAL; - - *pos = ds_get_end(tracer->ds.context, ds_bts); - - return 0; -} + struct task_struct *task; + unsigned long control; -int ds_get_pebs_end(struct pebs_tracer *tracer, size_t *pos) -{ if (!tracer) - return -EINVAL; - - if (!pos) - return -EINVAL; - - *pos = ds_get_end(tracer->ds.context, ds_pebs); - - return 0; -} - -static int ds_access(struct ds_context *context, enum ds_qualifier qual, - size_t index, const void **record) -{ - unsigned long base, idx; - - if (!record) - return -EINVAL; - - base = ds_get(context->ds, qual, ds_buffer_base); - idx = base + (index * ds_cfg.sizeof_rec[qual]); - - if (idx > ds_get(context->ds, qual, ds_absolute_maximum)) - return -EINVAL; + return; - *record = (const void *)idx; + task = tracer->ds.context->task; - return ds_cfg.sizeof_rec[qual]; -} + control = ds_cfg.ctl[dsf_bts]; + if (!(tracer->trace.ds.flags & BTS_KERNEL)) + control |= ds_cfg.ctl[dsf_bts_kernel]; + if (!(tracer->trace.ds.flags & BTS_USER)) + control |= ds_cfg.ctl[dsf_bts_user]; -int ds_access_bts(struct bts_tracer *tracer, size_t index, - const void **record) -{ - if (!tracer) - return -EINVAL; + if (task) { + task->thread.debugctlmsr |= control; + set_tsk_thread_flag(task, TIF_DEBUGCTLMSR); + } - return ds_access(tracer->ds.context, ds_bts, index, record); + if (!task || (task == current)) + update_debugctlmsr(get_debugctlmsr() | control); } -int ds_access_pebs(struct pebs_tracer *tracer, size_t index, - const void **record) +void ds_release_pebs(struct pebs_tracer *tracer) { if (!tracer) - return -EINVAL; - - return ds_access(tracer->ds.context, ds_pebs, index, record); -} - -static int ds_write(struct ds_context *context, enum ds_qualifier qual, - const void *record, size_t size) -{ - int bytes_written = 0; - - if (!record) - return -EINVAL; - - while (size) { - unsigned long base, index, end, write_end, int_th; - unsigned long write_size, adj_write_size; - - /* - * write as much as possible without producing an - * overflow interrupt. - * - * interrupt_threshold must either be - * - bigger than absolute_maximum or - * - point to a record between buffer_base and absolute_maximum - * - * index points to a valid record. - */ - base = ds_get(context->ds, qual, ds_buffer_base); - index = ds_get(context->ds, qual, ds_index); - end = ds_get(context->ds, qual, ds_absolute_maximum); - int_th = ds_get(context->ds, qual, ds_interrupt_threshold); - - write_end = min(end, int_th); - - /* if we are already beyond the interrupt threshold, - * we fill the entire buffer */ - if (write_end <= index) - write_end = end; - - if (write_end <= index) - break; - - write_size = min((unsigned long) size, write_end - index); - memcpy((void *)index, record, write_size); - - record = (const char *)record + write_size; - size -= write_size; - bytes_written += write_size; - - adj_write_size = write_size / ds_cfg.sizeof_rec[qual]; - adj_write_size *= ds_cfg.sizeof_rec[qual]; - - /* zero out trailing bytes */ - memset((char *)index + write_size, 0, - adj_write_size - write_size); - index += adj_write_size; + return; - if (index >= end) - index = base; - ds_set(context->ds, qual, ds_index, index); + ds_suspend_pebs(tracer); - if (index >= int_th) - ds_overflow(context, qual); - } + WARN_ON_ONCE(tracer->ds.context->pebs_master != tracer); + tracer->ds.context->pebs_master = NULL; - return bytes_written; -} + put_tracer(tracer->ds.context->task); + ds_put_context(tracer->ds.context); -int ds_write_bts(struct bts_tracer *tracer, const void *record, size_t size) -{ - if (!tracer) - return -EINVAL; - - return ds_write(tracer->ds.context, ds_bts, record, size); + kfree(tracer); } -int ds_write_pebs(struct pebs_tracer *tracer, const void *record, size_t size) +void ds_suspend_pebs(struct pebs_tracer *tracer) { - if (!tracer) - return -EINVAL; - return ds_write(tracer->ds.context, ds_pebs, record, size); } -static void ds_reset_or_clear(struct ds_context *context, - enum ds_qualifier qual, int clear) +void ds_resume_pebs(struct pebs_tracer *tracer) { - unsigned long base, end; - - base = ds_get(context->ds, qual, ds_buffer_base); - end = ds_get(context->ds, qual, ds_absolute_maximum); - - if (clear) - memset((void *)base, 0, end - base); - ds_set(context->ds, qual, ds_index, base); } -int ds_reset_bts(struct bts_tracer *tracer) +const struct bts_trace *ds_read_bts(struct bts_tracer *tracer) { if (!tracer) - return -EINVAL; - - ds_reset_or_clear(tracer->ds.context, ds_bts, /* clear = */ 0); + return NULL; - return 0; + ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_bts); + return &tracer->trace; } -int ds_reset_pebs(struct pebs_tracer *tracer) +const struct pebs_trace *ds_read_pebs(struct pebs_tracer *tracer) { if (!tracer) - return -EINVAL; + return NULL; - ds_reset_or_clear(tracer->ds.context, ds_pebs, /* clear = */ 0); + ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_pebs); + tracer->trace.reset_value = + *(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8)); - return 0; + return &tracer->trace; } -int ds_clear_bts(struct bts_tracer *tracer) +int ds_reset_bts(struct bts_tracer *tracer) { if (!tracer) return -EINVAL; - ds_reset_or_clear(tracer->ds.context, ds_bts, /* clear = */ 1); - - return 0; -} - -int ds_clear_pebs(struct pebs_tracer *tracer) -{ - if (!tracer) - return -EINVAL; + tracer->trace.ds.top = tracer->trace.ds.begin; - ds_reset_or_clear(tracer->ds.context, ds_pebs, /* clear = */ 1); + ds_set(tracer->ds.context->ds, ds_bts, ds_index, + (unsigned long)tracer->trace.ds.top); return 0; } -int ds_get_pebs_reset(struct pebs_tracer *tracer, u64 *value) +int ds_reset_pebs(struct pebs_tracer *tracer) { if (!tracer) return -EINVAL; - if (!value) - return -EINVAL; + tracer->trace.ds.top = tracer->trace.ds.begin; - *value = *(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8)); + ds_set(tracer->ds.context->ds, ds_bts, ds_index, + (unsigned long)tracer->trace.ds.top); return 0; } @@ -746,35 +894,59 @@ int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value) return 0; } -static const struct ds_configuration ds_cfg_var = { - .sizeof_ds = sizeof(long) * 12, - .sizeof_field = sizeof(long), - .sizeof_rec[ds_bts] = sizeof(long) * 3, +static const struct ds_configuration ds_cfg_netburst = { + .name = "netburst", + .ctl[dsf_bts] = (1 << 2) | (1 << 3), + .ctl[dsf_bts_kernel] = (1 << 5), + .ctl[dsf_bts_user] = (1 << 6), + + .sizeof_field = sizeof(long), + .sizeof_rec[ds_bts] = sizeof(long) * 3, #ifdef __i386__ - .sizeof_rec[ds_pebs] = sizeof(long) * 10 + .sizeof_rec[ds_pebs] = sizeof(long) * 10, #else - .sizeof_rec[ds_pebs] = sizeof(long) * 18 + .sizeof_rec[ds_pebs] = sizeof(long) * 18, #endif }; -static const struct ds_configuration ds_cfg_64 = { - .sizeof_ds = 8 * 12, - .sizeof_field = 8, - .sizeof_rec[ds_bts] = 8 * 3, +static const struct ds_configuration ds_cfg_pentium_m = { + .name = "pentium m", + .ctl[dsf_bts] = (1 << 6) | (1 << 7), + + .sizeof_field = sizeof(long), + .sizeof_rec[ds_bts] = sizeof(long) * 3, #ifdef __i386__ - .sizeof_rec[ds_pebs] = 8 * 10 + .sizeof_rec[ds_pebs] = sizeof(long) * 10, #else - .sizeof_rec[ds_pebs] = 8 * 18 + .sizeof_rec[ds_pebs] = sizeof(long) * 18, #endif }; +static const struct ds_configuration ds_cfg_core2 = { + .name = "core 2", + .ctl[dsf_bts] = (1 << 6) | (1 << 7), + .ctl[dsf_bts_kernel] = (1 << 9), + .ctl[dsf_bts_user] = (1 << 10), + + .sizeof_field = 8, + .sizeof_rec[ds_bts] = 8 * 3, + .sizeof_rec[ds_pebs] = 8 * 18, +}; -static inline void +static void ds_configure(const struct ds_configuration *cfg) { + memset(&ds_cfg, 0, sizeof(ds_cfg)); ds_cfg = *cfg; - printk(KERN_INFO "DS available\n"); + printk(KERN_INFO "[ds] using %s configuration\n", ds_cfg.name); + + if (!cpu_has_bts) { + ds_cfg.ctl[dsf_bts] = 0; + printk(KERN_INFO "[ds] bts not available\n"); + } + if (!cpu_has_pebs) + printk(KERN_INFO "[ds] pebs not available\n"); - WARN_ON_ONCE(MAX_SIZEOF_DS < ds_cfg.sizeof_ds); + WARN_ON_ONCE(MAX_SIZEOF_DS < (12 * ds_cfg.sizeof_field)); } void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) @@ -787,10 +959,10 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) break; case 0xD: case 0xE: /* Pentium M */ - ds_configure(&ds_cfg_var); + ds_configure(&ds_cfg_pentium_m); break; default: /* Core2, Atom, ... */ - ds_configure(&ds_cfg_64); + ds_configure(&ds_cfg_core2); break; } break; @@ -799,7 +971,7 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) case 0x0: case 0x1: case 0x2: /* Netburst */ - ds_configure(&ds_cfg_var); + ds_configure(&ds_cfg_netburst); break; default: /* sorry, don't know about them */ @@ -812,14 +984,41 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) } } -void ds_free(struct ds_context *context) +/* + * Change the DS configuration from tracing prev to tracing next. + */ +void ds_switch_to(struct task_struct *prev, struct task_struct *next) { - /* This is called when the task owning the parameter context - * is dying. There should not be any user of that context left - * to disturb us, anymore. */ - unsigned long leftovers = context->count; - while (leftovers--) { - put_tracer(context->task); - ds_put_context(context); + struct ds_context *prev_ctx = prev->thread.ds_ctx; + struct ds_context *next_ctx = next->thread.ds_ctx; + + if (prev_ctx) { + update_debugctlmsr(0); + + if (prev_ctx->bts_master && + (prev_ctx->bts_master->trace.ds.flags & BTS_TIMESTAMPS)) { + struct bts_struct ts = { + .qualifier = bts_task_departs, + .variant.timestamp.jiffies = jiffies_64, + .variant.timestamp.pid = prev->pid + }; + bts_write(prev_ctx->bts_master, &ts); + } + } + + if (next_ctx) { + if (next_ctx->bts_master && + (next_ctx->bts_master->trace.ds.flags & BTS_TIMESTAMPS)) { + struct bts_struct ts = { + .qualifier = bts_task_arrives, + .variant.timestamp.jiffies = jiffies_64, + .variant.timestamp.pid = next->pid + }; + bts_write(next_ctx->bts_master, &ts); + } + + wrmsrl(MSR_IA32_DS_AREA, (unsigned long)next_ctx->ds); } + + update_debugctlmsr(next->thread.debugctlmsr); } diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 24c2276..605eff9 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -252,11 +252,14 @@ void exit_thread(void) put_cpu(); } #ifdef CONFIG_X86_DS - /* Free any DS contexts that have not been properly released. */ - if (unlikely(current->thread.ds_ctx)) { - /* we clear debugctl to make sure DS is not used. */ - update_debugctlmsr(0); - ds_free(current->thread.ds_ctx); + /* Free any BTS tracers that have not been properly released. */ + if (unlikely(current->bts)) { + ds_release_bts(current->bts); + current->bts = NULL; + + kfree(current->bts_buffer); + current->bts_buffer = NULL; + current->bts_size = 0; } #endif /* CONFIG_X86_DS */ } @@ -420,48 +423,19 @@ int set_tsc_mode(unsigned int val) return 0; } -#ifdef CONFIG_X86_DS -static int update_debugctl(struct thread_struct *prev, - struct thread_struct *next, unsigned long debugctl) -{ - unsigned long ds_prev = 0; - unsigned long ds_next = 0; - - if (prev->ds_ctx) - ds_prev = (unsigned long)prev->ds_ctx->ds; - if (next->ds_ctx) - ds_next = (unsigned long)next->ds_ctx->ds; - - if (ds_next != ds_prev) { - /* we clear debugctl to make sure DS - * is not in use when we change it */ - debugctl = 0; - update_debugctlmsr(0); - wrmsr(MSR_IA32_DS_AREA, ds_next, 0); - } - return debugctl; -} -#else -static int update_debugctl(struct thread_struct *prev, - struct thread_struct *next, unsigned long debugctl) -{ - return debugctl; -} -#endif /* CONFIG_X86_DS */ - static noinline void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, struct tss_struct *tss) { struct thread_struct *prev, *next; - unsigned long debugctl; prev = &prev_p->thread; next = &next_p->thread; - debugctl = update_debugctl(prev, next, prev->debugctlmsr); - - if (next->debugctlmsr != debugctl) + if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) || + test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR)) + ds_switch_to(prev_p, next_p); + else if (next->debugctlmsr != prev->debugctlmsr) update_debugctlmsr(next->debugctlmsr); if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { @@ -483,15 +457,6 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, hard_enable_TSC(); } -#ifdef CONFIG_X86_PTRACE_BTS - if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS)) - ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS); - - if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS)) - ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES); -#endif /* CONFIG_X86_PTRACE_BTS */ - - if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { /* * Disable the bitmap via an invalid offset. We still cache diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index fbb321d..1cfd2a4 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -237,11 +237,14 @@ void exit_thread(void) put_cpu(); } #ifdef CONFIG_X86_DS - /* Free any DS contexts that have not been properly released. */ - if (unlikely(t->ds_ctx)) { - /* we clear debugctl to make sure DS is not used. */ - update_debugctlmsr(0); - ds_free(t->ds_ctx); + /* Free any BTS tracers that have not been properly released. */ + if (unlikely(current->bts)) { + ds_release_bts(current->bts); + current->bts = NULL; + + kfree(current->bts_buffer); + current->bts_buffer = NULL; + current->bts_size = 0; } #endif /* CONFIG_X86_DS */ } @@ -471,35 +474,14 @@ static inline void __switch_to_xtra(struct task_struct *prev_p, struct tss_struct *tss) { struct thread_struct *prev, *next; - unsigned long debugctl; prev = &prev_p->thread, next = &next_p->thread; - debugctl = prev->debugctlmsr; - -#ifdef CONFIG_X86_DS - { - unsigned long ds_prev = 0, ds_next = 0; - - if (prev->ds_ctx) - ds_prev = (unsigned long)prev->ds_ctx->ds; - if (next->ds_ctx) - ds_next = (unsigned long)next->ds_ctx->ds; - - if (ds_next != ds_prev) { - /* - * We clear debugctl to make sure DS - * is not in use when we change it: - */ - debugctl = 0; - update_debugctlmsr(0); - wrmsrl(MSR_IA32_DS_AREA, ds_next); - } - } -#endif /* CONFIG_X86_DS */ - - if (next->debugctlmsr != debugctl) + if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) || + test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR)) + ds_switch_to(prev_p, next_p); + else if (next->debugctlmsr != prev->debugctlmsr) update_debugctlmsr(next->debugctlmsr); if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { @@ -534,14 +516,6 @@ static inline void __switch_to_xtra(struct task_struct *prev_p, */ memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); } - -#ifdef CONFIG_X86_PTRACE_BTS - if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS)) - ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS); - - if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS)) - ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES); -#endif /* CONFIG_X86_PTRACE_BTS */ } /* diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index b2998fe..45e9855 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -581,153 +581,73 @@ static int ioperm_get(struct task_struct *target, } #ifdef CONFIG_X86_PTRACE_BTS -/* - * The configuration for a particular BTS hardware implementation. - */ -struct bts_configuration { - /* the size of a BTS record in bytes; at most BTS_MAX_RECORD_SIZE */ - unsigned char sizeof_bts; - /* the size of a field in the BTS record in bytes */ - unsigned char sizeof_field; - /* a bitmask to enable/disable BTS in DEBUGCTL MSR */ - unsigned long debugctl_mask; -}; -static struct bts_configuration bts_cfg; - -#define BTS_MAX_RECORD_SIZE (8 * 3) - - -/* - * Branch Trace Store (BTS) uses the following format. Different - * architectures vary in the size of those fields. - * - source linear address - * - destination linear address - * - flags - * - * Later architectures use 64bit pointers throughout, whereas earlier - * architectures use 32bit pointers in 32bit mode. - * - * We compute the base address for the first 8 fields based on: - * - the field size stored in the DS configuration - * - the relative field position - * - * In order to store additional information in the BTS buffer, we use - * a special source address to indicate that the record requires - * special interpretation. - * - * Netburst indicated via a bit in the flags field whether the branch - * was predicted; this is ignored. - */ - -enum bts_field { - bts_from = 0, - bts_to, - bts_flags, - - bts_escape = (unsigned long)-1, - bts_qual = bts_to, - bts_jiffies = bts_flags -}; - -static inline unsigned long bts_get(const char *base, enum bts_field field) -{ - base += (bts_cfg.sizeof_field * field); - return *(unsigned long *)base; -} - -static inline void bts_set(char *base, enum bts_field field, unsigned long val) -{ - base += (bts_cfg.sizeof_field * field);; - (*(unsigned long *)base) = val; -} - -/* - * Translate a BTS record from the raw format into the bts_struct format - * - * out (out): bts_struct interpretation - * raw: raw BTS record - */ -static void ptrace_bts_translate_record(struct bts_struct *out, const void *raw) -{ - memset(out, 0, sizeof(*out)); - if (bts_get(raw, bts_from) == bts_escape) { - out->qualifier = bts_get(raw, bts_qual); - out->variant.jiffies = bts_get(raw, bts_jiffies); - } else { - out->qualifier = BTS_BRANCH; - out->variant.lbr.from_ip = bts_get(raw, bts_from); - out->variant.lbr.to_ip = bts_get(raw, bts_to); - } -} - static int ptrace_bts_read_record(struct task_struct *child, size_t index, struct bts_struct __user *out) { - struct bts_struct ret; - const void *bts_record; - size_t bts_index, bts_end; + const struct bts_trace *trace; + struct bts_struct bts; + const unsigned char *at; int error; - error = ds_get_bts_end(child->bts, &bts_end); - if (error < 0) - return error; - - if (bts_end <= index) - return -EINVAL; + trace = ds_read_bts(child->bts); + if (!trace) + return -EPERM; - error = ds_get_bts_index(child->bts, &bts_index); - if (error < 0) - return error; + at = trace->ds.top - ((index + 1) * trace->ds.size); + if ((void *)at < trace->ds.begin) + at += (trace->ds.n * trace->ds.size); - /* translate the ptrace bts index into the ds bts index */ - bts_index += bts_end - (index + 1); - if (bts_end <= bts_index) - bts_index -= bts_end; + if (!trace->read) + return -EOPNOTSUPP; - error = ds_access_bts(child->bts, bts_index, &bts_record); + error = trace->read(child->bts, at, &bts); if (error < 0) return error; - ptrace_bts_translate_record(&ret, bts_record); - - if (copy_to_user(out, &ret, sizeof(ret))) + if (copy_to_user(out, &bts, sizeof(bts))) return -EFAULT; - return sizeof(ret); + return sizeof(bts); } static int ptrace_bts_drain(struct task_struct *child, long size, struct bts_struct __user *out) { - struct bts_struct ret; - const unsigned char *raw; - size_t end, i; - int error; + const struct bts_trace *trace; + const unsigned char *at; + int error, drained = 0; - error = ds_get_bts_index(child->bts, &end); - if (error < 0) - return error; + trace = ds_read_bts(child->bts); + if (!trace) + return -EPERM; - if (size < (end * sizeof(struct bts_struct))) + if (!trace->read) + return -EOPNOTSUPP; + + if (size < (trace->ds.top - trace->ds.begin)) return -EIO; - error = ds_access_bts(child->bts, 0, (const void **)&raw); - if (error < 0) - return error; + for (at = trace->ds.begin; (void *)at < trace->ds.top; + out++, drained++, at += trace->ds.size) { + struct bts_struct bts; + int error; - for (i = 0; i < end; i++, out++, raw += bts_cfg.sizeof_bts) { - ptrace_bts_translate_record(&ret, raw); + error = trace->read(child->bts, at, &bts); + if (error < 0) + return error; - if (copy_to_user(out, &ret, sizeof(ret))) + if (copy_to_user(out, &bts, sizeof(bts))) return -EFAULT; } - error = ds_clear_bts(child->bts); + memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size); + + error = ds_reset_bts(child->bts); if (error < 0) return error; - return end; + return drained; } static int ptrace_bts_config(struct task_struct *child, @@ -735,136 +655,89 @@ static int ptrace_bts_config(struct task_struct *child, const struct ptrace_bts_config __user *ucfg) { struct ptrace_bts_config cfg; - int error = 0; - - error = -EOPNOTSUPP; - if (!bts_cfg.sizeof_bts) - goto errout; + unsigned int flags = 0; - error = -EIO; if (cfg_size < sizeof(cfg)) - goto errout; + return -EIO; - error = -EFAULT; if (copy_from_user(&cfg, ucfg, sizeof(cfg))) - goto errout; - - error = -EINVAL; - if ((cfg.flags & PTRACE_BTS_O_SIGNAL) && - !(cfg.flags & PTRACE_BTS_O_ALLOC)) - goto errout; - - if (cfg.flags & PTRACE_BTS_O_ALLOC) { - bts_ovfl_callback_t ovfl = NULL; - unsigned int sig = 0; - - error = -EINVAL; - if (cfg.size < (10 * bts_cfg.sizeof_bts)) - goto errout; + return -EFAULT; - if (cfg.flags & PTRACE_BTS_O_SIGNAL) { - if (!cfg.signal) - goto errout; + if (child->bts) { + ds_release_bts(child->bts); + child->bts = NULL; + } - error = -EOPNOTSUPP; - goto errout; + if (cfg.flags & PTRACE_BTS_O_SIGNAL) { + if (!cfg.signal) + return -EINVAL; - sig = cfg.signal; - } + return -EOPNOTSUPP; - if (child->bts) { - (void)ds_release_bts(child->bts); - kfree(child->bts_buffer); + child->thread.bts_ovfl_signal = cfg.signal; + } - child->bts = NULL; - child->bts_buffer = NULL; - } + if ((cfg.flags & PTRACE_BTS_O_ALLOC) && + (cfg.size != child->bts_size)) { + kfree(child->bts_buffer); - error = -ENOMEM; + child->bts_size = cfg.size; child->bts_buffer = kzalloc(cfg.size, GFP_KERNEL); - if (!child->bts_buffer) - goto errout; - - child->bts = ds_request_bts(child, child->bts_buffer, cfg.size, - ovfl, /* th = */ (size_t)-1); - if (IS_ERR(child->bts)) { - error = PTR_ERR(child->bts); - kfree(child->bts_buffer); - child->bts = NULL; - child->bts_buffer = NULL; - goto errout; + if (!child->bts_buffer) { + child->bts_size = 0; + return -ENOMEM; } - - child->thread.bts_ovfl_signal = sig; } - error = -EINVAL; - if (!child->thread.ds_ctx && cfg.flags) - goto errout; - if (cfg.flags & PTRACE_BTS_O_TRACE) - child->thread.debugctlmsr |= bts_cfg.debugctl_mask; - else - child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask; + flags |= BTS_USER; if (cfg.flags & PTRACE_BTS_O_SCHED) - set_tsk_thread_flag(child, TIF_BTS_TRACE_TS); - else - clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); + flags |= BTS_TIMESTAMPS; - error = sizeof(cfg); + child->bts = ds_request_bts(child, child->bts_buffer, child->bts_size, + /* ovfl = */ NULL, /* th = */ (size_t)-1, + flags); + if (IS_ERR(child->bts)) { + int error = PTR_ERR(child->bts); -out: - if (child->thread.debugctlmsr) - set_tsk_thread_flag(child, TIF_DEBUGCTLMSR); - else - clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); + kfree(child->bts_buffer); + child->bts = NULL; + child->bts_buffer = NULL; + child->bts_size = 0; - return error; + return error; + } -errout: - child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask; - clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); - goto out; + return sizeof(cfg); } static int ptrace_bts_status(struct task_struct *child, long cfg_size, struct ptrace_bts_config __user *ucfg) { + const struct bts_trace *trace; struct ptrace_bts_config cfg; - size_t end; - const void *base, *max; - int error; if (cfg_size < sizeof(cfg)) return -EIO; - error = ds_get_bts_end(child->bts, &end); - if (error < 0) - return error; - - error = ds_access_bts(child->bts, /* index = */ 0, &base); - if (error < 0) - return error; - - error = ds_access_bts(child->bts, /* index = */ end, &max); - if (error < 0) - return error; + trace = ds_read_bts(child->bts); + if (!trace) + return -EPERM; memset(&cfg, 0, sizeof(cfg)); - cfg.size = (max - base); + cfg.size = trace->ds.end - trace->ds.begin; cfg.signal = child->thread.bts_ovfl_signal; cfg.bts_size = sizeof(struct bts_struct); if (cfg.signal) cfg.flags |= PTRACE_BTS_O_SIGNAL; - if (test_tsk_thread_flag(child, TIF_DEBUGCTLMSR) && - child->thread.debugctlmsr & bts_cfg.debugctl_mask) + if (trace->ds.flags & BTS_USER) cfg.flags |= PTRACE_BTS_O_TRACE; - if (test_tsk_thread_flag(child, TIF_BTS_TRACE_TS)) + if (trace->ds.flags & BTS_TIMESTAMPS) cfg.flags |= PTRACE_BTS_O_SCHED; if (copy_to_user(ucfg, &cfg, sizeof(cfg))) @@ -873,105 +746,28 @@ static int ptrace_bts_status(struct task_struct *child, return sizeof(cfg); } -static int ptrace_bts_write_record(struct task_struct *child, - const struct bts_struct *in) +static int ptrace_bts_clear(struct task_struct *child) { - unsigned char bts_record[BTS_MAX_RECORD_SIZE]; + const struct bts_trace *trace; - if (BTS_MAX_RECORD_SIZE < bts_cfg.sizeof_bts) - return -EOVERFLOW; + trace = ds_read_bts(child->bts); + if (!trace) + return -EPERM; - memset(bts_record, 0, bts_cfg.sizeof_bts); - switch (in->qualifier) { - case BTS_INVALID: - break; + memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size); - case BTS_BRANCH: - bts_set(bts_record, bts_from, in->variant.lbr.from_ip); - bts_set(bts_record, bts_to, in->variant.lbr.to_ip); - break; - - case BTS_TASK_ARRIVES: - case BTS_TASK_DEPARTS: - bts_set(bts_record, bts_from, bts_escape); - bts_set(bts_record, bts_qual, in->qualifier); - bts_set(bts_record, bts_jiffies, in->variant.jiffies); - break; - - default: - return -EINVAL; - } - - return ds_write_bts(child->bts, bts_record, bts_cfg.sizeof_bts); + return ds_reset_bts(child->bts); } -void ptrace_bts_take_timestamp(struct task_struct *tsk, - enum bts_qualifier qualifier) +static int ptrace_bts_size(struct task_struct *child) { - struct bts_struct rec = { - .qualifier = qualifier, - .variant.jiffies = jiffies_64 - }; - - ptrace_bts_write_record(tsk, &rec); -} - -static const struct bts_configuration bts_cfg_netburst = { - .sizeof_bts = sizeof(long) * 3, - .sizeof_field = sizeof(long), - .debugctl_mask = (1<<2)|(1<<3)|(1<<5) -}; + const struct bts_trace *trace; -static const struct bts_configuration bts_cfg_pentium_m = { - .sizeof_bts = sizeof(long) * 3, - .sizeof_field = sizeof(long), - .debugctl_mask = (1<<6)|(1<<7) -}; + trace = ds_read_bts(child->bts); + if (!trace) + return -EPERM; -static const struct bts_configuration bts_cfg_core2 = { - .sizeof_bts = 8 * 3, - .sizeof_field = 8, - .debugctl_mask = (1<<6)|(1<<7)|(1<<9) -}; - -static inline void bts_configure(const struct bts_configuration *cfg) -{ - bts_cfg = *cfg; -} - -void __cpuinit ptrace_bts_init_intel(struct cpuinfo_x86 *c) -{ - switch (c->x86) { - case 0x6: - switch (c->x86_model) { - case 0 ... 0xC: - /* sorry, don't know about them */ - break; - case 0xD: - case 0xE: /* Pentium M */ - bts_configure(&bts_cfg_pentium_m); - break; - default: /* Core2, Atom, ... */ - bts_configure(&bts_cfg_core2); - break; - } - break; - case 0xF: - switch (c->x86_model) { - case 0x0: - case 0x1: - case 0x2: /* Netburst */ - bts_configure(&bts_cfg_netburst); - break; - default: - /* sorry, don't know about them */ - break; - } - break; - default: - /* sorry, don't know about them */ - break; - } + return (trace->ds.top - trace->ds.begin) / trace->ds.size; } #endif /* CONFIG_X86_PTRACE_BTS */ @@ -988,15 +784,12 @@ void ptrace_disable(struct task_struct *child) #endif #ifdef CONFIG_X86_PTRACE_BTS if (child->bts) { - (void)ds_release_bts(child->bts); + ds_release_bts(child->bts); + child->bts = NULL; + kfree(child->bts_buffer); child->bts_buffer = NULL; - - child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask; - if (!child->thread.debugctlmsr) - clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); - - clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); + child->bts_size = 0; } #endif /* CONFIG_X86_PTRACE_BTS */ } @@ -1129,16 +922,9 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) (child, data, (struct ptrace_bts_config __user *)addr); break; - case PTRACE_BTS_SIZE: { - size_t size; - - ret = ds_get_bts_index(child->bts, &size); - if (ret == 0) { - WARN_ON_ONCE(size != (int) size); - ret = (int) size; - } + case PTRACE_BTS_SIZE: + ret = ptrace_bts_size(child); break; - } case PTRACE_BTS_GET: ret = ptrace_bts_read_record @@ -1146,7 +932,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) break; case PTRACE_BTS_CLEAR: - ret = ds_clear_bts(child->bts); + ret = ptrace_bts_clear(child); break; case PTRACE_BTS_DRAIN: @@ -1409,6 +1195,14 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, case PTRACE_GET_THREAD_AREA: case PTRACE_SET_THREAD_AREA: +#ifdef CONFIG_X86_PTRACE_BTS + case PTRACE_BTS_CONFIG: + case PTRACE_BTS_STATUS: + case PTRACE_BTS_SIZE: + case PTRACE_BTS_GET: + case PTRACE_BTS_CLEAR: + case PTRACE_BTS_DRAIN: +#endif /* CONFIG_X86_PTRACE_BTS */ return arch_ptrace(child, request, addr, data); default: diff --git a/include/linux/sched.h b/include/linux/sched.h index 4b81fc5..dc5ea65 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1176,6 +1176,7 @@ struct task_struct { * The buffer to hold the BTS data. */ void *bts_buffer; + size_t bts_size; #endif /* CONFIG_X86_PTRACE_BTS */ /* PID/PID hash table linkage. */ -- cgit v0.10.2 From a93751cab71d63126687551823ed3e70cd85854a Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Thu, 11 Dec 2008 13:53:26 +0100 Subject: x86, bts, ftrace: adapt the hw-branch-tracer to the ds.c interface Impact: restructure code, cleanup Remove BTS bits from the hw-branch-tracer (renamed from bts-tracer) and use the ds interface. Signed-off-by: Markus Metzger Signed-off-by: Ingo Molnar diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index bde6f03..d8bae6f 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -251,9 +251,9 @@ config STACK_TRACER Say N if unsure. -config BTS_TRACER +config HW_BRANCH_TRACER depends on HAVE_HW_BRANCH_TRACER - bool "Trace branches" + bool "Trace hw branches" select TRACING help This tracer records all branches on the system in a circular diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 62dc561..349d5a9 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -31,7 +31,7 @@ obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o obj-$(CONFIG_BOOT_TRACER) += trace_boot.o obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o -obj-$(CONFIG_BTS_TRACER) += trace_bts.o +obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o obj-$(CONFIG_POWER_TRACER) += trace_power.o libftrace-y := ftrace.o diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8ebe007..639344a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2425,7 +2425,7 @@ __tracing_open(struct inode *inode, struct file *file, int *ret) /* Notify the tracer early; before we stop tracing. */ if (iter->trace && iter->trace->open) - iter->trace->open(iter); + iter->trace->open(iter); /* Annotate start of buffers if we had overruns */ if (ring_buffer_overruns(iter->tr->buffer)) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 5ac6970..f07c246 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -28,7 +28,7 @@ enum trace_type { TRACE_GRAPH_RET, TRACE_GRAPH_ENT, TRACE_USER_STACK, - TRACE_BTS, + TRACE_HW_BRANCHES, TRACE_POWER, __TRACE_LAST_TYPE @@ -159,10 +159,10 @@ struct trace_branch { char correct; }; -struct bts_entry { +struct hw_branch_entry { struct trace_entry ent; - unsigned long from; - unsigned long to; + u64 from; + u64 to; }; struct trace_power { @@ -278,7 +278,7 @@ extern void __ftrace_bad_type(void); TRACE_GRAPH_ENT); \ IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \ TRACE_GRAPH_RET); \ - IF_ASSIGN(var, ent, struct bts_entry, TRACE_BTS);\ + IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\ IF_ASSIGN(var, ent, struct trace_power, TRACE_POWER); \ __ftrace_bad_type(); \ } while (0) @@ -414,9 +414,7 @@ void trace_function(struct trace_array *tr, void trace_graph_return(struct ftrace_graph_ret *trace); int trace_graph_entry(struct ftrace_graph_ent *trace); -void trace_bts(struct trace_array *tr, - unsigned long from, - unsigned long to); +void trace_hw_branch(struct trace_array *tr, u64 from, u64 to); void tracing_start_cmdline_record(void); void tracing_stop_cmdline_record(void); diff --git a/kernel/trace/trace_bts.c b/kernel/trace/trace_bts.c deleted file mode 100644 index 23b76e4..0000000 --- a/kernel/trace/trace_bts.c +++ /dev/null @@ -1,276 +0,0 @@ -/* - * BTS tracer - * - * Copyright (C) 2008 Markus Metzger - * - */ - -#include -#include -#include -#include -#include - -#include - -#include "trace.h" - - -#define SIZEOF_BTS (1 << 13) - -static DEFINE_PER_CPU(struct bts_tracer *, tracer); -static DEFINE_PER_CPU(unsigned char[SIZEOF_BTS], buffer); - -#define this_tracer per_cpu(tracer, smp_processor_id()) -#define this_buffer per_cpu(buffer, smp_processor_id()) - - -/* - * Information to interpret a BTS record. - * This will go into an in-kernel BTS interface. - */ -static unsigned char sizeof_field; -static unsigned long debugctl_mask; - -#define sizeof_bts (3 * sizeof_field) - -static void bts_trace_cpuinit(struct cpuinfo_x86 *c) -{ - switch (c->x86) { - case 0x6: - switch (c->x86_model) { - case 0x0 ... 0xC: - break; - case 0xD: - case 0xE: /* Pentium M */ - sizeof_field = sizeof(long); - debugctl_mask = (1<<6)|(1<<7); - break; - default: - sizeof_field = 8; - debugctl_mask = (1<<6)|(1<<7); - break; - } - break; - case 0xF: - switch (c->x86_model) { - case 0x0: - case 0x1: - case 0x2: /* Netburst */ - sizeof_field = sizeof(long); - debugctl_mask = (1<<2)|(1<<3); - break; - default: - /* sorry, don't know about them */ - break; - } - break; - default: - /* sorry, don't know about them */ - break; - } -} - -static inline void bts_enable(void) -{ - unsigned long debugctl; - - rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); - wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl | debugctl_mask); -} - -static inline void bts_disable(void) -{ - unsigned long debugctl; - - rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); - wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl & ~debugctl_mask); -} - -static void bts_trace_reset(struct trace_array *tr) -{ - int cpu; - - tr->time_start = ftrace_now(tr->cpu); - - for_each_online_cpu(cpu) - tracing_reset(tr, cpu); -} - -static void bts_trace_start_cpu(void *arg) -{ - this_tracer = - ds_request_bts(/* task = */ NULL, this_buffer, SIZEOF_BTS, - /* ovfl = */ NULL, /* th = */ (size_t)-1); - if (IS_ERR(this_tracer)) { - this_tracer = NULL; - return; - } - - bts_enable(); -} - -static void bts_trace_start(struct trace_array *tr) -{ - int cpu; - - bts_trace_reset(tr); - - for_each_cpu_mask(cpu, cpu_possible_map) - smp_call_function_single(cpu, bts_trace_start_cpu, NULL, 1); -} - -static void bts_trace_stop_cpu(void *arg) -{ - if (this_tracer) { - bts_disable(); - - ds_release_bts(this_tracer); - this_tracer = NULL; - } -} - -static void bts_trace_stop(struct trace_array *tr) -{ - int cpu; - - for_each_cpu_mask(cpu, cpu_possible_map) - smp_call_function_single(cpu, bts_trace_stop_cpu, NULL, 1); -} - -static int bts_trace_init(struct trace_array *tr) -{ - bts_trace_cpuinit(&boot_cpu_data); - bts_trace_reset(tr); - bts_trace_start(tr); - - return 0; -} - -static void bts_trace_print_header(struct seq_file *m) -{ -#ifdef __i386__ - seq_puts(m, "# CPU# FROM TO FUNCTION\n"); - seq_puts(m, "# | | | |\n"); -#else - seq_puts(m, - "# CPU# FROM TO FUNCTION\n"); - seq_puts(m, - "# | | | |\n"); -#endif -} - -static enum print_line_t bts_trace_print_line(struct trace_iterator *iter) -{ - struct trace_entry *entry = iter->ent; - struct trace_seq *seq = &iter->seq; - struct bts_entry *it; - - trace_assign_type(it, entry); - - if (entry->type == TRACE_BTS) { - int ret; -#ifdef CONFIG_KALLSYMS - char function[KSYM_SYMBOL_LEN]; - sprint_symbol(function, it->from); -#else - char *function = ""; -#endif - - ret = trace_seq_printf(seq, "%4d 0x%lx -> 0x%lx [%s]\n", - entry->cpu, it->from, it->to, function); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE;; - return TRACE_TYPE_HANDLED; - } - return TRACE_TYPE_UNHANDLED; -} - -void trace_bts(struct trace_array *tr, unsigned long from, unsigned long to) -{ - struct ring_buffer_event *event; - struct bts_entry *entry; - unsigned long irq; - - event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), &irq); - if (!event) - return; - entry = ring_buffer_event_data(event); - tracing_generic_entry_update(&entry->ent, 0, from); - entry->ent.type = TRACE_BTS; - entry->ent.cpu = smp_processor_id(); - entry->from = from; - entry->to = to; - ring_buffer_unlock_commit(tr->buffer, event, irq); -} - -static void trace_bts_at(struct trace_array *tr, size_t index) -{ - const void *raw = NULL; - unsigned long from, to; - int err; - - err = ds_access_bts(this_tracer, index, &raw); - if (err < 0) - return; - - from = *(const unsigned long *)raw; - to = *(const unsigned long *)((const char *)raw + sizeof_field); - - trace_bts(tr, from, to); -} - -static void trace_bts_cpu(void *arg) -{ - struct trace_array *tr = (struct trace_array *) arg; - size_t index = 0, end = 0, i; - int err; - - if (!this_tracer) - return; - - bts_disable(); - - err = ds_get_bts_index(this_tracer, &index); - if (err < 0) - goto out; - - err = ds_get_bts_end(this_tracer, &end); - if (err < 0) - goto out; - - for (i = index; i < end; i++) - trace_bts_at(tr, i); - - for (i = 0; i < index; i++) - trace_bts_at(tr, i); - -out: - bts_enable(); -} - -static void trace_bts_prepare(struct trace_iterator *iter) -{ - int cpu; - - for_each_cpu_mask(cpu, cpu_possible_map) - smp_call_function_single(cpu, trace_bts_cpu, iter->tr, 1); -} - -struct tracer bts_tracer __read_mostly = -{ - .name = "bts", - .init = bts_trace_init, - .reset = bts_trace_stop, - .print_header = bts_trace_print_header, - .print_line = bts_trace_print_line, - .start = bts_trace_start, - .stop = bts_trace_stop, - .open = trace_bts_prepare -}; - -__init static int init_bts_trace(void) -{ - return register_tracer(&bts_tracer); -} -device_initcall(init_bts_trace); diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c new file mode 100644 index 0000000..ee29e01 --- /dev/null +++ b/kernel/trace/trace_hw_branches.c @@ -0,0 +1,205 @@ +/* + * h/w branch tracer for x86 based on bts + * + * Copyright (C) 2008 Markus Metzger + * + */ + +#include +#include +#include +#include +#include + +#include + +#include "trace.h" + + +#define SIZEOF_BTS (1 << 13) + +static DEFINE_PER_CPU(struct bts_tracer *, tracer); +static DEFINE_PER_CPU(unsigned char[SIZEOF_BTS], buffer); + +#define this_tracer per_cpu(tracer, smp_processor_id()) +#define this_buffer per_cpu(buffer, smp_processor_id()) + + +static void bts_trace_reset(struct trace_array *tr) +{ + int cpu; + + tr->time_start = ftrace_now(tr->cpu); + + for_each_online_cpu(cpu) + tracing_reset(tr, cpu); +} + +static void bts_trace_start_cpu(void *arg) +{ + if (this_tracer) + ds_release_bts(this_tracer); + + this_tracer = + ds_request_bts(/* task = */ NULL, this_buffer, SIZEOF_BTS, + /* ovfl = */ NULL, /* th = */ (size_t)-1, + BTS_KERNEL); + if (IS_ERR(this_tracer)) { + this_tracer = NULL; + return; + } +} + +static void bts_trace_start(struct trace_array *tr) +{ + int cpu; + + bts_trace_reset(tr); + + for_each_cpu_mask(cpu, cpu_possible_map) + smp_call_function_single(cpu, bts_trace_start_cpu, NULL, 1); +} + +static void bts_trace_stop_cpu(void *arg) +{ + if (this_tracer) { + ds_release_bts(this_tracer); + this_tracer = NULL; + } +} + +static void bts_trace_stop(struct trace_array *tr) +{ + int cpu; + + for_each_cpu_mask(cpu, cpu_possible_map) + smp_call_function_single(cpu, bts_trace_stop_cpu, NULL, 1); +} + +static int bts_trace_init(struct trace_array *tr) +{ + bts_trace_reset(tr); + bts_trace_start(tr); + + return 0; +} + +static void bts_trace_print_header(struct seq_file *m) +{ + seq_puts(m, + "# CPU# FROM TO FUNCTION\n"); + seq_puts(m, + "# | | | |\n"); +} + +static enum print_line_t bts_trace_print_line(struct trace_iterator *iter) +{ + struct trace_entry *entry = iter->ent; + struct trace_seq *seq = &iter->seq; + struct hw_branch_entry *it; + + trace_assign_type(it, entry); + + if (entry->type == TRACE_HW_BRANCHES) { + if (trace_seq_printf(seq, "%4d ", entry->cpu) && + trace_seq_printf(seq, "0x%016llx -> 0x%016llx ", + it->from, it->to) && + (!it->from || + seq_print_ip_sym(seq, it->from, /* sym_flags = */ 0)) && + trace_seq_printf(seq, "\n")) + return TRACE_TYPE_HANDLED; + return TRACE_TYPE_PARTIAL_LINE;; + } + return TRACE_TYPE_UNHANDLED; +} + +void trace_hw_branch(struct trace_array *tr, u64 from, u64 to) +{ + struct ring_buffer_event *event; + struct hw_branch_entry *entry; + unsigned long irq; + + event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), &irq); + if (!event) + return; + entry = ring_buffer_event_data(event); + tracing_generic_entry_update(&entry->ent, 0, from); + entry->ent.type = TRACE_HW_BRANCHES; + entry->ent.cpu = smp_processor_id(); + entry->from = from; + entry->to = to; + ring_buffer_unlock_commit(tr->buffer, event, irq); +} + +static void trace_bts_at(struct trace_array *tr, + const struct bts_trace *trace, void *at) +{ + struct bts_struct bts; + int err = 0; + + WARN_ON_ONCE(!trace->read); + if (!trace->read) + return; + + err = trace->read(this_tracer, at, &bts); + if (err < 0) + return; + + switch (bts.qualifier) { + case BTS_BRANCH: + trace_hw_branch(tr, bts.variant.lbr.from, bts.variant.lbr.to); + break; + } +} + +static void trace_bts_cpu(void *arg) +{ + struct trace_array *tr = (struct trace_array *) arg; + const struct bts_trace *trace; + unsigned char *at; + + if (!this_tracer) + return; + + ds_suspend_bts(this_tracer); + trace = ds_read_bts(this_tracer); + if (!trace) + goto out; + + for (at = trace->ds.top; (void *)at < trace->ds.end; + at += trace->ds.size) + trace_bts_at(tr, trace, at); + + for (at = trace->ds.begin; (void *)at < trace->ds.top; + at += trace->ds.size) + trace_bts_at(tr, trace, at); + +out: + ds_resume_bts(this_tracer); +} + +static void trace_bts_prepare(struct trace_iterator *iter) +{ + int cpu; + + for_each_cpu_mask(cpu, cpu_possible_map) + smp_call_function_single(cpu, trace_bts_cpu, iter->tr, 1); +} + +struct tracer bts_tracer __read_mostly = +{ + .name = "hw-branch-tracer", + .init = bts_trace_init, + .reset = bts_trace_stop, + .print_header = bts_trace_print_header, + .print_line = bts_trace_print_line, + .start = bts_trace_start, + .stop = bts_trace_stop, + .open = trace_bts_prepare +}; + +__init static int init_bts_trace(void) +{ + return register_tracer(&bts_tracer); +} +device_initcall(init_bts_trace); -- cgit v0.10.2 From ffc2238af8431d930d2c15f16feecf1fd6d75642 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 12 Dec 2008 08:21:19 +0100 Subject: x86, bts: fix build error Impact: build fix arch/x86/kernel/ds.c: In function 'ds_request': arch/x86/kernel/ds.c:236: sorry, unimplemented: inlining failed in call to 'ds_get_context': recursive inlining but the recursion here is scary ... Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index f058300..dc1e712 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -232,7 +232,7 @@ static DEFINE_PER_CPU(struct ds_context *, system_context_array); #define system_context per_cpu(system_context_array, smp_processor_id()) -static inline struct ds_context *ds_get_context(struct task_struct *task) +static struct ds_context *ds_get_context(struct task_struct *task) { struct ds_context **p_context = (task ? &task->thread.ds_ctx : &system_context); -- cgit v0.10.2 From 8001530d5af707eb9a158839c8f651eb6c1cb3c2 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Thu, 11 Dec 2008 16:10:08 +0100 Subject: tracing/fastboot: fix len of func buffer Impact: fix possible stack overrun This is a port of a patch included in the mainline (KSYM_SYMBOL_LEN fixes). The current func len is not large enough to contain the max symbol len, the right size must be KSYM_SYMBOL_LEN. Signed-off-by: Stephen Rothwell Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar diff --git a/include/trace/boot.h b/include/trace/boot.h index 6b54537..3ec58b4 100644 --- a/include/trace/boot.h +++ b/include/trace/boot.h @@ -9,7 +9,7 @@ */ struct boot_trace_call { pid_t caller; - char func[KSYM_NAME_LEN]; + char func[KSYM_SYMBOL_LEN]; }; /* @@ -17,7 +17,7 @@ struct boot_trace_call { * while it returns. */ struct boot_trace_ret { - char func[KSYM_NAME_LEN]; + char func[KSYM_SYMBOL_LEN]; int result; unsigned long long duration; /* nsecs */ }; -- cgit v0.10.2 From da485e0cb16726797e99a595a399b9fc721b91bc Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 11 Dec 2008 16:14:23 +0100 Subject: tracing/fastboot: include missing headers For now include/trace/boot.h doesn't need to include necessary headers for its functions and structures because the files that include it already do it. But boot.h could be needed as well for further uses on other files. So, this patch adds the necessary headers for future purposes... Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar diff --git a/include/trace/boot.h b/include/trace/boot.h index 3ec58b4..088ea08 100644 --- a/include/trace/boot.h +++ b/include/trace/boot.h @@ -1,6 +1,10 @@ #ifndef _LINUX_TRACE_BOOT_H #define _LINUX_TRACE_BOOT_H +#include +#include +#include + /* * Structure which defines the trace of an initcall * while it is called. -- cgit v0.10.2 From a0343e823184070f55364d8359f832dcb33c57c7 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 9 Dec 2008 23:53:16 +0100 Subject: tracing/function-graph-tracer: add a new .irqentry.text section Impact: let the function-graph-tracer be aware of the irq entrypoints Add a new .irqentry.text section to store the irq entrypoints functions inside the same section. This way, the tracer will be able to signal an interrupts triggering on output by recognizing these entrypoints. Also, make this section recordable for dynamic tracing. Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index 46e0544..1a614c0 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -35,6 +35,7 @@ SECTIONS SCHED_TEXT LOCK_TEXT KPROBES_TEXT + IRQENTRY_TEXT *(.fixup) *(.gnu.warning) _etext = .; /* End of text section */ diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index eba835a..c61fab1 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -288,6 +288,16 @@ *(.kprobes.text) \ VMLINUX_SYMBOL(__kprobes_text_end) = .; +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +#define IRQENTRY_TEXT \ + ALIGN_FUNCTION(); \ + VMLINUX_SYMBOL(__irqentry_text_start) = .; \ + *(.irqentry.text) \ + VMLINUX_SYMBOL(__irqentry_text_end) = .; +#else +#define IRQENTRY_TEXT +#endif + /* Section used for early init (in .S files) */ #define HEAD_TEXT *(.head.text) diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl index 0b1dc9f..fe83141 100755 --- a/scripts/recordmcount.pl +++ b/scripts/recordmcount.pl @@ -114,6 +114,7 @@ my %text_sections = ( ".text" => 1, ".sched.text" => 1, ".spinlock.text" => 1, + ".irqentry.text" => 1, ); $objdump = "objdump" if ((length $objdump) == 0); -- cgit v0.10.2 From bcbc4f20b52c2c40c43a4d2337707dcdfe81bc3a Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 9 Dec 2008 23:54:20 +0100 Subject: tracing/function-graph-tracer: annotate do_IRQ and smp_apic_timer_interrupt Impact: move most important x86 irq entry-points to a separate subsection Annotate do_IRQ and smp_apic_timer_interrupt to put them into the .irqentry.text subsection. These function will so be recognized as hardirq entrypoints for the function-graph-tracer. We could also annotate other irq entries but the others are far less important but they can be added on request. Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c index 16f9487..b946ac1 100644 --- a/arch/x86/kernel/apic.c +++ b/arch/x86/kernel/apic.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -800,7 +801,7 @@ static void local_apic_timer_interrupt(void) * [ if a single-CPU system runs an SMP kernel then we call the local * interrupt as well. Thus we cannot inline the local irq ... ] */ -void smp_apic_timer_interrupt(struct pt_regs *regs) +void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 60eb84e..11c65e8 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -47,7 +48,7 @@ static inline void stack_overflow_check(struct pt_regs *regs) * SMP cross-CPU interrupts have their own specific * handlers). */ -asmlinkage unsigned int do_IRQ(struct pt_regs *regs) +asmlinkage unsigned int __irq_entry do_IRQ(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); struct irq_desc *desc; diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 11cac81..44020f3 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -377,6 +377,16 @@ struct ftrace_graph_ret { */ #define __notrace_funcgraph notrace +/* + * We want to which function is an entrypoint of a hardirq. + * That will help us to put a signal on output. + */ +#define __irq_entry __attribute__((__section__(".irqentry.text"))) + +/* Limits of hardirq entrypoints */ +extern char __irqentry_text_start[]; +extern char __irqentry_text_end[]; + #define FTRACE_RETFUNC_DEPTH 50 #define FTRACE_RETSTACK_ALLOC_SIZE 32 /* Type of the callback handlers for tracing function graph*/ @@ -414,6 +424,7 @@ static inline void unpause_graph_tracing(void) #else #define __notrace_funcgraph +#define __irq_entry static inline void ftrace_graph_init_task(struct task_struct *t) { } static inline void ftrace_graph_exit_task(struct task_struct *t) { } -- cgit v0.10.2 From f8b755ac8e0cc3f330269e4c4504514f987167a2 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 9 Dec 2008 23:55:25 +0100 Subject: tracing/function-graph-tracer: Output arrows signal on hardirq call/return Impact: make more obvious the hardirq calls in the output When a hardirq is triggered inside the codeflow on output, we have now two arrows that indicate the entry and return of the hardirq. 0) | bit_waitqueue() { 0) 0.880 us | __phys_addr(); 0) 2.699 us | } 0) | __wake_up_bit() { 0) ==========> | smp_apic_timer_interrupt() { 0) 0.797 us | native_apic_mem_write(); 0) 0.715 us | exit_idle(); 0) | irq_enter() { 0) 0.722 us | idle_cpu(); 0) 5.519 us | } 0) | hrtimer_interrupt() { 0) | ktime_get() { 0) | ktime_get_ts() { 0) 0.805 us | getnstimeofday(); [...] 0) ! 108.528 us | } 0) | irq_exit() { 0) | do_softirq() { 0) | __do_softirq() { 0) 0.895 us | __local_bh_disable(); 0) | run_timer_softirq() { 0) 0.827 us | hrtimer_run_pending(); 0) 1.226 us | _spin_lock_irq(); 0) | _spin_unlock_irq() { 0) 6.550 us | } 0) 0.924 us | _local_bh_enable(); 0) + 12.129 us | } 0) + 13.911 us | } 0) 0.707 us | idle_cpu(); 0) + 17.009 us | } 0) ! 137.419 us | } 0) <========== | 0) 1.045 us | } 0) ! 148.908 us | } 0) ! 151.022 us | } 0) ! 153.022 us | } 0) 0.963 us | journal_mark_dirty(); 0) 0.925 us | __brelse(); Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index af60eef..4bf39fc 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -231,6 +231,49 @@ trace_branch_is_leaf(struct trace_iterator *iter, return true; } +static enum print_line_t +print_graph_irq(struct trace_seq *s, unsigned long addr, + enum trace_type type, int cpu, pid_t pid) +{ + int ret; + + if (addr < (unsigned long)__irqentry_text_start || + addr >= (unsigned long)__irqentry_text_end) + return TRACE_TYPE_UNHANDLED; + + if (type == TRACE_GRAPH_ENT) { + ret = trace_seq_printf(s, "==========> | "); + } else { + /* Cpu */ + if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) { + ret = print_graph_cpu(s, cpu); + if (ret == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + } + /* Proc */ + if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) { + ret = print_graph_proc(s, pid); + if (ret == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + + ret = trace_seq_printf(s, " | "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + + /* No overhead */ + if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) { + ret = trace_seq_printf(s, " "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + + ret = trace_seq_printf(s, "<========== |\n"); + } + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + return TRACE_TYPE_HANDLED; +} static enum print_line_t print_graph_duration(unsigned long long duration, struct trace_seq *s) @@ -344,7 +387,7 @@ print_graph_entry_leaf(struct trace_iterator *iter, static enum print_line_t print_graph_entry_nested(struct ftrace_graph_ent_entry *entry, - struct trace_seq *s) + struct trace_seq *s, pid_t pid, int cpu) { int i; int ret; @@ -357,8 +400,18 @@ print_graph_entry_nested(struct ftrace_graph_ent_entry *entry, return TRACE_TYPE_PARTIAL_LINE; } - /* No time */ - ret = trace_seq_printf(s, " | "); + /* Interrupt */ + ret = print_graph_irq(s, call->func, TRACE_GRAPH_ENT, cpu, pid); + if (ret == TRACE_TYPE_UNHANDLED) { + /* No time */ + ret = trace_seq_printf(s, " | "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } else { + if (ret == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + } + /* Function */ for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { @@ -410,7 +463,7 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, if (trace_branch_is_leaf(iter, field)) return print_graph_entry_leaf(iter, field, s); else - return print_graph_entry_nested(field, s); + return print_graph_entry_nested(field, s, iter->ent->pid, cpu); } @@ -474,6 +527,11 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, if (!ret) return TRACE_TYPE_PARTIAL_LINE; } + + ret = print_graph_irq(s, trace->func, TRACE_GRAPH_RET, cpu, ent->pid); + if (ret == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + return TRACE_TYPE_HANDLED; } -- cgit v0.10.2 From cbc34ed1ac36690f75fd272e19e7b4fc29aae5a2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 10 Dec 2008 08:08:22 +0100 Subject: sched: fix tracepoints in scheduler The trace point only caught one of many places where a task changes cpu, put it in the right place to we get all of them. Change the signature while we're at it. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar diff --git a/include/trace/sched.h b/include/trace/sched.h index 9b2854a..f4549d5 100644 --- a/include/trace/sched.h +++ b/include/trace/sched.h @@ -30,8 +30,8 @@ DECLARE_TRACE(sched_switch, TPARGS(rq, prev, next)); DECLARE_TRACE(sched_migrate_task, - TPPROTO(struct rq *rq, struct task_struct *p, int dest_cpu), - TPARGS(rq, p, dest_cpu)); + TPPROTO(struct task_struct *p, int orig_cpu, int dest_cpu), + TPARGS(p, orig_cpu, dest_cpu)); DECLARE_TRACE(sched_process_free, TPPROTO(struct task_struct *p), diff --git a/kernel/sched.c b/kernel/sched.c index 7729c4b..d377097 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1851,6 +1851,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) clock_offset = old_rq->clock - new_rq->clock; + trace_sched_migrate_task(p, task_cpu(p), new_cpu); + #ifdef CONFIG_SCHEDSTATS if (p->se.wait_start) p->se.wait_start -= clock_offset; @@ -2868,7 +2870,6 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu) || unlikely(!cpu_active(dest_cpu))) goto out; - trace_sched_migrate_task(rq, p, dest_cpu); /* force the process onto the specified CPU */ if (migrate_task(p, dest_cpu, &req)) { /* Need to wait for migration thread (might exit: take ref). */ -- cgit v0.10.2 From 2bed8446819a7c5033aa1da138d9f230ae212edc Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 12 Dec 2008 12:13:36 +0100 Subject: tracing/function-graph-tracer: add a new .irqentry.text section, fix Impact: build fix 32-bit x86 needs this section too. Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index a9b8560..82c6755 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -44,6 +44,7 @@ SECTIONS SCHED_TEXT LOCK_TEXT KPROBES_TEXT + IRQENTRY_TEXT *(.fixup) *(.gnu.warning) _etext = .; /* End of text section */ -- cgit v0.10.2 From cc1dc6d039ced64c2f8b8457bf1cccf4ecfc5942 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Tue, 16 Dec 2008 15:51:03 +0100 Subject: x86, bts: remove recursion from get_context Impact: cleanup Optimistically allocate a DS context. It is extremely unlikely that one already existed. This simplifies the code a lot. Signed-off-by: Markus Metzger Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index dc1e712..0dc7959 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -232,53 +232,45 @@ static DEFINE_PER_CPU(struct ds_context *, system_context_array); #define system_context per_cpu(system_context_array, smp_processor_id()) -static struct ds_context *ds_get_context(struct task_struct *task) + +static inline struct ds_context *ds_get_context(struct task_struct *task) { struct ds_context **p_context = (task ? &task->thread.ds_ctx : &system_context); - struct ds_context *context = *p_context; + struct ds_context *context = NULL; + struct ds_context *new_context = NULL; unsigned long irq; - if (!context) { - context = kzalloc(sizeof(*context), GFP_KERNEL); - if (!context) - return NULL; - - spin_lock_irqsave(&ds_lock, irq); - - if (*p_context) { - kfree(context); + /* Chances are small that we already have a context. */ + new_context = kzalloc(sizeof(*new_context), GFP_KERNEL); + if (!new_context) + return NULL; - context = *p_context; - } else { - *p_context = context; + spin_lock_irqsave(&ds_lock, irq); - context->this = p_context; - context->task = task; + context = *p_context; + if (!context) { + context = new_context; - if (task) - set_tsk_thread_flag(task, TIF_DS_AREA_MSR); + context->this = p_context; + context->task = task; + context->count = 0; - if (!task || (task == current)) - wrmsrl(MSR_IA32_DS_AREA, - (unsigned long)context->ds); - } + if (task) + set_tsk_thread_flag(task, TIF_DS_AREA_MSR); - context->count++; + if (!task || (task == current)) + wrmsrl(MSR_IA32_DS_AREA, (unsigned long)context->ds); - spin_unlock_irqrestore(&ds_lock, irq); - } else { - spin_lock_irqsave(&ds_lock, irq); + *p_context = context; + } - context = *p_context; - if (context) - context->count++; + context->count++; - spin_unlock_irqrestore(&ds_lock, irq); + spin_unlock_irqrestore(&ds_lock, irq); - if (!context) - context = ds_get_context(task); - } + if (context != new_context) + kfree(new_context); return context; } -- cgit v0.10.2 From d072c25f531c6513994960401d2c7f059434c0d2 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Tue, 16 Dec 2008 15:53:11 +0100 Subject: x86, bts: correctly report invalid bts records Impact: change the reporting of empty BTS records Correctly report a cleared BTS record as invalid. Used to be reported as branch from 0 to 0. Signed-off-by: Markus Metzger Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 0dc7959..98d271e 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -484,6 +484,9 @@ static int bts_read(struct bts_tracer *tracer, const void *at, out->qualifier = bts_branch; out->variant.lbr.from = bts_get(at, bts_from); out->variant.lbr.to = bts_get(at, bts_to); + + if (!out->variant.lbr.from && !out->variant.lbr.to) + out->qualifier = bts_invalid; } return ds_cfg.sizeof_rec[ds_bts]; -- cgit v0.10.2 From 2c2d7329d8afa9efa3ec24e19a53e7be9d14f242 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 16 Dec 2008 22:08:58 +0100 Subject: tracing/ftrace: use preempt_enable_no_resched_notrace in ring_buffer_time_stamp() Impact: prevent a trace recursion After some tests with function graph tracer under x86-32, I saw some recursions caused by ring_buffer_time_stamp() that calls preempt_enable_no_notrace() which calls preempt_schedule() which is traced itself. This patch re-enables preemption without rescheduling. Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 7f69cfea..eab81f9 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -107,7 +107,7 @@ u64 ring_buffer_time_stamp(int cpu) preempt_disable_notrace(); /* shift to debug/test normalization and TIME_EXTENTS */ time = sched_clock() << DEBUG_SHIFT; - preempt_enable_notrace(); + preempt_enable_no_resched_notrace(); return time; } -- cgit v0.10.2 From 66896a85cf2890b6bbbc4c9ccdcd296600ffbf89 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 13 Dec 2008 20:18:13 +0100 Subject: tracing/ftrace: add the printk-msg-only option Impact: display ftrace_printk messages "as is" By default, ftrace_printk() messages find their output with some other informations like pid, caller, ... Sometimes a developer just want to have the ftrace_printk left "as is", without other information. This is done by providing a default-off option called printk-msg-only. To enable it, just do `echo printk-msg-only > /debugfs/tracing/trace_options` Before the patch: <...>-2739 [000] 145.692153: __might_sleep: I'm an ftrace_printk msg in __might_sleep <...>-2739 [000] 145.692155: __might_sleep: I'm another ftrace_printk msg in __might_sleep After the patch and the printk-msg-only option enabled: I'm an ftrace_printk msg in __might_sleep I'm another ftrace_printk msg in __might_sleep Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 639344a..1a3d6b3 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -287,6 +287,7 @@ static const char *trace_options[] = { "annotate", "userstacktrace", "sym-userobj", + "printk-msg-only", NULL }; @@ -2265,6 +2266,25 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter) return TRACE_TYPE_HANDLED; } +static enum print_line_t print_printk_msg_only(struct trace_iterator *iter) +{ + struct trace_seq *s = &iter->seq; + struct trace_entry *entry = iter->ent; + struct print_entry *field; + int ret; + + trace_assign_type(field, entry); + + ret = trace_seq_printf(s, field->buf); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + if (entry->flags & TRACE_FLAG_CONT) + trace_seq_print_cont(s, iter); + + return TRACE_TYPE_HANDLED; +} + static enum print_line_t print_bin_fmt(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; @@ -2345,6 +2365,11 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter) return ret; } + if (iter->ent->type == TRACE_PRINT && + trace_flags & TRACE_ITER_PRINTK && + trace_flags & TRACE_ITER_PRINTK_MSGONLY) + return print_printk_msg_only(iter); + if (trace_flags & TRACE_ITER_BIN) return print_bin_fmt(iter); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index f07c246..fc75dce 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -578,7 +578,8 @@ enum trace_iterator_flags { TRACE_ITER_BRANCH = 0x1000, TRACE_ITER_ANNOTATE = 0x2000, TRACE_ITER_USERSTACKTRACE = 0x4000, - TRACE_ITER_SYM_USEROBJ = 0x8000 + TRACE_ITER_SYM_USEROBJ = 0x8000, + TRACE_ITER_PRINTK_MSGONLY = 0x10000 }; /* -- cgit v0.10.2 From d680fe44775ed17a80035462d9898f5e77bfd7dd Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sat, 13 Dec 2008 00:09:08 +0300 Subject: x86: entry_64 - introduce FTRACE_ frame macro v2 Impact: clean up Itroduce MCOUNT_SAVE/RESTORE_FRAME which allow us to save a number of lines on source level. Also fix a comment in ftrace.h. Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index 7e61b4c..b55b4a7 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -1,6 +1,33 @@ #ifndef _ASM_X86_FTRACE_H #define _ASM_X86_FTRACE_H +#ifdef __ASSEMBLY__ + + .macro MCOUNT_SAVE_FRAME + /* taken from glibc */ + subq $0x38, %rsp + movq %rax, (%rsp) + movq %rcx, 8(%rsp) + movq %rdx, 16(%rsp) + movq %rsi, 24(%rsp) + movq %rdi, 32(%rsp) + movq %r8, 40(%rsp) + movq %r9, 48(%rsp) + .endm + + .macro MCOUNT_RESTORE_FRAME + movq 48(%rsp), %r9 + movq 40(%rsp), %r8 + movq 32(%rsp), %rdi + movq 24(%rsp), %rsi + movq 16(%rsp), %rdx + movq 8(%rsp), %rcx + movq (%rsp), %rax + addq $0x38, %rsp + .endm + +#endif + #ifdef CONFIG_FUNCTION_TRACER #define MCOUNT_ADDR ((long)(mcount)) #define MCOUNT_INSN_SIZE 5 /* sizeof mcount call */ @@ -46,7 +73,7 @@ struct ftrace_ret_stack { /* * Primary handler of a function return. * It relays on ftrace_return_to_handler. - * Defined in entry32.S + * Defined in entry_32/64.S */ extern void return_to_handler(void); diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 54e0bbd..303dd84 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -71,15 +71,7 @@ ENTRY(ftrace_caller) cmpl $0, function_trace_stop jne ftrace_stub - /* taken from glibc */ - subq $0x38, %rsp - movq %rax, (%rsp) - movq %rcx, 8(%rsp) - movq %rdx, 16(%rsp) - movq %rsi, 24(%rsp) - movq %rdi, 32(%rsp) - movq %r8, 40(%rsp) - movq %r9, 48(%rsp) + MCOUNT_SAVE_FRAME movq 0x38(%rsp), %rdi movq 8(%rbp), %rsi @@ -89,14 +81,7 @@ ENTRY(ftrace_caller) ftrace_call: call ftrace_stub - movq 48(%rsp), %r9 - movq 40(%rsp), %r8 - movq 32(%rsp), %rdi - movq 24(%rsp), %rsi - movq 16(%rsp), %rdx - movq 8(%rsp), %rcx - movq (%rsp), %rax - addq $0x38, %rsp + MCOUNT_RESTORE_FRAME #ifdef CONFIG_FUNCTION_GRAPH_TRACER .globl ftrace_graph_call @@ -130,15 +115,7 @@ ftrace_stub: retq trace: - /* taken from glibc */ - subq $0x38, %rsp - movq %rax, (%rsp) - movq %rcx, 8(%rsp) - movq %rdx, 16(%rsp) - movq %rsi, 24(%rsp) - movq %rdi, 32(%rsp) - movq %r8, 40(%rsp) - movq %r9, 48(%rsp) + MCOUNT_SAVE_FRAME movq 0x38(%rsp), %rdi movq 8(%rbp), %rsi @@ -146,14 +123,7 @@ trace: call *ftrace_trace_function - movq 48(%rsp), %r9 - movq 40(%rsp), %r8 - movq 32(%rsp), %rdi - movq 24(%rsp), %rsi - movq 16(%rsp), %rdx - movq 8(%rsp), %rcx - movq (%rsp), %rax - addq $0x38, %rsp + MCOUNT_RESTORE_FRAME jmp ftrace_stub END(mcount) @@ -165,14 +135,7 @@ ENTRY(ftrace_graph_caller) cmpl $0, function_trace_stop jne ftrace_stub - subq $0x38, %rsp - movq %rax, (%rsp) - movq %rcx, 8(%rsp) - movq %rdx, 16(%rsp) - movq %rsi, 24(%rsp) - movq %rdi, 32(%rsp) - movq %r8, 40(%rsp) - movq %r9, 48(%rsp) + MCOUNT_SAVE_FRAME leaq 8(%rbp), %rdi movq 0x38(%rsp), %rsi @@ -180,14 +143,8 @@ ENTRY(ftrace_graph_caller) call prepare_ftrace_return - movq 48(%rsp), %r9 - movq 40(%rsp), %r8 - movq 32(%rsp), %rdi - movq 24(%rsp), %rsi - movq 16(%rsp), %rdx - movq 8(%rsp), %rcx - movq (%rsp), %rax - addq $0x38, %rsp + MCOUNT_RESTORE_FRAME + retq END(ftrace_graph_caller) -- cgit v0.10.2 From f38f1d2aa5a3520cf05da7cd6bd12fe2b0c509b7 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 16 Dec 2008 23:06:40 -0500 Subject: trace: add a way to enable or disable the stack tracer Impact: enhancement to stack tracer The stack tracer currently is either on when configured in or off when it is not. It can not be disabled when it is configured on. (besides disabling the function tracer that it uses) This patch adds a way to enable or disable the stack tracer at run time. It defaults off on bootup, but a kernel parameter 'stacktrace' has been added to enable it on bootup. A new sysctl has been added "kernel.stack_tracer_enabled" to let the user enable or disable the stack tracer at run time. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 2919a2e..edab81c 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -89,6 +89,7 @@ parameter is applicable: SPARC Sparc architecture is enabled. SWSUSP Software suspend (hibernation) is enabled. SUSPEND System suspend states are enabled. + FTRACE Function tracing enabled. TS Appropriate touchscreen support is enabled. USB USB support is enabled. USBHID USB Human Interface Device support is enabled. @@ -2173,6 +2174,9 @@ and is between 256 and 4096 characters. It is defined in the file st= [HW,SCSI] SCSI tape parameters (buffers, etc.) See Documentation/scsi/st.txt. + stacktrace [FTRACE] + Enabled the stack tracer on boot up. + sti= [PARISC,HW] Format: Set the STI (builtin display/keyboard on the HP-PARISC diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 44020f3..6b0db53 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -86,6 +86,14 @@ static inline void ftrace_stop(void) { } static inline void ftrace_start(void) { } #endif /* CONFIG_FUNCTION_TRACER */ +#ifdef CONFIG_STACK_TRACER +extern int stack_tracer_enabled; +int +stack_trace_sysctl(struct ctl_table *table, int write, + struct file *file, void __user *buffer, size_t *lenp, + loff_t *ppos); +#endif + #ifdef CONFIG_DYNAMIC_FTRACE /* asm/ftrace.h must be defined for archs supporting dynamic ftrace */ #include diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c83f566..6ac501a 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -487,6 +487,16 @@ static struct ctl_table kern_table[] = { .proc_handler = &ftrace_enable_sysctl, }, #endif +#ifdef CONFIG_STACK_TRACER + { + .ctl_name = CTL_UNNUMBERED, + .procname = "stack_tracer_enabled", + .data = &stack_tracer_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &stack_trace_sysctl, + }, +#endif #ifdef CONFIG_TRACING { .ctl_name = CTL_UNNUMBERED, diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index d8bae6f..e2a4ff6 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -244,10 +244,15 @@ config STACK_TRACER This tracer works by hooking into every function call that the kernel executes, and keeping a maximum stack depth value and - stack-trace saved. Because this logic has to execute in every - kernel function, all the time, this option can slow down the - kernel measurably and is generally intended for kernel - developers only. + stack-trace saved. If this is configured with DYNAMIC_FTRACE + then it will not have any overhead while the stack tracer + is disabled. + + To enable the stack tracer on bootup, pass in 'stacktrace' + on the kernel command line. + + The stack tracer can also be enabled or disabled via the + sysctl kernel.stack_tracer_enabled Say N if unsure. diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 0b863f2..4842c96 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include "trace.h" @@ -31,6 +32,10 @@ static raw_spinlock_t max_stack_lock = static int stack_trace_disabled __read_mostly; static DEFINE_PER_CPU(int, trace_active); +static DEFINE_MUTEX(stack_sysctl_mutex); + +int stack_tracer_enabled; +static int last_stack_tracer_enabled; static inline void check_stack(void) { @@ -174,7 +179,7 @@ stack_max_size_write(struct file *filp, const char __user *ubuf, return count; } -static struct file_operations stack_max_size_fops = { +static const struct file_operations stack_max_size_fops = { .open = tracing_open_generic, .read = stack_max_size_read, .write = stack_max_size_write, @@ -272,7 +277,7 @@ static int t_show(struct seq_file *m, void *v) return 0; } -static struct seq_operations stack_trace_seq_ops = { +static const struct seq_operations stack_trace_seq_ops = { .start = t_start, .next = t_next, .stop = t_stop, @@ -288,12 +293,48 @@ static int stack_trace_open(struct inode *inode, struct file *file) return ret; } -static struct file_operations stack_trace_fops = { +static const struct file_operations stack_trace_fops = { .open = stack_trace_open, .read = seq_read, .llseek = seq_lseek, }; +int +stack_trace_sysctl(struct ctl_table *table, int write, + struct file *file, void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + + mutex_lock(&stack_sysctl_mutex); + + ret = proc_dointvec(table, write, file, buffer, lenp, ppos); + + if (ret || !write || + (last_stack_tracer_enabled == stack_tracer_enabled)) + goto out; + + last_stack_tracer_enabled = stack_tracer_enabled; + + if (stack_tracer_enabled) + register_ftrace_function(&trace_ops); + else + unregister_ftrace_function(&trace_ops); + + out: + mutex_unlock(&stack_sysctl_mutex); + return ret; +} + +static int start_stack_trace __initdata; + +static __init int enable_stacktrace(char *str) +{ + start_stack_trace = 1; + return 1; +} +__setup("stacktrace", enable_stacktrace); + static __init int stack_trace_init(void) { struct dentry *d_tracer; @@ -311,7 +352,10 @@ static __init int stack_trace_init(void) if (!entry) pr_warning("Could not create debugfs 'stack_trace' entry\n"); - register_ftrace_function(&trace_ops); + if (start_stack_trace) { + register_ftrace_function(&trace_ops); + stack_tracer_enabled = 1; + } return 0; } -- cgit v0.10.2 From e05a43b744fb9518cbf8539a7ef33164ac60a70f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 17 Dec 2008 09:43:00 -0500 Subject: trace: better use of stack_trace_enabled for boot up code Impact: clean up Andrew Morton suggested to use the stack_tracer_enabled variable to decide whether or not to start stack tracing on bootup. This lets us remove the start_stack_trace variable. Reported-by: Andrew Morton Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 4842c96..d0871bc 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -308,7 +308,7 @@ stack_trace_sysctl(struct ctl_table *table, int write, mutex_lock(&stack_sysctl_mutex); - ret = proc_dointvec(table, write, file, buffer, lenp, ppos); + ret = proc_dointvec(table, write, file, buffer, lenp, ppos); if (ret || !write || (last_stack_tracer_enabled == stack_tracer_enabled)) @@ -326,11 +326,10 @@ stack_trace_sysctl(struct ctl_table *table, int write, return ret; } -static int start_stack_trace __initdata; - static __init int enable_stacktrace(char *str) { - start_stack_trace = 1; + stack_tracer_enabled = 1; + last_stack_tracer_enabled = 1; return 1; } __setup("stacktrace", enable_stacktrace); @@ -352,10 +351,8 @@ static __init int stack_trace_init(void) if (!entry) pr_warning("Could not create debugfs 'stack_trace' entry\n"); - if (start_stack_trace) { + if (stack_tracer_enabled) register_ftrace_function(&trace_ops); - stack_tracer_enabled = 1; - } return 0; } -- cgit v0.10.2 From ea3a6d6d60b2504c573fe3415f6617e8310c0236 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 17 Dec 2008 15:05:36 -0500 Subject: ftrace: add not to regex on filtering functions Impact: enhancement Ingo Molnar has asked about a way to remove items from the filter lists. Currently, you can only add or replace items. The way items are added to the list is through opening one of the list files (set_ftrace_filter or set_ftrace_notrace) via append. If the file is opened for truncate, the list is cleared. echo spin_lock > /debug/tracing/set_ftrace_filter The above will replace the list with only spin_lock echo spin_lock >> /debug/tracing/set_ftrace_filter The above will add spin_lock to the list. Now this patch adds: echo '!spin_lock' >> /debug/tracing/set_ftrace_filter This will remove spin_lock from the list. The limited glob features of these lists also can be notted. echo '!spin_*' >> /debug/tracing/set_ftrace_filter This will remove all functions that start with 'spin_' Note: echo '!spin_*' > /debug/tracing/set_ftrace_filter will simply clear out the list (notice the '>' instead of '>>') Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index a12f80e..2f32969 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1047,6 +1047,13 @@ ftrace_match(unsigned char *buff, int len, int enable) int type = MATCH_FULL; unsigned long flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; unsigned i, match = 0, search_len = 0; + int not = 0; + + if (buff[0] == '!') { + not = 1; + buff++; + len--; + } for (i = 0; i < len; i++) { if (buff[i] == '*') { @@ -1100,8 +1107,12 @@ ftrace_match(unsigned char *buff, int len, int enable) matched = 1; break; } - if (matched) - rec->flags |= flag; + if (matched) { + if (not) + rec->flags &= ~flag; + else + rec->flags |= flag; + } } pg = pg->next; } -- cgit v0.10.2 From 3d9101e92529e1ff6014f95a69afc82f37b9b13a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 17 Dec 2008 22:34:13 +0100 Subject: trace: fix task state printout Impact: fix occasionally incorrect trace output The tracing code has interesting varieties of printing out task state. Unfortunalely only one of the instances is correct as it copies the code from sched.c:sched_show_task(). The others are plain wrong as they treatthe bitfield as an integer offset into the character array. Also the size check of the character array is wrong as it includes the trailing \0. Use a common state decoder inline which does the Right Thing. Signed-off-by: Thomas Gleixner Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d86e325..8031005 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1301,6 +1301,13 @@ lat_print_timestamp(struct trace_seq *s, u64 abs_usecs, static const char state_to_char[] = TASK_STATE_TO_CHAR_STR; +static int task_state_char(unsigned long state) +{ + int bit = state ? __ffs(state) + 1 : 0; + + return bit < sizeof(state_to_char) - 1 ? state_to_char[bit] : '?'; +} + /* * The message is supposed to contain an ending newline. * If the printing stops prematurely, try to add a newline of our own. @@ -1396,12 +1403,8 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) trace_assign_type(field, entry); - T = field->next_state < sizeof(state_to_char) ? - state_to_char[field->next_state] : 'X'; - - state = field->prev_state ? - __ffs(field->prev_state) + 1 : 0; - S = state < sizeof(state_to_char) - 1 ? state_to_char[state] : 'X'; + T = task_state_char(field->next_state); + S = task_state_char(field->prev_state); comm = trace_find_cmdline(field->next_pid); trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n", field->prev_pid, @@ -1519,10 +1522,8 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter) trace_assign_type(field, entry); - S = field->prev_state < sizeof(state_to_char) ? - state_to_char[field->prev_state] : 'X'; - T = field->next_state < sizeof(state_to_char) ? - state_to_char[field->next_state] : 'X'; + T = task_state_char(field->next_state); + S = task_state_char(field->prev_state); ret = trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c\n", field->prev_pid, field->prev_prio, @@ -1621,12 +1622,9 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter) trace_assign_type(field, entry); - S = field->prev_state < sizeof(state_to_char) ? - state_to_char[field->prev_state] : 'X'; - T = field->next_state < sizeof(state_to_char) ? - state_to_char[field->next_state] : 'X'; - if (entry->type == TRACE_WAKE) - S = '+'; + T = task_state_char(field->next_state); + S = entry->type == TRACE_WAKE ? '+' : + task_state_char(field->prev_state); ret = trace_seq_printf(s, "%d %d %c %d %d %d %c\n", field->prev_pid, field->prev_prio, @@ -1712,12 +1710,9 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter) trace_assign_type(field, entry); - S = field->prev_state < sizeof(state_to_char) ? - state_to_char[field->prev_state] : 'X'; - T = field->next_state < sizeof(state_to_char) ? - state_to_char[field->next_state] : 'X'; - if (entry->type == TRACE_WAKE) - S = '+'; + T = task_state_char(field->next_state); + S = entry->type == TRACE_WAKE ? '+' : + task_state_char(field->prev_state); SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid); SEQ_PUT_HEX_FIELD_RET(s, field->prev_prio); SEQ_PUT_HEX_FIELD_RET(s, S); -- cgit v0.10.2 From 6d102bc68f3dd2ae0e305b09170b1751aa67baeb Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 17 Dec 2008 17:48:23 +0800 Subject: tracing/ring-buffer: remove unused ring_buffer size Impact: remove dead code struct ring_buffer.size is not set after ring_buffer is initialized or resized. it is always 0. we can use "buffer->pages * PAGE_SIZE" to get ring_buffer's size Signed-off-by: Lai Jiangshan Signed-off-by: Ingo Molnar diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index eab81f9..bb6922a 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -258,7 +258,6 @@ struct ring_buffer_per_cpu { }; struct ring_buffer { - unsigned long size; unsigned pages; unsigned flags; int cpus; @@ -2210,8 +2209,7 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a, return -EINVAL; /* At least make sure the two buffers are somewhat the same */ - if (buffer_a->size != buffer_b->size || - buffer_a->pages != buffer_b->pages) + if (buffer_a->pages != buffer_b->pages) return -EINVAL; cpu_buffer_a = buffer_a->buffers[cpu]; -- cgit v0.10.2 From 3bddb9a3246f6df5cf3b7655cb541ac10203bb71 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 19 Dec 2008 01:03:29 +0100 Subject: tracing: fix warning in kernel/trace/trace.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit this warning: kernel/trace/trace.c: In function ‘print_lat_fmt’: kernel/trace/trace.c:1826: warning: unused variable ‘state’ Triggers because 'state' has become unused - remove it. Signed-off-by: Ingo Molnar diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 1a3d6b3..49fc720 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1816,7 +1816,6 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) char *comm; int S, T; int i; - unsigned state; if (entry->type == TRACE_CONT) return TRACE_TYPE_HANDLED; -- cgit v0.10.2 From c71dd42db2c6f1637b92502a214587431c1a6ad2 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 19 Dec 2008 01:09:51 +0100 Subject: tracing: fix warnings in kernel/trace/trace_sched_switch.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit these warnings: kernel/trace/trace_sched_switch.c: In function ‘tracing_sched_register’: kernel/trace/trace_sched_switch.c:96: warning: passing argument 1 of ‘register_trace_sched_wakeup_new’ from incompatible pointer type kernel/trace/trace_sched_switch.c:112: warning: passing argument 1 of ‘unregister_trace_sched_wakeup_new’ from incompatible pointer type kernel/trace/trace_sched_switch.c: In function ‘tracing_sched_unregister’: kernel/trace/trace_sched_switch.c:121: warning: passing argument 1 of ‘unregister_trace_sched_wakeup_new’ from incompatible pointer type Trigger because sched_wakeup_new tracepoints need the same trace signature as sched_wakeup - which was changed recently. Fix it. Signed-off-by: Ingo Molnar diff --git a/include/trace/sched.h b/include/trace/sched.h index f4549d5..bc4c9ea 100644 --- a/include/trace/sched.h +++ b/include/trace/sched.h @@ -21,8 +21,8 @@ DECLARE_TRACE(sched_wakeup, TPARGS(rq, p)); DECLARE_TRACE(sched_wakeup_new, - TPPROTO(struct rq *rq, struct task_struct *p), - TPARGS(rq, p)); + TPPROTO(struct rq *rq, struct task_struct *p, int success), + TPARGS(rq, p, success)); DECLARE_TRACE(sched_switch, TPPROTO(struct rq *rq, struct task_struct *prev, diff --git a/kernel/sched.c b/kernel/sched.c index d377097..ac5a70a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2457,7 +2457,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) p->sched_class->task_new(rq, p); inc_nr_running(rq); } - trace_sched_wakeup_new(rq, p); + trace_sched_wakeup_new(rq, p, 1); check_preempt_curr(rq, p, 0); #ifdef CONFIG_SMP if (p->sched_class->task_wake_up) diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 8633905..781d72e 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -247,3 +247,4 @@ __init static int init_sched_switch_trace(void) return register_tracer(&sched_switch_trace); } device_initcall(init_sched_switch_trace); + -- cgit v0.10.2 From 213cc060797378059a28ebc5c539f3e9a80160bd Mon Sep 17 00:00:00 2001 From: Pekka J Enberg Date: Fri, 19 Dec 2008 12:08:39 +0200 Subject: ftrace: introduce tracing_reset_online_cpus() helper Impact: cleanup This patch factors out common code from multiple tracers into a tracing_reset_online_cpus() function and converts the tracers to use it. Signed-off-by: Pekka Enberg Signed-off-by: Ingo Molnar diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 0eb6d48..79db26e 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -679,6 +679,16 @@ void tracing_reset(struct trace_array *tr, int cpu) ftrace_enable_cpu(); } +void tracing_reset_online_cpus(struct trace_array *tr) +{ + int cpu; + + tr->time_start = ftrace_now(tr->cpu); + + for_each_online_cpu(cpu) + tracing_reset(tr, cpu); +} + #define SAVED_CMDLINES 128 static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; static unsigned map_cmdline_to_pid[SAVED_CMDLINES]; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index fc75dce..cc7a4f8 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -374,6 +374,7 @@ struct trace_iterator { int tracing_is_enabled(void); void trace_wake_up(void); void tracing_reset(struct trace_array *tr, int cpu); +void tracing_reset_online_cpus(struct trace_array *tr); int tracing_open_generic(struct inode *inode, struct file *filp); struct dentry *tracing_init_dentry(void); void init_tracer_sysprof_debugfs(struct dentry *d_tracer); diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index a4fa2c5..3ccebde 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -37,16 +37,6 @@ void disable_boot_trace(void) tracing_stop_sched_switch_record(); } -static void reset_boot_trace(struct trace_array *tr) -{ - int cpu; - - tr->time_start = ftrace_now(tr->cpu); - - for_each_online_cpu(cpu) - tracing_reset(tr, cpu); -} - static int boot_trace_init(struct trace_array *tr) { int cpu; @@ -130,7 +120,7 @@ struct tracer boot_tracer __read_mostly = { .name = "initcall", .init = boot_trace_init, - .reset = reset_boot_trace, + .reset = tracing_reset_online_cpus, .print_line = initcall_print_line, }; diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index e74f6d0..9236d7e 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -16,20 +16,10 @@ #include "trace.h" -static void function_reset(struct trace_array *tr) -{ - int cpu; - - tr->time_start = ftrace_now(tr->cpu); - - for_each_online_cpu(cpu) - tracing_reset(tr, cpu); -} - static void start_function_trace(struct trace_array *tr) { tr->cpu = get_cpu(); - function_reset(tr); + tracing_reset_online_cpus(tr); put_cpu(); tracing_start_cmdline_record(); @@ -55,7 +45,7 @@ static void function_trace_reset(struct trace_array *tr) static void function_trace_start(struct trace_array *tr) { - function_reset(tr); + tracing_reset_online_cpus(tr); } static struct tracer function_trace __read_mostly = diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c index ee29e01..b6a3e20 100644 --- a/kernel/trace/trace_hw_branches.c +++ b/kernel/trace/trace_hw_branches.c @@ -25,16 +25,6 @@ static DEFINE_PER_CPU(unsigned char[SIZEOF_BTS], buffer); #define this_buffer per_cpu(buffer, smp_processor_id()) -static void bts_trace_reset(struct trace_array *tr) -{ - int cpu; - - tr->time_start = ftrace_now(tr->cpu); - - for_each_online_cpu(cpu) - tracing_reset(tr, cpu); -} - static void bts_trace_start_cpu(void *arg) { if (this_tracer) @@ -54,7 +44,7 @@ static void bts_trace_start(struct trace_array *tr) { int cpu; - bts_trace_reset(tr); + tracing_reset_online_cpus(tr); for_each_cpu_mask(cpu, cpu_possible_map) smp_call_function_single(cpu, bts_trace_start_cpu, NULL, 1); @@ -78,7 +68,7 @@ static void bts_trace_stop(struct trace_array *tr) static int bts_trace_init(struct trace_array *tr) { - bts_trace_reset(tr); + tracing_reset_online_cpus(tr); bts_trace_start(tr); return 0; diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index 2fb6da6..fffcb06 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -22,14 +22,10 @@ static unsigned long prev_overruns; static void mmio_reset_data(struct trace_array *tr) { - int cpu; - overrun_detected = false; prev_overruns = 0; - tr->time_start = ftrace_now(tr->cpu); - for_each_online_cpu(cpu) - tracing_reset(tr, cpu); + tracing_reset_online_cpus(tr); } static int mmio_trace_init(struct trace_array *tr) diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 781d72e..add2c1f 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -72,16 +72,6 @@ probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee) local_irq_restore(flags); } -static void sched_switch_reset(struct trace_array *tr) -{ - int cpu; - - tr->time_start = ftrace_now(tr->cpu); - - for_each_online_cpu(cpu) - tracing_reset(tr, cpu); -} - static int tracing_sched_register(void) { int ret; @@ -197,7 +187,7 @@ void tracing_sched_switch_assign_trace(struct trace_array *tr) static void start_sched_trace(struct trace_array *tr) { - sched_switch_reset(tr); + tracing_reset_online_cpus(tr); tracing_start_sched_switch_record(); } @@ -221,7 +211,7 @@ static void sched_switch_trace_reset(struct trace_array *tr) static void sched_switch_trace_start(struct trace_array *tr) { - sched_switch_reset(tr); + tracing_reset_online_cpus(tr); tracing_start_sched_switch(); } diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c index 54960ed..01becf1 100644 --- a/kernel/trace/trace_sysprof.c +++ b/kernel/trace/trace_sysprof.c @@ -234,20 +234,10 @@ static void stop_stack_timers(void) stop_stack_timer(cpu); } -static void stack_reset(struct trace_array *tr) -{ - int cpu; - - tr->time_start = ftrace_now(tr->cpu); - - for_each_online_cpu(cpu) - tracing_reset(tr, cpu); -} - static void start_stack_trace(struct trace_array *tr) { mutex_lock(&sample_timer_lock); - stack_reset(tr); + tracing_reset_online_cpus(tr); start_stack_timers(); tracer_enabled = 1; mutex_unlock(&sample_timer_lock); -- cgit v0.10.2 From bf53de907dfdaac178c92d774aae7370d7b97d20 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 19 Dec 2008 15:10:24 +0100 Subject: x86, bts: add fork and exit handling Impact: introduce new ptrace facility Add arch_ptrace_untrace() function that is called when the tracer detaches (either voluntarily or when the tracing task dies); ptrace_disable() is only called on a voluntary detach. Add ptrace_fork() and arch_ptrace_fork(). They are called when a traced task is forked. Clear DS and BTS related fields on fork. Release DS resources and reclaim memory in ptrace_untrace(). This releases resources already when the tracing task dies. We used to do that when the traced task dies. Signed-off-by: Markus Metzger Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/ds.h b/arch/x86/include/asm/ds.h index ee0ea3a..a8f672b 100644 --- a/arch/x86/include/asm/ds.h +++ b/arch/x86/include/asm/ds.h @@ -252,12 +252,21 @@ extern void __cpuinit ds_init_intel(struct cpuinfo_x86 *); */ extern void ds_switch_to(struct task_struct *prev, struct task_struct *next); +/* + * Task clone/init and cleanup work + */ +extern void ds_copy_thread(struct task_struct *tsk, struct task_struct *father); +extern void ds_exit_thread(struct task_struct *tsk); + #else /* CONFIG_X86_DS */ struct cpuinfo_x86; static inline void __cpuinit ds_init_intel(struct cpuinfo_x86 *ignored) {} static inline void ds_switch_to(struct task_struct *prev, struct task_struct *next) {} +static inline void ds_copy_thread(struct task_struct *tsk, + struct task_struct *father) {} +static inline void ds_exit_thread(struct task_struct *tsk) {} #endif /* CONFIG_X86_DS */ #endif /* _ASM_X86_DS_H */ diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index fbf7442..6d34d95 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -235,6 +235,13 @@ extern int do_get_thread_area(struct task_struct *p, int idx, extern int do_set_thread_area(struct task_struct *p, int idx, struct user_desc __user *info, int can_allocate); +extern void x86_ptrace_untrace(struct task_struct *); +extern void x86_ptrace_fork(struct task_struct *child, + unsigned long clone_flags); + +#define arch_ptrace_untrace(tsk) x86_ptrace_untrace(tsk) +#define arch_ptrace_fork(child, flags) x86_ptrace_fork(child, flags) + #endif /* __KERNEL__ */ #endif /* !__ASSEMBLY__ */ diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 98d271e..da91701 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -1017,3 +1017,14 @@ void ds_switch_to(struct task_struct *prev, struct task_struct *next) update_debugctlmsr(next->thread.debugctlmsr); } + +void ds_copy_thread(struct task_struct *tsk, struct task_struct *father) +{ + clear_tsk_thread_flag(tsk, TIF_DS_AREA_MSR); + tsk->thread.ds_ctx = NULL; +} + +void ds_exit_thread(struct task_struct *tsk) +{ + WARN_ON(tsk->thread.ds_ctx); +} diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 605eff9..3ba155d 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -60,6 +60,7 @@ #include #include #include +#include asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); @@ -251,17 +252,8 @@ void exit_thread(void) tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET; put_cpu(); } -#ifdef CONFIG_X86_DS - /* Free any BTS tracers that have not been properly released. */ - if (unlikely(current->bts)) { - ds_release_bts(current->bts); - current->bts = NULL; - - kfree(current->bts_buffer); - current->bts_buffer = NULL; - current->bts_size = 0; - } -#endif /* CONFIG_X86_DS */ + + ds_exit_thread(current); } void flush_thread(void) @@ -343,6 +335,12 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, kfree(p->thread.io_bitmap_ptr); p->thread.io_bitmap_max = 0; } + + ds_copy_thread(p, current); + + clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR); + p->thread.debugctlmsr = 0; + return err; } diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 1cfd2a4..416fb92 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -53,6 +53,7 @@ #include #include #include +#include asmlinkage extern void ret_from_fork(void); @@ -236,17 +237,8 @@ void exit_thread(void) t->io_bitmap_max = 0; put_cpu(); } -#ifdef CONFIG_X86_DS - /* Free any BTS tracers that have not been properly released. */ - if (unlikely(current->bts)) { - ds_release_bts(current->bts); - current->bts = NULL; - - kfree(current->bts_buffer); - current->bts_buffer = NULL; - current->bts_size = 0; - } -#endif /* CONFIG_X86_DS */ + + ds_exit_thread(current); } void flush_thread(void) @@ -376,6 +368,12 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, if (err) goto out; } + + ds_copy_thread(p, me); + + clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR); + p->thread.debugctlmsr = 0; + err = 0; out: if (err && p->thread.io_bitmap_ptr) { diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 45e9855..6ad2bb6 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -769,8 +769,47 @@ static int ptrace_bts_size(struct task_struct *child) return (trace->ds.top - trace->ds.begin) / trace->ds.size; } + +static void ptrace_bts_fork(struct task_struct *tsk) +{ + tsk->bts = NULL; + tsk->bts_buffer = NULL; + tsk->bts_size = 0; + tsk->thread.bts_ovfl_signal = 0; +} + +static void ptrace_bts_untrace(struct task_struct *child) +{ + if (unlikely(child->bts)) { + ds_release_bts(child->bts); + child->bts = NULL; + + kfree(child->bts_buffer); + child->bts_buffer = NULL; + child->bts_size = 0; + } +} + +static void ptrace_bts_detach(struct task_struct *child) +{ + ptrace_bts_untrace(child); +} +#else +static inline void ptrace_bts_fork(struct task_struct *tsk) {} +static inline void ptrace_bts_detach(struct task_struct *child) {} +static inline void ptrace_bts_untrace(struct task_struct *child) {} #endif /* CONFIG_X86_PTRACE_BTS */ +void x86_ptrace_fork(struct task_struct *child, unsigned long clone_flags) +{ + ptrace_bts_fork(child); +} + +void x86_ptrace_untrace(struct task_struct *child) +{ + ptrace_bts_untrace(child); +} + /* * Called by kernel/ptrace.c when detaching.. * @@ -782,16 +821,7 @@ void ptrace_disable(struct task_struct *child) #ifdef TIF_SYSCALL_EMU clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); #endif -#ifdef CONFIG_X86_PTRACE_BTS - if (child->bts) { - ds_release_bts(child->bts); - child->bts = NULL; - - kfree(child->bts_buffer); - child->bts_buffer = NULL; - child->bts_size = 0; - } -#endif /* CONFIG_X86_PTRACE_BTS */ + ptrace_bts_detach(child); } #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index 22641d5..98b93ca 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -94,6 +94,7 @@ extern void ptrace_notify(int exit_code); extern void __ptrace_link(struct task_struct *child, struct task_struct *new_parent); extern void __ptrace_unlink(struct task_struct *child); +extern void ptrace_fork(struct task_struct *task, unsigned long clone_flags); #define PTRACE_MODE_READ 1 #define PTRACE_MODE_ATTACH 2 /* Returns 0 on success, -errno on denial. */ @@ -313,6 +314,27 @@ static inline void user_enable_block_step(struct task_struct *task) #define arch_ptrace_stop(code, info) do { } while (0) #endif +#ifndef arch_ptrace_untrace +/* + * Do machine-specific work before untracing child. + * + * This is called for a normal detach as well as from ptrace_exit() + * when the tracing task dies. + * + * Called with write_lock(&tasklist_lock) held. + */ +#define arch_ptrace_untrace(task) do { } while (0) +#endif + +#ifndef arch_ptrace_fork +/* + * Do machine-specific work to initialize a new task. + * + * This is called from copy_process(). + */ +#define arch_ptrace_fork(child, clone_flags) do { } while (0) +#endif + extern int task_current_syscall(struct task_struct *target, long *callno, unsigned long args[6], unsigned int maxargs, unsigned long *sp, unsigned long *pc); diff --git a/kernel/fork.c b/kernel/fork.c index 7b93da7..65ce60a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1096,6 +1096,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, #ifdef CONFIG_DEBUG_MUTEXES p->blocked_on = NULL; /* not blocked yet */ #endif + if (unlikely(ptrace_reparented(current))) + ptrace_fork(p, clone_flags); /* Perform scheduler related setup. Assign this task to a CPU. */ sched_fork(p, clone_flags); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 4c8bcd7..100a71c 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -25,6 +25,17 @@ #include #include + +/* + * Initialize a new task whose father had been ptraced. + * + * Called from copy_process(). + */ +void ptrace_fork(struct task_struct *child, unsigned long clone_flags) +{ + arch_ptrace_fork(child, clone_flags); +} + /* * ptrace a task: make the debugger its new parent and * move it to the ptrace list. @@ -72,6 +83,7 @@ void __ptrace_unlink(struct task_struct *child) child->parent = child->real_parent; list_del_init(&child->ptrace_entry); + arch_ptrace_untrace(child); if (task_is_traced(child)) ptrace_untrace(child); } -- cgit v0.10.2 From c5dee6177f4bd2095aab7d9be9f6ebdddd6deee9 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 19 Dec 2008 15:17:02 +0100 Subject: x86, bts: memory accounting Impact: move the BTS buffer accounting to the mlock bucket Add alloc_locked_buffer() and free_locked_buffer() functions to mm/mlock.c to kalloc a buffer and account the locked memory to current. Account the memory for the BTS buffer to the tracer. Signed-off-by: Markus Metzger Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 6ad2bb6..0a5df5f 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -650,6 +650,24 @@ static int ptrace_bts_drain(struct task_struct *child, return drained; } +static int ptrace_bts_allocate_buffer(struct task_struct *child, size_t size) +{ + child->bts_buffer = alloc_locked_buffer(size); + if (!child->bts_buffer) + return -ENOMEM; + + child->bts_size = size; + + return 0; +} + +static void ptrace_bts_free_buffer(struct task_struct *child) +{ + free_locked_buffer(child->bts_buffer, child->bts_size); + child->bts_buffer = NULL; + child->bts_size = 0; +} + static int ptrace_bts_config(struct task_struct *child, long cfg_size, const struct ptrace_bts_config __user *ucfg) @@ -679,14 +697,13 @@ static int ptrace_bts_config(struct task_struct *child, if ((cfg.flags & PTRACE_BTS_O_ALLOC) && (cfg.size != child->bts_size)) { - kfree(child->bts_buffer); + int error; - child->bts_size = cfg.size; - child->bts_buffer = kzalloc(cfg.size, GFP_KERNEL); - if (!child->bts_buffer) { - child->bts_size = 0; - return -ENOMEM; - } + ptrace_bts_free_buffer(child); + + error = ptrace_bts_allocate_buffer(child, cfg.size); + if (error < 0) + return error; } if (cfg.flags & PTRACE_BTS_O_TRACE) @@ -701,10 +718,8 @@ static int ptrace_bts_config(struct task_struct *child, if (IS_ERR(child->bts)) { int error = PTR_ERR(child->bts); - kfree(child->bts_buffer); + ptrace_bts_free_buffer(child); child->bts = NULL; - child->bts_buffer = NULL; - child->bts_size = 0; return error; } @@ -784,6 +799,9 @@ static void ptrace_bts_untrace(struct task_struct *child) ds_release_bts(child->bts); child->bts = NULL; + /* We cannot update total_vm and locked_vm since + child's mm is already gone. But we can reclaim the + memory. */ kfree(child->bts_buffer); child->bts_buffer = NULL; child->bts_size = 0; @@ -792,7 +810,12 @@ static void ptrace_bts_untrace(struct task_struct *child) static void ptrace_bts_detach(struct task_struct *child) { - ptrace_bts_untrace(child); + if (unlikely(child->bts)) { + ds_release_bts(child->bts); + child->bts = NULL; + + ptrace_bts_free_buffer(child); + } } #else static inline void ptrace_bts_fork(struct task_struct *tsk) {} diff --git a/include/linux/mm.h b/include/linux/mm.h index ffee2f7..9979d3f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1286,5 +1286,7 @@ int vmemmap_populate_basepages(struct page *start_page, int vmemmap_populate(struct page *start_page, unsigned long pages, int node); void vmemmap_populate_print_last(void); +extern void *alloc_locked_buffer(size_t size); +extern void free_locked_buffer(void *buffer, size_t size); #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ diff --git a/mm/mlock.c b/mm/mlock.c index 1ada366..3035a56 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -667,3 +667,48 @@ void user_shm_unlock(size_t size, struct user_struct *user) spin_unlock(&shmlock_user_lock); free_uid(user); } + +void *alloc_locked_buffer(size_t size) +{ + unsigned long rlim, vm, pgsz; + void *buffer = NULL; + + pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT; + + down_write(¤t->mm->mmap_sem); + + rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; + vm = current->mm->total_vm + pgsz; + if (rlim < vm) + goto out; + + rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; + vm = current->mm->locked_vm + pgsz; + if (rlim < vm) + goto out; + + buffer = kzalloc(size, GFP_KERNEL); + if (!buffer) + goto out; + + current->mm->total_vm += pgsz; + current->mm->locked_vm += pgsz; + + out: + up_write(¤t->mm->mmap_sem); + return buffer; +} + +void free_locked_buffer(void *buffer, size_t size) +{ + unsigned long pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT; + + down_write(¤t->mm->mmap_sem); + + current->mm->total_vm -= pgsz; + current->mm->locked_vm -= pgsz; + + up_write(¤t->mm->mmap_sem); + + kfree(buffer); +} -- cgit v0.10.2 From 3ddeb912f41801fd1968c7880d031702a396e4d0 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Sat, 20 Dec 2008 17:15:14 +0800 Subject: ftrace: enable format arguments checking Impact: broaden gcc printf format checks for ftrace_printk() format arguments checking for ftrace_printk() is __printf(1, 2), not __printf(1, 0). Signed-off-by: Lai Jiangshan Signed-off-by: Ingo Molnar diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 04b52e6..677432b 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -303,7 +303,7 @@ extern void ftrace_dump(void); static inline void ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) { } static inline int -ftrace_printk(const char *fmt, ...) __attribute__ ((format (printf, 1, 0))); +ftrace_printk(const char *fmt, ...) __attribute__ ((format (printf, 1, 2))); static inline void tracing_start(void) { } static inline void tracing_stop(void) { } -- cgit v0.10.2 From a8ccf1d6f60e3e6ae63122e02378cd4d40dd4aac Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 23 Dec 2008 11:32:24 -0500 Subject: ring-buffer: fix dangling commit race Impact: fix stuck trace-buffers If an interrupt comes in during the rb_set_commit_to_write and pushes the tail page forward just at the right time, the commit updates will miss the adding of the interrupt data. This will cause the commit pointer to cease from moving forward. Thanks to Jiaying Zhang for finding this race. Reported-by: Jiaying Zhang Signed-off-by: Steven Rostedt Cc: Signed-off-by: Ingo Molnar diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index bb6922a..d03f4f4 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -838,6 +838,7 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) * back to us). This allows us to do a simple loop to * assign the commit to the tail. */ + again: while (cpu_buffer->commit_page != cpu_buffer->tail_page) { cpu_buffer->commit_page->page->commit = cpu_buffer->commit_page->write; @@ -853,6 +854,17 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->commit_page->write; barrier(); } + + /* again, keep gcc from optimizing */ + barrier(); + + /* + * If an interrupt came in just after the first while loop + * and pushed the tail page forward, we will be left with + * a dangling commit that will never go forward. + */ + if (unlikely(cpu_buffer->commit_page != cpu_buffer->tail_page)) + goto again; } static void rb_reset_reader_page(struct ring_buffer_per_cpu *cpu_buffer) -- cgit v0.10.2 From 98db8df777438e16ad0f44a0fba05ebbdb73db8d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 23 Dec 2008 11:32:25 -0500 Subject: ring-buffer: prevent false positive warning Impact: eliminate false WARN_ON message If an interrupt goes off after the setting of the local variable tail_page and before incrementing the write index of that page, the interrupt could push the commit forward to the next page. Later a check is made to see if interrupts pushed the buffer around the entire ring buffer by comparing the next page to the last commited page. This can produce a false positive if the interrupt had pushed the commit page forward as stated above. Thanks to Jiaying Zhang for finding this race. Reported-by: Jiaying Zhang Signed-off-by: Steven Rostedt Cc: Signed-off-by: Ingo Molnar diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index d03f4f4..76f34c0 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -962,12 +962,15 @@ static struct ring_buffer_event * __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, unsigned type, unsigned long length, u64 *ts) { - struct buffer_page *tail_page, *head_page, *reader_page; + struct buffer_page *tail_page, *head_page, *reader_page, *commit_page; unsigned long tail, write; struct ring_buffer *buffer = cpu_buffer->buffer; struct ring_buffer_event *event; unsigned long flags; + commit_page = cpu_buffer->commit_page; + /* we just need to protect against interrupts */ + barrier(); tail_page = cpu_buffer->tail_page; write = local_add_return(length, &tail_page->write); tail = write - length; @@ -993,7 +996,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, * it all the way around the buffer, bail, and warn * about it. */ - if (unlikely(next_page == cpu_buffer->commit_page)) { + if (unlikely(next_page == commit_page)) { WARN_ON_ONCE(1); goto out_unlock; } -- cgit v0.10.2 From 67be403d897f818b1a5ecc201967b0ee6a0332f9 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 24 Dec 2008 21:08:37 +0100 Subject: Revert "x86: disable X86_PTRACE_BTS" This reverts commit 40f15ad8aadff5ebb621b17a6f303ad2cd3f847d. The CONFIG_X86_PTRACE_BTS bugs have been fixed via: c5dee61: x86, bts: memory accounting bf53de9: x86, bts: add fork and exit handling Signed-off-by: Ingo Molnar diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index b54903ef..85a7857 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -521,7 +521,6 @@ config X86_PTRACE_BTS bool "Branch Trace Store" default y depends on X86_DEBUGCTLMSR - depends on BROKEN help This adds a ptrace interface to the hardware's branch trace store. -- cgit v0.10.2 From 0ca59dd948a51c95d5a366d35f897bc5ef9df55d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 24 Dec 2008 23:30:02 +0100 Subject: tracing/ftrace: don't trace on early stage of a secondary cpu boot, v3 Impact: fix a crash/hard-reboot on certain configs while enabling cpu runtime On some archs, the boot of a secondary cpu can have an early fragile state. On x86-64, the pda is not initialized on the first stage of a cpu boot but it is needed to get the cpu number and the current task pointer. This data is needed during tracing. As they were dereferenced at this stage, we got a crash while tracing a cpu being enabled at runtime. Some other archs like ia64 can have such kind of issue too. Changes on v2: We dropped the previous solution of a per-arch called function to guess the current state of a cpu. That could slow down the tracing. This patch removes the -pg flag on arch/x86/kernel/cpu/common.c where the low level cpu boot functions exist, on start_secondary() and a helper function used at this stage. Signed-off-by: Frederic Weisbecker Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index c2a812e..b8a1799 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -85,7 +85,8 @@ static inline void native_write_msr(unsigned int msr, asm volatile("wrmsr" : : "c" (msr), "a"(low), "d" (high) : "memory"); } -static inline int native_write_msr_safe(unsigned int msr, +/* Can be uninlined because referenced by paravirt */ +notrace static inline int native_write_msr_safe(unsigned int msr, unsigned low, unsigned high) { int err; diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 82ec607..4ae495a 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -2,6 +2,11 @@ # Makefile for x86-compatible CPU details and quirks # +# Don't trace early stages of a secondary CPU boot +ifdef CONFIG_FUNCTION_TRACER +CFLAGS_REMOVE_common.o = -pg +endif + obj-y := intel_cacheinfo.o addon_cpuid_features.o obj-y += proc.o capflags.o powerflags.o common.o diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index f71f96f..f6174d2 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -287,7 +287,7 @@ static int __cpuinitdata unsafe_smp; /* * Activate a secondary processor. */ -static void __cpuinit start_secondary(void *unused) +notrace static void __cpuinit start_secondary(void *unused) { /* * Don't put *anything* before cpu_init(), SMP booting is too -- cgit v0.10.2 From 468a15bb4cc61694495cc5ed7ffca29e87c79b69 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 16 Dec 2008 08:07:03 +0100 Subject: sched, trace: update trace_sched_wakeup() Impact: extend the wakeup tracepoint with the info whether the wakeup was real Add the information needed to distinguish 'real' wakeups from 'false' wakeups. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar diff --git a/include/trace/sched.h b/include/trace/sched.h index bc4c9ea..0d81098 100644 --- a/include/trace/sched.h +++ b/include/trace/sched.h @@ -17,8 +17,8 @@ DECLARE_TRACE(sched_wait_task, TPARGS(rq, p)); DECLARE_TRACE(sched_wakeup, - TPPROTO(struct rq *rq, struct task_struct *p), - TPARGS(rq, p)); + TPPROTO(struct rq *rq, struct task_struct *p, int success), + TPARGS(rq, p, success)); DECLARE_TRACE(sched_wakeup_new, TPPROTO(struct rq *rq, struct task_struct *p, int success), diff --git a/kernel/sched.c b/kernel/sched.c index ceda579..dcb39bc 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2324,7 +2324,7 @@ out_activate: success = 1; out_running: - trace_sched_wakeup(rq, p); + trace_sched_wakeup(rq, p, success); check_preempt_curr(rq, p, sync); p->state = TASK_RUNNING; diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index add2c1f..df175cb 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -49,7 +49,7 @@ probe_sched_switch(struct rq *__rq, struct task_struct *prev, } static void -probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee) +probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success) { struct trace_array_cpu *data; unsigned long flags; diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 0067b49..43586b6 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -211,7 +211,7 @@ static void wakeup_reset(struct trace_array *tr) } static void -probe_wakeup(struct rq *rq, struct task_struct *p) +probe_wakeup(struct rq *rq, struct task_struct *p, int success) { int cpu = smp_processor_id(); unsigned long flags; -- cgit v0.10.2