From bd3bff9e20f454b242d979ec2f9a4dca0d5fa06f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:41 +0200 Subject: sched: add latency tracer callbacks to the scheduler add 3 lightweight callbacks to the tracer backend. zero impact if tracing is turned off. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/include/linux/sched.h b/include/linux/sched.h index 5395a61..717cab8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2117,6 +2117,32 @@ static inline void arch_pick_mmap_layout(struct mm_struct *mm) } #endif +#ifdef CONFIG_CONTEXT_SWITCH_TRACER +extern void +ftrace_ctx_switch(struct task_struct *prev, struct task_struct *next); +#else +static inline void +ftrace_ctx_switch(struct task_struct *prev, struct task_struct *next) +{ +} +#endif + +#ifdef CONFIG_SCHED_TRACER +extern void +ftrace_wake_up_task(struct task_struct *wakee, struct task_struct *curr); +extern void +ftrace_wake_up_new_task(struct task_struct *wakee, struct task_struct *curr); +#else +static inline void +ftrace_wake_up_task(struct task_struct *wakee, struct task_struct *curr) +{ +} +static inline void +ftrace_wake_up_new_task(struct task_struct *wakee, struct task_struct *curr) +{ +} +#endif + extern long sched_setaffinity(pid_t pid, const cpumask_t *new_mask); extern long sched_getaffinity(pid_t pid, cpumask_t *mask); diff --git a/kernel/sched.c b/kernel/sched.c index cfa222a..463dcdb 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2467,6 +2467,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) out_activate: #endif /* CONFIG_SMP */ + ftrace_wake_up_task(p, rq->curr); schedstat_inc(p, se.nr_wakeups); if (sync) schedstat_inc(p, se.nr_wakeups_sync); @@ -2611,6 +2612,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) p->sched_class->task_new(rq, p); inc_nr_running(rq); } + ftrace_wake_up_new_task(p, rq->curr); check_preempt_curr(rq, p); #ifdef CONFIG_SMP if (p->sched_class->task_wake_up) @@ -2783,6 +2785,7 @@ context_switch(struct rq *rq, struct task_struct *prev, struct mm_struct *mm, *oldmm; prepare_task_switch(rq, prev, next); + ftrace_ctx_switch(prev, next); mm = next->mm; oldmm = prev->active_mm; /* -- cgit v0.10.2 From 7c731e0a495e25e79dc1e9e68772a67a55721a65 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:41 +0200 Subject: ftrace: make the task state char-string visible to all The tracer wants to be able to convert the state number into a user visible character. This patch pulls that conversion string out the scheduler into the header. This way if it were to ever change, other parts of the kernel will know. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/include/linux/sched.h b/include/linux/sched.h index 717cab8..6e26f1f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2237,6 +2237,8 @@ static inline void mm_init_owner(struct mm_struct *mm, struct task_struct *p) } #endif /* CONFIG_MM_OWNER */ +#define TASK_STATE_TO_CHAR_STR "RSDTtZX" + #endif /* __KERNEL__ */ #endif diff --git a/kernel/sched.c b/kernel/sched.c index 463dcdb..73e6008 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5729,7 +5729,7 @@ out_unlock: return retval; } -static const char stat_nam[] = "RSDTtZX"; +static const char stat_nam[] = TASK_STATE_TO_CHAR_STR; void sched_show_task(struct task_struct *p) { -- cgit v0.10.2 From 502825282e6f79c975a644afc124432ec1744de4 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:41 +0200 Subject: ftrace: add preempt_enable/disable notrace macros The tracer may need to call preempt_enable and disable functions for time keeping and such. The trace gets ugly when we see these functions show up for all traces. To make the output cleaner this patch adds preempt_enable_notrace and preempt_disable_notrace to be used by tracer (and debugging) functions. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/include/linux/preempt.h b/include/linux/preempt.h index 23f0c54..36b03d5 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -52,6 +52,34 @@ do { \ preempt_check_resched(); \ } while (0) +/* For debugging and tracer internals only! */ +#define add_preempt_count_notrace(val) \ + do { preempt_count() += (val); } while (0) +#define sub_preempt_count_notrace(val) \ + do { preempt_count() -= (val); } while (0) +#define inc_preempt_count_notrace() add_preempt_count_notrace(1) +#define dec_preempt_count_notrace() sub_preempt_count_notrace(1) + +#define preempt_disable_notrace() \ +do { \ + inc_preempt_count_notrace(); \ + barrier(); \ +} while (0) + +#define preempt_enable_no_resched_notrace() \ +do { \ + barrier(); \ + dec_preempt_count_notrace(); \ +} while (0) + +/* preempt_check_resched is OK to trace */ +#define preempt_enable_notrace() \ +do { \ + preempt_enable_no_resched_notrace(); \ + barrier(); \ + preempt_check_resched(); \ +} while (0) + #else #define preempt_disable() do { } while (0) @@ -59,6 +87,10 @@ do { \ #define preempt_enable() do { } while (0) #define preempt_check_resched() do { } while (0) +#define preempt_disable_notrace() do { } while (0) +#define preempt_enable_no_resched_notrace() do { } while (0) +#define preempt_enable_notrace() do { } while (0) + #endif #ifdef CONFIG_PREEMPT_NOTIFIERS -- cgit v0.10.2 From ffdc1a09ae7e2cbd714a446ee38a27f625b5f1c8 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:41 +0200 Subject: tracing: add notrace to linkage.h notrace signals that a function should not be traced. Most of the time this is used by tracers to annotate code that cannot be traced - it's in a volatile state (such as in user vdso context or NMI context) or it's in the tracer internals. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/include/linux/linkage.h b/include/linux/linkage.h index 2119610..14f329c 100644 --- a/include/linux/linkage.h +++ b/include/linux/linkage.h @@ -3,6 +3,8 @@ #include +#define notrace __attribute__((no_instrument_function)) + #ifdef __cplusplus #define CPP_ASMLINKAGE extern "C" #else -- cgit v0.10.2 From 23adec554a7648f99c8acc0caf49c66320cd2b84 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:41 +0200 Subject: x86: add notrace annotations to vsyscall. Add the notrace annotations to the vsyscall functions - there we are not in kernel context yet, so the tracer function cannot (and must not) be called. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 61efa2f..4063dfa 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -42,7 +42,8 @@ #include #include -#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) +#define __vsyscall(nr) \ + __attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace #define __syscall_clobber "r11","cx","memory" /* diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c index 23476c2..5cb8f75 100644 --- a/arch/x86/vdso/vclock_gettime.c +++ b/arch/x86/vdso/vclock_gettime.c @@ -23,7 +23,7 @@ #define gtod vdso_vsyscall_gtod_data -static long vdso_fallback_gettime(long clock, struct timespec *ts) +notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) { long ret; asm("syscall" : "=a" (ret) : @@ -31,7 +31,7 @@ static long vdso_fallback_gettime(long clock, struct timespec *ts) return ret; } -static inline long vgetns(void) +notrace static inline long vgetns(void) { long v; cycles_t (*vread)(void); @@ -40,7 +40,7 @@ static inline long vgetns(void) return (v * gtod->clock.mult) >> gtod->clock.shift; } -static noinline int do_realtime(struct timespec *ts) +notrace static noinline int do_realtime(struct timespec *ts) { unsigned long seq, ns; do { @@ -54,7 +54,8 @@ static noinline int do_realtime(struct timespec *ts) } /* Copy of the version in kernel/time.c which we cannot directly access */ -static void vset_normalized_timespec(struct timespec *ts, long sec, long nsec) +notrace static void +vset_normalized_timespec(struct timespec *ts, long sec, long nsec) { while (nsec >= NSEC_PER_SEC) { nsec -= NSEC_PER_SEC; @@ -68,7 +69,7 @@ static void vset_normalized_timespec(struct timespec *ts, long sec, long nsec) ts->tv_nsec = nsec; } -static noinline int do_monotonic(struct timespec *ts) +notrace static noinline int do_monotonic(struct timespec *ts) { unsigned long seq, ns, secs; do { @@ -82,7 +83,7 @@ static noinline int do_monotonic(struct timespec *ts) return 0; } -int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) +notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) { if (likely(gtod->sysctl_enabled && gtod->clock.vread)) switch (clock) { @@ -96,7 +97,7 @@ int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) int clock_gettime(clockid_t, struct timespec *) __attribute__((weak, alias("__vdso_clock_gettime"))); -int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz) +notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz) { long ret; if (likely(gtod->sysctl_enabled && gtod->clock.vread)) { diff --git a/arch/x86/vdso/vgetcpu.c b/arch/x86/vdso/vgetcpu.c index c8097f1..9fbc6b2 100644 --- a/arch/x86/vdso/vgetcpu.c +++ b/arch/x86/vdso/vgetcpu.c @@ -13,7 +13,8 @@ #include #include "vextern.h" -long __vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused) +notrace long +__vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused) { unsigned int p; diff --git a/include/asm-x86/vsyscall.h b/include/asm-x86/vsyscall.h index 17b3700..6b66ff9 100644 --- a/include/asm-x86/vsyscall.h +++ b/include/asm-x86/vsyscall.h @@ -24,7 +24,8 @@ enum vsyscall_num { ((unused, __section__ (".vsyscall_gtod_data"),aligned(16))) #define __section_vsyscall_clock __attribute__ \ ((unused, __section__ (".vsyscall_clock"),aligned(16))) -#define __vsyscall_fn __attribute__ ((unused,__section__(".vsyscall_fn"))) +#define __vsyscall_fn \ + __attribute__ ((unused, __section__(".vsyscall_fn"))) notrace #define VGETCPU_RDTSCP 1 #define VGETCPU_LSL 2 -- cgit v0.10.2 From 6e766410c4babd37bc7cd5e25009c179781742c8 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 12 May 2008 21:20:41 +0200 Subject: ftrace: annotate core code that should not be traced Mark with "notrace" functions in core code that should not be traced. The "notrace" attribute will prevent gcc from adding a call to ftrace on the annotated funtions. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c index 6c90fb9..e555ab6 100644 --- a/lib/smp_processor_id.c +++ b/lib/smp_processor_id.c @@ -7,7 +7,7 @@ #include #include -unsigned int debug_smp_processor_id(void) +notrace unsigned int debug_smp_processor_id(void) { unsigned long preempt_count = preempt_count(); int this_cpu = raw_smp_processor_id(); -- cgit v0.10.2 From 16444a8a40d4c7b4f6de34af0cae1f76a4f6c901 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 12 May 2008 21:20:42 +0200 Subject: ftrace: add basic support for gcc profiler instrumentation If CONFIG_FTRACE is selected and /proc/sys/kernel/ftrace_enabled is set to a non-zero value the ftrace routine will be called everytime we enter a kernel function that is not marked with the "notrace" attribute. The ftrace routine will then call a registered function if a function happens to be registered. [ This code has been highly hacked by Steven Rostedt and Ingo Molnar, so don't blame Arnaldo for all of this ;-) ] Update: It is now possible to register more than one ftrace function. If only one ftrace function is registered, that will be the function that ftrace calls directly. If more than one function is registered, then ftrace will call a function that will loop through the functions to call. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/Makefile b/Makefile index 20b3235..b4a273f 100644 --- a/Makefile +++ b/Makefile @@ -528,6 +528,10 @@ KBUILD_CFLAGS += -g KBUILD_AFLAGS += -gdwarf-2 endif +ifdef CONFIG_FTRACE +KBUILD_CFLAGS += -pg +endif + # We trigger additional mismatches with less inlining ifdef CONFIG_DEBUG_SECTION_MISMATCH KBUILD_CFLAGS += $(call cc-option, -fno-inline-functions-called-once) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index fe361ae..c742dfe 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -23,6 +23,7 @@ config X86 select HAVE_OPROFILE select HAVE_KPROBES select HAVE_KRETPROBES + select HAVE_FTRACE select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) select HAVE_ARCH_KGDB if !X86_VOYAGER diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 2a609dc..f47b9b5 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -1109,6 +1109,33 @@ ENDPROC(xen_failsafe_callback) #endif /* CONFIG_XEN */ +#ifdef CONFIG_FTRACE +ENTRY(mcount) + cmpl $ftrace_stub, ftrace_trace_function + jnz trace + +.globl ftrace_stub +ftrace_stub: + ret + + /* taken from glibc */ +trace: + pushl %eax + pushl %ecx + pushl %edx + movl 0xc(%esp), %eax + movl 0x4(%ebp), %edx + + call *ftrace_trace_function + + popl %edx + popl %ecx + popl %eax + + jmp ftrace_stub +END(mcount) +#endif + .section .rodata,"a" #include "syscall_table_32.S" diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 556a8df..f046e0c 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -54,6 +54,43 @@ .code64 +#ifdef CONFIG_FTRACE +ENTRY(mcount) + cmpq $ftrace_stub, ftrace_trace_function + jnz trace +.globl ftrace_stub +ftrace_stub: + retq + +trace: + /* taken from glibc */ + subq $0x38, %rsp + movq %rax, (%rsp) + movq %rcx, 8(%rsp) + movq %rdx, 16(%rsp) + movq %rsi, 24(%rsp) + movq %rdi, 32(%rsp) + movq %r8, 40(%rsp) + movq %r9, 48(%rsp) + + movq 0x38(%rsp), %rdi + movq 8(%rbp), %rsi + + call *ftrace_trace_function + + movq 48(%rsp), %r9 + movq 40(%rsp), %r8 + movq 32(%rsp), %rdi + movq 24(%rsp), %rsi + movq 16(%rsp), %rdx + movq 8(%rsp), %rcx + movq (%rsp), %rax + addq $0x38, %rsp + + jmp ftrace_stub +END(mcount) +#endif + #ifndef CONFIG_PREEMPT #define retint_kernel retint_restore_args #endif diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h new file mode 100644 index 0000000..b96ef14 --- /dev/null +++ b/include/linux/ftrace.h @@ -0,0 +1,38 @@ +#ifndef _LINUX_FTRACE_H +#define _LINUX_FTRACE_H + +#ifdef CONFIG_FTRACE + +#include + +#define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0)) +#define CALLER_ADDR1 ((unsigned long)__builtin_return_address(1)) +#define CALLER_ADDR2 ((unsigned long)__builtin_return_address(2)) + +typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip); + +struct ftrace_ops { + ftrace_func_t func; + struct ftrace_ops *next; +}; + +/* + * The ftrace_ops must be a static and should also + * be read_mostly. These functions do modify read_mostly variables + * so use them sparely. Never free an ftrace_op or modify the + * next pointer after it has been registered. Even after unregistering + * it, the next pointer may still be used internally. + */ +int register_ftrace_function(struct ftrace_ops *ops); +int unregister_ftrace_function(struct ftrace_ops *ops); +void clear_ftrace_function(void); + +extern void ftrace_stub(unsigned long a0, unsigned long a1); +extern void mcount(void); + +#else /* !CONFIG_FTRACE */ +# define register_ftrace_function(ops) do { } while (0) +# define unregister_ftrace_function(ops) do { } while (0) +# define clear_ftrace_function(ops) do { } while (0) +#endif /* CONFIG_FTRACE */ +#endif /* _LINUX_FTRACE_H */ diff --git a/kernel/Makefile b/kernel/Makefile index 1c9938a..fa05f6d 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -69,6 +69,7 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o obj-$(CONFIG_MARKERS) += marker.o obj-$(CONFIG_LATENCYTOP) += latencytop.o +obj-$(CONFIG_FTRACE) += trace/ ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) # According to Alan Modra , the -fno-omit-frame-pointer is diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig new file mode 100644 index 0000000..8185c91 --- /dev/null +++ b/kernel/trace/Kconfig @@ -0,0 +1,5 @@ +# +# Architectures that offer an FTRACE implementation should select HAVE_FTRACE: +# +config HAVE_FTRACE + bool diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile new file mode 100644 index 0000000..bf4fd21 --- /dev/null +++ b/kernel/trace/Makefile @@ -0,0 +1,3 @@ +obj-$(CONFIG_FTRACE) += libftrace.o + +libftrace-y := ftrace.o diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c new file mode 100644 index 0000000..b6a80b9 --- /dev/null +++ b/kernel/trace/ftrace.c @@ -0,0 +1,138 @@ +/* + * Infrastructure for profiling code inserted by 'gcc -pg'. + * + * Copyright (C) 2007-2008 Steven Rostedt + * Copyright (C) 2004-2008 Ingo Molnar + * + * Originally ported from the -rt patch by: + * Copyright (C) 2007 Arnaldo Carvalho de Melo + * + * Based on code in the latency_tracer, that is: + * + * Copyright (C) 2004-2006 Ingo Molnar + * Copyright (C) 2004 William Lee Irwin III + */ + +#include +#include + +static DEFINE_SPINLOCK(ftrace_func_lock); +static struct ftrace_ops ftrace_list_end __read_mostly = +{ + .func = ftrace_stub, +}; + +static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end; +ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; + +/* mcount is defined per arch in assembly */ +EXPORT_SYMBOL(mcount); + +notrace void ftrace_list_func(unsigned long ip, unsigned long parent_ip) +{ + struct ftrace_ops *op = ftrace_list; + + /* in case someone actually ports this to alpha! */ + read_barrier_depends(); + + while (op != &ftrace_list_end) { + /* silly alpha */ + read_barrier_depends(); + op->func(ip, parent_ip); + op = op->next; + }; +} + +/** + * register_ftrace_function - register a function for profiling + * @ops - ops structure that holds the function for profiling. + * + * Register a function to be called by all functions in the + * kernel. + * + * Note: @ops->func and all the functions it calls must be labeled + * with "notrace", otherwise it will go into a + * recursive loop. + */ +int register_ftrace_function(struct ftrace_ops *ops) +{ + unsigned long flags; + + spin_lock_irqsave(&ftrace_func_lock, flags); + ops->next = ftrace_list; + /* + * We are entering ops into the ftrace_list but another + * CPU might be walking that list. We need to make sure + * the ops->next pointer is valid before another CPU sees + * the ops pointer included into the ftrace_list. + */ + smp_wmb(); + ftrace_list = ops; + /* + * For one func, simply call it directly. + * For more than one func, call the chain. + */ + if (ops->next == &ftrace_list_end) + ftrace_trace_function = ops->func; + else + ftrace_trace_function = ftrace_list_func; + spin_unlock_irqrestore(&ftrace_func_lock, flags); + + return 0; +} + +/** + * unregister_ftrace_function - unresgister a function for profiling. + * @ops - ops structure that holds the function to unregister + * + * Unregister a function that was added to be called by ftrace profiling. + */ +int unregister_ftrace_function(struct ftrace_ops *ops) +{ + unsigned long flags; + struct ftrace_ops **p; + int ret = 0; + + spin_lock_irqsave(&ftrace_func_lock, flags); + + /* + * If we are the only function, then the ftrace pointer is + * pointing directly to that function. + */ + if (ftrace_list == ops && ops->next == &ftrace_list_end) { + ftrace_trace_function = ftrace_stub; + ftrace_list = &ftrace_list_end; + goto out; + } + + for (p = &ftrace_list; *p != &ftrace_list_end; p = &(*p)->next) + if (*p == ops) + break; + + if (*p != ops) { + ret = -1; + goto out; + } + + *p = (*p)->next; + + /* If we only have one func left, then call that directly */ + if (ftrace_list->next == &ftrace_list_end) + ftrace_trace_function = ftrace_list->func; + + out: + spin_unlock_irqrestore(&ftrace_func_lock, flags); + + return 0; +} + +/** + * clear_ftrace_function - reset the ftrace function + * + * This NULLs the ftrace function and in essence stops + * tracing. There may be lag + */ +void clear_ftrace_function(void) +{ + ftrace_trace_function = ftrace_stub; +} diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index d2099f4..d8b6279 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -634,6 +634,8 @@ config LATENCYTOP Enable this option if you want to use the LatencyTOP tool to find out which userspace is blocking on what kernel operations. +source kernel/trace/Kconfig + config PROVIDE_OHCI1394_DMA_INIT bool "Remote debugging over FireWire early on boot" depends on PCI && X86 -- cgit v0.10.2 From bc0c38d139ec7fcd5c030aea16b008f3732e42ac Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:42 +0200 Subject: ftrace: latency tracer infrastructure This patch adds the latency tracer infrastructure. This patch does not add anything that will select and turn it on, but will be used by later patches. If it were to be compiled, it would add the following files to the debugfs: The root tracing directory: /debugfs/tracing/ This patch also adds the following files: available_tracers list of available tracers. Currently no tracers are available. Looking into this file only shows "none" which is used to unregister all tracers. current_tracer The trace that is currently active. Empty on start up. To switch to a tracer simply echo one of the tracers that are listed in available_tracers: example: (used with later patches) echo function > /debugfs/tracing/current_tracer To disable the tracer: echo disable > /debugfs/tracing/current_tracer tracing_enabled echoing "1" into this file starts the ftrace function tracing (if sysctl kernel.ftrace_enabled=1) echoing "0" turns it off. latency_trace This file is readonly and holds the result of the trace. trace This file outputs a easier to read version of the trace. iter_ctrl Controls the way the output of traces look. So far there's two controls: echoing in "symonly" will only show the kallsyms variables without the addresses (if kallsyms was configured) echoing in "verbose" will change the output to show a lot more data, but not very easy to understand by humans. echoing in "nosymonly" turns off symonly. echoing in "noverbose" turns off verbose. Signed-off-by: Steven Rostedt Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/Makefile b/kernel/Makefile index fa05f6d..7e344e7 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -70,6 +70,7 @@ obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o obj-$(CONFIG_MARKERS) += marker.o obj-$(CONFIG_LATENCYTOP) += latencytop.o obj-$(CONFIG_FTRACE) += trace/ +obj-$(CONFIG_TRACING) += trace/ ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) # According to Alan Modra , the -fno-omit-frame-pointer is diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 8185c91..ce70677 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -3,3 +3,8 @@ # config HAVE_FTRACE bool + +config TRACING + bool + select DEBUG_FS + diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index bf4fd21..7af4031 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -1,3 +1,5 @@ obj-$(CONFIG_FTRACE) += libftrace.o +obj-$(CONFIG_TRACING) += trace.o + libftrace-y := ftrace.o diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c new file mode 100644 index 0000000..1b8eca7 --- /dev/null +++ b/kernel/trace/trace.c @@ -0,0 +1,1547 @@ +/* + * ring buffer based function tracer + * + * Copyright (C) 2007-2008 Steven Rostedt + * Copyright (C) 2008 Ingo Molnar + * + * Originally taken from the RT patch by: + * Arnaldo Carvalho de Melo + * + * Based on code from the latency_tracer, that is: + * Copyright (C) 2004-2006 Ingo Molnar + * Copyright (C) 2004 William Lee Irwin III + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "trace.h" + +unsigned long __read_mostly tracing_max_latency = (cycle_t)ULONG_MAX; +unsigned long __read_mostly tracing_thresh; + +static long notrace +ns2usecs(cycle_t nsec) +{ + nsec += 500; + do_div(nsec, 1000); + return nsec; +} + +static atomic_t tracer_counter; +static struct trace_array global_trace; + +static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu); + +static struct trace_array max_tr; + +static DEFINE_PER_CPU(struct trace_array_cpu, max_data); + +static int tracer_enabled; +static unsigned long trace_nr_entries = 4096UL; + +static struct tracer *trace_types __read_mostly; +static struct tracer *current_trace __read_mostly; +static int max_tracer_type_len; + +static DEFINE_MUTEX(trace_types_lock); + +static int __init set_nr_entries(char *str) +{ + if (!str) + return 0; + trace_nr_entries = simple_strtoul(str, &str, 0); + return 1; +} +__setup("trace_entries=", set_nr_entries); + +enum trace_type { + __TRACE_FIRST_TYPE = 0, + + TRACE_FN, + TRACE_CTX, + + __TRACE_LAST_TYPE +}; + +enum trace_flag_type { + TRACE_FLAG_IRQS_OFF = 0x01, + TRACE_FLAG_NEED_RESCHED = 0x02, + TRACE_FLAG_HARDIRQ = 0x04, + TRACE_FLAG_SOFTIRQ = 0x08, +}; + +enum trace_iterator_flags { + TRACE_ITER_PRINT_PARENT = 0x01, + TRACE_ITER_SYM_OFFSET = 0x02, + TRACE_ITER_SYM_ADDR = 0x04, + TRACE_ITER_VERBOSE = 0x08, +}; + +#define TRACE_ITER_SYM_MASK \ + (TRACE_ITER_PRINT_PARENT|TRACE_ITER_SYM_OFFSET|TRACE_ITER_SYM_ADDR) + +/* These must match the bit postions above */ +static const char *trace_options[] = { + "print-parent", + "sym-offset", + "sym-addr", + "verbose", + NULL +}; + +static unsigned trace_flags; + + +/* + * Copy the new maximum trace into the separate maximum-trace + * structure. (this way the maximum trace is permanently saved, + * for later retrieval via /debugfs/tracing/latency_trace) + */ +static void notrace +__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) +{ + struct trace_array_cpu *data = tr->data[cpu]; + + max_tr.cpu = cpu; + max_tr.time_start = data->preempt_timestamp; + + data = max_tr.data[cpu]; + data->saved_latency = tracing_max_latency; + + memcpy(data->comm, tsk->comm, TASK_COMM_LEN); + data->pid = tsk->pid; + data->uid = tsk->uid; + data->nice = tsk->static_prio - 20 - MAX_RT_PRIO; + data->policy = tsk->policy; + data->rt_priority = tsk->rt_priority; + + /* record this tasks comm */ + tracing_record_cmdline(current); +} + +notrace void +update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) +{ + struct trace_array_cpu *data; + void *save_trace; + int i; + + /* clear out all the previous traces */ + for_each_possible_cpu(i) { + data = tr->data[i]; + save_trace = max_tr.data[i]->trace; + memcpy(max_tr.data[i], data, sizeof(*data)); + data->trace = save_trace; + } + + __update_max_tr(tr, tsk, cpu); +} + +/** + * update_max_tr_single - only copy one trace over, and reset the rest + * @tr - tracer + * @tsk - task with the latency + * @cpu - the cpu of the buffer to copy. + */ +notrace void +update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) +{ + struct trace_array_cpu *data = tr->data[cpu]; + void *save_trace; + int i; + + for_each_possible_cpu(i) + tracing_reset(max_tr.data[i]); + + save_trace = max_tr.data[cpu]->trace; + memcpy(max_tr.data[cpu], data, sizeof(*data)); + data->trace = save_trace; + + __update_max_tr(tr, tsk, cpu); +} + +int register_tracer(struct tracer *type) +{ + struct tracer *t; + int len; + int ret = 0; + + if (!type->name) { + pr_info("Tracer must have a name\n"); + return -1; + } + + mutex_lock(&trace_types_lock); + for (t = trace_types; t; t = t->next) { + if (strcmp(type->name, t->name) == 0) { + /* already found */ + pr_info("Trace %s already registered\n", + type->name); + ret = -1; + goto out; + } + } + + type->next = trace_types; + trace_types = type; + len = strlen(type->name); + if (len > max_tracer_type_len) + max_tracer_type_len = len; + out: + mutex_unlock(&trace_types_lock); + + return ret; +} + +void unregister_tracer(struct tracer *type) +{ + struct tracer **t; + int len; + + mutex_lock(&trace_types_lock); + for (t = &trace_types; *t; t = &(*t)->next) { + if (*t == type) + goto found; + } + pr_info("Trace %s not registered\n", type->name); + goto out; + + found: + *t = (*t)->next; + if (strlen(type->name) != max_tracer_type_len) + goto out; + + max_tracer_type_len = 0; + for (t = &trace_types; *t; t = &(*t)->next) { + len = strlen((*t)->name); + if (len > max_tracer_type_len) + max_tracer_type_len = len; + } + out: + mutex_unlock(&trace_types_lock); +} + +void notrace tracing_reset(struct trace_array_cpu *data) +{ + data->trace_idx = 0; + atomic_set(&data->underrun, 0); +} + +#ifdef CONFIG_FTRACE +static void notrace +function_trace_call(unsigned long ip, unsigned long parent_ip) +{ + struct trace_array *tr = &global_trace; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int cpu; + + if (unlikely(!tracer_enabled)) + return; + + raw_local_irq_save(flags); + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + disabled = atomic_inc_return(&data->disabled); + + if (likely(disabled == 1)) + ftrace(tr, data, ip, parent_ip, flags); + + atomic_dec(&data->disabled); + raw_local_irq_restore(flags); +} + +static struct ftrace_ops trace_ops __read_mostly = +{ + .func = function_trace_call, +}; +#endif + +notrace void tracing_start_function_trace(void) +{ + register_ftrace_function(&trace_ops); +} + +notrace void tracing_stop_function_trace(void) +{ + unregister_ftrace_function(&trace_ops); +} + +#define SAVED_CMDLINES 128 +static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; +static unsigned map_cmdline_to_pid[SAVED_CMDLINES]; +static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN]; +static int cmdline_idx; +static DEFINE_SPINLOCK(trace_cmdline_lock); +atomic_t trace_record_cmdline_disabled; + +static void trace_init_cmdlines(void) +{ + memset(&map_pid_to_cmdline, -1, sizeof(map_pid_to_cmdline)); + memset(&map_cmdline_to_pid, -1, sizeof(map_cmdline_to_pid)); + cmdline_idx = 0; +} + +notrace void trace_stop_cmdline_recording(void); + +static void notrace trace_save_cmdline(struct task_struct *tsk) +{ + unsigned map; + unsigned idx; + + if (!tsk->pid || unlikely(tsk->pid > PID_MAX_DEFAULT)) + return; + + /* + * It's not the end of the world if we don't get + * the lock, but we also don't want to spin + * nor do we want to disable interrupts, + * so if we miss here, then better luck next time. + */ + if (!spin_trylock(&trace_cmdline_lock)) + return; + + idx = map_pid_to_cmdline[tsk->pid]; + if (idx >= SAVED_CMDLINES) { + idx = (cmdline_idx + 1) % SAVED_CMDLINES; + + map = map_cmdline_to_pid[idx]; + if (map <= PID_MAX_DEFAULT) + map_pid_to_cmdline[map] = (unsigned)-1; + + map_pid_to_cmdline[tsk->pid] = idx; + + cmdline_idx = idx; + } + + memcpy(&saved_cmdlines[idx], tsk->comm, TASK_COMM_LEN); + + spin_unlock(&trace_cmdline_lock); +} + +static notrace char *trace_find_cmdline(int pid) +{ + char *cmdline = "<...>"; + unsigned map; + + if (!pid) + return ""; + + if (pid > PID_MAX_DEFAULT) + goto out; + + map = map_pid_to_cmdline[pid]; + if (map >= SAVED_CMDLINES) + goto out; + + cmdline = saved_cmdlines[map]; + + out: + return cmdline; +} + +notrace void tracing_record_cmdline(struct task_struct *tsk) +{ + if (atomic_read(&trace_record_cmdline_disabled)) + return; + + trace_save_cmdline(tsk); +} + +static inline notrace struct trace_entry * +tracing_get_trace_entry(struct trace_array *tr, + struct trace_array_cpu *data) +{ + unsigned long idx, idx_next; + struct trace_entry *entry; + + idx = data->trace_idx; + idx_next = idx + 1; + + if (unlikely(idx_next >= tr->entries)) { + atomic_inc(&data->underrun); + idx_next = 0; + } + + data->trace_idx = idx_next; + + if (unlikely(idx_next != 0 && atomic_read(&data->underrun))) + atomic_inc(&data->underrun); + + entry = data->trace + idx * TRACE_ENTRY_SIZE; + + return entry; +} + +static inline notrace void +tracing_generic_entry_update(struct trace_entry *entry, + unsigned long flags) +{ + struct task_struct *tsk = current; + unsigned long pc; + + pc = preempt_count(); + + entry->idx = atomic_inc_return(&tracer_counter); + entry->preempt_count = pc & 0xff; + entry->pid = tsk->pid; + entry->t = now(raw_smp_processor_id()); + entry->flags = (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | + ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) | + ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) | + (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0); +} + +notrace void +ftrace(struct trace_array *tr, struct trace_array_cpu *data, + unsigned long ip, unsigned long parent_ip, + unsigned long flags) +{ + struct trace_entry *entry; + + entry = tracing_get_trace_entry(tr, data); + tracing_generic_entry_update(entry, flags); + entry->type = TRACE_FN; + entry->fn.ip = ip; + entry->fn.parent_ip = parent_ip; +} + +notrace void +tracing_sched_switch_trace(struct trace_array *tr, + struct trace_array_cpu *data, + struct task_struct *prev, struct task_struct *next, + unsigned long flags) +{ + struct trace_entry *entry; + + entry = tracing_get_trace_entry(tr, data); + tracing_generic_entry_update(entry, flags); + entry->type = TRACE_CTX; + entry->ctx.prev_pid = prev->pid; + entry->ctx.prev_prio = prev->prio; + entry->ctx.prev_state = prev->state; + entry->ctx.next_pid = next->pid; + entry->ctx.next_prio = next->prio; +} + +enum trace_file_type { + TRACE_FILE_LAT_FMT = 1, +}; + +static struct trace_entry * +trace_entry_idx(struct trace_array *tr, unsigned long idx, int cpu) +{ + struct trace_entry *array = tr->data[cpu]->trace; + unsigned long underrun; + + if (idx >= tr->entries) + return NULL; + + underrun = atomic_read(&tr->data[cpu]->underrun); + if (underrun) + idx = ((underrun - 1) + idx) % tr->entries; + else if (idx >= tr->data[cpu]->trace_idx) + return NULL; + + return &array[idx]; +} + +static struct notrace trace_entry * +find_next_entry(struct trace_iterator *iter, int *ent_cpu) +{ + struct trace_array *tr = iter->tr; + struct trace_entry *ent, *next = NULL; + int next_cpu = -1; + int cpu; + + for_each_possible_cpu(cpu) { + if (!tr->data[cpu]->trace) + continue; + ent = trace_entry_idx(tr, iter->next_idx[cpu], cpu); + if (ent && + (!next || (long)(next->idx - ent->idx) > 0)) { + next = ent; + next_cpu = cpu; + } + } + + if (ent_cpu) + *ent_cpu = next_cpu; + + return next; +} + +static void *find_next_entry_inc(struct trace_iterator *iter) +{ + struct trace_entry *next; + int next_cpu = -1; + + next = find_next_entry(iter, &next_cpu); + + if (next) { + iter->next_idx[next_cpu]++; + iter->idx++; + } + iter->ent = next; + iter->cpu = next_cpu; + + return next ? iter : NULL; +} + +static void notrace * +s_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct trace_iterator *iter = m->private; + void *ent; + void *last_ent = iter->ent; + int i = (int)*pos; + + (*pos)++; + + /* can't go backwards */ + if (iter->idx > i) + return NULL; + + if (iter->idx < 0) + ent = find_next_entry_inc(iter); + else + ent = iter; + + while (ent && iter->idx < i) + ent = find_next_entry_inc(iter); + + iter->pos = *pos; + + if (last_ent && !ent) + seq_puts(m, "\n\nvim:ft=help\n"); + + return ent; +} + +static void *s_start(struct seq_file *m, loff_t *pos) +{ + struct trace_iterator *iter = m->private; + void *p = NULL; + loff_t l = 0; + int i; + + mutex_lock(&trace_types_lock); + + if (!current_trace || current_trace != iter->trace) + return NULL; + + atomic_inc(&trace_record_cmdline_disabled); + + /* let the tracer grab locks here if needed */ + if (current_trace->start) + current_trace->start(iter); + + if (*pos != iter->pos) { + iter->ent = NULL; + iter->cpu = 0; + iter->idx = -1; + + for (i = 0; i < NR_CPUS; i++) + iter->next_idx[i] = 0; + + for (p = iter; p && l < *pos; p = s_next(m, p, &l)) + ; + + } else { + l = *pos; + p = s_next(m, p, &l); + } + + return p; +} + +static void s_stop(struct seq_file *m, void *p) +{ + struct trace_iterator *iter = m->private; + + atomic_dec(&trace_record_cmdline_disabled); + + /* let the tracer release locks here if needed */ + if (current_trace && current_trace == iter->trace && iter->trace->stop) + iter->trace->stop(iter); + + mutex_unlock(&trace_types_lock); +} + +static void +seq_print_sym_short(struct seq_file *m, const char *fmt, unsigned long address) +{ +#ifdef CONFIG_KALLSYMS + char str[KSYM_SYMBOL_LEN]; + + kallsyms_lookup(address, NULL, NULL, NULL, str); + + seq_printf(m, fmt, str); +#endif +} + +static void +seq_print_sym_offset(struct seq_file *m, const char *fmt, unsigned long address) +{ +#ifdef CONFIG_KALLSYMS + char str[KSYM_SYMBOL_LEN]; + + sprint_symbol(str, address); + seq_printf(m, fmt, str); +#endif +} + +#ifndef CONFIG_64BIT +# define IP_FMT "%08lx" +#else +# define IP_FMT "%016lx" +#endif + +static void notrace +seq_print_ip_sym(struct seq_file *m, unsigned long ip, unsigned long sym_flags) +{ + if (!ip) { + seq_printf(m, "0"); + return; + } + + if (sym_flags & TRACE_ITER_SYM_OFFSET) + seq_print_sym_offset(m, "%s", ip); + else + seq_print_sym_short(m, "%s", ip); + + if (sym_flags & TRACE_ITER_SYM_ADDR) + seq_printf(m, " <" IP_FMT ">", ip); +} + +static void notrace print_lat_help_header(struct seq_file *m) +{ + seq_puts(m, "# _------=> CPU# \n"); + seq_puts(m, "# / _-----=> irqs-off \n"); + seq_puts(m, "# | / _----=> need-resched \n"); + seq_puts(m, "# || / _---=> hardirq/softirq \n"); + seq_puts(m, "# ||| / _--=> preempt-depth \n"); + seq_puts(m, "# |||| / \n"); + seq_puts(m, "# ||||| delay \n"); + seq_puts(m, "# cmd pid ||||| time | caller \n"); + seq_puts(m, "# \\ / ||||| \\ | / \n"); +} + +static void notrace print_func_help_header(struct seq_file *m) +{ + seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n"); + seq_puts(m, "# | | | | |\n"); +} + + +static void notrace +print_trace_header(struct seq_file *m, struct trace_iterator *iter) +{ + unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); + struct trace_array *tr = iter->tr; + struct trace_array_cpu *data = tr->data[tr->cpu]; + struct tracer *type = current_trace; + unsigned long underruns = 0; + unsigned long underrun; + unsigned long entries = 0; + int cpu; + const char *name = "preemption"; + + if (type) + name = type->name; + + for_each_possible_cpu(cpu) { + if (tr->data[cpu]->trace) { + underrun = atomic_read(&tr->data[cpu]->underrun); + if (underrun) { + underruns += underrun; + entries += tr->entries; + } else + entries += tr->data[cpu]->trace_idx; + } + } + + seq_printf(m, "%s latency trace v1.1.5 on %s\n", + name, UTS_RELEASE); + seq_puts(m, "-----------------------------------" + "---------------------------------\n"); + seq_printf(m, " latency: %lu us, #%lu/%lu, CPU#%d |" + " (M:%s VP:%d, KP:%d, SP:%d HP:%d", + data->saved_latency, + entries, + (entries + underruns), + tr->cpu, +#if defined(CONFIG_PREEMPT_NONE) + "server", +#elif defined(CONFIG_PREEMPT_VOLUNTARY) + "desktop", +#elif defined(CONFIG_PREEMPT_DESKTOP) + "preempt", +#else + "unknown", +#endif + /* These are reserved for later use */ + 0, 0, 0, 0); +#ifdef CONFIG_SMP + seq_printf(m, " #P:%d)\n", num_online_cpus()); +#else + seq_puts(m, ")\n"); +#endif + seq_puts(m, " -----------------\n"); + seq_printf(m, " | task: %.16s-%d " + "(uid:%d nice:%ld policy:%ld rt_prio:%ld)\n", + data->comm, data->pid, data->uid, data->nice, + data->policy, data->rt_priority); + seq_puts(m, " -----------------\n"); + + if (data->critical_start) { + seq_puts(m, " => started at: "); + seq_print_ip_sym(m, data->critical_start, sym_flags); + seq_puts(m, "\n => ended at: "); + seq_print_ip_sym(m, data->critical_end, sym_flags); + seq_puts(m, "\n"); + } + + seq_puts(m, "\n"); +} + +unsigned long nsecs_to_usecs(unsigned long nsecs) +{ + return nsecs / 1000; +} + +static void notrace +lat_print_generic(struct seq_file *m, struct trace_entry *entry, int cpu) +{ + int hardirq, softirq; + char *comm; + + comm = trace_find_cmdline(entry->pid); + + seq_printf(m, "%8.8s-%-5d ", comm, entry->pid); + seq_printf(m, "%d", cpu); + seq_printf(m, "%c%c", + (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : '.', + ((entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.')); + + hardirq = entry->flags & TRACE_FLAG_HARDIRQ; + softirq = entry->flags & TRACE_FLAG_SOFTIRQ; + if (hardirq && softirq) + seq_putc(m, 'H'); + else { + if (hardirq) + seq_putc(m, 'h'); + else { + if (softirq) + seq_putc(m, 's'); + else + seq_putc(m, '.'); + } + } + + if (entry->preempt_count) + seq_printf(m, "%x", entry->preempt_count); + else + seq_puts(m, "."); +} + +unsigned long preempt_mark_thresh = 100; + +static void notrace +lat_print_timestamp(struct seq_file *m, unsigned long long abs_usecs, + unsigned long rel_usecs) +{ + seq_printf(m, " %4lldus", abs_usecs); + if (rel_usecs > preempt_mark_thresh) + seq_puts(m, "!: "); + else if (rel_usecs > 1) + seq_puts(m, "+: "); + else + seq_puts(m, " : "); +} + +static const char state_to_char[] = TASK_STATE_TO_CHAR_STR; + +static void notrace +print_lat_fmt(struct seq_file *m, struct trace_iterator *iter, + unsigned int trace_idx, int cpu) +{ + unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); + struct trace_entry *next_entry = find_next_entry(iter, NULL); + unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE); + struct trace_entry *entry = iter->ent; + unsigned long abs_usecs; + unsigned long rel_usecs; + char *comm; + int S; + + if (!next_entry) + next_entry = entry; + rel_usecs = ns2usecs(next_entry->t - entry->t); + abs_usecs = ns2usecs(entry->t - iter->tr->time_start); + + if (verbose) { + comm = trace_find_cmdline(entry->pid); + seq_printf(m, "%16s %5d %d %d %08x %08x [%08lx]" + " %ld.%03ldms (+%ld.%03ldms): ", + comm, + entry->pid, cpu, entry->flags, + entry->preempt_count, trace_idx, + ns2usecs(entry->t), + abs_usecs/1000, + abs_usecs % 1000, rel_usecs/1000, rel_usecs % 1000); + } else { + lat_print_generic(m, entry, cpu); + lat_print_timestamp(m, abs_usecs, rel_usecs); + } + switch (entry->type) { + case TRACE_FN: + seq_print_ip_sym(m, entry->fn.ip, sym_flags); + seq_puts(m, " ("); + seq_print_ip_sym(m, entry->fn.parent_ip, sym_flags); + seq_puts(m, ")\n"); + break; + case TRACE_CTX: + S = entry->ctx.prev_state < sizeof(state_to_char) ? + state_to_char[entry->ctx.prev_state] : 'X'; + comm = trace_find_cmdline(entry->ctx.next_pid); + seq_printf(m, " %d:%d:%c --> %d:%d %s\n", + entry->ctx.prev_pid, + entry->ctx.prev_prio, + S, + entry->ctx.next_pid, + entry->ctx.next_prio, + comm); + break; + } +} + +static void notrace +print_trace_fmt(struct seq_file *m, struct trace_iterator *iter) +{ + unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); + struct trace_entry *entry = iter->ent; + unsigned long usec_rem; + unsigned long long t; + unsigned long secs; + char *comm; + int S; + + comm = trace_find_cmdline(iter->ent->pid); + + t = ns2usecs(entry->t); + usec_rem = do_div(t, 1000000ULL); + secs = (unsigned long)t; + + seq_printf(m, "%16s-%-5d ", comm, entry->pid); + seq_printf(m, "[%02d] ", iter->cpu); + seq_printf(m, "%5lu.%06lu: ", secs, usec_rem); + + switch (entry->type) { + case TRACE_FN: + seq_print_ip_sym(m, entry->fn.ip, sym_flags); + if ((sym_flags & TRACE_ITER_PRINT_PARENT) && + entry->fn.parent_ip) { + seq_printf(m, " <-"); + seq_print_ip_sym(m, entry->fn.parent_ip, sym_flags); + } + break; + case TRACE_CTX: + S = entry->ctx.prev_state < sizeof(state_to_char) ? + state_to_char[entry->ctx.prev_state] : 'X'; + seq_printf(m, " %d:%d:%c ==> %d:%d\n", + entry->ctx.prev_pid, + entry->ctx.prev_prio, + S, + entry->ctx.next_pid, + entry->ctx.next_prio); + break; + } + seq_printf(m, "\n"); +} + +static int trace_empty(struct trace_iterator *iter) +{ + struct trace_array_cpu *data; + int cpu; + + for_each_possible_cpu(cpu) { + data = iter->tr->data[cpu]; + + if (data->trace && + (data->trace_idx || + atomic_read(&data->underrun))) + return 0; + } + return 1; +} + +static int s_show(struct seq_file *m, void *v) +{ + struct trace_iterator *iter = v; + + if (iter->ent == NULL) { + if (iter->tr) { + seq_printf(m, "# tracer: %s\n", iter->trace->name); + seq_puts(m, "#\n"); + } + if (iter->iter_flags & TRACE_FILE_LAT_FMT) { + /* print nothing if the buffers are empty */ + if (trace_empty(iter)) + return 0; + print_trace_header(m, iter); + if (!(trace_flags & TRACE_ITER_VERBOSE)) + print_lat_help_header(m); + } else { + if (!(trace_flags & TRACE_ITER_VERBOSE)) + print_func_help_header(m); + } + } else { + if (iter->iter_flags & TRACE_FILE_LAT_FMT) + print_lat_fmt(m, iter, iter->idx, iter->cpu); + else + print_trace_fmt(m, iter); + } + + return 0; +} + +static struct seq_operations tracer_seq_ops = { + .start = s_start, + .next = s_next, + .stop = s_stop, + .show = s_show, +}; + +static struct trace_iterator notrace * +__tracing_open(struct inode *inode, struct file *file, int *ret) +{ + struct trace_iterator *iter; + + iter = kzalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) { + *ret = -ENOMEM; + goto out; + } + + mutex_lock(&trace_types_lock); + if (current_trace && current_trace->print_max) + iter->tr = &max_tr; + else + iter->tr = inode->i_private; + iter->trace = current_trace; + iter->pos = -1; + + /* TODO stop tracer */ + *ret = seq_open(file, &tracer_seq_ops); + if (!*ret) { + struct seq_file *m = file->private_data; + m->private = iter; + + /* stop the trace while dumping */ + if (iter->tr->ctrl) + tracer_enabled = 0; + + if (iter->trace && iter->trace->open) + iter->trace->open(iter); + } else { + kfree(iter); + iter = NULL; + } + mutex_unlock(&trace_types_lock); + + out: + return iter; +} + +int tracing_open_generic(struct inode *inode, struct file *filp) +{ + filp->private_data = inode->i_private; + return 0; +} + +int tracing_release(struct inode *inode, struct file *file) +{ + struct seq_file *m = (struct seq_file *)file->private_data; + struct trace_iterator *iter = m->private; + + mutex_lock(&trace_types_lock); + if (iter->trace && iter->trace->close) + iter->trace->close(iter); + + /* reenable tracing if it was previously enabled */ + if (iter->tr->ctrl) + tracer_enabled = 1; + mutex_unlock(&trace_types_lock); + + seq_release(inode, file); + kfree(iter); + return 0; +} + +static int tracing_open(struct inode *inode, struct file *file) +{ + int ret; + + __tracing_open(inode, file, &ret); + + return ret; +} + +static int tracing_lt_open(struct inode *inode, struct file *file) +{ + struct trace_iterator *iter; + int ret; + + iter = __tracing_open(inode, file, &ret); + + if (!ret) + iter->iter_flags |= TRACE_FILE_LAT_FMT; + + return ret; +} + + +static void notrace * +t_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct tracer *t = m->private; + + (*pos)++; + + if (t) + t = t->next; + + m->private = t; + + return t; +} + +static void *t_start(struct seq_file *m, loff_t *pos) +{ + struct tracer *t = m->private; + loff_t l = 0; + + mutex_lock(&trace_types_lock); + for (; t && l < *pos; t = t_next(m, t, &l)) + ; + + return t; +} + +static void t_stop(struct seq_file *m, void *p) +{ + mutex_unlock(&trace_types_lock); +} + +static int t_show(struct seq_file *m, void *v) +{ + struct tracer *t = v; + + if (!t) + return 0; + + seq_printf(m, "%s", t->name); + if (t->next) + seq_putc(m, ' '); + else + seq_putc(m, '\n'); + + return 0; +} + +static struct seq_operations show_traces_seq_ops = { + .start = t_start, + .next = t_next, + .stop = t_stop, + .show = t_show, +}; + +static int show_traces_open(struct inode *inode, struct file *file) +{ + int ret; + + ret = seq_open(file, &show_traces_seq_ops); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = trace_types; + } + + return ret; +} + +static struct file_operations tracing_fops = { + .open = tracing_open, + .read = seq_read, + .llseek = seq_lseek, + .release = tracing_release, +}; + +static struct file_operations tracing_lt_fops = { + .open = tracing_lt_open, + .read = seq_read, + .llseek = seq_lseek, + .release = tracing_release, +}; + +static struct file_operations show_traces_fops = { + .open = show_traces_open, + .read = seq_read, + .release = seq_release, +}; + +static ssize_t +tracing_iter_ctrl_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char *buf; + int r = 0; + int len = 0; + int i; + + /* calulate max size */ + for (i = 0; trace_options[i]; i++) { + len += strlen(trace_options[i]); + len += 3; /* "no" and space */ + } + + /* +2 for \n and \0 */ + buf = kmalloc(len + 2, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + for (i = 0; trace_options[i]; i++) { + if (trace_flags & (1 << i)) + r += sprintf(buf + r, "%s ", trace_options[i]); + else + r += sprintf(buf + r, "no%s ", trace_options[i]); + } + + r += sprintf(buf + r, "\n"); + WARN_ON(r >= len + 2); + + r = simple_read_from_buffer(ubuf, cnt, ppos, + buf, r); + + kfree(buf); + + return r; +} + +static ssize_t +tracing_iter_ctrl_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[64]; + char *cmp = buf; + int neg = 0; + int i; + + if (cnt > 63) + cnt = 63; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + if (strncmp(buf, "no", 2) == 0) { + neg = 1; + cmp += 2; + } + + for (i = 0; trace_options[i]; i++) { + int len = strlen(trace_options[i]); + + if (strncmp(cmp, trace_options[i], len) == 0) { + if (neg) + trace_flags &= ~(1 << i); + else + trace_flags |= (1 << i); + break; + } + } + + filp->f_pos += cnt; + + return cnt; +} + +static struct file_operations tracing_iter_fops = { + .open = tracing_open_generic, + .read = tracing_iter_ctrl_read, + .write = tracing_iter_ctrl_write, +}; + +static ssize_t +tracing_ctrl_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_array *tr = filp->private_data; + char buf[64]; + int r; + + r = sprintf(buf, "%ld\n", tr->ctrl); + return simple_read_from_buffer(ubuf, cnt, ppos, + buf, r); +} + +static ssize_t +tracing_ctrl_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_array *tr = filp->private_data; + long val; + char buf[64]; + + if (cnt > 63) + cnt = 63; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + val = simple_strtoul(buf, NULL, 10); + + val = !!val; + + mutex_lock(&trace_types_lock); + if (tr->ctrl ^ val) { + if (val) + tracer_enabled = 1; + else + tracer_enabled = 0; + + tr->ctrl = val; + + if (current_trace && current_trace->ctrl_update) + current_trace->ctrl_update(tr); + } + mutex_unlock(&trace_types_lock); + + filp->f_pos += cnt; + + return cnt; +} + +static ssize_t +tracing_set_trace_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[max_tracer_type_len+2]; + int r; + + mutex_lock(&trace_types_lock); + if (current_trace) + r = sprintf(buf, "%s\n", current_trace->name); + else + r = sprintf(buf, "\n"); + mutex_unlock(&trace_types_lock); + + return simple_read_from_buffer(ubuf, cnt, ppos, + buf, r); +} + +static ssize_t +tracing_set_trace_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_array *tr = &global_trace; + struct tracer *t; + char buf[max_tracer_type_len+1]; + int i; + + if (cnt > max_tracer_type_len) + cnt = max_tracer_type_len; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + /* strip ending whitespace. */ + for (i = cnt - 1; i > 0 && isspace(buf[i]); i--) + buf[i] = 0; + + mutex_lock(&trace_types_lock); + for (t = trace_types; t; t = t->next) { + if (strcmp(t->name, buf) == 0) + break; + } + if (!t || t == current_trace) + goto out; + + if (current_trace && current_trace->reset) + current_trace->reset(tr); + + current_trace = t; + if (t->init) + t->init(tr); + + out: + mutex_unlock(&trace_types_lock); + + filp->f_pos += cnt; + + return cnt; +} + +static ssize_t +tracing_max_lat_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + unsigned long *ptr = filp->private_data; + char buf[64]; + int r; + + r = snprintf(buf, 64, "%ld\n", + *ptr == (unsigned long)-1 ? -1 : nsecs_to_usecs(*ptr)); + if (r > 64) + r = 64; + return simple_read_from_buffer(ubuf, cnt, ppos, + buf, r); +} + +static ssize_t +tracing_max_lat_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + long *ptr = filp->private_data; + long val; + char buf[64]; + + if (cnt > 63) + cnt = 63; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + val = simple_strtoul(buf, NULL, 10); + + *ptr = val * 1000; + + return cnt; +} + +static struct file_operations tracing_max_lat_fops = { + .open = tracing_open_generic, + .read = tracing_max_lat_read, + .write = tracing_max_lat_write, +}; + +static struct file_operations tracing_ctrl_fops = { + .open = tracing_open_generic, + .read = tracing_ctrl_read, + .write = tracing_ctrl_write, +}; + +static struct file_operations set_tracer_fops = { + .open = tracing_open_generic, + .read = tracing_set_trace_read, + .write = tracing_set_trace_write, +}; + +#ifdef CONFIG_DYNAMIC_FTRACE + +static ssize_t +tracing_read_long(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + unsigned long *p = filp->private_data; + char buf[64]; + int r; + + r = sprintf(buf, "%ld\n", *p); + return simple_read_from_buffer(ubuf, cnt, ppos, + buf, r); +} + +static struct file_operations tracing_read_long_fops = { + .open = tracing_open_generic, + .read = tracing_read_long, +}; +#endif + +static struct dentry *d_tracer; + +struct dentry *tracing_init_dentry(void) +{ + static int once; + + if (d_tracer) + return d_tracer; + + d_tracer = debugfs_create_dir("tracing", NULL); + + if (!d_tracer && !once) { + once = 1; + pr_warning("Could not create debugfs directory 'tracing'\n"); + return NULL; + } + + return d_tracer; +} + +static __init void tracer_init_debugfs(void) +{ + struct dentry *d_tracer; + struct dentry *entry; + + d_tracer = tracing_init_dentry(); + + entry = debugfs_create_file("tracing_enabled", 0644, d_tracer, + &global_trace, &tracing_ctrl_fops); + if (!entry) + pr_warning("Could not create debugfs 'tracing_enabled' entry\n"); + + entry = debugfs_create_file("iter_ctrl", 0644, d_tracer, + NULL, &tracing_iter_fops); + if (!entry) + pr_warning("Could not create debugfs 'iter_ctrl' entry\n"); + + entry = debugfs_create_file("latency_trace", 0444, d_tracer, + &global_trace, &tracing_lt_fops); + if (!entry) + pr_warning("Could not create debugfs 'latency_trace' entry\n"); + + entry = debugfs_create_file("trace", 0444, d_tracer, + &global_trace, &tracing_fops); + if (!entry) + pr_warning("Could not create debugfs 'trace' entry\n"); + + entry = debugfs_create_file("available_tracers", 0444, d_tracer, + &global_trace, &show_traces_fops); + if (!entry) + pr_warning("Could not create debugfs 'trace' entry\n"); + + entry = debugfs_create_file("current_tracer", 0444, d_tracer, + &global_trace, &set_tracer_fops); + if (!entry) + pr_warning("Could not create debugfs 'trace' entry\n"); + + entry = debugfs_create_file("tracing_max_latency", 0644, d_tracer, + &tracing_max_latency, + &tracing_max_lat_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'tracing_max_latency' entry\n"); + + entry = debugfs_create_file("tracing_thresh", 0644, d_tracer, + &tracing_thresh, &tracing_max_lat_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'tracing_threash' entry\n"); + +#ifdef CONFIG_DYNAMIC_FTRACE + entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer, + &ftrace_update_tot_cnt, + &tracing_read_long_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'dyn_ftrace_total_info' entry\n"); +#endif +} + +/* dummy trace to disable tracing */ +static struct tracer no_tracer __read_mostly = +{ + .name = "none", +}; + +static inline notrace int page_order(const unsigned long size) +{ + const unsigned long nr_pages = DIV_ROUND_UP(size, PAGE_SIZE); + return ilog2(roundup_pow_of_two(nr_pages)); +} + +__init static int tracer_alloc_buffers(void) +{ + const int order = page_order(trace_nr_entries * TRACE_ENTRY_SIZE); + const unsigned long size = (1UL << order) << PAGE_SHIFT; + struct trace_entry *array; + int i; + + for_each_possible_cpu(i) { + global_trace.data[i] = &per_cpu(global_trace_cpu, i); + max_tr.data[i] = &per_cpu(max_data, i); + + array = (struct trace_entry *) + __get_free_pages(GFP_KERNEL, order); + if (array == NULL) { + printk(KERN_ERR "tracer: failed to allocate" + " %ld bytes for trace buffer!\n", size); + goto free_buffers; + } + global_trace.data[i]->trace = array; + +/* Only allocate if we are actually using the max trace */ +#ifdef CONFIG_TRACER_MAX_TRACE + array = (struct trace_entry *) + __get_free_pages(GFP_KERNEL, order); + if (array == NULL) { + printk(KERN_ERR "wakeup tracer: failed to allocate" + " %ld bytes for trace buffer!\n", size); + goto free_buffers; + } + max_tr.data[i]->trace = array; +#endif + } + + /* + * Since we allocate by orders of pages, we may be able to + * round up a bit. + */ + global_trace.entries = size / TRACE_ENTRY_SIZE; + max_tr.entries = global_trace.entries; + + pr_info("tracer: %ld bytes allocated for %ld", + size, trace_nr_entries); + pr_info(" entries of %ld bytes\n", (long)TRACE_ENTRY_SIZE); + pr_info(" actual entries %ld\n", global_trace.entries); + + tracer_init_debugfs(); + + trace_init_cmdlines(); + + register_tracer(&no_tracer); + current_trace = &no_tracer; + + return 0; + + free_buffers: + for (i-- ; i >= 0; i--) { + struct trace_array_cpu *data = global_trace.data[i]; + + if (data && data->trace) { + free_pages((unsigned long)data->trace, order); + data->trace = NULL; + } + +#ifdef CONFIG_TRACER_MAX_TRACE + data = max_tr.data[i]; + if (data && data->trace) { + free_pages((unsigned long)data->trace, order); + data->trace = NULL; + } +#endif + } + return -ENOMEM; +} + +device_initcall(tracer_alloc_buffers); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h new file mode 100644 index 0000000..3173a93 --- /dev/null +++ b/kernel/trace/trace.h @@ -0,0 +1,184 @@ +#ifndef _LINUX_KERNEL_TRACE_H +#define _LINUX_KERNEL_TRACE_H + +#include +#include +#include +#include + +/* + * Function trace entry - function address and parent function addres: + */ +struct ftrace_entry { + unsigned long ip; + unsigned long parent_ip; +}; + +/* + * Context switch trace entry - which task (and prio) we switched from/to: + */ +struct ctx_switch_entry { + unsigned int prev_pid; + unsigned char prev_prio; + unsigned char prev_state; + unsigned int next_pid; + unsigned char next_prio; +}; + +/* + * The trace entry - the most basic unit of tracing. This is what + * is printed in the end as a single line in the trace output, such as: + * + * bash-15816 [01] 235.197585: idle_cpu <- irq_enter + */ +struct trace_entry { + char type; + char cpu; + char flags; + char preempt_count; + int pid; + cycle_t t; + unsigned long idx; + union { + struct ftrace_entry fn; + struct ctx_switch_entry ctx; + }; +}; + +#define TRACE_ENTRY_SIZE sizeof(struct trace_entry) + +/* + * The CPU trace array - it consists of thousands of trace entries + * plus some other descriptor data: (for example which task started + * the trace, etc.) + */ +struct trace_array_cpu { + void *trace; + unsigned long trace_idx; + atomic_t disabled; + atomic_t underrun; + unsigned long saved_latency; + unsigned long critical_start; + unsigned long critical_end; + unsigned long critical_sequence; + unsigned long nice; + unsigned long policy; + unsigned long rt_priority; + cycle_t preempt_timestamp; + pid_t pid; + uid_t uid; + char comm[TASK_COMM_LEN]; +}; + +struct trace_iterator; + +/* + * The trace array - an array of per-CPU trace arrays. This is the + * highest level data structure that individual tracers deal with. + * They have on/off state as well: + */ +struct trace_array { + unsigned long entries; + long ctrl; + int cpu; + cycle_t time_start; + struct trace_array_cpu *data[NR_CPUS]; +}; + +/* + * A specific tracer, represented by methods that operate on a trace array: + */ +struct tracer { + const char *name; + void (*init)(struct trace_array *tr); + void (*reset)(struct trace_array *tr); + void (*open)(struct trace_iterator *iter); + void (*close)(struct trace_iterator *iter); + void (*start)(struct trace_iterator *iter); + void (*stop)(struct trace_iterator *iter); + void (*ctrl_update)(struct trace_array *tr); + struct tracer *next; + int print_max; +}; + +/* + * Trace iterator - used by printout routines who present trace + * results to users and which routines might sleep, etc: + */ +struct trace_iterator { + struct trace_array *tr; + struct tracer *trace; + struct trace_entry *ent; + unsigned long iter_flags; + loff_t pos; + unsigned long next_idx[NR_CPUS]; + int cpu; + int idx; +}; + +void notrace tracing_reset(struct trace_array_cpu *data); +int tracing_open_generic(struct inode *inode, struct file *filp); +struct dentry *tracing_init_dentry(void); +void ftrace(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long ip, + unsigned long parent_ip, + unsigned long flags); +void tracing_sched_switch_trace(struct trace_array *tr, + struct trace_array_cpu *data, + struct task_struct *prev, + struct task_struct *next, + unsigned long flags); +void tracing_record_cmdline(struct task_struct *tsk); + +void tracing_start_function_trace(void); +void tracing_stop_function_trace(void); +int register_tracer(struct tracer *type); +void unregister_tracer(struct tracer *type); + +extern unsigned long nsecs_to_usecs(unsigned long nsecs); + +extern unsigned long tracing_max_latency; +extern unsigned long tracing_thresh; + +void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu); +void update_max_tr_single(struct trace_array *tr, + struct task_struct *tsk, int cpu); + +static inline notrace cycle_t now(int cpu) +{ + return cpu_clock(cpu); +} + +#ifdef CONFIG_SCHED_TRACER +extern void notrace +wakeup_sched_switch(struct task_struct *prev, struct task_struct *next); +#else +static inline void +wakeup_sched_switch(struct task_struct *prev, struct task_struct *next) +{ +} +#endif + +#ifdef CONFIG_CONTEXT_SWITCH_TRACER +typedef void +(*tracer_switch_func_t)(void *private, + struct task_struct *prev, + struct task_struct *next); + +struct tracer_switch_ops { + tracer_switch_func_t func; + void *private; + struct tracer_switch_ops *next; +}; + +extern int register_tracer_switch(struct tracer_switch_ops *ops); +extern int unregister_tracer_switch(struct tracer_switch_ops *ops); + +#endif /* CONFIG_CONTEXT_SWITCH_TRACER */ + +#ifdef CONFIG_DYNAMIC_FTRACE +extern unsigned long ftrace_update_tot_cnt; +#endif + +#endif /* _LINUX_KERNEL_TRACE_H */ -- cgit v0.10.2 From 1b29b01887e6032dcaf818c14999c7a39593b4e7 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:42 +0200 Subject: ftrace: function tracer This is a simple trace that uses the ftrace infrastructure. It is designed to be fast and small, and easy to use. It is useful to record things that happen over a very short period of time, and not to analyze the system in general. Updates: available_tracers "function" is added to this file. current_tracer To enable the function tracer: echo function > /debugfs/tracing/current_tracer To disable the tracer: echo disable > /debugfs/tracing/current_tracer The output of the function_trace file is as follows "echo noverbose > /debugfs/tracing/iter_ctrl" preemption latency trace v1.1.5 on 2.6.24-rc7-tst Signed-off-by: Ingo Molnar -------------------------------------------------------------------- latency: 0 us, #419428/4361791, CPU#1 | (M:desktop VP:0, KP:0, SP:0 HP:0 #P:4) ----------------- | task: -0 (uid:0 nice:0 policy:0 rt_prio:0) ----------------- _------=> CPU# / _-----=> irqs-off | / _----=> need-resched || / _---=> hardirq/softirq ||| / _--=> preempt-depth |||| / ||||| delay cmd pid ||||| time | caller \ / ||||| \ | / swapper-0 0d.h. 1595128us+: set_normalized_timespec+0x8/0x2d (ktime_get_ts+0x4a/0x4e ) swapper-0 0d.h. 1595131us+: _spin_lock+0x8/0x18 (hrtimer_interrupt+0x6e/0x1b0 ) Or with verbose turned on: "echo verbose > /debugfs/tracing/iter_ctrl" preemption latency trace v1.1.5 on 2.6.24-rc7-tst -------------------------------------------------------------------- latency: 0 us, #419428/4361791, CPU#1 | (M:desktop VP:0, KP:0, SP:0 HP:0 #P:4) ----------------- | task: -0 (uid:0 nice:0 policy:0 rt_prio:0) ----------------- swapper 0 0 9 00000000 00000000 [f3675f41] 1595.128ms (+0.003ms): set_normalized_timespec+0x8/0x2d (ktime_get_ts+0x4a/0x4e ) swapper 0 0 9 00000000 00000001 [f3675f45] 1595.131ms (+0.003ms): _spin_lock+0x8/0x18 (hrtimer_interrupt+0x6e/0x1b0 ) swapper 0 0 9 00000000 00000002 [f3675f48] 1595.135ms (+0.003ms): _spin_lock+0x8/0x18 (hrtimer_interrupt+0x6e/0x1b0 ) The "trace" file is not affected by the verbose mode, but is by the symonly. echo "nosymonly" > /debugfs/tracing/iter_ctrl tracer: [ 81.479967] CPU 0: bash:3154 register_ftrace_function+0x5f/0x66 <-- _spin_unlock_irqrestore+0xe/0x5a [ 81.479967] CPU 0: bash:3154 _spin_unlock_irqrestore+0x3e/0x5a <-- sub_preempt_count+0xc/0x7a [ 81.479968] CPU 0: bash:3154 sub_preempt_count+0x30/0x7a <-- in_lock_functions+0x9/0x24 [ 81.479968] CPU 0: bash:3154 vfs_write+0x11d/0x155 <-- dnotify_parent+0x12/0x78 [ 81.479968] CPU 0: bash:3154 dnotify_parent+0x2d/0x78 <-- _spin_lock+0xe/0x70 [ 81.479969] CPU 0: bash:3154 _spin_lock+0x1b/0x70 <-- add_preempt_count+0xe/0x77 [ 81.479969] CPU 0: bash:3154 add_preempt_count+0x3e/0x77 <-- in_lock_functions+0x9/0x24 echo "symonly" > /debugfs/tracing/iter_ctrl tracer: [ 81.479913] CPU 0: bash:3154 register_ftrace_function+0x5f/0x66 <-- _spin_unlock_irqrestore+0xe/0x5a [ 81.479913] CPU 0: bash:3154 _spin_unlock_irqrestore+0x3e/0x5a <-- sub_preempt_count+0xc/0x7a [ 81.479913] CPU 0: bash:3154 sub_preempt_count+0x30/0x7a <-- in_lock_functions+0x9/0x24 [ 81.479914] CPU 0: bash:3154 vfs_write+0x11d/0x155 <-- dnotify_parent+0x12/0x78 [ 81.479914] CPU 0: bash:3154 dnotify_parent+0x2d/0x78 <-- _spin_lock+0xe/0x70 [ 81.479914] CPU 0: bash:3154 _spin_lock+0x1b/0x70 <-- add_preempt_count+0xe/0x77 [ 81.479914] CPU 0: bash:3154 add_preempt_count+0x3e/0x77 <-- in_lock_functions+0x9/0x24 Signed-off-by: Steven Rostedt Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index ce70677..1399f37 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -8,3 +8,16 @@ config TRACING bool select DEBUG_FS +config FTRACE + bool "Kernel Function Tracer" + depends on DEBUG_KERNEL && HAVE_FTRACE + select FRAME_POINTER + select TRACING + help + Enable the kernel to trace every kernel function. This is done + by using a compiler feature to insert a small, 5-byte No-Operation + instruction to the beginning of every kernel function, which NOP + sequence is then dynamically patched into a tracer call when + tracing is enabled by the administrator. If it's runtime disabled + (the bootup default), then the overhead of the instructions is very + small and not measurable even in micro-benchmarks. diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 7af4031..6bb5e50 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -1,5 +1,6 @@ obj-$(CONFIG_FTRACE) += libftrace.o obj-$(CONFIG_TRACING) += trace.o +obj-$(CONFIG_FTRACE) += trace_functions.o libftrace-y := ftrace.o diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c new file mode 100644 index 0000000..82988c5 --- /dev/null +++ b/kernel/trace/trace_functions.c @@ -0,0 +1,73 @@ +/* + * ring buffer based function tracer + * + * Copyright (C) 2007-2008 Steven Rostedt + * Copyright (C) 2008 Ingo Molnar + * + * Based on code from the latency_tracer, that is: + * + * Copyright (C) 2004-2006 Ingo Molnar + * Copyright (C) 2004 William Lee Irwin III + */ +#include +#include +#include +#include + +#include "trace.h" + +static notrace void function_reset(struct trace_array *tr) +{ + int cpu; + + tr->time_start = now(tr->cpu); + + for_each_online_cpu(cpu) + tracing_reset(tr->data[cpu]); +} + +static notrace void start_function_trace(struct trace_array *tr) +{ + function_reset(tr); + tracing_start_function_trace(); +} + +static notrace void stop_function_trace(struct trace_array *tr) +{ + tracing_stop_function_trace(); +} + +static notrace void function_trace_init(struct trace_array *tr) +{ + if (tr->ctrl) + start_function_trace(tr); +} + +static notrace void function_trace_reset(struct trace_array *tr) +{ + if (tr->ctrl) + stop_function_trace(tr); +} + +static notrace void function_trace_ctrl_update(struct trace_array *tr) +{ + if (tr->ctrl) + start_function_trace(tr); + else + stop_function_trace(tr); +} + +static struct tracer function_trace __read_mostly = +{ + .name = "ftrace", + .init = function_trace_init, + .reset = function_trace_reset, + .ctrl_update = function_trace_ctrl_update, +}; + +static __init int init_function_trace(void) +{ + return register_tracer(&function_trace); +} + +device_initcall(init_function_trace); -- cgit v0.10.2 From 35e8e302e5d6e32675df2fc1dd3a53dfa6630dc1 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:42 +0200 Subject: ftrace: add tracing of context switches This patch adds context switch tracing, of the format of: _------=> CPU# / _-----=> irqs-off | / _----=> need-resched || / _---=> hardirq/softirq ||| / _--=> preempt-depth |||| / ||||| delay cmd pid ||||| time | pid:prio:state \ / ||||| \ | / swapper-0 1d..3 137us+: 0:140:R --> 2912:120 sshd-2912 1d..3 216us+: 2912:120:S --> 0:140 swapper-0 1d..3 261us+: 0:140:R --> 2912:120 bash-2920 0d..3 267us+: 2920:120:S --> 0:140 sshd-2912 1d..3 330us!: 2912:120:S --> 0:140 swapper-0 1d..3 2389us+: 0:140:R --> 2847:120 yum-upda-2847 1d..3 2411us!: 2847:120:S --> 0:140 swapper-0 0d..3 11089us+: 0:140:R --> 3139:120 gdm-bina-3139 0d..3 11113us!: 3139:120:S --> 0:140 swapper-0 1d..3 102328us+: 0:140:R --> 2847:120 yum-upda-2847 1d..3 102348us!: 2847:120:S --> 0:140 "sched_switch" is added to /debugfs/tracing/available_tracers [ Eugene Teo Cc: Mathieu Desnoyers Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 1399f37..5d6aa92 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -13,6 +13,7 @@ config FTRACE depends on DEBUG_KERNEL && HAVE_FTRACE select FRAME_POINTER select TRACING + select CONTEXT_SWITCH_TRACER help Enable the kernel to trace every kernel function. This is done by using a compiler feature to insert a small, 5-byte No-Operation @@ -21,3 +22,13 @@ config FTRACE tracing is enabled by the administrator. If it's runtime disabled (the bootup default), then the overhead of the instructions is very small and not measurable even in micro-benchmarks. + +config CONTEXT_SWITCH_TRACER + bool "Trace process context switches" + depends on DEBUG_KERNEL + select TRACING + select MARKERS + help + This tracer gets called from the context switch and records + all switching of tasks. + diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 6bb5e50..6b54ceb 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -1,6 +1,7 @@ obj-$(CONFIG_FTRACE) += libftrace.o obj-$(CONFIG_TRACING) += trace.o +obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o obj-$(CONFIG_FTRACE) += trace_functions.o libftrace-y := ftrace.o diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c new file mode 100644 index 0000000..3e4771d --- /dev/null +++ b/kernel/trace/trace_sched_switch.c @@ -0,0 +1,116 @@ +/* + * trace context switch + * + * Copyright (C) 2007 Steven Rostedt + * + */ +#include +#include +#include +#include +#include +#include +#include + +#include "trace.h" + +static struct trace_array *ctx_trace; +static int __read_mostly tracer_enabled; + +static void notrace +ctx_switch_func(struct task_struct *prev, struct task_struct *next) +{ + struct trace_array *tr = ctx_trace; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int cpu; + + if (!tracer_enabled) + return; + + raw_local_irq_save(flags); + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + disabled = atomic_inc_return(&data->disabled); + + if (likely(disabled == 1)) + tracing_sched_switch_trace(tr, data, prev, next, flags); + + atomic_dec(&data->disabled); + raw_local_irq_restore(flags); +} + +void ftrace_ctx_switch(struct task_struct *prev, struct task_struct *next) +{ + tracing_record_cmdline(prev); + + /* + * If tracer_switch_func only points to the local + * switch func, it still needs the ptr passed to it. + */ + ctx_switch_func(prev, next); + + /* + * Chain to the wakeup tracer (this is a NOP if disabled): + */ + wakeup_sched_switch(prev, next); +} + +static notrace void sched_switch_reset(struct trace_array *tr) +{ + int cpu; + + tr->time_start = now(tr->cpu); + + for_each_online_cpu(cpu) + tracing_reset(tr->data[cpu]); +} + +static notrace void start_sched_trace(struct trace_array *tr) +{ + sched_switch_reset(tr); + tracer_enabled = 1; +} + +static notrace void stop_sched_trace(struct trace_array *tr) +{ + tracer_enabled = 0; +} + +static notrace void sched_switch_trace_init(struct trace_array *tr) +{ + ctx_trace = tr; + + if (tr->ctrl) + start_sched_trace(tr); +} + +static notrace void sched_switch_trace_reset(struct trace_array *tr) +{ + if (tr->ctrl) + stop_sched_trace(tr); +} + +static void sched_switch_trace_ctrl_update(struct trace_array *tr) +{ + /* When starting a new trace, reset the buffers */ + if (tr->ctrl) + start_sched_trace(tr); + else + stop_sched_trace(tr); +} + +static struct tracer sched_switch_trace __read_mostly = +{ + .name = "sched_switch", + .init = sched_switch_trace_init, + .reset = sched_switch_trace_reset, + .ctrl_update = sched_switch_trace_ctrl_update, +}; + +__init static int init_sched_switch_trace(void) +{ + return register_tracer(&sched_switch_trace); +} +device_initcall(init_sched_switch_trace); -- cgit v0.10.2 From 352ad25aa4a189c667cb2af333948d34692a2d27 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:42 +0200 Subject: ftrace: tracer for scheduler wakeup latency This patch adds the tracer that tracks the wakeup latency of the highest priority waking task. "wakeup" is added to /debugfs/tracing/available_tracers Also added to /debugfs/tracing tracing_max_latency holds the current max latency for the wakeup wakeup_thresh if set to other than zero, a log will be recorded for every wakeup that takes longer than the number entered in here (usecs for all counters) (deletes previous trace) Examples: (with ftrace_enabled = 0) ============ preemption latency trace v1.1.5 on 2.6.24-rc8 Signed-off-by: Ingo Molnar -------------------------------------------------------------------- latency: 26 us, #2/2, CPU#1 | (M:rt VP:0, KP:0, SP:0 HP:0 #P:2) ----------------- | task: migration/0-3 (uid:0 nice:-5 policy:1 rt_prio:99) ----------------- _------=> CPU# / _-----=> irqs-off | / _----=> need-resched || / _---=> hardirq/softirq ||| / _--=> preempt-depth |||| / ||||| delay cmd pid ||||| time | caller \ / ||||| \ | / quilt-8551 0d..3 0us+: wake_up_process+0x15/0x17 (sched_exec+0xc9/0x100 ) quilt-8551 0d..4 26us : sched_switch_callback+0x73/0x81 (schedule+0x483/0x6d5 ) vim:ft=help ============ (with ftrace_enabled = 1) ============ preemption latency trace v1.1.5 on 2.6.24-rc8 -------------------------------------------------------------------- latency: 36 us, #45/45, CPU#0 | (M:rt VP:0, KP:0, SP:0 HP:0 #P:2) ----------------- | task: migration/1-5 (uid:0 nice:-5 policy:1 rt_prio:99) ----------------- _------=> CPU# / _-----=> irqs-off | / _----=> need-resched || / _---=> hardirq/softirq ||| / _--=> preempt-depth |||| / ||||| delay cmd pid ||||| time | caller \ / ||||| \ | / bash-10653 1d..3 0us : wake_up_process+0x15/0x17 (sched_exec+0xc9/0x100 ) bash-10653 1d..3 1us : try_to_wake_up+0x271/0x2e7 (sub_preempt_count+0xc/0x7a ) bash-10653 1d..2 2us : try_to_wake_up+0x296/0x2e7 (update_rq_clock+0x9/0x20 ) bash-10653 1d..2 2us : update_rq_clock+0x1e/0x20 (__update_rq_clock+0xc/0x90 ) bash-10653 1d..2 3us : __update_rq_clock+0x1b/0x90 (sched_clock+0x9/0x29 ) bash-10653 1d..2 4us : try_to_wake_up+0x2a6/0x2e7 (activate_task+0xc/0x3f ) bash-10653 1d..2 4us : activate_task+0x2d/0x3f (enqueue_task+0xe/0x66 ) bash-10653 1d..2 5us : enqueue_task+0x5b/0x66 (enqueue_task_rt+0x9/0x3c ) bash-10653 1d..2 6us : try_to_wake_up+0x2ba/0x2e7 (check_preempt_wakeup+0x12/0x99 ) [...] bash-10653 1d..5 33us : tracing_record_cmdline+0xcf/0xd4 (_spin_unlock+0x9/0x33 ) bash-10653 1d..5 34us : _spin_unlock+0x19/0x33 (sub_preempt_count+0xc/0x7a ) bash-10653 1d..4 35us : wakeup_sched_switch+0x65/0x2ff (_spin_lock_irqsave+0xc/0xa9 ) bash-10653 1d..4 35us : _spin_lock_irqsave+0x19/0xa9 (add_preempt_count+0xe/0x77 ) bash-10653 1d..4 36us : sched_switch_callback+0x73/0x81 (schedule+0x483/0x6d5 ) vim:ft=help ============ The [...] was added here to not waste your email box space. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index b96ef14..db8a5e7 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -5,10 +5,6 @@ #include -#define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0)) -#define CALLER_ADDR1 ((unsigned long)__builtin_return_address(1)) -#define CALLER_ADDR2 ((unsigned long)__builtin_return_address(2)) - typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip); struct ftrace_ops { @@ -35,4 +31,23 @@ extern void mcount(void); # define unregister_ftrace_function(ops) do { } while (0) # define clear_ftrace_function(ops) do { } while (0) #endif /* CONFIG_FTRACE */ + + +#ifdef CONFIG_FRAME_POINTER +/* TODO: need to fix this for ARM */ +# define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0)) +# define CALLER_ADDR1 ((unsigned long)__builtin_return_address(1)) +# define CALLER_ADDR2 ((unsigned long)__builtin_return_address(2)) +# define CALLER_ADDR3 ((unsigned long)__builtin_return_address(3)) +# define CALLER_ADDR4 ((unsigned long)__builtin_return_address(4)) +# define CALLER_ADDR5 ((unsigned long)__builtin_return_address(5)) +#else +# define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0)) +# define CALLER_ADDR1 0UL +# define CALLER_ADDR2 0UL +# define CALLER_ADDR3 0UL +# define CALLER_ADDR4 0UL +# define CALLER_ADDR5 0UL +#endif + #endif /* _LINUX_FTRACE_H */ diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 5d6aa92..892ecc9 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -4,6 +4,9 @@ config HAVE_FTRACE bool +config TRACER_MAX_TRACE + bool + config TRACING bool select DEBUG_FS @@ -23,6 +26,16 @@ config FTRACE (the bootup default), then the overhead of the instructions is very small and not measurable even in micro-benchmarks. +config SCHED_TRACER + bool "Scheduling Latency Tracer" + depends on DEBUG_KERNEL + select TRACING + select CONTEXT_SWITCH_TRACER + select TRACER_MAX_TRACE + help + This tracer tracks the latency of the highest priority task + to be scheduled in, starting from the point it has woken up. + config CONTEXT_SWITCH_TRACER bool "Trace process context switches" depends on DEBUG_KERNEL diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 6b54ceb..5508cdb 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -3,5 +3,6 @@ obj-$(CONFIG_FTRACE) += libftrace.o obj-$(CONFIG_TRACING) += trace.o obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o obj-$(CONFIG_FTRACE) += trace_functions.o +obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o libftrace-y := ftrace.o diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c new file mode 100644 index 0000000..7c3ccef --- /dev/null +++ b/kernel/trace/trace_sched_wakeup.c @@ -0,0 +1,310 @@ +/* + * trace task wakeup timings + * + * Copyright (C) 2007-2008 Steven Rostedt + * Copyright (C) 2008 Ingo Molnar + * + * Based on code from the latency_tracer, that is: + * + * Copyright (C) 2004-2006 Ingo Molnar + * Copyright (C) 2004 William Lee Irwin III + */ +#include +#include +#include +#include +#include +#include + +#include "trace.h" + +static struct trace_array *wakeup_trace; +static int __read_mostly tracer_enabled; + +static struct task_struct *wakeup_task; +static int wakeup_cpu; +static unsigned wakeup_prio = -1; + +static DEFINE_SPINLOCK(wakeup_lock); + +static void notrace __wakeup_reset(struct trace_array *tr); + +/* + * Should this new latency be reported/recorded? + */ +static int notrace report_latency(cycle_t delta) +{ + if (tracing_thresh) { + if (delta < tracing_thresh) + return 0; + } else { + if (delta <= tracing_max_latency) + return 0; + } + return 1; +} + +void notrace +wakeup_sched_switch(struct task_struct *prev, struct task_struct *next) +{ + unsigned long latency = 0, t0 = 0, t1 = 0; + struct trace_array *tr = wakeup_trace; + struct trace_array_cpu *data; + cycle_t T0, T1, delta; + unsigned long flags; + long disabled; + int cpu; + + if (unlikely(!tracer_enabled)) + return; + + /* + * When we start a new trace, we set wakeup_task to NULL + * and then set tracer_enabled = 1. We want to make sure + * that another CPU does not see the tracer_enabled = 1 + * and the wakeup_task with an older task, that might + * actually be the same as next. + */ + smp_rmb(); + + if (next != wakeup_task) + return; + + /* The task we are waitng for is waking up */ + data = tr->data[wakeup_cpu]; + + /* disable local data, not wakeup_cpu data */ + cpu = raw_smp_processor_id(); + disabled = atomic_inc_return(&tr->data[cpu]->disabled); + if (likely(disabled != 1)) + goto out; + + spin_lock_irqsave(&wakeup_lock, flags); + + /* We could race with grabbing wakeup_lock */ + if (unlikely(!tracer_enabled || next != wakeup_task)) + goto out_unlock; + + ftrace(tr, data, CALLER_ADDR1, CALLER_ADDR2, flags); + + /* + * usecs conversion is slow so we try to delay the conversion + * as long as possible: + */ + T0 = data->preempt_timestamp; + T1 = now(cpu); + delta = T1-T0; + + if (!report_latency(delta)) + goto out_unlock; + + latency = nsecs_to_usecs(delta); + + tracing_max_latency = delta; + t0 = nsecs_to_usecs(T0); + t1 = nsecs_to_usecs(T1); + + update_max_tr(tr, wakeup_task, wakeup_cpu); + + if (tracing_thresh) { + printk(KERN_INFO "(%16s-%-5d|#%d): %lu us wakeup latency " + "violates %lu us threshold.\n" + " => started at timestamp %lu: ", + wakeup_task->comm, wakeup_task->pid, + raw_smp_processor_id(), + latency, nsecs_to_usecs(tracing_thresh), t0); + } else { + printk(KERN_INFO "(%16s-%-5d|#%d): new %lu us maximum " + "wakeup latency.\n => started at timestamp %lu: ", + wakeup_task->comm, wakeup_task->pid, + cpu, latency, t0); + } + + printk(KERN_CONT " ended at timestamp %lu: ", t1); + dump_stack(); + t1 = nsecs_to_usecs(now(cpu)); + printk(KERN_CONT " dump-end timestamp %lu\n\n", t1); + +out_unlock: + __wakeup_reset(tr); + spin_unlock_irqrestore(&wakeup_lock, flags); +out: + atomic_dec(&tr->data[cpu]->disabled); +} + +static void notrace __wakeup_reset(struct trace_array *tr) +{ + struct trace_array_cpu *data; + int cpu; + + assert_spin_locked(&wakeup_lock); + + for_each_possible_cpu(cpu) { + data = tr->data[cpu]; + tracing_reset(data); + } + + wakeup_cpu = -1; + wakeup_prio = -1; + + if (wakeup_task) + put_task_struct(wakeup_task); + + wakeup_task = NULL; +} + +static void notrace wakeup_reset(struct trace_array *tr) +{ + unsigned long flags; + + spin_lock_irqsave(&wakeup_lock, flags); + __wakeup_reset(tr); + spin_unlock_irqrestore(&wakeup_lock, flags); +} + +static notrace void +wakeup_check_start(struct trace_array *tr, struct task_struct *p, + struct task_struct *curr) +{ + int cpu = smp_processor_id(); + unsigned long flags; + long disabled; + + if (likely(!rt_task(p)) || + p->prio >= wakeup_prio || + p->prio >= curr->prio) + return; + + disabled = atomic_inc_return(&tr->data[cpu]->disabled); + if (unlikely(disabled != 1)) + goto out; + + /* interrupts should be off from try_to_wake_up */ + spin_lock(&wakeup_lock); + + /* check for races. */ + if (!tracer_enabled || p->prio >= wakeup_prio) + goto out_locked; + + /* reset the trace */ + __wakeup_reset(tr); + + wakeup_cpu = task_cpu(p); + wakeup_prio = p->prio; + + wakeup_task = p; + get_task_struct(wakeup_task); + + local_save_flags(flags); + + tr->data[wakeup_cpu]->preempt_timestamp = now(cpu); + ftrace(tr, tr->data[wakeup_cpu], CALLER_ADDR1, CALLER_ADDR2, flags); + +out_locked: + spin_unlock(&wakeup_lock); +out: + atomic_dec(&tr->data[cpu]->disabled); +} + +notrace void +ftrace_wake_up_task(struct task_struct *wakee, struct task_struct *curr) +{ + if (likely(!tracer_enabled)) + return; + + wakeup_check_start(wakeup_trace, wakee, curr); +} + +notrace void +ftrace_wake_up_new_task(struct task_struct *wakee, struct task_struct *curr) +{ + if (likely(!tracer_enabled)) + return; + + wakeup_check_start(wakeup_trace, wakee, curr); +} + +static notrace void start_wakeup_tracer(struct trace_array *tr) +{ + wakeup_reset(tr); + + /* + * Don't let the tracer_enabled = 1 show up before + * the wakeup_task is reset. This may be overkill since + * wakeup_reset does a spin_unlock after setting the + * wakeup_task to NULL, but I want to be safe. + * This is a slow path anyway. + */ + smp_wmb(); + + tracer_enabled = 1; + + return; +} + +static notrace void stop_wakeup_tracer(struct trace_array *tr) +{ + tracer_enabled = 0; +} + +static notrace void wakeup_tracer_init(struct trace_array *tr) +{ + wakeup_trace = tr; + + if (tr->ctrl) + start_wakeup_tracer(tr); +} + +static notrace void wakeup_tracer_reset(struct trace_array *tr) +{ + if (tr->ctrl) { + stop_wakeup_tracer(tr); + /* make sure we put back any tasks we are tracing */ + wakeup_reset(tr); + } +} + +static void wakeup_tracer_ctrl_update(struct trace_array *tr) +{ + if (tr->ctrl) + start_wakeup_tracer(tr); + else + stop_wakeup_tracer(tr); +} + +static void notrace wakeup_tracer_open(struct trace_iterator *iter) +{ + /* stop the trace while dumping */ + if (iter->tr->ctrl) + stop_wakeup_tracer(iter->tr); +} + +static void notrace wakeup_tracer_close(struct trace_iterator *iter) +{ + /* forget about any processes we were recording */ + if (iter->tr->ctrl) + start_wakeup_tracer(iter->tr); +} + +static struct tracer wakeup_tracer __read_mostly = +{ + .name = "wakeup", + .init = wakeup_tracer_init, + .reset = wakeup_tracer_reset, + .open = wakeup_tracer_open, + .close = wakeup_tracer_close, + .ctrl_update = wakeup_tracer_ctrl_update, + .print_max = 1, +}; + +__init static int init_wakeup_tracer(void) +{ + int ret; + + ret = register_tracer(&wakeup_tracer); + if (ret) + return ret; + + return 0; +} +device_initcall(init_wakeup_tracer); -- cgit v0.10.2 From 81d68a96a39844853b37f20cc8282d9b65b78ef3 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:42 +0200 Subject: ftrace: trace irq disabled critical timings This patch adds latency tracing for critical timings (how long interrupts are disabled for). "irqsoff" is added to /debugfs/tracing/available_tracers Note: tracing_max_latency also holds the max latency for irqsoff (in usecs). (default to large number so one must start latency tracing) tracing_thresh threshold (in usecs) to always print out if irqs off is detected to be longer than stated here. If irq_thresh is non-zero, then max_irq_latency is ignored. Here's an example of a trace with ftrace_enabled = 0 ======= preemption latency trace v1.1.5 on 2.6.24-rc7 Signed-off-by: Ingo Molnar -------------------------------------------------------------------- latency: 100 us, #3/3, CPU#1 | (M:rt VP:0, KP:0, SP:0 HP:0 #P:2) ----------------- | task: swapper-0 (uid:0 nice:0 policy:0 rt_prio:0) ----------------- => started at: _spin_lock_irqsave+0x2a/0xb7 => ended at: _spin_unlock_irqrestore+0x32/0x5f _------=> CPU# / _-----=> irqs-off | / _----=> need-resched || / _---=> hardirq/softirq ||| / _--=> preempt-depth |||| / ||||| delay cmd pid ||||| time | caller \ / ||||| \ | / swapper-0 1d.s3 0us+: _spin_lock_irqsave+0x2a/0xb7 (e1000_update_stats+0x47/0x64c [e1000]) swapper-0 1d.s3 100us : _spin_unlock_irqrestore+0x32/0x5f (e1000_update_stats+0x641/0x64c [e1000]) swapper-0 1d.s3 100us : trace_hardirqs_on_caller+0x75/0x89 (_spin_unlock_irqrestore+0x32/0x5f) vim:ft=help ======= And this is a trace with ftrace_enabled == 1 ======= preemption latency trace v1.1.5 on 2.6.24-rc7 -------------------------------------------------------------------- latency: 102 us, #12/12, CPU#1 | (M:rt VP:0, KP:0, SP:0 HP:0 #P:2) ----------------- | task: swapper-0 (uid:0 nice:0 policy:0 rt_prio:0) ----------------- => started at: _spin_lock_irqsave+0x2a/0xb7 => ended at: _spin_unlock_irqrestore+0x32/0x5f _------=> CPU# / _-----=> irqs-off | / _----=> need-resched || / _---=> hardirq/softirq ||| / _--=> preempt-depth |||| / ||||| delay cmd pid ||||| time | caller \ / ||||| \ | / swapper-0 1dNs3 0us+: _spin_lock_irqsave+0x2a/0xb7 (e1000_update_stats+0x47/0x64c [e1000]) swapper-0 1dNs3 46us : e1000_read_phy_reg+0x16/0x225 [e1000] (e1000_update_stats+0x5e2/0x64c [e1000]) swapper-0 1dNs3 46us : e1000_swfw_sync_acquire+0x10/0x99 [e1000] (e1000_read_phy_reg+0x49/0x225 [e1000]) swapper-0 1dNs3 46us : e1000_get_hw_eeprom_semaphore+0x12/0xa6 [e1000] (e1000_swfw_sync_acquire+0x36/0x99 [e1000]) swapper-0 1dNs3 47us : __const_udelay+0x9/0x47 (e1000_read_phy_reg+0x116/0x225 [e1000]) swapper-0 1dNs3 47us+: __delay+0x9/0x50 (__const_udelay+0x45/0x47) swapper-0 1dNs3 97us : preempt_schedule+0xc/0x84 (__delay+0x4e/0x50) swapper-0 1dNs3 98us : e1000_swfw_sync_release+0xc/0x55 [e1000] (e1000_read_phy_reg+0x211/0x225 [e1000]) swapper-0 1dNs3 99us+: e1000_put_hw_eeprom_semaphore+0x9/0x35 [e1000] (e1000_swfw_sync_release+0x50/0x55 [e1000]) swapper-0 1dNs3 101us : _spin_unlock_irqrestore+0xe/0x5f (e1000_update_stats+0x641/0x64c [e1000]) swapper-0 1dNs3 102us : _spin_unlock_irqrestore+0x32/0x5f (e1000_update_stats+0x641/0x64c [e1000]) swapper-0 1dNs3 102us : trace_hardirqs_on_caller+0x75/0x89 (_spin_unlock_irqrestore+0x32/0x5f) vim:ft=help ======= Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index e2319f3..dd349c9 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -165,7 +165,10 @@ void cpu_idle(void) */ local_irq_disable(); enter_idle(); + /* Don't trace irqs off for idle */ + stop_critical_timings(); idle(); + start_critical_timings(); /* In many cases the interrupt that ended idle has already called exit_idle. But some idle loops can be woken up without interrupt. */ diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 76f60f5..84aa288 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -5,6 +5,7 @@ obj-$(CONFIG_SMP) := msr-on-cpu.o lib-y := delay_$(BITS).o +lib-y += thunk_$(BITS).o lib-y += usercopy_$(BITS).o getuser_$(BITS).o putuser_$(BITS).o lib-y += memcpy_$(BITS).o diff --git a/arch/x86/lib/thunk_32.S b/arch/x86/lib/thunk_32.S new file mode 100644 index 0000000..650b11e --- /dev/null +++ b/arch/x86/lib/thunk_32.S @@ -0,0 +1,47 @@ +/* + * Trampoline to trace irqs off. (otherwise CALLER_ADDR1 might crash) + * Copyright 2008 by Steven Rostedt, Red Hat, Inc + * (inspired by Andi Kleen's thunk_64.S) + * Subject to the GNU public license, v.2. No warranty of any kind. + */ + + #include + +#define ARCH_TRACE_IRQS_ON \ + pushl %eax; \ + pushl %ecx; \ + pushl %edx; \ + call trace_hardirqs_on; \ + popl %edx; \ + popl %ecx; \ + popl %eax; + +#define ARCH_TRACE_IRQS_OFF \ + pushl %eax; \ + pushl %ecx; \ + pushl %edx; \ + call trace_hardirqs_off; \ + popl %edx; \ + popl %ecx; \ + popl %eax; + +#ifdef CONFIG_TRACE_IRQFLAGS + /* put return address in eax (arg1) */ + .macro thunk_ra name,func + .globl \name +\name: + pushl %eax + pushl %ecx + pushl %edx + /* Place EIP in the arg1 */ + movl 3*4(%esp), %eax + call \func + popl %edx + popl %ecx + popl %eax + ret + .endm + + thunk_ra trace_hardirqs_on_thunk,trace_hardirqs_on_caller + thunk_ra trace_hardirqs_off_thunk,trace_hardirqs_off_caller +#endif diff --git a/arch/x86/lib/thunk_64.S b/arch/x86/lib/thunk_64.S index e009251..bf9a7d5 100644 --- a/arch/x86/lib/thunk_64.S +++ b/arch/x86/lib/thunk_64.S @@ -2,6 +2,7 @@ * Save registers before calling assembly functions. This avoids * disturbance of register allocation in some inline assembly constructs. * Copyright 2001,2002 by Andi Kleen, SuSE Labs. + * Added trace_hardirqs callers - Copyright 2007 Steven Rostedt, Red Hat, Inc. * Subject to the GNU public license, v.2. No warranty of any kind. */ @@ -42,8 +43,22 @@ #endif #ifdef CONFIG_TRACE_IRQFLAGS - thunk trace_hardirqs_on_thunk,trace_hardirqs_on - thunk trace_hardirqs_off_thunk,trace_hardirqs_off + /* put return address in rdi (arg1) */ + .macro thunk_ra name,func + .globl \name +\name: + CFI_STARTPROC + SAVE_ARGS + /* SAVE_ARGS pushs 9 elements */ + /* the next element would be the rip */ + movq 9*8(%rsp), %rdi + call \func + jmp restore + CFI_ENDPROC + .endm + + thunk_ra trace_hardirqs_on_thunk,trace_hardirqs_on_caller + thunk_ra trace_hardirqs_off_thunk,trace_hardirqs_off_caller #endif #ifdef CONFIG_DEBUG_LOCK_ALLOC diff --git a/include/asm-x86/irqflags.h b/include/asm-x86/irqflags.h index c242527..24d71b1 100644 --- a/include/asm-x86/irqflags.h +++ b/include/asm-x86/irqflags.h @@ -179,8 +179,6 @@ static inline void trace_hardirqs_fixup(void) * have a reliable stack. x86_64 only. */ #define SWAPGS_UNSAFE_STACK swapgs -#define ARCH_TRACE_IRQS_ON call trace_hardirqs_on_thunk -#define ARCH_TRACE_IRQS_OFF call trace_hardirqs_off_thunk #define ARCH_LOCKDEP_SYS_EXIT call lockdep_sys_exit_thunk #define ARCH_LOCKDEP_SYS_EXIT_IRQ \ TRACE_IRQS_ON; \ @@ -192,24 +190,6 @@ static inline void trace_hardirqs_fixup(void) TRACE_IRQS_OFF; #else -#define ARCH_TRACE_IRQS_ON \ - pushl %eax; \ - pushl %ecx; \ - pushl %edx; \ - call trace_hardirqs_on; \ - popl %edx; \ - popl %ecx; \ - popl %eax; - -#define ARCH_TRACE_IRQS_OFF \ - pushl %eax; \ - pushl %ecx; \ - pushl %edx; \ - call trace_hardirqs_off; \ - popl %edx; \ - popl %ecx; \ - popl %eax; - #define ARCH_LOCKDEP_SYS_EXIT \ pushl %eax; \ pushl %ecx; \ @@ -223,8 +203,8 @@ static inline void trace_hardirqs_fixup(void) #endif #ifdef CONFIG_TRACE_IRQFLAGS -# define TRACE_IRQS_ON ARCH_TRACE_IRQS_ON -# define TRACE_IRQS_OFF ARCH_TRACE_IRQS_OFF +# define TRACE_IRQS_ON call trace_hardirqs_on_thunk; +# define TRACE_IRQS_OFF call trace_hardirqs_off_thunk; #else # define TRACE_IRQS_ON # define TRACE_IRQS_OFF diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index db8a5e7..0a20445 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -50,4 +50,12 @@ extern void mcount(void); # define CALLER_ADDR5 0UL #endif +#ifdef CONFIG_IRQSOFF_TRACER + extern void notrace time_hardirqs_on(unsigned long a0, unsigned long a1); + extern void notrace time_hardirqs_off(unsigned long a0, unsigned long a1); +#else +# define time_hardirqs_on(a0, a1) do { } while (0) +# define time_hardirqs_off(a0, a1) do { } while (0) +#endif + #endif /* _LINUX_FTRACE_H */ diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h index e600c4e..5b711d4 100644 --- a/include/linux/irqflags.h +++ b/include/linux/irqflags.h @@ -12,10 +12,10 @@ #define _LINUX_TRACE_IRQFLAGS_H #ifdef CONFIG_TRACE_IRQFLAGS - extern void trace_hardirqs_on(void); - extern void trace_hardirqs_off(void); extern void trace_softirqs_on(unsigned long ip); extern void trace_softirqs_off(unsigned long ip); + extern void trace_hardirqs_on(void); + extern void trace_hardirqs_off(void); # define trace_hardirq_context(p) ((p)->hardirq_context) # define trace_softirq_context(p) ((p)->softirq_context) # define trace_hardirqs_enabled(p) ((p)->hardirqs_enabled) @@ -41,6 +41,14 @@ # define INIT_TRACE_IRQFLAGS #endif +#ifdef CONFIG_IRQSOFF_TRACER + extern void stop_critical_timings(void); + extern void start_critical_timings(void); +#else +# define stop_critical_timings() do { } while (0) +# define start_critical_timings() do { } while (0) +#endif + #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT #include diff --git a/kernel/fork.c b/kernel/fork.c index 19908b2..d66d676 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -909,7 +909,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, rt_mutex_init_task(p); -#ifdef CONFIG_TRACE_IRQFLAGS +#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_LOCKDEP) DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); #endif diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 81a4e4a..e219243 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -39,6 +39,7 @@ #include #include #include +#include #include @@ -982,7 +983,7 @@ check_noncircular(struct lock_class *source, unsigned int depth) return 1; } -#ifdef CONFIG_TRACE_IRQFLAGS +#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) /* * Forwards and backwards subgraph searching, for the purposes of * proving that two subgraphs can be connected by a new dependency @@ -1680,7 +1681,7 @@ valid_state(struct task_struct *curr, struct held_lock *this, static int mark_lock(struct task_struct *curr, struct held_lock *this, enum lock_usage_bit new_bit); -#ifdef CONFIG_TRACE_IRQFLAGS +#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) /* * print irq inversion bug: @@ -2013,11 +2014,13 @@ void early_boot_irqs_on(void) /* * Hardirqs will be enabled: */ -void trace_hardirqs_on(void) +void notrace trace_hardirqs_on_caller(unsigned long a0) { struct task_struct *curr = current; unsigned long ip; + time_hardirqs_on(CALLER_ADDR0, a0); + if (unlikely(!debug_locks || current->lockdep_recursion)) return; @@ -2055,16 +2058,23 @@ void trace_hardirqs_on(void) curr->hardirq_enable_event = ++curr->irq_events; debug_atomic_inc(&hardirqs_on_events); } +EXPORT_SYMBOL(trace_hardirqs_on_caller); +void notrace trace_hardirqs_on(void) +{ + trace_hardirqs_on_caller(CALLER_ADDR0); +} EXPORT_SYMBOL(trace_hardirqs_on); /* * Hardirqs were disabled: */ -void trace_hardirqs_off(void) +void notrace trace_hardirqs_off_caller(unsigned long a0) { struct task_struct *curr = current; + time_hardirqs_off(CALLER_ADDR0, a0); + if (unlikely(!debug_locks || current->lockdep_recursion)) return; @@ -2082,7 +2092,12 @@ void trace_hardirqs_off(void) } else debug_atomic_inc(&redundant_hardirqs_off); } +EXPORT_SYMBOL(trace_hardirqs_off_caller); +void notrace trace_hardirqs_off(void) +{ + trace_hardirqs_off_caller(CALLER_ADDR0); +} EXPORT_SYMBOL(trace_hardirqs_off); /* diff --git a/kernel/printk.c b/kernel/printk.c index 8fb01c3..ae7d5b9 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1041,7 +1041,9 @@ void release_console_sem(void) _log_end = log_end; con_start = log_end; /* Flush */ spin_unlock(&logbuf_lock); + stop_critical_timings(); /* don't trace print latency */ call_console_drivers(_con_start, _log_end); + start_critical_timings(); local_irq_restore(flags); } console_locked = 0; diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 892ecc9..896df1c 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -26,6 +26,24 @@ config FTRACE (the bootup default), then the overhead of the instructions is very small and not measurable even in micro-benchmarks. +config IRQSOFF_TRACER + bool "Interrupts-off Latency Tracer" + default n + depends on TRACE_IRQFLAGS_SUPPORT + depends on GENERIC_TIME + select TRACE_IRQFLAGS + select TRACING + select TRACER_MAX_TRACE + help + This option measures the time spent in irqs-off critical + sections, with microsecond accuracy. + + The default measurement method is a maximum search, which is + disabled by default and can be runtime (re-)started + via: + + echo 0 > /debugfs/tracing/tracing_max_latency + config SCHED_TRACER bool "Scheduling Latency Tracer" depends on DEBUG_KERNEL diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 5508cdb..46be864 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -3,6 +3,7 @@ obj-$(CONFIG_FTRACE) += libftrace.o obj-$(CONFIG_TRACING) += trace.o obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o obj-$(CONFIG_FTRACE) += trace_functions.o +obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o libftrace-y := ftrace.o diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c new file mode 100644 index 0000000..a9131b0 --- /dev/null +++ b/kernel/trace/trace_irqsoff.c @@ -0,0 +1,402 @@ +/* + * trace irqs off criticall timings + * + * Copyright (C) 2007-2008 Steven Rostedt + * Copyright (C) 2008 Ingo Molnar + * + * From code in the latency_tracer, that is: + * + * Copyright (C) 2004-2006 Ingo Molnar + * Copyright (C) 2004 William Lee Irwin III + */ +#include +#include +#include +#include +#include +#include + +#include "trace.h" + +static struct trace_array *irqsoff_trace __read_mostly; +static int tracer_enabled __read_mostly; + +/* + * Sequence count - we record it when starting a measurement and + * skip the latency if the sequence has changed - some other section + * did a maximum and could disturb our measurement with serial console + * printouts, etc. Truly coinciding maximum latencies should be rare + * and what happens together happens separately as well, so this doesnt + * decrease the validity of the maximum found: + */ +static __cacheline_aligned_in_smp unsigned long max_sequence; + +#ifdef CONFIG_FTRACE +/* + * irqsoff uses its own tracer function to keep the overhead down: + */ +static void notrace +irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) +{ + struct trace_array *tr = irqsoff_trace; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int cpu; + + if (likely(!tracer_enabled)) + return; + + local_save_flags(flags); + + if (!irqs_disabled_flags(flags)) + return; + + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + disabled = atomic_inc_return(&data->disabled); + + if (likely(disabled == 1)) + ftrace(tr, data, ip, parent_ip, flags); + + atomic_dec(&data->disabled); +} + +static struct ftrace_ops trace_ops __read_mostly = +{ + .func = irqsoff_tracer_call, +}; +#endif /* CONFIG_FTRACE */ + +/* + * Should this new latency be reported/recorded? + */ +static int notrace report_latency(cycle_t delta) +{ + if (tracing_thresh) { + if (delta < tracing_thresh) + return 0; + } else { + if (delta <= tracing_max_latency) + return 0; + } + return 1; +} + +static void notrace +check_critical_timing(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long parent_ip, + int cpu) +{ + unsigned long latency, t0, t1; + cycle_t T0, T1, T2, delta; + unsigned long flags; + + /* + * usecs conversion is slow so we try to delay the conversion + * as long as possible: + */ + T0 = data->preempt_timestamp; + T1 = now(cpu); + delta = T1-T0; + + local_save_flags(flags); + + if (!report_latency(delta)) + goto out; + + ftrace(tr, data, CALLER_ADDR0, parent_ip, flags); + /* + * Update the timestamp, because the trace entry above + * might change it (it can only get larger so the latency + * is fair to be reported): + */ + T2 = now(cpu); + + delta = T2-T0; + + latency = nsecs_to_usecs(delta); + + if (data->critical_sequence != max_sequence) + goto out; + + tracing_max_latency = delta; + t0 = nsecs_to_usecs(T0); + t1 = nsecs_to_usecs(T1); + + data->critical_end = parent_ip; + + update_max_tr_single(tr, current, cpu); + + if (tracing_thresh) + printk(KERN_INFO "(%16s-%-5d|#%d): %lu us critical section " + "violates %lu us threshold.\n" + " => started at timestamp %lu: ", + current->comm, current->pid, + raw_smp_processor_id(), + latency, nsecs_to_usecs(tracing_thresh), t0); + else + printk(KERN_INFO "(%16s-%-5d|#%d):" + " new %lu us maximum-latency " + "critical section.\n => started at timestamp %lu: ", + current->comm, current->pid, + raw_smp_processor_id(), + latency, t0); + + print_symbol(KERN_CONT "<%s>\n", data->critical_start); + printk(KERN_CONT " => ended at timestamp %lu: ", t1); + print_symbol(KERN_CONT "<%s>\n", data->critical_end); + dump_stack(); + t1 = nsecs_to_usecs(now(cpu)); + printk(KERN_CONT " => dump-end timestamp %lu\n\n", t1); + + max_sequence++; + +out: + data->critical_sequence = max_sequence; + data->preempt_timestamp = now(cpu); + tracing_reset(data); + ftrace(tr, data, CALLER_ADDR0, parent_ip, flags); +} + +static inline void notrace +start_critical_timing(unsigned long ip, unsigned long parent_ip) +{ + int cpu; + struct trace_array *tr = irqsoff_trace; + struct trace_array_cpu *data; + unsigned long flags; + + if (likely(!tracer_enabled)) + return; + + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + + if (unlikely(!data) || unlikely(!data->trace) || + data->critical_start || atomic_read(&data->disabled)) + return; + + atomic_inc(&data->disabled); + + data->critical_sequence = max_sequence; + data->preempt_timestamp = now(cpu); + data->critical_start = parent_ip; + tracing_reset(data); + + local_save_flags(flags); + ftrace(tr, data, ip, parent_ip, flags); + + atomic_dec(&data->disabled); +} + +static inline void notrace +stop_critical_timing(unsigned long ip, unsigned long parent_ip) +{ + int cpu; + struct trace_array *tr = irqsoff_trace; + struct trace_array_cpu *data; + unsigned long flags; + + if (likely(!tracer_enabled)) + return; + + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + + if (unlikely(!data) || unlikely(!data->trace) || + !data->critical_start || atomic_read(&data->disabled)) + return; + + atomic_inc(&data->disabled); + local_save_flags(flags); + ftrace(tr, data, ip, parent_ip, flags); + check_critical_timing(tr, data, parent_ip, cpu); + data->critical_start = 0; + atomic_dec(&data->disabled); +} + +void notrace start_critical_timings(void) +{ + unsigned long flags; + + local_save_flags(flags); + + if (irqs_disabled_flags(flags)) + start_critical_timing(CALLER_ADDR0, CALLER_ADDR1); +} + +void notrace stop_critical_timings(void) +{ + unsigned long flags; + + local_save_flags(flags); + + if (irqs_disabled_flags(flags)) + stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1); +} + +#ifdef CONFIG_PROVE_LOCKING +void notrace time_hardirqs_on(unsigned long a0, unsigned long a1) +{ + unsigned long flags; + + local_save_flags(flags); + + if (irqs_disabled_flags(flags)) + stop_critical_timing(a0, a1); +} + +void notrace time_hardirqs_off(unsigned long a0, unsigned long a1) +{ + unsigned long flags; + + local_save_flags(flags); + + if (irqs_disabled_flags(flags)) + start_critical_timing(a0, a1); +} + +#else /* !CONFIG_PROVE_LOCKING */ + +/* + * Stubs: + */ + +void early_boot_irqs_off(void) +{ +} + +void early_boot_irqs_on(void) +{ +} + +void trace_softirqs_on(unsigned long ip) +{ +} + +void trace_softirqs_off(unsigned long ip) +{ +} + +inline void print_irqtrace_events(struct task_struct *curr) +{ +} + +/* + * We are only interested in hardirq on/off events: + */ +void notrace trace_hardirqs_on(void) +{ + unsigned long flags; + + local_save_flags(flags); + + if (irqs_disabled_flags(flags)) + stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1); +} +EXPORT_SYMBOL(trace_hardirqs_on); + +void notrace trace_hardirqs_off(void) +{ + unsigned long flags; + + local_save_flags(flags); + + if (irqs_disabled_flags(flags)) + start_critical_timing(CALLER_ADDR0, CALLER_ADDR1); +} +EXPORT_SYMBOL(trace_hardirqs_off); + +void notrace trace_hardirqs_on_caller(unsigned long caller_addr) +{ + unsigned long flags; + + local_save_flags(flags); + + if (irqs_disabled_flags(flags)) + stop_critical_timing(CALLER_ADDR0, caller_addr); +} +EXPORT_SYMBOL(trace_hardirqs_on_caller); + +void notrace trace_hardirqs_off_caller(unsigned long caller_addr) +{ + unsigned long flags; + + local_save_flags(flags); + + if (irqs_disabled_flags(flags)) + start_critical_timing(CALLER_ADDR0, caller_addr); +} +EXPORT_SYMBOL(trace_hardirqs_off_caller); + +#endif /* CONFIG_PROVE_LOCKING */ + +static void start_irqsoff_tracer(struct trace_array *tr) +{ + tracer_enabled = 1; + register_ftrace_function(&trace_ops); +} + +static void stop_irqsoff_tracer(struct trace_array *tr) +{ + unregister_ftrace_function(&trace_ops); + tracer_enabled = 0; +} + +static void irqsoff_tracer_init(struct trace_array *tr) +{ + irqsoff_trace = tr; + /* make sure that the tracer is visibel */ + smp_wmb(); + + if (tr->ctrl) + start_irqsoff_tracer(tr); +} + +static void irqsoff_tracer_reset(struct trace_array *tr) +{ + if (tr->ctrl) + stop_irqsoff_tracer(tr); +} + +static void irqsoff_tracer_ctrl_update(struct trace_array *tr) +{ + if (tr->ctrl) + start_irqsoff_tracer(tr); + else + stop_irqsoff_tracer(tr); +} + +static void notrace irqsoff_tracer_open(struct trace_iterator *iter) +{ + /* stop the trace while dumping */ + if (iter->tr->ctrl) + stop_irqsoff_tracer(iter->tr); +} + +static void notrace irqsoff_tracer_close(struct trace_iterator *iter) +{ + if (iter->tr->ctrl) + start_irqsoff_tracer(iter->tr); +} + +static struct tracer irqsoff_tracer __read_mostly = +{ + .name = "irqsoff", + .init = irqsoff_tracer_init, + .reset = irqsoff_tracer_reset, + .open = irqsoff_tracer_open, + .close = irqsoff_tracer_close, + .ctrl_update = irqsoff_tracer_ctrl_update, + .print_max = 1, +}; + +__init static int init_irqsoff_tracer(void) +{ + register_tracer(&irqsoff_tracer); + + return 0; +} +device_initcall(init_irqsoff_tracer); -- cgit v0.10.2 From 6cd8a4bb2f97527a9ceb30bc77ea4e959c6a95e3 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:42 +0200 Subject: ftrace: trace preempt off critical timings Add preempt off timings. A lot of kernel core code is taken from the RT patch latency trace that was written by Ingo Molnar. This adds "preemptoff" and "preemptirqsoff" to /debugfs/tracing/available_tracers Now instead of just tracing irqs off, preemption off can be selected to be recorded. When this is selected, it shares the same files as irqs off timings. One can either trace preemption off, irqs off, or one or the other off. By echoing "preemptoff" into /debugfs/tracing/current_tracer, recording of preempt off only is performed. "irqsoff" will only record the time irqs are disabled, but "preemptirqsoff" will take the total time irqs or preemption are disabled. Runtime switching of these options is now supported by simpling echoing in the appropriate trace name into /debugfs/tracing/current_tracer. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index f8476df..a30aa1f 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -185,7 +185,10 @@ void cpu_idle(void) local_irq_disable(); __get_cpu_var(irq_stat).idle_timestamp = jiffies; + /* Don't trace irqs off for idle */ + stop_critical_timings(); idle(); + start_critical_timings(); } tick_nohz_restart_sched_tick(); preempt_enable_no_resched(); diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 0a20445..740c97d 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -58,4 +58,12 @@ extern void mcount(void); # define time_hardirqs_off(a0, a1) do { } while (0) #endif +#ifdef CONFIG_PREEMPT_TRACER + extern void notrace trace_preempt_on(unsigned long a0, unsigned long a1); + extern void notrace trace_preempt_off(unsigned long a0, unsigned long a1); +#else +# define trace_preempt_on(a0, a1) do { } while (0) +# define trace_preempt_off(a0, a1) do { } while (0) +#endif + #endif /* _LINUX_FTRACE_H */ diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h index 5b711d4..2b1c2e5 100644 --- a/include/linux/irqflags.h +++ b/include/linux/irqflags.h @@ -41,7 +41,8 @@ # define INIT_TRACE_IRQFLAGS #endif -#ifdef CONFIG_IRQSOFF_TRACER +#if defined(CONFIG_IRQSOFF_TRACER) || \ + defined(CONFIG_PREEMPT_TRACER) extern void stop_critical_timings(void); extern void start_critical_timings(void); #else diff --git a/include/linux/preempt.h b/include/linux/preempt.h index 36b03d5..72b1a10 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -10,7 +10,7 @@ #include #include -#ifdef CONFIG_DEBUG_PREEMPT +#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER) extern void add_preempt_count(int val); extern void sub_preempt_count(int val); #else diff --git a/kernel/sched.c b/kernel/sched.c index 73e6008..328494e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -70,6 +70,7 @@ #include #include #include +#include #include #include @@ -4365,26 +4366,44 @@ void scheduler_tick(void) #endif } -#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT) +#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ + defined(CONFIG_PREEMPT_TRACER)) + +static inline unsigned long get_parent_ip(unsigned long addr) +{ + if (in_lock_functions(addr)) { + addr = CALLER_ADDR2; + if (in_lock_functions(addr)) + addr = CALLER_ADDR3; + } + return addr; +} void __kprobes add_preempt_count(int val) { +#ifdef CONFIG_DEBUG_PREEMPT /* * Underflow? */ if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) return; +#endif preempt_count() += val; +#ifdef CONFIG_DEBUG_PREEMPT /* * Spinlock count overflowing soon? */ DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK - 10); +#endif + if (preempt_count() == val) + trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); } EXPORT_SYMBOL(add_preempt_count); void __kprobes sub_preempt_count(int val) { +#ifdef CONFIG_DEBUG_PREEMPT /* * Underflow? */ @@ -4396,7 +4415,10 @@ void __kprobes sub_preempt_count(int val) if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && !(preempt_count() & PREEMPT_MASK))) return; +#endif + if (preempt_count() == val) + trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); preempt_count() -= val; } EXPORT_SYMBOL(sub_preempt_count); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 896df1c..6430016 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -44,6 +44,31 @@ config IRQSOFF_TRACER echo 0 > /debugfs/tracing/tracing_max_latency + (Note that kernel size and overhead increases with this option + enabled. This option and the preempt-off timing option can be + used together or separately.) + +config PREEMPT_TRACER + bool "Preemption-off Latency Tracer" + default n + depends on GENERIC_TIME + depends on PREEMPT + select TRACING + select TRACER_MAX_TRACE + help + This option measures the time spent in preemption off critical + sections, with microsecond accuracy. + + The default measurement method is a maximum search, which is + disabled by default and can be runtime (re-)started + via: + + echo 0 > /debugfs/tracing/tracing_max_latency + + (Note that kernel size and overhead increases with this option + enabled. This option and the irqs-off timing option can be + used together or separately.) + config SCHED_TRACER bool "Scheduling Latency Tracer" depends on DEBUG_KERNEL diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 46be864..3fec653 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -4,6 +4,7 @@ obj-$(CONFIG_TRACING) += trace.o obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o obj-$(CONFIG_FTRACE) += trace_functions.o obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o +obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o libftrace-y := ftrace.o diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index a9131b0..8b12316 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -21,6 +21,36 @@ static struct trace_array *irqsoff_trace __read_mostly; static int tracer_enabled __read_mostly; +static DEFINE_PER_CPU(int, tracing_cpu); + +enum { + TRACER_IRQS_OFF = (1 << 1), + TRACER_PREEMPT_OFF = (1 << 2), +}; + +static int trace_type __read_mostly; + +#ifdef CONFIG_PREEMPT_TRACER +static inline int notrace +preempt_trace(void) +{ + return ((trace_type & TRACER_PREEMPT_OFF) && preempt_count()); +} +#else +# define preempt_trace() (0) +#endif + +#ifdef CONFIG_IRQSOFF_TRACER +static inline int notrace +irq_trace(void) +{ + return ((trace_type & TRACER_IRQS_OFF) && + irqs_disabled()); +} +#else +# define irq_trace() (0) +#endif + /* * Sequence count - we record it when starting a measurement and * skip the latency if the sequence has changed - some other section @@ -44,14 +74,11 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) long disabled; int cpu; - if (likely(!tracer_enabled)) + if (likely(!__get_cpu_var(tracing_cpu))) return; local_save_flags(flags); - if (!irqs_disabled_flags(flags)) - return; - cpu = raw_smp_processor_id(); data = tr->data[cpu]; disabled = atomic_inc_return(&data->disabled); @@ -171,23 +198,29 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip) if (likely(!tracer_enabled)) return; + if (__get_cpu_var(tracing_cpu)) + return; + cpu = raw_smp_processor_id(); data = tr->data[cpu]; if (unlikely(!data) || unlikely(!data->trace) || - data->critical_start || atomic_read(&data->disabled)) + atomic_read(&data->disabled)) return; atomic_inc(&data->disabled); data->critical_sequence = max_sequence; data->preempt_timestamp = now(cpu); - data->critical_start = parent_ip; + data->critical_start = parent_ip ? : ip; tracing_reset(data); local_save_flags(flags); + ftrace(tr, data, ip, parent_ip, flags); + __get_cpu_var(tracing_cpu) = 1; + atomic_dec(&data->disabled); } @@ -199,7 +232,13 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip) struct trace_array_cpu *data; unsigned long flags; - if (likely(!tracer_enabled)) + /* Always clear the tracing cpu on stopping the trace */ + if (unlikely(__get_cpu_var(tracing_cpu))) + __get_cpu_var(tracing_cpu) = 0; + else + return; + + if (!tracer_enabled) return; cpu = raw_smp_processor_id(); @@ -212,49 +251,35 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip) atomic_inc(&data->disabled); local_save_flags(flags); ftrace(tr, data, ip, parent_ip, flags); - check_critical_timing(tr, data, parent_ip, cpu); + check_critical_timing(tr, data, parent_ip ? : ip, cpu); data->critical_start = 0; atomic_dec(&data->disabled); } +/* start and stop critical timings used to for stoppage (in idle) */ void notrace start_critical_timings(void) { - unsigned long flags; - - local_save_flags(flags); - - if (irqs_disabled_flags(flags)) + if (preempt_trace() || irq_trace()) start_critical_timing(CALLER_ADDR0, CALLER_ADDR1); } void notrace stop_critical_timings(void) { - unsigned long flags; - - local_save_flags(flags); - - if (irqs_disabled_flags(flags)) + if (preempt_trace() || irq_trace()) stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1); } +#ifdef CONFIG_IRQSOFF_TRACER #ifdef CONFIG_PROVE_LOCKING void notrace time_hardirqs_on(unsigned long a0, unsigned long a1) { - unsigned long flags; - - local_save_flags(flags); - - if (irqs_disabled_flags(flags)) + if (!preempt_trace() && irq_trace()) stop_critical_timing(a0, a1); } void notrace time_hardirqs_off(unsigned long a0, unsigned long a1) { - unsigned long flags; - - local_save_flags(flags); - - if (irqs_disabled_flags(flags)) + if (!preempt_trace() && irq_trace()) start_critical_timing(a0, a1); } @@ -289,49 +314,46 @@ inline void print_irqtrace_events(struct task_struct *curr) */ void notrace trace_hardirqs_on(void) { - unsigned long flags; - - local_save_flags(flags); - - if (irqs_disabled_flags(flags)) + if (!preempt_trace() && irq_trace()) stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1); } EXPORT_SYMBOL(trace_hardirqs_on); void notrace trace_hardirqs_off(void) { - unsigned long flags; - - local_save_flags(flags); - - if (irqs_disabled_flags(flags)) + if (!preempt_trace() && irq_trace()) start_critical_timing(CALLER_ADDR0, CALLER_ADDR1); } EXPORT_SYMBOL(trace_hardirqs_off); void notrace trace_hardirqs_on_caller(unsigned long caller_addr) { - unsigned long flags; - - local_save_flags(flags); - - if (irqs_disabled_flags(flags)) + if (!preempt_trace() && irq_trace()) stop_critical_timing(CALLER_ADDR0, caller_addr); } EXPORT_SYMBOL(trace_hardirqs_on_caller); void notrace trace_hardirqs_off_caller(unsigned long caller_addr) { - unsigned long flags; - - local_save_flags(flags); - - if (irqs_disabled_flags(flags)) + if (!preempt_trace() && irq_trace()) start_critical_timing(CALLER_ADDR0, caller_addr); } EXPORT_SYMBOL(trace_hardirqs_off_caller); #endif /* CONFIG_PROVE_LOCKING */ +#endif /* CONFIG_IRQSOFF_TRACER */ + +#ifdef CONFIG_PREEMPT_TRACER +void notrace trace_preempt_on(unsigned long a0, unsigned long a1) +{ + stop_critical_timing(a0, a1); +} + +void notrace trace_preempt_off(unsigned long a0, unsigned long a1) +{ + start_critical_timing(a0, a1); +} +#endif /* CONFIG_PREEMPT_TRACER */ static void start_irqsoff_tracer(struct trace_array *tr) { @@ -345,7 +367,7 @@ static void stop_irqsoff_tracer(struct trace_array *tr) tracer_enabled = 0; } -static void irqsoff_tracer_init(struct trace_array *tr) +static void __irqsoff_tracer_init(struct trace_array *tr) { irqsoff_trace = tr; /* make sure that the tracer is visibel */ @@ -382,6 +404,13 @@ static void notrace irqsoff_tracer_close(struct trace_iterator *iter) start_irqsoff_tracer(iter->tr); } +#ifdef CONFIG_IRQSOFF_TRACER +static void irqsoff_tracer_init(struct trace_array *tr) +{ + trace_type = TRACER_IRQS_OFF; + + __irqsoff_tracer_init(tr); +} static struct tracer irqsoff_tracer __read_mostly = { .name = "irqsoff", @@ -392,10 +421,65 @@ static struct tracer irqsoff_tracer __read_mostly = .ctrl_update = irqsoff_tracer_ctrl_update, .print_max = 1, }; +# define register_irqsoff(trace) register_tracer(&trace) +#else +# define register_irqsoff(trace) do { } while (0) +#endif + +#ifdef CONFIG_PREEMPT_TRACER +static void preemptoff_tracer_init(struct trace_array *tr) +{ + trace_type = TRACER_PREEMPT_OFF; + + __irqsoff_tracer_init(tr); +} + +static struct tracer preemptoff_tracer __read_mostly = +{ + .name = "preemptoff", + .init = preemptoff_tracer_init, + .reset = irqsoff_tracer_reset, + .open = irqsoff_tracer_open, + .close = irqsoff_tracer_close, + .ctrl_update = irqsoff_tracer_ctrl_update, + .print_max = 1, +}; +# define register_preemptoff(trace) register_tracer(&trace) +#else +# define register_preemptoff(trace) do { } while (0) +#endif + +#if defined(CONFIG_IRQSOFF_TRACER) && \ + defined(CONFIG_PREEMPT_TRACER) + +static void preemptirqsoff_tracer_init(struct trace_array *tr) +{ + trace_type = TRACER_IRQS_OFF | TRACER_PREEMPT_OFF; + + __irqsoff_tracer_init(tr); +} + +static struct tracer preemptirqsoff_tracer __read_mostly = +{ + .name = "preemptirqsoff", + .init = preemptirqsoff_tracer_init, + .reset = irqsoff_tracer_reset, + .open = irqsoff_tracer_open, + .close = irqsoff_tracer_close, + .ctrl_update = irqsoff_tracer_ctrl_update, + .print_max = 1, +}; + +# define register_preemptirqsoff(trace) register_tracer(&trace) +#else +# define register_preemptirqsoff(trace) do { } while (0) +#endif __init static int init_irqsoff_tracer(void) { - register_tracer(&irqsoff_tracer); + register_irqsoff(irqsoff_tracer); + register_preemptoff(preemptoff_tracer); + register_preemptirqsoff(preemptirqsoff_tracer); return 0; } -- cgit v0.10.2 From 3d0833953e1b98b79ddf491dd49229eef9baeac1 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:42 +0200 Subject: ftrace: dynamic enabling/disabling of function calls This patch adds a feature to dynamically replace the ftrace code with the jmps to allow a kernel with ftrace configured to run as fast as it can without it configured. The way this works, is on bootup (if ftrace is enabled), a ftrace function is registered to record the instruction pointer of all places that call the function. Later, if there's still any code to patch, a kthread is awoken (rate limited to at most once a second) that performs a stop_machine, and replaces all the code that was called with a jmp over the call to ftrace. It only replaces what was found the previous time. Typically the system reaches equilibrium quickly after bootup and there's no code patching needed at all. e.g. call ftrace /* 5 bytes */ is replaced with jmp 3f /* jmp is 2 bytes and we jump 3 forward */ 3: When we want to enable ftrace for function tracing, the IP recording is removed, and stop_machine is called again to replace all the locations of that were recorded back to the call of ftrace. When it is disabled, we replace the code back to the jmp. Allocation is done by the kthread. If the ftrace recording function is called, and we don't have any record slots available, then we simply skip that call. Once a second a new page (if needed) is allocated for recording new ftrace function calls. A large batch is allocated at boot up to get most of the calls there. Because we do this via stop_machine, we don't have to worry about another CPU executing a ftrace call as we modify it. But we do need to worry about NMI's so all functions that might be called via nmi must be annotated with notrace_nmi. When this code is configured in, the NMI code will not call notrace. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 5e618c3..e142091 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -56,6 +56,7 @@ obj-$(CONFIG_X86_MPPARSE) += mpparse.o obj-$(CONFIG_X86_LOCAL_APIC) += apic_$(BITS).o nmi_$(BITS).o obj-$(CONFIG_X86_IO_APIC) += io_apic_$(BITS).o obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o +obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c new file mode 100644 index 0000000..5dd5813 --- /dev/null +++ b/arch/x86/kernel/ftrace.c @@ -0,0 +1,237 @@ +/* + * Code for replacing ftrace calls with jumps. + * + * Copyright (C) 2007-2008 Steven Rostedt + * + * Thanks goes to Ingo Molnar, for suggesting the idea. + * Mathieu Desnoyers, for suggesting postponing the modifications. + * Arjan van de Ven, for keeping me straight, and explaining to me + * the dangers of modifying code on the run. + */ + +#include +#include +#include +#include +#include +#include + +#define CALL_BACK 5 + +#define JMPFWD 0x03eb + +static unsigned short ftrace_jmp = JMPFWD; + +struct ftrace_record { + struct dyn_ftrace rec; + int failed; +} __attribute__((packed)); + +struct ftrace_page { + struct ftrace_page *next; + int index; + struct ftrace_record records[]; +} __attribute__((packed)); + +#define ENTRIES_PER_PAGE \ + ((PAGE_SIZE - sizeof(struct ftrace_page)) / sizeof(struct ftrace_record)) + +/* estimate from running different kernels */ +#define NR_TO_INIT 10000 + +#define MCOUNT_ADDR ((long)(&mcount)) + +union ftrace_code_union { + char code[5]; + struct { + char e8; + int offset; + } __attribute__((packed)); +}; + +static struct ftrace_page *ftrace_pages_start; +static struct ftrace_page *ftrace_pages; + +notrace struct dyn_ftrace *ftrace_alloc_shutdown_node(unsigned long ip) +{ + struct ftrace_record *rec; + unsigned short save; + + ip -= CALL_BACK; + save = *(short *)ip; + + /* If this was already converted, skip it */ + if (save == JMPFWD) + return NULL; + + if (ftrace_pages->index == ENTRIES_PER_PAGE) { + if (!ftrace_pages->next) + return NULL; + ftrace_pages = ftrace_pages->next; + } + + rec = &ftrace_pages->records[ftrace_pages->index++]; + + return &rec->rec; +} + +static int notrace +ftrace_modify_code(unsigned long ip, unsigned char *old_code, + unsigned char *new_code) +{ + unsigned short old = *(unsigned short *)old_code; + unsigned short new = *(unsigned short *)new_code; + unsigned short replaced; + int faulted = 0; + + /* + * Note: Due to modules and __init, code can + * disappear and change, we need to protect against faulting + * as well as code changing. + * + * No real locking needed, this code is run through + * kstop_machine. + */ + asm volatile ( + "1: lock\n" + " cmpxchg %w3, (%2)\n" + "2:\n" + ".section .fixup, \"ax\"\n" + " movl $1, %0\n" + "3: jmp 2b\n" + ".previous\n" + _ASM_EXTABLE(1b, 3b) + : "=r"(faulted), "=a"(replaced) + : "r"(ip), "r"(new), "0"(faulted), "a"(old) + : "memory"); + sync_core(); + + if (replaced != old) + faulted = 2; + + return faulted; +} + +static int notrace ftrace_calc_offset(long ip) +{ + return (int)(MCOUNT_ADDR - ip); +} + +notrace void ftrace_code_disable(struct dyn_ftrace *rec) +{ + unsigned long ip; + union ftrace_code_union save; + struct ftrace_record *r = + container_of(rec, struct ftrace_record, rec); + + ip = rec->ip; + + save.e8 = 0xe8; + save.offset = ftrace_calc_offset(ip); + + /* move the IP back to the start of the call */ + ip -= CALL_BACK; + + r->failed = ftrace_modify_code(ip, save.code, (char *)&ftrace_jmp); +} + +static void notrace ftrace_replace_code(int saved) +{ + unsigned char *new = NULL, *old = NULL; + struct ftrace_record *rec; + struct ftrace_page *pg; + unsigned long ip; + int i; + + if (saved) + old = (char *)&ftrace_jmp; + else + new = (char *)&ftrace_jmp; + + for (pg = ftrace_pages_start; pg; pg = pg->next) { + for (i = 0; i < pg->index; i++) { + union ftrace_code_union calc; + rec = &pg->records[i]; + + /* don't modify code that has already faulted */ + if (rec->failed) + continue; + + ip = rec->rec.ip; + + calc.e8 = 0xe8; + calc.offset = ftrace_calc_offset(ip); + + if (saved) + new = calc.code; + else + old = calc.code; + + ip -= CALL_BACK; + + rec->failed = ftrace_modify_code(ip, old, new); + } + } + +} + +notrace void ftrace_startup_code(void) +{ + ftrace_replace_code(1); +} + +notrace void ftrace_shutdown_code(void) +{ + ftrace_replace_code(0); +} + +notrace void ftrace_shutdown_replenish(void) +{ + if (ftrace_pages->next) + return; + + /* allocate another page */ + ftrace_pages->next = (void *)get_zeroed_page(GFP_KERNEL); +} + +notrace int ftrace_shutdown_arch_init(void) +{ + struct ftrace_page *pg; + int cnt; + int i; + + /* allocate a few pages */ + ftrace_pages_start = (void *)get_zeroed_page(GFP_KERNEL); + if (!ftrace_pages_start) + return -1; + + /* + * Allocate a few more pages. + * + * TODO: have some parser search vmlinux before + * final linking to find all calls to ftrace. + * Then we can: + * a) know how many pages to allocate. + * and/or + * b) set up the table then. + * + * The dynamic code is still necessary for + * modules. + */ + + pg = ftrace_pages = ftrace_pages_start; + + cnt = NR_TO_INIT / ENTRIES_PER_PAGE; + + for (i = 0; i < cnt; i++) { + pg->next = (void *)get_zeroed_page(GFP_KERNEL); + + /* If we fail, we'll try later anyway */ + if (!pg->next) + break; + + pg = pg->next; + } + + return 0; +} diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 740c97d..90dbc0e 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -32,6 +32,24 @@ extern void mcount(void); # define clear_ftrace_function(ops) do { } while (0) #endif /* CONFIG_FTRACE */ +#ifdef CONFIG_DYNAMIC_FTRACE +# define FTRACE_HASHBITS 10 +# define FTRACE_HASHSIZE (1< +#include +#include +#include +#include +#include #include +#include +#include +#include + +#include "trace.h" -static DEFINE_SPINLOCK(ftrace_func_lock); +static DEFINE_SPINLOCK(ftrace_lock); static struct ftrace_ops ftrace_list_end __read_mostly = { .func = ftrace_stub, @@ -44,21 +53,21 @@ notrace void ftrace_list_func(unsigned long ip, unsigned long parent_ip) } /** - * register_ftrace_function - register a function for profiling - * @ops - ops structure that holds the function for profiling. - * - * Register a function to be called by all functions in the - * kernel. + * clear_ftrace_function - reset the ftrace function * - * Note: @ops->func and all the functions it calls must be labeled - * with "notrace", otherwise it will go into a - * recursive loop. + * This NULLs the ftrace function and in essence stops + * tracing. There may be lag */ -int register_ftrace_function(struct ftrace_ops *ops) +void clear_ftrace_function(void) { - unsigned long flags; + ftrace_trace_function = ftrace_stub; +} + +static int notrace __register_ftrace_function(struct ftrace_ops *ops) +{ + /* Should never be called by interrupts */ + spin_lock(&ftrace_lock); - spin_lock_irqsave(&ftrace_func_lock, flags); ops->next = ftrace_list; /* * We are entering ops into the ftrace_list but another @@ -68,6 +77,7 @@ int register_ftrace_function(struct ftrace_ops *ops) */ smp_wmb(); ftrace_list = ops; + /* * For one func, simply call it directly. * For more than one func, call the chain. @@ -76,28 +86,22 @@ int register_ftrace_function(struct ftrace_ops *ops) ftrace_trace_function = ops->func; else ftrace_trace_function = ftrace_list_func; - spin_unlock_irqrestore(&ftrace_func_lock, flags); + + spin_unlock(&ftrace_lock); return 0; } -/** - * unregister_ftrace_function - unresgister a function for profiling. - * @ops - ops structure that holds the function to unregister - * - * Unregister a function that was added to be called by ftrace profiling. - */ -int unregister_ftrace_function(struct ftrace_ops *ops) +static int notrace __unregister_ftrace_function(struct ftrace_ops *ops) { - unsigned long flags; struct ftrace_ops **p; int ret = 0; - spin_lock_irqsave(&ftrace_func_lock, flags); + spin_lock(&ftrace_lock); /* - * If we are the only function, then the ftrace pointer is - * pointing directly to that function. + * If we are removing the last function, then simply point + * to the ftrace_stub. */ if (ftrace_list == ops && ops->next == &ftrace_list_end) { ftrace_trace_function = ftrace_stub; @@ -117,22 +121,310 @@ int unregister_ftrace_function(struct ftrace_ops *ops) *p = (*p)->next; /* If we only have one func left, then call that directly */ - if (ftrace_list->next == &ftrace_list_end) + if (ftrace_list == &ftrace_list_end || + ftrace_list->next == &ftrace_list_end) ftrace_trace_function = ftrace_list->func; out: - spin_unlock_irqrestore(&ftrace_func_lock, flags); + spin_unlock(&ftrace_lock); + + return ret; +} + +#ifdef CONFIG_DYNAMIC_FTRACE + +static struct hlist_head ftrace_hash[FTRACE_HASHSIZE]; + +static DEFINE_PER_CPU(int, ftrace_shutdown_disable_cpu); + +static DEFINE_SPINLOCK(ftrace_shutdown_lock); +static DEFINE_MUTEX(ftraced_lock); + +static int ftraced_trigger; +static int ftraced_suspend; + +static int ftrace_record_suspend; + +static inline int +notrace ftrace_ip_in_hash(unsigned long ip, unsigned long key) +{ + struct dyn_ftrace *p; + struct hlist_node *t; + int found = 0; + + hlist_for_each_entry(p, t, &ftrace_hash[key], node) { + if (p->ip == ip) { + found = 1; + break; + } + } + + return found; +} + +static inline void notrace +ftrace_add_hash(struct dyn_ftrace *node, unsigned long key) +{ + hlist_add_head(&node->node, &ftrace_hash[key]); +} + +static void notrace +ftrace_record_ip(unsigned long ip, unsigned long parent_ip) +{ + struct dyn_ftrace *node; + unsigned long flags; + unsigned long key; + int resched; + int atomic; + + resched = need_resched(); + preempt_disable_notrace(); + + /* We simply need to protect against recursion */ + __get_cpu_var(ftrace_shutdown_disable_cpu)++; + if (__get_cpu_var(ftrace_shutdown_disable_cpu) != 1) + goto out; + + if (unlikely(ftrace_record_suspend)) + goto out; + + key = hash_long(ip, FTRACE_HASHBITS); + + WARN_ON_ONCE(key >= FTRACE_HASHSIZE); + + if (ftrace_ip_in_hash(ip, key)) + goto out; + + atomic = irqs_disabled(); + + spin_lock_irqsave(&ftrace_shutdown_lock, flags); + + /* This ip may have hit the hash before the lock */ + if (ftrace_ip_in_hash(ip, key)) + goto out_unlock; + + /* + * There's a slight race that the ftraced will update the + * hash and reset here. The arch alloc is responsible + * for seeing if the IP has already changed, and if + * it has, the alloc will fail. + */ + node = ftrace_alloc_shutdown_node(ip); + if (!node) + goto out_unlock; + + node->ip = ip; + + ftrace_add_hash(node, key); + + ftraced_trigger = 1; + + out_unlock: + spin_unlock_irqrestore(&ftrace_shutdown_lock, flags); + out: + __get_cpu_var(ftrace_shutdown_disable_cpu)--; + + /* prevent recursion with scheduler */ + if (resched) + preempt_enable_no_resched_notrace(); + else + preempt_enable_notrace(); +} + +static struct ftrace_ops ftrace_shutdown_ops __read_mostly = +{ + .func = ftrace_record_ip, +}; + + +static int notrace __ftrace_modify_code(void *data) +{ + void (*func)(void) = data; + + func(); + return 0; +} + +static void notrace ftrace_run_startup_code(void) +{ + stop_machine_run(__ftrace_modify_code, ftrace_startup_code, NR_CPUS); +} + +static void notrace ftrace_run_shutdown_code(void) +{ + stop_machine_run(__ftrace_modify_code, ftrace_shutdown_code, NR_CPUS); +} + +static void notrace ftrace_startup(void) +{ + mutex_lock(&ftraced_lock); + ftraced_suspend++; + if (ftraced_suspend != 1) + goto out; + __unregister_ftrace_function(&ftrace_shutdown_ops); + + ftrace_run_startup_code(); + out: + mutex_unlock(&ftraced_lock); +} + +static void notrace ftrace_shutdown(void) +{ + mutex_lock(&ftraced_lock); + ftraced_suspend--; + if (ftraced_suspend) + goto out; + + ftrace_run_shutdown_code(); + + __register_ftrace_function(&ftrace_shutdown_ops); + out: + mutex_unlock(&ftraced_lock); +} + +static cycle_t ftrace_update_time; +static unsigned long ftrace_update_cnt; +unsigned long ftrace_update_tot_cnt; + +static int notrace __ftrace_update_code(void *ignore) +{ + struct dyn_ftrace *p; + struct hlist_head head; + struct hlist_node *t; + cycle_t start, stop; + int i; + + /* Don't be calling ftrace ops now */ + __unregister_ftrace_function(&ftrace_shutdown_ops); + + start = now(raw_smp_processor_id()); + ftrace_update_cnt = 0; + + /* No locks needed, the machine is stopped! */ + for (i = 0; i < FTRACE_HASHSIZE; i++) { + if (hlist_empty(&ftrace_hash[i])) + continue; + + head = ftrace_hash[i]; + INIT_HLIST_HEAD(&ftrace_hash[i]); + + /* all CPUS are stopped, we are safe to modify code */ + hlist_for_each_entry(p, t, &head, node) { + ftrace_code_disable(p); + ftrace_update_cnt++; + } + + } + + stop = now(raw_smp_processor_id()); + ftrace_update_time = stop - start; + ftrace_update_tot_cnt += ftrace_update_cnt; + + __register_ftrace_function(&ftrace_shutdown_ops); return 0; } +static void notrace ftrace_update_code(void) +{ + stop_machine_run(__ftrace_update_code, NULL, NR_CPUS); +} + +static int notrace ftraced(void *ignore) +{ + unsigned long usecs; + + set_current_state(TASK_INTERRUPTIBLE); + + while (!kthread_should_stop()) { + + /* check once a second */ + schedule_timeout(HZ); + + mutex_lock(&ftraced_lock); + if (ftraced_trigger && !ftraced_suspend) { + ftrace_record_suspend++; + ftrace_update_code(); + usecs = nsecs_to_usecs(ftrace_update_time); + if (ftrace_update_tot_cnt > 100000) { + ftrace_update_tot_cnt = 0; + pr_info("hm, dftrace overflow: %lu change%s" + " (%lu total) in %lu usec%s\n", + ftrace_update_cnt, + ftrace_update_cnt != 1 ? "s" : "", + ftrace_update_tot_cnt, + usecs, usecs != 1 ? "s" : ""); + WARN_ON_ONCE(1); + } + ftraced_trigger = 0; + ftrace_record_suspend--; + } + mutex_unlock(&ftraced_lock); + + ftrace_shutdown_replenish(); + + set_current_state(TASK_INTERRUPTIBLE); + } + __set_current_state(TASK_RUNNING); + return 0; +} + +static int __init notrace ftrace_shutdown_init(void) +{ + struct task_struct *p; + int ret; + + ret = ftrace_shutdown_arch_init(); + if (ret) + return ret; + + p = kthread_run(ftraced, NULL, "ftraced"); + if (IS_ERR(p)) + return -1; + + __register_ftrace_function(&ftrace_shutdown_ops); + + return 0; +} + +core_initcall(ftrace_shutdown_init); +#else +# define ftrace_startup() do { } while (0) +# define ftrace_shutdown() do { } while (0) +#endif /* CONFIG_DYNAMIC_FTRACE */ + /** - * clear_ftrace_function - reset the ftrace function + * register_ftrace_function - register a function for profiling + * @ops - ops structure that holds the function for profiling. * - * This NULLs the ftrace function and in essence stops - * tracing. There may be lag + * Register a function to be called by all functions in the + * kernel. + * + * Note: @ops->func and all the functions it calls must be labeled + * with "notrace", otherwise it will go into a + * recursive loop. */ -void clear_ftrace_function(void) +int register_ftrace_function(struct ftrace_ops *ops) { - ftrace_trace_function = ftrace_stub; + ftrace_startup(); + + return __register_ftrace_function(ops); +} + +/** + * unregister_ftrace_function - unresgister a function for profiling. + * @ops - ops structure that holds the function to unregister + * + * Unregister a function that was added to be called by ftrace profiling. + */ +int unregister_ftrace_function(struct ftrace_ops *ops) +{ + int ret; + + ret = __unregister_ftrace_function(ops); + + if (ftrace_list == &ftrace_list_end) + ftrace_shutdown(); + + return ret; } -- cgit v0.10.2 From b0fc494fae96a7089f3651cb451f461c7291244c Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:43 +0200 Subject: ftrace: add ftrace_enabled sysctl to disable mcount function This patch adds back the sysctl ftrace_enabled. This time it is defaulted to on, if DYNAMIC_FTRACE is configured. When ftrace_enabled is disabled, the ftrace function is set to the stub return. If DYNAMIC_FTRACE is also configured, on ftrace_enabled = 0, the registered ftrace functions will all be set to jmps, but no more new calls to ftrace recording (used to find the ftrace calling sites) will be called. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 90dbc0e..ccd8537 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -5,6 +5,12 @@ #include +extern int ftrace_enabled; +extern int +ftrace_enable_sysctl(struct ctl_table *table, int write, + struct file *filp, void __user *buffer, size_t *lenp, + loff_t *ppos); + typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip); struct ftrace_ops { diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 2911665..efaf7c5 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -46,6 +46,7 @@ #include #include #include +#include #include #include @@ -455,6 +456,16 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, +#ifdef CONFIG_FTRACE + { + .ctl_name = CTL_UNNUMBERED, + .procname = "ftrace_enabled", + .data = &ftrace_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &ftrace_enable_sysctl, + }, +#endif #ifdef CONFIG_KMOD { .ctl_name = KERN_MODPROBE, diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index d1ae2ba..d3de372 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -20,12 +20,24 @@ #include #include #include +#include #include #include #include "trace.h" +#ifdef CONFIG_DYNAMIC_FTRACE +# define FTRACE_ENABLED_INIT 1 +#else +# define FTRACE_ENABLED_INIT 0 +#endif + +int ftrace_enabled = FTRACE_ENABLED_INIT; +static int last_ftrace_enabled = FTRACE_ENABLED_INIT; + static DEFINE_SPINLOCK(ftrace_lock); +static DEFINE_MUTEX(ftrace_sysctl_lock); + static struct ftrace_ops ftrace_list_end __read_mostly = { .func = ftrace_stub, @@ -78,14 +90,16 @@ static int notrace __register_ftrace_function(struct ftrace_ops *ops) smp_wmb(); ftrace_list = ops; - /* - * For one func, simply call it directly. - * For more than one func, call the chain. - */ - if (ops->next == &ftrace_list_end) - ftrace_trace_function = ops->func; - else - ftrace_trace_function = ftrace_list_func; + if (ftrace_enabled) { + /* + * For one func, simply call it directly. + * For more than one func, call the chain. + */ + if (ops->next == &ftrace_list_end) + ftrace_trace_function = ops->func; + else + ftrace_trace_function = ftrace_list_func; + } spin_unlock(&ftrace_lock); @@ -120,10 +134,12 @@ static int notrace __unregister_ftrace_function(struct ftrace_ops *ops) *p = (*p)->next; - /* If we only have one func left, then call that directly */ - if (ftrace_list == &ftrace_list_end || - ftrace_list->next == &ftrace_list_end) - ftrace_trace_function = ftrace_list->func; + if (ftrace_enabled) { + /* If we only have one func left, then call that directly */ + if (ftrace_list == &ftrace_list_end || + ftrace_list->next == &ftrace_list_end) + ftrace_trace_function = ftrace_list->func; + } out: spin_unlock(&ftrace_lock); @@ -263,7 +279,8 @@ static void notrace ftrace_startup(void) goto out; __unregister_ftrace_function(&ftrace_shutdown_ops); - ftrace_run_startup_code(); + if (ftrace_enabled) + ftrace_run_startup_code(); out: mutex_unlock(&ftraced_lock); } @@ -275,13 +292,32 @@ static void notrace ftrace_shutdown(void) if (ftraced_suspend) goto out; - ftrace_run_shutdown_code(); + if (ftrace_enabled) + ftrace_run_shutdown_code(); __register_ftrace_function(&ftrace_shutdown_ops); out: mutex_unlock(&ftraced_lock); } +static void notrace ftrace_startup_sysctl(void) +{ + mutex_lock(&ftraced_lock); + /* ftraced_suspend is true if we want ftrace running */ + if (ftraced_suspend) + ftrace_run_startup_code(); + mutex_unlock(&ftraced_lock); +} + +static void notrace ftrace_shutdown_sysctl(void) +{ + mutex_lock(&ftraced_lock); + /* ftraced_suspend is true if ftrace is running */ + if (ftraced_suspend) + ftrace_run_shutdown_code(); + mutex_unlock(&ftraced_lock); +} + static cycle_t ftrace_update_time; static unsigned long ftrace_update_cnt; unsigned long ftrace_update_tot_cnt; @@ -341,8 +377,9 @@ static int notrace ftraced(void *ignore) /* check once a second */ schedule_timeout(HZ); + mutex_lock(&ftrace_sysctl_lock); mutex_lock(&ftraced_lock); - if (ftraced_trigger && !ftraced_suspend) { + if (ftrace_enabled && ftraced_trigger && !ftraced_suspend) { ftrace_record_suspend++; ftrace_update_code(); usecs = nsecs_to_usecs(ftrace_update_time); @@ -360,6 +397,7 @@ static int notrace ftraced(void *ignore) ftrace_record_suspend--; } mutex_unlock(&ftraced_lock); + mutex_unlock(&ftrace_sysctl_lock); ftrace_shutdown_replenish(); @@ -389,8 +427,10 @@ static int __init notrace ftrace_shutdown_init(void) core_initcall(ftrace_shutdown_init); #else -# define ftrace_startup() do { } while (0) -# define ftrace_shutdown() do { } while (0) +# define ftrace_startup() do { } while (0) +# define ftrace_shutdown() do { } while (0) +# define ftrace_startup_sysctl() do { } while (0) +# define ftrace_shutdown_sysctl() do { } while (0) #endif /* CONFIG_DYNAMIC_FTRACE */ /** @@ -406,9 +446,15 @@ core_initcall(ftrace_shutdown_init); */ int register_ftrace_function(struct ftrace_ops *ops) { + int ret; + + mutex_lock(&ftrace_sysctl_lock); ftrace_startup(); - return __register_ftrace_function(ops); + ret = __register_ftrace_function(ops); + mutex_unlock(&ftrace_sysctl_lock); + + return ret; } /** @@ -421,10 +467,53 @@ int unregister_ftrace_function(struct ftrace_ops *ops) { int ret; + mutex_lock(&ftrace_sysctl_lock); ret = __unregister_ftrace_function(ops); if (ftrace_list == &ftrace_list_end) ftrace_shutdown(); + mutex_unlock(&ftrace_sysctl_lock); + + return ret; +} + +notrace int +ftrace_enable_sysctl(struct ctl_table *table, int write, + struct file *filp, void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + + mutex_lock(&ftrace_sysctl_lock); + + ret = proc_dointvec(table, write, filp, buffer, lenp, ppos); + + if (ret || !write || (last_ftrace_enabled == ftrace_enabled)) + goto out; + + last_ftrace_enabled = ftrace_enabled; + + if (ftrace_enabled) { + + ftrace_startup_sysctl(); + + /* we are starting ftrace again */ + if (ftrace_list != &ftrace_list_end) { + if (ftrace_list->next == &ftrace_list_end) + ftrace_trace_function = ftrace_list->func; + else + ftrace_trace_function = ftrace_list_func; + } + + } else { + /* stopping ftrace calls (just send to ftrace_stub) */ + ftrace_trace_function = ftrace_stub; + + ftrace_shutdown_sysctl(); + } + + out: + mutex_unlock(&ftrace_sysctl_lock); return ret; } -- cgit v0.10.2 From dfa60aba04dae7833d75b2e2be124bb7cfb8239f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:43 +0200 Subject: ftrace: use nops instead of jmp This patch patches the call to mcount with nops instead of a jmp over the mcount call. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 65c7857..de240ba 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -143,7 +143,7 @@ static const unsigned char *const p6_nops[ASM_NOP_MAX+1] = { #ifdef CONFIG_X86_64 extern char __vsyscall_0; -static inline const unsigned char*const * find_nop_table(void) +const unsigned char *const *find_nop_table(void) { return boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || boot_cpu_data.x86 < 6 ? k8_nops : p6_nops; @@ -162,7 +162,7 @@ static const struct nop { { -1, NULL } }; -static const unsigned char*const * find_nop_table(void) +const unsigned char *const *find_nop_table(void) { const unsigned char *const *noptable = intel_nops; int i; diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 5dd5813..2e060c5 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -16,11 +16,12 @@ #include #include -#define CALL_BACK 5 +#include -#define JMPFWD 0x03eb +#define CALL_BACK 5 -static unsigned short ftrace_jmp = JMPFWD; +/* Long is fine, even if it is only 4 bytes ;-) */ +static long *ftrace_nop; struct ftrace_record { struct dyn_ftrace rec; @@ -55,13 +56,13 @@ static struct ftrace_page *ftrace_pages; notrace struct dyn_ftrace *ftrace_alloc_shutdown_node(unsigned long ip) { struct ftrace_record *rec; - unsigned short save; + unsigned long save; ip -= CALL_BACK; - save = *(short *)ip; + save = *(long *)ip; /* If this was already converted, skip it */ - if (save == JMPFWD) + if (save == *ftrace_nop) return NULL; if (ftrace_pages->index == ENTRIES_PER_PAGE) { @@ -79,9 +80,10 @@ static int notrace ftrace_modify_code(unsigned long ip, unsigned char *old_code, unsigned char *new_code) { - unsigned short old = *(unsigned short *)old_code; - unsigned short new = *(unsigned short *)new_code; - unsigned short replaced; + unsigned replaced; + unsigned old = *(unsigned *)old_code; /* 4 bytes */ + unsigned new = *(unsigned *)new_code; /* 4 bytes */ + unsigned char newch = new_code[4]; int faulted = 0; /* @@ -94,7 +96,9 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code, */ asm volatile ( "1: lock\n" - " cmpxchg %w3, (%2)\n" + " cmpxchg %3, (%2)\n" + " jnz 2f\n" + " movb %b4, 4(%2)\n" "2:\n" ".section .fixup, \"ax\"\n" " movl $1, %0\n" @@ -102,11 +106,12 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code, ".previous\n" _ASM_EXTABLE(1b, 3b) : "=r"(faulted), "=a"(replaced) - : "r"(ip), "r"(new), "0"(faulted), "a"(old) + : "r"(ip), "r"(new), "r"(newch), + "0"(faulted), "a"(old) : "memory"); sync_core(); - if (replaced != old) + if (replaced != old && replaced != new) faulted = 2; return faulted; @@ -132,7 +137,7 @@ notrace void ftrace_code_disable(struct dyn_ftrace *rec) /* move the IP back to the start of the call */ ip -= CALL_BACK; - r->failed = ftrace_modify_code(ip, save.code, (char *)&ftrace_jmp); + r->failed = ftrace_modify_code(ip, save.code, (char *)ftrace_nop); } static void notrace ftrace_replace_code(int saved) @@ -144,9 +149,9 @@ static void notrace ftrace_replace_code(int saved) int i; if (saved) - old = (char *)&ftrace_jmp; + old = (char *)ftrace_nop; else - new = (char *)&ftrace_jmp; + new = (char *)ftrace_nop; for (pg = ftrace_pages_start; pg; pg = pg->next) { for (i = 0; i < pg->index; i++) { @@ -194,12 +199,15 @@ notrace void ftrace_shutdown_replenish(void) ftrace_pages->next = (void *)get_zeroed_page(GFP_KERNEL); } -notrace int ftrace_shutdown_arch_init(void) +notrace int __init ftrace_shutdown_arch_init(void) { + const unsigned char *const *noptable = find_nop_table(); struct ftrace_page *pg; int cnt; int i; + ftrace_nop = (unsigned long *)noptable[CALL_BACK]; + /* allocate a few pages */ ftrace_pages_start = (void *)get_zeroed_page(GFP_KERNEL); if (!ftrace_pages_start) diff --git a/include/asm-x86/alternative.h b/include/asm-x86/alternative.h index 1f6a9ca..f6aa18e 100644 --- a/include/asm-x86/alternative.h +++ b/include/asm-x86/alternative.h @@ -72,6 +72,8 @@ static inline void alternatives_smp_module_del(struct module *mod) {} static inline void alternatives_smp_switch(int smp) {} #endif /* CONFIG_SMP */ +const unsigned char *const *find_nop_table(void); + /* * Alternative instructions for different CPU types or capabilities. * -- cgit v0.10.2 From 3c1720f00bb619302ba19d55986ab565e74d06db Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:43 +0200 Subject: ftrace: move memory management out of arch code This patch moves the memory management of the ftrace records out of the arch code and into the generic code making the arch code simpler. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 2e060c5..b69795e 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -23,25 +23,6 @@ /* Long is fine, even if it is only 4 bytes ;-) */ static long *ftrace_nop; -struct ftrace_record { - struct dyn_ftrace rec; - int failed; -} __attribute__((packed)); - -struct ftrace_page { - struct ftrace_page *next; - int index; - struct ftrace_record records[]; -} __attribute__((packed)); - -#define ENTRIES_PER_PAGE \ - ((PAGE_SIZE - sizeof(struct ftrace_page)) / sizeof(struct ftrace_record)) - -/* estimate from running different kernels */ -#define NR_TO_INIT 10000 - -#define MCOUNT_ADDR ((long)(&mcount)) - union ftrace_code_union { char code[5]; struct { @@ -50,33 +31,41 @@ union ftrace_code_union { } __attribute__((packed)); }; -static struct ftrace_page *ftrace_pages_start; -static struct ftrace_page *ftrace_pages; - -notrace struct dyn_ftrace *ftrace_alloc_shutdown_node(unsigned long ip) +notrace int ftrace_ip_converted(unsigned long ip) { - struct ftrace_record *rec; unsigned long save; ip -= CALL_BACK; save = *(long *)ip; - /* If this was already converted, skip it */ - if (save == *ftrace_nop) - return NULL; + return save == *ftrace_nop; +} - if (ftrace_pages->index == ENTRIES_PER_PAGE) { - if (!ftrace_pages->next) - return NULL; - ftrace_pages = ftrace_pages->next; - } +static int notrace ftrace_calc_offset(long ip, long addr) +{ + return (int)(addr - ip); +} - rec = &ftrace_pages->records[ftrace_pages->index++]; +notrace unsigned char *ftrace_nop_replace(void) +{ + return (char *)ftrace_nop; +} + +notrace unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr) +{ + static union ftrace_code_union calc; - return &rec->rec; + calc.e8 = 0xe8; + calc.offset = ftrace_calc_offset(ip, addr); + + /* + * No locking needed, this must be called via kstop_machine + * which in essence is like running on a uniprocessor machine. + */ + return calc.code; } -static int notrace +notrace int ftrace_modify_code(unsigned long ip, unsigned char *old_code, unsigned char *new_code) { @@ -86,6 +75,9 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code, unsigned char newch = new_code[4]; int faulted = 0; + /* move the IP back to the start of the call */ + ip -= CALL_BACK; + /* * Note: Due to modules and __init, code can * disappear and change, we need to protect against faulting @@ -117,129 +109,12 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code, return faulted; } -static int notrace ftrace_calc_offset(long ip) -{ - return (int)(MCOUNT_ADDR - ip); -} - -notrace void ftrace_code_disable(struct dyn_ftrace *rec) -{ - unsigned long ip; - union ftrace_code_union save; - struct ftrace_record *r = - container_of(rec, struct ftrace_record, rec); - - ip = rec->ip; - - save.e8 = 0xe8; - save.offset = ftrace_calc_offset(ip); - - /* move the IP back to the start of the call */ - ip -= CALL_BACK; - - r->failed = ftrace_modify_code(ip, save.code, (char *)ftrace_nop); -} - -static void notrace ftrace_replace_code(int saved) -{ - unsigned char *new = NULL, *old = NULL; - struct ftrace_record *rec; - struct ftrace_page *pg; - unsigned long ip; - int i; - - if (saved) - old = (char *)ftrace_nop; - else - new = (char *)ftrace_nop; - - for (pg = ftrace_pages_start; pg; pg = pg->next) { - for (i = 0; i < pg->index; i++) { - union ftrace_code_union calc; - rec = &pg->records[i]; - - /* don't modify code that has already faulted */ - if (rec->failed) - continue; - - ip = rec->rec.ip; - - calc.e8 = 0xe8; - calc.offset = ftrace_calc_offset(ip); - - if (saved) - new = calc.code; - else - old = calc.code; - - ip -= CALL_BACK; - - rec->failed = ftrace_modify_code(ip, old, new); - } - } - -} - -notrace void ftrace_startup_code(void) -{ - ftrace_replace_code(1); -} - -notrace void ftrace_shutdown_code(void) -{ - ftrace_replace_code(0); -} - -notrace void ftrace_shutdown_replenish(void) -{ - if (ftrace_pages->next) - return; - - /* allocate another page */ - ftrace_pages->next = (void *)get_zeroed_page(GFP_KERNEL); -} - -notrace int __init ftrace_shutdown_arch_init(void) +int __init ftrace_dyn_arch_init(void) { const unsigned char *const *noptable = find_nop_table(); - struct ftrace_page *pg; - int cnt; - int i; ftrace_nop = (unsigned long *)noptable[CALL_BACK]; - /* allocate a few pages */ - ftrace_pages_start = (void *)get_zeroed_page(GFP_KERNEL); - if (!ftrace_pages_start) - return -1; - - /* - * Allocate a few more pages. - * - * TODO: have some parser search vmlinux before - * final linking to find all calls to ftrace. - * Then we can: - * a) know how many pages to allocate. - * and/or - * b) set up the table then. - * - * The dynamic code is still necessary for - * modules. - */ - - pg = ftrace_pages = ftrace_pages_start; - - cnt = NR_TO_INIT / ENTRIES_PER_PAGE; - - for (i = 0; i < cnt; i++) { - pg->next = (void *)get_zeroed_page(GFP_KERNEL); - - /* If we fail, we'll try later anyway */ - if (!pg->next) - break; - - pg = pg->next; - } - return 0; } + diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index ccd8537..d509ad6 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -42,19 +42,23 @@ extern void mcount(void); # define FTRACE_HASHBITS 10 # define FTRACE_HASHSIZE (1<node, &ftrace_hash[key]); } +static notrace struct dyn_ftrace *ftrace_alloc_shutdown_node(unsigned long ip) +{ + /* If this was already converted, skip it */ + if (ftrace_ip_converted(ip)) + return NULL; + + if (ftrace_pages->index == ENTRIES_PER_PAGE) { + if (!ftrace_pages->next) + return NULL; + ftrace_pages = ftrace_pages->next; + } + + return &ftrace_pages->records[ftrace_pages->index++]; +} + static void notrace ftrace_record_ip(unsigned long ip, unsigned long parent_ip) { @@ -252,6 +282,62 @@ static struct ftrace_ops ftrace_shutdown_ops __read_mostly = .func = ftrace_record_ip, }; +#define MCOUNT_ADDR ((long)(&mcount)) + +static void notrace ftrace_replace_code(int saved) +{ + unsigned char *new = NULL, *old = NULL; + struct dyn_ftrace *rec; + struct ftrace_page *pg; + unsigned long ip; + int failed; + int i; + + if (saved) + old = ftrace_nop_replace(); + else + new = ftrace_nop_replace(); + + for (pg = ftrace_pages_start; pg; pg = pg->next) { + for (i = 0; i < pg->index; i++) { + rec = &pg->records[i]; + + /* don't modify code that has already faulted */ + if (rec->flags & FTRACE_FL_FAILED) + continue; + + ip = rec->ip; + + if (saved) + new = ftrace_call_replace(ip, MCOUNT_ADDR); + else + old = ftrace_call_replace(ip, MCOUNT_ADDR); + + failed = ftrace_modify_code(ip, old, new); + if (failed) + rec->flags |= FTRACE_FL_FAILED; + } + } +} + +static notrace void ftrace_startup_code(void) +{ + ftrace_replace_code(1); +} + +static notrace void ftrace_shutdown_code(void) +{ + ftrace_replace_code(0); +} + +static notrace void ftrace_shutdown_replenish(void) +{ + if (ftrace_pages->next) + return; + + /* allocate another page */ + ftrace_pages->next = (void *)get_zeroed_page(GFP_KERNEL); +} static int notrace __ftrace_modify_code(void *data) { @@ -261,6 +347,23 @@ static int notrace __ftrace_modify_code(void *data) return 0; } +static notrace void +ftrace_code_disable(struct dyn_ftrace *rec, unsigned long addr) +{ + unsigned long ip; + unsigned char *nop, *call; + int failed; + + ip = rec->ip; + + nop = ftrace_nop_replace(); + call = ftrace_call_replace(ip, addr); + + failed = ftrace_modify_code(ip, call, nop); + if (failed) + rec->flags |= FTRACE_FL_FAILED; +} + static void notrace ftrace_run_startup_code(void) { stop_machine_run(__ftrace_modify_code, ftrace_startup_code, NR_CPUS); @@ -346,7 +449,7 @@ static int notrace __ftrace_update_code(void *ignore) /* all CPUS are stopped, we are safe to modify code */ hlist_for_each_entry(p, t, &head, node) { - ftrace_code_disable(p); + ftrace_code_disable(p, MCOUNT_ADDR); ftrace_update_cnt++; } @@ -407,12 +510,59 @@ static int notrace ftraced(void *ignore) return 0; } +static int __init ftrace_dyn_table_alloc(void) +{ + struct ftrace_page *pg; + int cnt; + int i; + int ret; + + ret = ftrace_dyn_arch_init(); + if (ret) + return ret; + + /* allocate a few pages */ + ftrace_pages_start = (void *)get_zeroed_page(GFP_KERNEL); + if (!ftrace_pages_start) + return -1; + + /* + * Allocate a few more pages. + * + * TODO: have some parser search vmlinux before + * final linking to find all calls to ftrace. + * Then we can: + * a) know how many pages to allocate. + * and/or + * b) set up the table then. + * + * The dynamic code is still necessary for + * modules. + */ + + pg = ftrace_pages = ftrace_pages_start; + + cnt = NR_TO_INIT / ENTRIES_PER_PAGE; + + for (i = 0; i < cnt; i++) { + pg->next = (void *)get_zeroed_page(GFP_KERNEL); + + /* If we fail, we'll try later anyway */ + if (!pg->next) + break; + + pg = pg->next; + } + + return 0; +} + static int __init notrace ftrace_shutdown_init(void) { struct task_struct *p; int ret; - ret = ftrace_shutdown_arch_init(); + ret = ftrace_dyn_table_alloc(); if (ret) return ret; -- cgit v0.10.2 From d61f82d06672f57fca410da6f7fffd15867db622 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:43 +0200 Subject: ftrace: use dynamic patching for updating mcount calls This patch replaces the indirect call to the mcount function pointer with a direct call that will be patched by the dynamic ftrace routines. On boot up, the mcount function calls the ftace_stub function. When the dynamic ftrace code is initialized, the ftrace_stub is replaced with a call to the ftrace_record_ip, which records the instruction pointers of the locations that call it. Later, the ftraced daemon will call kstop_machine and patch all the locations to nops. When a ftrace is enabled, the original calls to mcount will now be set top call ftrace_caller, which will do a direct call to the registered ftrace function. This direct call is also patched when the function that should be called is updated. All patching is performed by a kstop_machine routine to prevent any type of race conditions that is associated with modifying code on the fly. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index f47b9b5..e6517ce 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -1110,10 +1110,50 @@ ENDPROC(xen_failsafe_callback) #endif /* CONFIG_XEN */ #ifdef CONFIG_FTRACE +#ifdef CONFIG_DYNAMIC_FTRACE + +ENTRY(mcount) + pushl %eax + pushl %ecx + pushl %edx + movl 0xc(%esp), %eax + +.globl mcount_call +mcount_call: + call ftrace_stub + + popl %edx + popl %ecx + popl %eax + + ret +END(mcount) + +ENTRY(ftrace_caller) + pushl %eax + pushl %ecx + pushl %edx + movl 0xc(%esp), %eax + movl 0x4(%ebp), %edx + +.globl ftrace_call +ftrace_call: + call ftrace_stub + + popl %edx + popl %ecx + popl %eax + +.globl ftrace_stub +ftrace_stub: + ret +END(ftrace_caller) + +#else /* ! CONFIG_DYNAMIC_FTRACE */ + ENTRY(mcount) cmpl $ftrace_stub, ftrace_trace_function jnz trace - .globl ftrace_stub ftrace_stub: ret @@ -1126,7 +1166,7 @@ trace: movl 0xc(%esp), %eax movl 0x4(%ebp), %edx - call *ftrace_trace_function + call *ftrace_trace_function popl %edx popl %ecx @@ -1134,7 +1174,8 @@ trace: jmp ftrace_stub END(mcount) -#endif +#endif /* CONFIG_DYNAMIC_FTRACE */ +#endif /* CONFIG_FTRACE */ .section .rodata,"a" #include "syscall_table_32.S" diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index f046e0c..fe25e5f 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -55,6 +55,70 @@ .code64 #ifdef CONFIG_FTRACE +#ifdef CONFIG_DYNAMIC_FTRACE +ENTRY(mcount) + + subq $0x38, %rsp + movq %rax, (%rsp) + movq %rcx, 8(%rsp) + movq %rdx, 16(%rsp) + movq %rsi, 24(%rsp) + movq %rdi, 32(%rsp) + movq %r8, 40(%rsp) + movq %r9, 48(%rsp) + + movq 0x38(%rsp), %rdi + +.globl mcount_call +mcount_call: + call ftrace_stub + + movq 48(%rsp), %r9 + movq 40(%rsp), %r8 + movq 32(%rsp), %rdi + movq 24(%rsp), %rsi + movq 16(%rsp), %rdx + movq 8(%rsp), %rcx + movq (%rsp), %rax + addq $0x38, %rsp + + retq +END(mcount) + +ENTRY(ftrace_caller) + + /* taken from glibc */ + subq $0x38, %rsp + movq %rax, (%rsp) + movq %rcx, 8(%rsp) + movq %rdx, 16(%rsp) + movq %rsi, 24(%rsp) + movq %rdi, 32(%rsp) + movq %r8, 40(%rsp) + movq %r9, 48(%rsp) + + movq 0x38(%rsp), %rdi + movq 8(%rbp), %rsi + +.globl ftrace_call +ftrace_call: + call ftrace_stub + + movq 48(%rsp), %r9 + movq 40(%rsp), %r8 + movq 32(%rsp), %rdi + movq 24(%rsp), %rsi + movq 16(%rsp), %rdx + movq 8(%rsp), %rcx + movq (%rsp), %rax + addq $0x38, %rsp + +.globl ftrace_stub +ftrace_stub: + retq +END(ftrace_caller) + +#else /* ! CONFIG_DYNAMIC_FTRACE */ ENTRY(mcount) cmpq $ftrace_stub, ftrace_trace_function jnz trace @@ -89,7 +153,8 @@ trace: jmp ftrace_stub END(mcount) -#endif +#endif /* CONFIG_DYNAMIC_FTRACE */ +#endif /* CONFIG_FTRACE */ #ifndef CONFIG_PREEMPT #define retint_kernel retint_restore_args diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index b69795e..9f44623 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -109,10 +109,49 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code, return faulted; } -int __init ftrace_dyn_arch_init(void) +notrace int ftrace_update_ftrace_func(ftrace_func_t func) +{ + unsigned long ip = (unsigned long)(&ftrace_call); + unsigned char old[5], *new; + int ret; + + ip += CALL_BACK; + + memcpy(old, &ftrace_call, 5); + new = ftrace_call_replace(ip, (unsigned long)func); + ret = ftrace_modify_code(ip, old, new); + + return ret; +} + +notrace int ftrace_mcount_set(unsigned long *data) +{ + unsigned long ip = (long)(&mcount_call); + unsigned long *addr = data; + unsigned char old[5], *new; + + /* ip is at the location, but modify code will subtact this */ + ip += CALL_BACK; + + /* + * Replace the mcount stub with a pointer to the + * ip recorder function. + */ + memcpy(old, &mcount_call, 5); + new = ftrace_call_replace(ip, *addr); + *addr = ftrace_modify_code(ip, old, new); + + return 0; +} + +int __init ftrace_dyn_arch_init(void *data) { const unsigned char *const *noptable = find_nop_table(); + /* This is running in kstop_machine */ + + ftrace_mcount_set(data); + ftrace_nop = (unsigned long *)noptable[CALL_BACK]; return 0; diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index d509ad6..b0dd009 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -56,9 +56,14 @@ struct dyn_ftrace { extern int ftrace_ip_converted(unsigned long ip); extern unsigned char *ftrace_nop_replace(void); extern unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr); -extern int ftrace_dyn_arch_init(void); +extern int ftrace_dyn_arch_init(void *data); +extern int ftrace_mcount_set(unsigned long *data); extern int ftrace_modify_code(unsigned long ip, unsigned char *old_code, unsigned char *new_code); +extern int ftrace_update_ftrace_func(ftrace_func_t func); +extern void ftrace_caller(void); +extern void ftrace_call(void); +extern void mcount_call(void); #endif #ifdef CONFIG_FRAME_POINTER diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f6d9af3..88544f9 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -26,14 +26,8 @@ #include "trace.h" -#ifdef CONFIG_DYNAMIC_FTRACE -# define FTRACE_ENABLED_INIT 1 -#else -# define FTRACE_ENABLED_INIT 0 -#endif - -int ftrace_enabled = FTRACE_ENABLED_INIT; -static int last_ftrace_enabled = FTRACE_ENABLED_INIT; +int ftrace_enabled; +static int last_ftrace_enabled; static DEFINE_SPINLOCK(ftrace_lock); static DEFINE_MUTEX(ftrace_sysctl_lock); @@ -149,6 +143,14 @@ static int notrace __unregister_ftrace_function(struct ftrace_ops *ops) #ifdef CONFIG_DYNAMIC_FTRACE +enum { + FTRACE_ENABLE_CALLS = (1 << 0), + FTRACE_DISABLE_CALLS = (1 << 1), + FTRACE_UPDATE_TRACE_FUNC = (1 << 2), + FTRACE_ENABLE_MCOUNT = (1 << 3), + FTRACE_DISABLE_MCOUNT = (1 << 4), +}; + static struct hlist_head ftrace_hash[FTRACE_HASHSIZE]; static DEFINE_PER_CPU(int, ftrace_shutdown_disable_cpu); @@ -199,12 +201,8 @@ ftrace_add_hash(struct dyn_ftrace *node, unsigned long key) hlist_add_head(&node->node, &ftrace_hash[key]); } -static notrace struct dyn_ftrace *ftrace_alloc_shutdown_node(unsigned long ip) +static notrace struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip) { - /* If this was already converted, skip it */ - if (ftrace_ip_converted(ip)) - return NULL; - if (ftrace_pages->index == ENTRIES_PER_PAGE) { if (!ftrace_pages->next) return NULL; @@ -215,7 +213,7 @@ static notrace struct dyn_ftrace *ftrace_alloc_shutdown_node(unsigned long ip) } static void notrace -ftrace_record_ip(unsigned long ip, unsigned long parent_ip) +ftrace_record_ip(unsigned long ip) { struct dyn_ftrace *node; unsigned long flags; @@ -223,6 +221,9 @@ ftrace_record_ip(unsigned long ip, unsigned long parent_ip) int resched; int atomic; + if (!ftrace_enabled) + return; + resched = need_resched(); preempt_disable_notrace(); @@ -251,11 +252,12 @@ ftrace_record_ip(unsigned long ip, unsigned long parent_ip) /* * There's a slight race that the ftraced will update the - * hash and reset here. The arch alloc is responsible - * for seeing if the IP has already changed, and if - * it has, the alloc will fail. + * hash and reset here. If it is already converted, skip it. */ - node = ftrace_alloc_shutdown_node(ip); + if (ftrace_ip_converted(ip)) + goto out_unlock; + + node = ftrace_alloc_dyn_node(ip); if (!node) goto out_unlock; @@ -277,11 +279,7 @@ ftrace_record_ip(unsigned long ip, unsigned long parent_ip) preempt_enable_notrace(); } -static struct ftrace_ops ftrace_shutdown_ops __read_mostly = -{ - .func = ftrace_record_ip, -}; - +#define FTRACE_ADDR ((long)(&ftrace_caller)) #define MCOUNT_ADDR ((long)(&mcount)) static void notrace ftrace_replace_code(int saved) @@ -309,9 +307,9 @@ static void notrace ftrace_replace_code(int saved) ip = rec->ip; if (saved) - new = ftrace_call_replace(ip, MCOUNT_ADDR); + new = ftrace_call_replace(ip, FTRACE_ADDR); else - old = ftrace_call_replace(ip, MCOUNT_ADDR); + old = ftrace_call_replace(ip, FTRACE_ADDR); failed = ftrace_modify_code(ip, old, new); if (failed) @@ -320,16 +318,6 @@ static void notrace ftrace_replace_code(int saved) } } -static notrace void ftrace_startup_code(void) -{ - ftrace_replace_code(1); -} - -static notrace void ftrace_shutdown_code(void) -{ - ftrace_replace_code(0); -} - static notrace void ftrace_shutdown_replenish(void) { if (ftrace_pages->next) @@ -339,16 +327,8 @@ static notrace void ftrace_shutdown_replenish(void) ftrace_pages->next = (void *)get_zeroed_page(GFP_KERNEL); } -static int notrace __ftrace_modify_code(void *data) -{ - void (*func)(void) = data; - - func(); - return 0; -} - static notrace void -ftrace_code_disable(struct dyn_ftrace *rec, unsigned long addr) +ftrace_code_disable(struct dyn_ftrace *rec) { unsigned long ip; unsigned char *nop, *call; @@ -357,67 +337,113 @@ ftrace_code_disable(struct dyn_ftrace *rec, unsigned long addr) ip = rec->ip; nop = ftrace_nop_replace(); - call = ftrace_call_replace(ip, addr); + call = ftrace_call_replace(ip, MCOUNT_ADDR); failed = ftrace_modify_code(ip, call, nop); if (failed) rec->flags |= FTRACE_FL_FAILED; } -static void notrace ftrace_run_startup_code(void) +static int notrace __ftrace_modify_code(void *data) { - stop_machine_run(__ftrace_modify_code, ftrace_startup_code, NR_CPUS); + unsigned long addr; + int *command = data; + + if (*command & FTRACE_ENABLE_CALLS) + ftrace_replace_code(1); + else if (*command & FTRACE_DISABLE_CALLS) + ftrace_replace_code(0); + + if (*command & FTRACE_UPDATE_TRACE_FUNC) + ftrace_update_ftrace_func(ftrace_trace_function); + + if (*command & FTRACE_ENABLE_MCOUNT) { + addr = (unsigned long)ftrace_record_ip; + ftrace_mcount_set(&addr); + } else if (*command & FTRACE_DISABLE_MCOUNT) { + addr = (unsigned long)ftrace_stub; + ftrace_mcount_set(&addr); + } + + return 0; } -static void notrace ftrace_run_shutdown_code(void) +static void notrace ftrace_run_update_code(int command) { - stop_machine_run(__ftrace_modify_code, ftrace_shutdown_code, NR_CPUS); + stop_machine_run(__ftrace_modify_code, &command, NR_CPUS); } +static ftrace_func_t saved_ftrace_func; + static void notrace ftrace_startup(void) { + int command = 0; + mutex_lock(&ftraced_lock); ftraced_suspend++; - if (ftraced_suspend != 1) + if (ftraced_suspend == 1) + command |= FTRACE_ENABLE_CALLS; + + if (saved_ftrace_func != ftrace_trace_function) { + saved_ftrace_func = ftrace_trace_function; + command |= FTRACE_UPDATE_TRACE_FUNC; + } + + if (!command || !ftrace_enabled) goto out; - __unregister_ftrace_function(&ftrace_shutdown_ops); - if (ftrace_enabled) - ftrace_run_startup_code(); + ftrace_run_update_code(command); out: mutex_unlock(&ftraced_lock); } static void notrace ftrace_shutdown(void) { + int command = 0; + mutex_lock(&ftraced_lock); ftraced_suspend--; - if (ftraced_suspend) - goto out; + if (!ftraced_suspend) + command |= FTRACE_DISABLE_CALLS; - if (ftrace_enabled) - ftrace_run_shutdown_code(); + if (saved_ftrace_func != ftrace_trace_function) { + saved_ftrace_func = ftrace_trace_function; + command |= FTRACE_UPDATE_TRACE_FUNC; + } - __register_ftrace_function(&ftrace_shutdown_ops); + if (!command || !ftrace_enabled) + goto out; + + ftrace_run_update_code(command); out: mutex_unlock(&ftraced_lock); } static void notrace ftrace_startup_sysctl(void) { + int command = FTRACE_ENABLE_MCOUNT; + mutex_lock(&ftraced_lock); + /* Force update next time */ + saved_ftrace_func = NULL; /* ftraced_suspend is true if we want ftrace running */ if (ftraced_suspend) - ftrace_run_startup_code(); + command |= FTRACE_ENABLE_CALLS; + + ftrace_run_update_code(command); mutex_unlock(&ftraced_lock); } static void notrace ftrace_shutdown_sysctl(void) { + int command = FTRACE_DISABLE_MCOUNT; + mutex_lock(&ftraced_lock); /* ftraced_suspend is true if ftrace is running */ if (ftraced_suspend) - ftrace_run_shutdown_code(); + command |= FTRACE_DISABLE_CALLS; + + ftrace_run_update_code(command); mutex_unlock(&ftraced_lock); } @@ -430,11 +456,13 @@ static int notrace __ftrace_update_code(void *ignore) struct dyn_ftrace *p; struct hlist_head head; struct hlist_node *t; + int save_ftrace_enabled; cycle_t start, stop; int i; - /* Don't be calling ftrace ops now */ - __unregister_ftrace_function(&ftrace_shutdown_ops); + /* Don't be recording funcs now */ + save_ftrace_enabled = ftrace_enabled; + ftrace_enabled = 0; start = now(raw_smp_processor_id()); ftrace_update_cnt = 0; @@ -449,7 +477,7 @@ static int notrace __ftrace_update_code(void *ignore) /* all CPUS are stopped, we are safe to modify code */ hlist_for_each_entry(p, t, &head, node) { - ftrace_code_disable(p, MCOUNT_ADDR); + ftrace_code_disable(p); ftrace_update_cnt++; } @@ -459,7 +487,7 @@ static int notrace __ftrace_update_code(void *ignore) ftrace_update_time = stop - start; ftrace_update_tot_cnt += ftrace_update_cnt; - __register_ftrace_function(&ftrace_shutdown_ops); + ftrace_enabled = save_ftrace_enabled; return 0; } @@ -515,11 +543,6 @@ static int __init ftrace_dyn_table_alloc(void) struct ftrace_page *pg; int cnt; int i; - int ret; - - ret = ftrace_dyn_arch_init(); - if (ret) - return ret; /* allocate a few pages */ ftrace_pages_start = (void *)get_zeroed_page(GFP_KERNEL); @@ -557,11 +580,19 @@ static int __init ftrace_dyn_table_alloc(void) return 0; } -static int __init notrace ftrace_shutdown_init(void) +static int __init notrace ftrace_dynamic_init(void) { struct task_struct *p; + unsigned long addr; int ret; + addr = (unsigned long)ftrace_record_ip; + stop_machine_run(ftrace_dyn_arch_init, &addr, NR_CPUS); + + /* ftrace_dyn_arch_init places the return code in addr */ + if (addr) + return addr; + ret = ftrace_dyn_table_alloc(); if (ret) return ret; @@ -570,12 +601,12 @@ static int __init notrace ftrace_shutdown_init(void) if (IS_ERR(p)) return -1; - __register_ftrace_function(&ftrace_shutdown_ops); + last_ftrace_enabled = ftrace_enabled = 1; return 0; } -core_initcall(ftrace_shutdown_init); +core_initcall(ftrace_dynamic_init); #else # define ftrace_startup() do { } while (0) # define ftrace_shutdown() do { } while (0) @@ -599,9 +630,8 @@ int register_ftrace_function(struct ftrace_ops *ops) int ret; mutex_lock(&ftrace_sysctl_lock); - ftrace_startup(); - ret = __register_ftrace_function(ops); + ftrace_startup(); mutex_unlock(&ftrace_sysctl_lock); return ret; @@ -619,10 +649,7 @@ int unregister_ftrace_function(struct ftrace_ops *ops) mutex_lock(&ftrace_sysctl_lock); ret = __unregister_ftrace_function(ops); - - if (ftrace_list == &ftrace_list_end) - ftrace_shutdown(); - + ftrace_shutdown(); mutex_unlock(&ftrace_sysctl_lock); return ret; -- cgit v0.10.2 From 5072c59fd45e9976d02ee6f18c7336ef97623cbc Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:43 +0200 Subject: ftrace: add filter select functions to trace This patch adds two files to the debugfs system: /debugfs/tracing/available_filter_functions and /debugfs/tracing/set_ftrace_filter The available_filter_functions lists all functions that has been recorded by the ftraced that has called the ftrace_record_ip function. This is to allow users to see what functions have been converted to nops and can be enabled for tracing. To enable functions, simply echo the names (whitespace delimited) into set_ftrace_filter. Simple wildcards are also allowed. echo 'scheduler' > /debugfs/tracing/set_ftrace_filter Will have only the scheduler be activated when tracing is enabled. echo 'sched_*' > /debugfs/tracing/set_ftrace_filter Will have only the functions starting with 'sched_' be activated. echo '*lock' > /debugfs/tracing/set_ftrace_filter Will have only functions ending with 'lock' be activated. echo '*lock*' > /debugfs/tracing/set_ftrace_filter Will have only functions with 'lock' in its name be activated. Note: 'sched*lock' will not work. The only wildcards that are allowed is an asterisk and the beginning and or end of the string passed in. Multiple names can be passed in with whitespace delimited: echo 'scheduler *lock *acpi*' > /debugfs/tracing/set_ftrace_filter is also the same as: echo 'scheduler' > /debugfs/tracing/set_ftrace_filter echo '*lock' >> /debugfs/tracing/set_ftrace_filter echo '*acpi*' >> /debugfs/tracing/set_ftrace_filter Appending does just that. It appends to the list. To disable all filters simply echo an empty line in: echo > /debugfs/tracing/set_ftrace_filter Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index b0dd009..f5911d2 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -43,7 +43,9 @@ extern void mcount(void); # define FTRACE_HASHSIZE (1< #include #include +#include +#include #include #include #include -#include +#include #include #include +#include #include #include "trace.h" @@ -151,12 +154,15 @@ enum { FTRACE_DISABLE_MCOUNT = (1 << 4), }; +static int ftrace_filtered; + static struct hlist_head ftrace_hash[FTRACE_HASHSIZE]; static DEFINE_PER_CPU(int, ftrace_shutdown_disable_cpu); static DEFINE_SPINLOCK(ftrace_shutdown_lock); static DEFINE_MUTEX(ftraced_lock); +static DEFINE_MUTEX(ftrace_filter_lock); struct ftrace_page { struct ftrace_page *next; @@ -282,16 +288,82 @@ ftrace_record_ip(unsigned long ip) #define FTRACE_ADDR ((long)(&ftrace_caller)) #define MCOUNT_ADDR ((long)(&mcount)) -static void notrace ftrace_replace_code(int saved) +static void notrace +__ftrace_replace_code(struct dyn_ftrace *rec, + unsigned char *old, unsigned char *new, int enable) +{ + unsigned long ip; + int failed; + + ip = rec->ip; + + if (ftrace_filtered && enable) { + unsigned long fl; + /* + * If filtering is on: + * + * If this record is set to be filtered and + * is enabled then do nothing. + * + * If this record is set to be filtered and + * it is not enabled, enable it. + * + * If this record is not set to be filtered + * and it is not enabled do nothing. + * + * If this record is not set to be filtered and + * it is enabled, disable it. + */ + fl = rec->flags & (FTRACE_FL_FILTER | FTRACE_FL_ENABLED); + + if ((fl == (FTRACE_FL_FILTER | FTRACE_FL_ENABLED)) || + (fl == 0)) + return; + + /* + * If it is enabled disable it, + * otherwise enable it! + */ + if (fl == FTRACE_FL_ENABLED) { + /* swap new and old */ + new = old; + old = ftrace_call_replace(ip, FTRACE_ADDR); + rec->flags &= ~FTRACE_FL_ENABLED; + } else { + new = ftrace_call_replace(ip, FTRACE_ADDR); + rec->flags |= FTRACE_FL_ENABLED; + } + } else { + + if (enable) + new = ftrace_call_replace(ip, FTRACE_ADDR); + else + old = ftrace_call_replace(ip, FTRACE_ADDR); + + if (enable) { + if (rec->flags & FTRACE_FL_ENABLED) + return; + rec->flags |= FTRACE_FL_ENABLED; + } else { + if (!(rec->flags & FTRACE_FL_ENABLED)) + return; + rec->flags &= ~FTRACE_FL_ENABLED; + } + } + + failed = ftrace_modify_code(ip, old, new); + if (failed) + rec->flags |= FTRACE_FL_FAILED; +} + +static void notrace ftrace_replace_code(int enable) { unsigned char *new = NULL, *old = NULL; struct dyn_ftrace *rec; struct ftrace_page *pg; - unsigned long ip; - int failed; int i; - if (saved) + if (enable) old = ftrace_nop_replace(); else new = ftrace_nop_replace(); @@ -304,16 +376,7 @@ static void notrace ftrace_replace_code(int saved) if (rec->flags & FTRACE_FL_FAILED) continue; - ip = rec->ip; - - if (saved) - new = ftrace_call_replace(ip, FTRACE_ADDR); - else - old = ftrace_call_replace(ip, FTRACE_ADDR); - - failed = ftrace_modify_code(ip, old, new); - if (failed) - rec->flags |= FTRACE_FL_FAILED; + __ftrace_replace_code(rec, old, new, enable); } } } @@ -580,6 +643,436 @@ static int __init ftrace_dyn_table_alloc(void) return 0; } +enum { + FTRACE_ITER_FILTER = (1 << 0), + FTRACE_ITER_CONT = (1 << 1), +}; + +#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ + +struct ftrace_iterator { + loff_t pos; + struct ftrace_page *pg; + unsigned idx; + unsigned flags; + unsigned char buffer[FTRACE_BUFF_MAX+1]; + unsigned buffer_idx; + unsigned filtered; +}; + +static void notrace * +t_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct ftrace_iterator *iter = m->private; + struct dyn_ftrace *rec = NULL; + + (*pos)++; + + retry: + if (iter->idx >= iter->pg->index) { + if (iter->pg->next) { + iter->pg = iter->pg->next; + iter->idx = 0; + goto retry; + } + } else { + rec = &iter->pg->records[iter->idx++]; + if ((rec->flags & FTRACE_FL_FAILED) || + ((iter->flags & FTRACE_ITER_FILTER) && + !(rec->flags & FTRACE_FL_FILTER))) { + rec = NULL; + goto retry; + } + } + + iter->pos = *pos; + + return rec; +} + +static void *t_start(struct seq_file *m, loff_t *pos) +{ + struct ftrace_iterator *iter = m->private; + void *p = NULL; + loff_t l = -1; + + if (*pos != iter->pos) { + for (p = t_next(m, p, &l); p && l < *pos; p = t_next(m, p, &l)) + ; + } else { + l = *pos; + p = t_next(m, p, &l); + } + + return p; +} + +static void t_stop(struct seq_file *m, void *p) +{ +} + +static int t_show(struct seq_file *m, void *v) +{ + struct dyn_ftrace *rec = v; + char str[KSYM_SYMBOL_LEN]; + + if (!rec) + return 0; + + kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); + + seq_printf(m, "%s\n", str); + + return 0; +} + +static struct seq_operations show_ftrace_seq_ops = { + .start = t_start, + .next = t_next, + .stop = t_stop, + .show = t_show, +}; + +static int notrace +ftrace_avail_open(struct inode *inode, struct file *file) +{ + struct ftrace_iterator *iter; + int ret; + + iter = kzalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) + return -ENOMEM; + + iter->pg = ftrace_pages_start; + iter->pos = -1; + + ret = seq_open(file, &show_ftrace_seq_ops); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = iter; + } else + kfree(iter); + + return ret; +} + +int ftrace_avail_release(struct inode *inode, struct file *file) +{ + struct seq_file *m = (struct seq_file *)file->private_data; + struct ftrace_iterator *iter = m->private; + + seq_release(inode, file); + kfree(iter); + return 0; +} + +static void notrace ftrace_filter_reset(void) +{ + struct ftrace_page *pg; + struct dyn_ftrace *rec; + unsigned i; + + /* keep kstop machine from running */ + preempt_disable(); + ftrace_filtered = 0; + pg = ftrace_pages_start; + while (pg) { + for (i = 0; i < pg->index; i++) { + rec = &pg->records[i]; + if (rec->flags & FTRACE_FL_FAILED) + continue; + rec->flags &= ~FTRACE_FL_FILTER; + } + pg = pg->next; + } + preempt_enable(); +} + +static int notrace +ftrace_filter_open(struct inode *inode, struct file *file) +{ + struct ftrace_iterator *iter; + int ret = 0; + + iter = kzalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) + return -ENOMEM; + + mutex_lock(&ftrace_filter_lock); + if ((file->f_mode & FMODE_WRITE) && + !(file->f_flags & O_APPEND)) + ftrace_filter_reset(); + + if (file->f_mode & FMODE_READ) { + iter->pg = ftrace_pages_start; + iter->pos = -1; + iter->flags = FTRACE_ITER_FILTER; + + ret = seq_open(file, &show_ftrace_seq_ops); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = iter; + } else + kfree(iter); + } else + file->private_data = iter; + mutex_unlock(&ftrace_filter_lock); + + return ret; +} + +static ssize_t notrace +ftrace_filter_read(struct file *file, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + if (file->f_mode & FMODE_READ) + return seq_read(file, ubuf, cnt, ppos); + else + return -EPERM; +} + +static loff_t notrace +ftrace_filter_lseek(struct file *file, loff_t offset, int origin) +{ + loff_t ret; + + if (file->f_mode & FMODE_READ) + ret = seq_lseek(file, offset, origin); + else + file->f_pos = ret = 1; + + return ret; +} + +enum { + MATCH_FULL, + MATCH_FRONT_ONLY, + MATCH_MIDDLE_ONLY, + MATCH_END_ONLY, +}; + +static void notrace +ftrace_match(unsigned char *buff, int len) +{ + char str[KSYM_SYMBOL_LEN]; + char *search = NULL; + struct ftrace_page *pg; + struct dyn_ftrace *rec; + int type = MATCH_FULL; + unsigned i, match = 0, search_len = 0; + + for (i = 0; i < len; i++) { + if (buff[i] == '*') { + if (!i) { + search = buff + i + 1; + type = MATCH_END_ONLY; + search_len = len - (i + 1); + } else { + if (type == MATCH_END_ONLY) { + type = MATCH_MIDDLE_ONLY; + } else { + match = i; + type = MATCH_FRONT_ONLY; + } + buff[i] = 0; + break; + } + } + } + + /* keep kstop machine from running */ + preempt_disable(); + ftrace_filtered = 1; + pg = ftrace_pages_start; + while (pg) { + for (i = 0; i < pg->index; i++) { + int matched = 0; + char *ptr; + + rec = &pg->records[i]; + if (rec->flags & FTRACE_FL_FAILED) + continue; + kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); + switch (type) { + case MATCH_FULL: + if (strcmp(str, buff) == 0) + matched = 1; + break; + case MATCH_FRONT_ONLY: + if (memcmp(str, buff, match) == 0) + matched = 1; + break; + case MATCH_MIDDLE_ONLY: + if (strstr(str, search)) + matched = 1; + break; + case MATCH_END_ONLY: + ptr = strstr(str, search); + if (ptr && (ptr[search_len] == 0)) + matched = 1; + break; + } + if (matched) + rec->flags |= FTRACE_FL_FILTER; + } + pg = pg->next; + } + preempt_enable(); +} + +static ssize_t notrace +ftrace_filter_write(struct file *file, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct ftrace_iterator *iter; + char ch; + size_t read = 0; + ssize_t ret; + + if (!cnt || cnt < 0) + return 0; + + mutex_lock(&ftrace_filter_lock); + + if (file->f_mode & FMODE_READ) { + struct seq_file *m = file->private_data; + iter = m->private; + } else + iter = file->private_data; + + if (!*ppos) { + iter->flags &= ~FTRACE_ITER_CONT; + iter->buffer_idx = 0; + } + + ret = get_user(ch, ubuf++); + if (ret) + goto out; + read++; + cnt--; + + if (!(iter->flags & ~FTRACE_ITER_CONT)) { + /* skip white space */ + while (cnt && isspace(ch)) { + ret = get_user(ch, ubuf++); + if (ret) + goto out; + read++; + cnt--; + } + + + if (isspace(ch)) { + file->f_pos += read; + ret = read; + goto out; + } + + iter->buffer_idx = 0; + } + + while (cnt && !isspace(ch)) { + if (iter->buffer_idx < FTRACE_BUFF_MAX) + iter->buffer[iter->buffer_idx++] = ch; + else { + ret = -EINVAL; + goto out; + } + ret = get_user(ch, ubuf++); + if (ret) + goto out; + read++; + cnt--; + } + + if (isspace(ch)) { + iter->filtered++; + iter->buffer[iter->buffer_idx] = 0; + ftrace_match(iter->buffer, iter->buffer_idx); + iter->buffer_idx = 0; + } else + iter->flags |= FTRACE_ITER_CONT; + + + file->f_pos += read; + + ret = read; + out: + mutex_unlock(&ftrace_filter_lock); + + return ret; +} + +static int notrace +ftrace_filter_release(struct inode *inode, struct file *file) +{ + struct seq_file *m = (struct seq_file *)file->private_data; + struct ftrace_iterator *iter; + + mutex_lock(&ftrace_filter_lock); + if (file->f_mode & FMODE_READ) { + iter = m->private; + + seq_release(inode, file); + } else + iter = file->private_data; + + if (iter->buffer_idx) { + iter->filtered++; + iter->buffer[iter->buffer_idx] = 0; + ftrace_match(iter->buffer, iter->buffer_idx); + } + + mutex_lock(&ftrace_sysctl_lock); + mutex_lock(&ftraced_lock); + if (iter->filtered && ftraced_suspend && ftrace_enabled) + ftrace_run_update_code(FTRACE_ENABLE_CALLS); + mutex_unlock(&ftraced_lock); + mutex_unlock(&ftrace_sysctl_lock); + + kfree(iter); + mutex_unlock(&ftrace_filter_lock); + return 0; +} + +static struct file_operations ftrace_avail_fops = { + .open = ftrace_avail_open, + .read = seq_read, + .llseek = seq_lseek, + .release = ftrace_avail_release, +}; + +static struct file_operations ftrace_filter_fops = { + .open = ftrace_filter_open, + .read = ftrace_filter_read, + .write = ftrace_filter_write, + .llseek = ftrace_filter_lseek, + .release = ftrace_filter_release, +}; + +static __init int ftrace_init_debugfs(void) +{ + struct dentry *d_tracer; + struct dentry *entry; + + d_tracer = tracing_init_dentry(); + + entry = debugfs_create_file("available_filter_functions", 0444, + d_tracer, NULL, &ftrace_avail_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'available_filter_functions' entry\n"); + + entry = debugfs_create_file("set_ftrace_filter", 0644, d_tracer, + NULL, &ftrace_filter_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'set_ftrace_filter' entry\n"); + return 0; +} + +fs_initcall(ftrace_init_debugfs); + static int __init notrace ftrace_dynamic_init(void) { struct task_struct *p; @@ -657,14 +1150,14 @@ int unregister_ftrace_function(struct ftrace_ops *ops) notrace int ftrace_enable_sysctl(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, + struct file *file, void __user *buffer, size_t *lenp, loff_t *ppos) { int ret; mutex_lock(&ftrace_sysctl_lock); - ret = proc_dointvec(table, write, filp, buffer, lenp, ppos); + ret = proc_dointvec(table, write, file, buffer, lenp, ppos); if (ret || !write || (last_ftrace_enabled == ftrace_enabled)) goto out; -- cgit v0.10.2 From 4c11d7aed389375253b59e2b1865eec96663c65d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:43 +0200 Subject: ftrace: convert single large buffer into single pages. Allocating large buffers for the tracer may fail easily. This patch converts the buffer from a large ordered allocation to single pages. It uses the struct page LRU field to link the pages together. Later patches may also implement dynamic increasing and decreasing of the trace buffers. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 1b8eca7..d7ad030 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -49,7 +50,7 @@ static struct trace_array max_tr; static DEFINE_PER_CPU(struct trace_array_cpu, max_data); static int tracer_enabled; -static unsigned long trace_nr_entries = 4096UL; +static unsigned long trace_nr_entries = 16384UL; static struct tracer *trace_types __read_mostly; static struct tracer *current_trace __read_mostly; @@ -57,6 +58,8 @@ static int max_tracer_type_len; static DEFINE_MUTEX(trace_types_lock); +#define ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(struct trace_entry)) + static int __init set_nr_entries(char *str) { if (!str) @@ -103,6 +106,7 @@ static const char *trace_options[] = { static unsigned trace_flags; +static DEFINE_SPINLOCK(ftrace_max_lock); /* * Copy the new maximum trace into the separate maximum-trace @@ -136,17 +140,23 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) { struct trace_array_cpu *data; void *save_trace; + struct list_head save_pages; int i; + WARN_ON_ONCE(!irqs_disabled()); + spin_lock(&ftrace_max_lock); /* clear out all the previous traces */ for_each_possible_cpu(i) { data = tr->data[i]; save_trace = max_tr.data[i]->trace; + save_pages = max_tr.data[i]->trace_pages; memcpy(max_tr.data[i], data, sizeof(*data)); data->trace = save_trace; + data->trace_pages = save_pages; } __update_max_tr(tr, tsk, cpu); + spin_unlock(&ftrace_max_lock); } /** @@ -160,16 +170,22 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) { struct trace_array_cpu *data = tr->data[cpu]; void *save_trace; + struct list_head save_pages; int i; + WARN_ON_ONCE(!irqs_disabled()); + spin_lock(&ftrace_max_lock); for_each_possible_cpu(i) tracing_reset(max_tr.data[i]); save_trace = max_tr.data[cpu]->trace; + save_pages = max_tr.data[cpu]->trace_pages; memcpy(max_tr.data[cpu], data, sizeof(*data)); data->trace = save_trace; + data->trace_pages = save_pages; __update_max_tr(tr, tsk, cpu); + spin_unlock(&ftrace_max_lock); } int register_tracer(struct tracer *type) @@ -236,7 +252,8 @@ void unregister_tracer(struct tracer *type) void notrace tracing_reset(struct trace_array_cpu *data) { data->trace_idx = 0; - atomic_set(&data->underrun, 0); + data->trace_current = data->trace; + data->trace_current_idx = 0; } #ifdef CONFIG_FTRACE @@ -367,21 +384,27 @@ tracing_get_trace_entry(struct trace_array *tr, { unsigned long idx, idx_next; struct trace_entry *entry; + struct page *page; + struct list_head *next; - idx = data->trace_idx; + data->trace_idx++; + idx = data->trace_current_idx; idx_next = idx + 1; - if (unlikely(idx_next >= tr->entries)) { - atomic_inc(&data->underrun); + entry = data->trace_current + idx * TRACE_ENTRY_SIZE; + + if (unlikely(idx_next >= ENTRIES_PER_PAGE)) { + page = virt_to_page(data->trace_current); + if (unlikely(&page->lru == data->trace_pages.prev)) + next = data->trace_pages.next; + else + next = page->lru.next; + page = list_entry(next, struct page, lru); + data->trace_current = page_address(page); idx_next = 0; } - data->trace_idx = idx_next; - - if (unlikely(idx_next != 0 && atomic_read(&data->underrun))) - atomic_inc(&data->underrun); - - entry = data->trace + idx * TRACE_ENTRY_SIZE; + data->trace_current_idx = idx_next; return entry; } @@ -442,21 +465,38 @@ enum trace_file_type { }; static struct trace_entry * -trace_entry_idx(struct trace_array *tr, unsigned long idx, int cpu) +trace_entry_idx(struct trace_array *tr, struct trace_array_cpu *data, + struct trace_iterator *iter, int cpu) { - struct trace_entry *array = tr->data[cpu]->trace; - unsigned long underrun; + struct page *page; + struct trace_entry *array; - if (idx >= tr->entries) + if (iter->next_idx[cpu] >= tr->entries || + iter->next_idx[cpu] >= data->trace_idx) return NULL; - underrun = atomic_read(&tr->data[cpu]->underrun); - if (underrun) - idx = ((underrun - 1) + idx) % tr->entries; - else if (idx >= tr->data[cpu]->trace_idx) - return NULL; + if (!iter->next_page[cpu]) { + /* + * Initialize. If the count of elements in + * this buffer is greater than the max entries + * we had an underrun. Which means we looped around. + * We can simply use the current pointer as our + * starting point. + */ + if (data->trace_idx >= tr->entries) { + page = virt_to_page(data->trace_current); + iter->next_page[cpu] = &page->lru; + iter->next_page_idx[cpu] = data->trace_current_idx; + } else { + iter->next_page[cpu] = data->trace_pages.next; + iter->next_page_idx[cpu] = 0; + } + } - return &array[idx]; + page = list_entry(iter->next_page[cpu], struct page, lru); + array = page_address(page); + + return &array[iter->next_page_idx[cpu]]; } static struct notrace trace_entry * @@ -470,7 +510,7 @@ find_next_entry(struct trace_iterator *iter, int *ent_cpu) for_each_possible_cpu(cpu) { if (!tr->data[cpu]->trace) continue; - ent = trace_entry_idx(tr, iter->next_idx[cpu], cpu); + ent = trace_entry_idx(tr, tr->data[cpu], iter, cpu); if (ent && (!next || (long)(next->idx - ent->idx) > 0)) { next = ent; @@ -492,8 +532,19 @@ static void *find_next_entry_inc(struct trace_iterator *iter) next = find_next_entry(iter, &next_cpu); if (next) { - iter->next_idx[next_cpu]++; iter->idx++; + iter->next_idx[next_cpu]++; + iter->next_page_idx[next_cpu]++; + if (iter->next_page_idx[next_cpu] >= ENTRIES_PER_PAGE) { + struct trace_array_cpu *data = iter->tr->data[next_cpu]; + + iter->next_page_idx[next_cpu] = 0; + iter->next_page[next_cpu] = + iter->next_page[next_cpu]->next; + if (iter->next_page[next_cpu] == &data->trace_pages) + iter->next_page[next_cpu] = + data->trace_pages.next; + } } iter->ent = next; iter->cpu = next_cpu; @@ -554,14 +605,16 @@ static void *s_start(struct seq_file *m, loff_t *pos) iter->cpu = 0; iter->idx = -1; - for (i = 0; i < NR_CPUS; i++) + for_each_possible_cpu(i) { iter->next_idx[i] = 0; + iter->next_page[i] = NULL; + } for (p = iter; p && l < *pos; p = s_next(m, p, &l)) ; } else { - l = *pos; + l = *pos - 1; p = s_next(m, p, &l); } @@ -654,9 +707,8 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) struct trace_array *tr = iter->tr; struct trace_array_cpu *data = tr->data[tr->cpu]; struct tracer *type = current_trace; - unsigned long underruns = 0; - unsigned long underrun; - unsigned long entries = 0; + unsigned long total = 0; + unsigned long entries = 0; int cpu; const char *name = "preemption"; @@ -665,11 +717,10 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) for_each_possible_cpu(cpu) { if (tr->data[cpu]->trace) { - underrun = atomic_read(&tr->data[cpu]->underrun); - if (underrun) { - underruns += underrun; + total += tr->data[cpu]->trace_idx; + if (tr->data[cpu]->trace_idx > tr->entries) entries += tr->entries; - } else + else entries += tr->data[cpu]->trace_idx; } } @@ -682,7 +733,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) " (M:%s VP:%d, KP:%d, SP:%d HP:%d", data->saved_latency, entries, - (entries + underruns), + total, tr->cpu, #if defined(CONFIG_PREEMPT_NONE) "server", @@ -882,8 +933,7 @@ static int trace_empty(struct trace_iterator *iter) data = iter->tr->data[cpu]; if (data->trace && - (data->trace_idx || - atomic_read(&data->underrun))) + data->trace_idx) return 0; } return 1; @@ -1464,42 +1514,109 @@ static struct tracer no_tracer __read_mostly = .name = "none", }; -static inline notrace int page_order(const unsigned long size) +static int trace_alloc_page(void) { - const unsigned long nr_pages = DIV_ROUND_UP(size, PAGE_SIZE); - return ilog2(roundup_pow_of_two(nr_pages)); + struct trace_array_cpu *data; + void *array; + struct page *page, *tmp; + LIST_HEAD(pages); + int i; + + /* first allocate a page for each CPU */ + for_each_possible_cpu(i) { + array = (void *)__get_free_page(GFP_KERNEL); + if (array == NULL) { + printk(KERN_ERR "tracer: failed to allocate page" + "for trace buffer!\n"); + goto free_pages; + } + + page = virt_to_page(array); + list_add(&page->lru, &pages); + +/* Only allocate if we are actually using the max trace */ +#ifdef CONFIG_TRACER_MAX_TRACE + array = (void *)__get_free_page(GFP_KERNEL); + if (array == NULL) { + printk(KERN_ERR "tracer: failed to allocate page" + "for trace buffer!\n"); + goto free_pages; + } + page = virt_to_page(array); + list_add(&page->lru, &pages); +#endif + } + + /* Now that we successfully allocate a page per CPU, add them */ + for_each_possible_cpu(i) { + data = global_trace.data[i]; + page = list_entry(pages.next, struct page, lru); + list_del(&page->lru); + list_add_tail(&page->lru, &data->trace_pages); + ClearPageLRU(page); + +#ifdef CONFIG_TRACER_MAX_TRACE + data = max_tr.data[i]; + page = list_entry(pages.next, struct page, lru); + list_del(&page->lru); + list_add_tail(&page->lru, &data->trace_pages); + SetPageLRU(page); +#endif + } + global_trace.entries += ENTRIES_PER_PAGE; + + return 0; + + free_pages: + list_for_each_entry_safe(page, tmp, &pages, lru) { + list_del(&page->lru); + __free_page(page); + } + return -ENOMEM; } __init static int tracer_alloc_buffers(void) { - const int order = page_order(trace_nr_entries * TRACE_ENTRY_SIZE); - const unsigned long size = (1UL << order) << PAGE_SHIFT; - struct trace_entry *array; + struct trace_array_cpu *data; + void *array; + struct page *page; + int pages = 0; int i; + /* Allocate the first page for all buffers */ for_each_possible_cpu(i) { - global_trace.data[i] = &per_cpu(global_trace_cpu, i); + data = global_trace.data[i] = &per_cpu(global_trace_cpu, i); max_tr.data[i] = &per_cpu(max_data, i); - array = (struct trace_entry *) - __get_free_pages(GFP_KERNEL, order); + array = (void *)__get_free_page(GFP_KERNEL); if (array == NULL) { - printk(KERN_ERR "tracer: failed to allocate" - " %ld bytes for trace buffer!\n", size); + printk(KERN_ERR "tracer: failed to allocate page" + "for trace buffer!\n"); goto free_buffers; } - global_trace.data[i]->trace = array; + data->trace = array; + + /* set the array to the list */ + INIT_LIST_HEAD(&data->trace_pages); + page = virt_to_page(array); + list_add(&page->lru, &data->trace_pages); + /* use the LRU flag to differentiate the two buffers */ + ClearPageLRU(page); /* Only allocate if we are actually using the max trace */ #ifdef CONFIG_TRACER_MAX_TRACE - array = (struct trace_entry *) - __get_free_pages(GFP_KERNEL, order); + array = (void *)__get_free_page(GFP_KERNEL); if (array == NULL) { - printk(KERN_ERR "wakeup tracer: failed to allocate" - " %ld bytes for trace buffer!\n", size); + printk(KERN_ERR "tracer: failed to allocate page" + "for trace buffer!\n"); goto free_buffers; } max_tr.data[i]->trace = array; + + INIT_LIST_HEAD(&max_tr.data[i]->trace_pages); + page = virt_to_page(array); + list_add(&page->lru, &max_tr.data[i]->trace_pages); + SetPageLRU(page); #endif } @@ -1507,11 +1624,18 @@ __init static int tracer_alloc_buffers(void) * Since we allocate by orders of pages, we may be able to * round up a bit. */ - global_trace.entries = size / TRACE_ENTRY_SIZE; + global_trace.entries = ENTRIES_PER_PAGE; max_tr.entries = global_trace.entries; + pages++; + + while (global_trace.entries < trace_nr_entries) { + if (trace_alloc_page()) + break; + pages++; + } - pr_info("tracer: %ld bytes allocated for %ld", - size, trace_nr_entries); + pr_info("tracer: %d pages allocated for %ld", + pages, trace_nr_entries); pr_info(" entries of %ld bytes\n", (long)TRACE_ENTRY_SIZE); pr_info(" actual entries %ld\n", global_trace.entries); @@ -1526,17 +1650,26 @@ __init static int tracer_alloc_buffers(void) free_buffers: for (i-- ; i >= 0; i--) { + struct page *page, *tmp; struct trace_array_cpu *data = global_trace.data[i]; if (data && data->trace) { - free_pages((unsigned long)data->trace, order); + list_for_each_entry_safe(page, tmp, + &data->trace_pages, lru) { + list_del(&page->lru); + __free_page(page); + } data->trace = NULL; } #ifdef CONFIG_TRACER_MAX_TRACE data = max_tr.data[i]; if (data && data->trace) { - free_pages((unsigned long)data->trace, order); + list_for_each_entry_safe(page, tmp, + &data->trace_pages, lru) { + list_del(&page->lru); + __free_page(page); + } data->trace = NULL; } #endif diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 3173a93..83e257e 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -54,9 +54,11 @@ struct trace_entry { */ struct trace_array_cpu { void *trace; + void *trace_current; + unsigned trace_current_idx; + struct list_head trace_pages; unsigned long trace_idx; atomic_t disabled; - atomic_t underrun; unsigned long saved_latency; unsigned long critical_start; unsigned long critical_end; @@ -112,8 +114,10 @@ struct trace_iterator { unsigned long iter_flags; loff_t pos; unsigned long next_idx[NR_CPUS]; + struct list_head *next_page[NR_CPUS]; + unsigned next_page_idx[NR_CPUS]; + long idx; int cpu; - int idx; }; void notrace tracing_reset(struct trace_array_cpu *data); -- cgit v0.10.2 From f43fdad8627fec2d21df92799b254dceb66c9c3c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:43 +0200 Subject: ftrace: fix kexec disable the tracer while kexec pulls the rug from under the old kernel. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index d0b234c..88923fd 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -11,6 +11,8 @@ #include #include #include +#include + #include #include #include @@ -107,6 +109,8 @@ NORET_TYPE void machine_kexec(struct kimage *image) unsigned long page_list[PAGES_NR]; void *control_page; + tracer_disable(); + /* Interrupts aren't acceptable while we reboot */ local_irq_disable(); diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 576a03d..1558fdc 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -11,6 +11,8 @@ #include #include #include +#include + #include #include #include @@ -184,6 +186,8 @@ NORET_TYPE void machine_kexec(struct kimage *image) unsigned long page_list[PAGES_NR]; void *control_page; + tracer_disable(); + /* Interrupts aren't acceptable while we reboot */ local_irq_disable(); diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index f5911d2..a42390c 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -68,6 +68,13 @@ extern void ftrace_call(void); extern void mcount_call(void); #endif +static inline void tracer_disable(void) +{ +#ifdef CONFIG_FTRACE + ftrace_enabled = 0; +#endif +} + #ifdef CONFIG_FRAME_POINTER /* TODO: need to fix this for ARM */ # define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0)) -- cgit v0.10.2 From 5568b139f4d196273958ae2947a736fdf1ffeece Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:44 +0200 Subject: ftrace: debug smp_processor_id, use notrace preempt disable The debug smp_processor_id caused a recursive fault in debugging the irqsoff tracer. The tracer used a smp_processor_id in the ftrace callback, and this function called preempt_disable which also is traced. This caused a recursive fault (stack overload). Since using smp_processor_id without debugging on does not cause faults with the tracer (even when the tracer is wrong), the debug version should not cause a system reboot. This changes the debug_smp_processor_id to use the notrace versions of preempt_disable and enable. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c index e555ab6..3b4dc09 100644 --- a/lib/smp_processor_id.c +++ b/lib/smp_processor_id.c @@ -37,7 +37,7 @@ notrace unsigned int debug_smp_processor_id(void) /* * Avoid recursion: */ - preempt_disable(); + preempt_disable_notrace(); if (!printk_ratelimit()) goto out_enable; @@ -49,7 +49,7 @@ notrace unsigned int debug_smp_processor_id(void) dump_stack(); out_enable: - preempt_enable_no_resched(); + preempt_enable_no_resched_notrace(); out: return this_cpu; } -- cgit v0.10.2 From 361943ad0ba3f16e66859e30a408915e008ba91e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:44 +0200 Subject: ftrace: irqs off smp_processor_id() fix The irqsoff function tracer did a __get_cpu_var to determine if it should trace the function or not. The problem is that __get_cpu_var can preempt between getting the CPU and reading the cpu variable. This means that the cpu variable that is being read is not from the cpu being run on. At worst, this can give a false positive, where we trace the function when we should not. It will never give a false negative since we only want to trace when interrupts are disabled and we never preempt when they are. This fix adds a check after reading the irq flags to only trace if the interrupts are actually disabled. It also changes the reading of the cpu variable to use a raw_smp_processor_id since we now don't care if we preempt. We still catch that fact. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 8b12316..bd3f881 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -74,12 +74,21 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) long disabled; int cpu; - if (likely(!__get_cpu_var(tracing_cpu))) + /* + * Does not matter if we preempt. We test the flags + * afterward, to see if irqs are disabled or not. + * If we preempt and get a false positive, the flags + * test will fail. + */ + cpu = raw_smp_processor_id(); + if (likely(!per_cpu(tracing_cpu, cpu))) return; local_save_flags(flags); + /* slight chance to get a false positive on tracing_cpu */ + if (!irqs_disabled_flags(flags)) + return; - cpu = raw_smp_processor_id(); data = tr->data[cpu]; disabled = atomic_inc_return(&data->disabled); -- cgit v0.10.2 From 0764d23cf066c52de42b653144605b481d3fbdbc Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:44 +0200 Subject: ftrace: lockdep notrace annotations Add notrace annotations to lockdep to keep ftrace from causing recursive problems with lock tracing and debugging. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/lockdep.c b/kernel/lockdep.c index e219243..ac46847 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -271,14 +271,14 @@ static struct list_head chainhash_table[CHAINHASH_SIZE]; ((key1) >> (64-MAX_LOCKDEP_KEYS_BITS)) ^ \ (key2)) -void lockdep_off(void) +notrace void lockdep_off(void) { current->lockdep_recursion++; } EXPORT_SYMBOL(lockdep_off); -void lockdep_on(void) +notrace void lockdep_on(void) { current->lockdep_recursion--; } @@ -1041,7 +1041,7 @@ find_usage_forwards(struct lock_class *source, unsigned int depth) * Return 1 otherwise and keep unchanged. * Return 0 on error. */ -static noinline int +static noinline notrace int find_usage_backwards(struct lock_class *source, unsigned int depth) { struct lock_list *entry; @@ -1591,7 +1591,7 @@ static inline int validate_chain(struct task_struct *curr, * We are building curr_chain_key incrementally, so double-check * it from scratch, to make sure that it's done correctly: */ -static void check_chain_key(struct task_struct *curr) +static notrace void check_chain_key(struct task_struct *curr) { #ifdef CONFIG_DEBUG_LOCKDEP struct held_lock *hlock, *prev_hlock = NULL; @@ -1967,7 +1967,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this, /* * Mark all held locks with a usage bit: */ -static int +static notrace int mark_held_locks(struct task_struct *curr, int hardirq) { enum lock_usage_bit usage_bit; @@ -2260,8 +2260,8 @@ static inline int separate_irq_context(struct task_struct *curr, /* * Mark a lock with a usage bit, and validate the state transition: */ -static int mark_lock(struct task_struct *curr, struct held_lock *this, - enum lock_usage_bit new_bit) +static notrace int mark_lock(struct task_struct *curr, struct held_lock *this, + enum lock_usage_bit new_bit) { unsigned int new_mask = 1 << new_bit, ret = 1; @@ -2663,7 +2663,7 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip) /* * Check whether we follow the irq-flags state precisely: */ -static void check_flags(unsigned long flags) +static notrace void check_flags(unsigned long flags) { #if defined(CONFIG_DEBUG_LOCKDEP) && defined(CONFIG_TRACE_IRQFLAGS) if (!debug_locks) @@ -2700,8 +2700,8 @@ static void check_flags(unsigned long flags) * We are not always called with irqs disabled - do that here, * and also avoid lockdep recursion: */ -void lock_acquire(struct lockdep_map *lock, unsigned int subclass, - int trylock, int read, int check, unsigned long ip) +notrace void lock_acquire(struct lockdep_map *lock, unsigned int subclass, + int trylock, int read, int check, unsigned long ip) { unsigned long flags; @@ -2723,7 +2723,8 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, EXPORT_SYMBOL_GPL(lock_acquire); -void lock_release(struct lockdep_map *lock, int nested, unsigned long ip) +notrace void lock_release(struct lockdep_map *lock, int nested, + unsigned long ip) { unsigned long flags; diff --git a/kernel/spinlock.c b/kernel/spinlock.c index ae28c82..a1fb54c 100644 --- a/kernel/spinlock.c +++ b/kernel/spinlock.c @@ -436,7 +436,7 @@ int __lockfunc _spin_trylock_bh(spinlock_t *lock) } EXPORT_SYMBOL(_spin_trylock_bh); -int in_lock_functions(unsigned long addr) +notrace int in_lock_functions(unsigned long addr) { /* Linker adds these: start and end of __lockfunc functions */ extern char __lock_text_start[], __lock_text_end[]; -- cgit v0.10.2 From 18cef379d30f5ded20cc31d7f2d342639d39919d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:44 +0200 Subject: ftrace: don't use raw_local_irq_save/restore Using raw_local_irq_save/restore confuses lockdep. It's fine to use the normal ones. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d7ad030..9175ce9 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -269,7 +269,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip) if (unlikely(!tracer_enabled)) return; - raw_local_irq_save(flags); + local_irq_save(flags); cpu = raw_smp_processor_id(); data = tr->data[cpu]; disabled = atomic_inc_return(&data->disabled); @@ -278,7 +278,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip) ftrace(tr, data, ip, parent_ip, flags); atomic_dec(&data->disabled); - raw_local_irq_restore(flags); + local_irq_restore(flags); } static struct ftrace_ops trace_ops __read_mostly = diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 3e4771d..2715267 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -29,7 +29,7 @@ ctx_switch_func(struct task_struct *prev, struct task_struct *next) if (!tracer_enabled) return; - raw_local_irq_save(flags); + local_irq_save(flags); cpu = raw_smp_processor_id(); data = tr->data[cpu]; disabled = atomic_inc_return(&data->disabled); @@ -38,7 +38,7 @@ ctx_switch_func(struct task_struct *prev, struct task_struct *next) tracing_sched_switch_trace(tr, data, prev, next, flags); atomic_dec(&data->disabled); - raw_local_irq_restore(flags); + local_irq_restore(flags); } void ftrace_ctx_switch(struct task_struct *prev, struct task_struct *next) -- cgit v0.10.2 From 89b2f97819dd074297bbe3e19eaa4afcc98845ad Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:44 +0200 Subject: ftrace: fix updates to max trace This patch fixes some bugs to the updating of the max trace that was caused by implementing the new buffering. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9175ce9..9596656 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -153,6 +153,7 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) memcpy(max_tr.data[i], data, sizeof(*data)); data->trace = save_trace; data->trace_pages = save_pages; + tracing_reset(data); } __update_max_tr(tr, tsk, cpu); @@ -183,6 +184,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) memcpy(max_tr.data[cpu], data, sizeof(*data)); data->trace = save_trace; data->trace_pages = save_pages; + tracing_reset(data); __update_max_tr(tr, tsk, cpu); spin_unlock(&ftrace_max_lock); @@ -877,6 +879,8 @@ print_lat_fmt(struct seq_file *m, struct trace_iterator *iter, entry->ctx.next_prio, comm); break; + default: + seq_printf(m, "Unknown type %d\n", entry->type); } } @@ -1625,7 +1629,6 @@ __init static int tracer_alloc_buffers(void) * round up a bit. */ global_trace.entries = ENTRIES_PER_PAGE; - max_tr.entries = global_trace.entries; pages++; while (global_trace.entries < trace_nr_entries) { @@ -1633,6 +1636,7 @@ __init static int tracer_alloc_buffers(void) break; pages++; } + max_tr.entries = global_trace.entries; pr_info("tracer: %d pages allocated for %ld", pages, trace_nr_entries); diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index bd3f881..74165f6 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -23,6 +23,8 @@ static int tracer_enabled __read_mostly; static DEFINE_PER_CPU(int, tracing_cpu); +static DEFINE_SPINLOCK(max_trace_lock); + enum { TRACER_IRQS_OFF = (1 << 1), TRACER_PREEMPT_OFF = (1 << 2), @@ -126,7 +128,7 @@ check_critical_timing(struct trace_array *tr, int cpu) { unsigned long latency, t0, t1; - cycle_t T0, T1, T2, delta; + cycle_t T0, T1, delta; unsigned long flags; /* @@ -142,20 +144,18 @@ check_critical_timing(struct trace_array *tr, if (!report_latency(delta)) goto out; - ftrace(tr, data, CALLER_ADDR0, parent_ip, flags); - /* - * Update the timestamp, because the trace entry above - * might change it (it can only get larger so the latency - * is fair to be reported): - */ - T2 = now(cpu); + spin_lock(&max_trace_lock); - delta = T2-T0; + /* check if we are still the max latency */ + if (!report_latency(delta)) + goto out_unlock; + + ftrace(tr, data, CALLER_ADDR0, parent_ip, flags); latency = nsecs_to_usecs(delta); if (data->critical_sequence != max_sequence) - goto out; + goto out_unlock; tracing_max_latency = delta; t0 = nsecs_to_usecs(T0); @@ -189,6 +189,9 @@ check_critical_timing(struct trace_array *tr, max_sequence++; +out_unlock: + spin_unlock(&max_trace_lock); + out: data->critical_sequence = max_sequence; data->preempt_timestamp = now(cpu); @@ -366,14 +369,14 @@ void notrace trace_preempt_off(unsigned long a0, unsigned long a1) static void start_irqsoff_tracer(struct trace_array *tr) { - tracer_enabled = 1; register_ftrace_function(&trace_ops); + tracer_enabled = 1; } static void stop_irqsoff_tracer(struct trace_array *tr) { - unregister_ftrace_function(&trace_ops); tracer_enabled = 0; + unregister_ftrace_function(&trace_ops); } static void __irqsoff_tracer_init(struct trace_array *tr) -- cgit v0.10.2 From 57f50be14d57b0dbf88dd019e7bb0ff3a3dc7b81 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:44 +0200 Subject: ftrace: fix max latency Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9596656..9bad237 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -69,6 +69,11 @@ static int __init set_nr_entries(char *str) } __setup("trace_entries=", set_nr_entries); +unsigned long nsecs_to_usecs(unsigned long nsecs) +{ + return nsecs / 1000; +} + enum trace_type { __TRACE_FIRST_TYPE = 0, @@ -733,7 +738,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) "---------------------------------\n"); seq_printf(m, " latency: %lu us, #%lu/%lu, CPU#%d |" " (M:%s VP:%d, KP:%d, SP:%d HP:%d", - data->saved_latency, + nsecs_to_usecs(data->saved_latency), entries, total, tr->cpu, @@ -771,11 +776,6 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) seq_puts(m, "\n"); } -unsigned long nsecs_to_usecs(unsigned long nsecs) -{ - return nsecs / 1000; -} - static void notrace lat_print_generic(struct seq_file *m, struct trace_entry *entry, int cpu) { -- cgit v0.10.2 From e1c08bdd9fa73e44096e5a82c0d5928b04ab02c8 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:44 +0200 Subject: ftrace: force recording Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index a42390c..2c1670c 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -54,6 +54,8 @@ struct dyn_ftrace { unsigned long flags; }; +int ftrace_force_update(void); + /* defined in arch */ extern int ftrace_ip_converted(unsigned long ip); extern unsigned char *ftrace_nop_replace(void); @@ -66,6 +68,8 @@ extern int ftrace_update_ftrace_func(ftrace_func_t func); extern void ftrace_caller(void); extern void ftrace_call(void); extern void mcount_call(void); +#else +# define ftrace_force_update() do { } while (0) #endif static inline void tracer_disable(void) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 97d5cb7..4facf5c 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -146,6 +146,10 @@ static int notrace __unregister_ftrace_function(struct ftrace_ops *ops) #ifdef CONFIG_DYNAMIC_FTRACE +static struct task_struct *ftraced_task; +static DECLARE_WAIT_QUEUE_HEAD(ftraced_waiters); +static unsigned long ftraced_iteration_counter; + enum { FTRACE_ENABLE_CALLS = (1 << 0), FTRACE_DISABLE_CALLS = (1 << 1), @@ -590,9 +594,12 @@ static int notrace ftraced(void *ignore) ftraced_trigger = 0; ftrace_record_suspend--; } + ftraced_iteration_counter++; mutex_unlock(&ftraced_lock); mutex_unlock(&ftrace_sysctl_lock); + wake_up_interruptible(&ftraced_waiters); + ftrace_shutdown_replenish(); set_current_state(TASK_INTERRUPTIBLE); @@ -1050,6 +1057,49 @@ static struct file_operations ftrace_filter_fops = { .release = ftrace_filter_release, }; +/** + * ftrace_force_update - force an update to all recording ftrace functions + * + * The ftrace dynamic update daemon only wakes up once a second. + * There may be cases where an update needs to be done immediately + * for tests or internal kernel tracing to begin. This function + * wakes the daemon to do an update and will not return until the + * update is complete. + */ +int ftrace_force_update(void) +{ + unsigned long last_counter; + DECLARE_WAITQUEUE(wait, current); + int ret = 0; + + if (!ftraced_task) + return -ENODEV; + + mutex_lock(&ftraced_lock); + last_counter = ftraced_iteration_counter; + + set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue(&ftraced_waiters, &wait); + + do { + mutex_unlock(&ftraced_lock); + wake_up_process(ftraced_task); + schedule(); + mutex_lock(&ftraced_lock); + if (signal_pending(current)) { + ret = -EINTR; + break; + } + set_current_state(TASK_INTERRUPTIBLE); + } while (last_counter == ftraced_iteration_counter); + + mutex_unlock(&ftraced_lock); + remove_wait_queue(&ftraced_waiters, &wait); + set_current_state(TASK_RUNNING); + + return ret; +} + static __init int ftrace_init_debugfs(void) { struct dentry *d_tracer; @@ -1095,6 +1145,7 @@ static int __init notrace ftrace_dynamic_init(void) return -1; last_ftrace_enabled = ftrace_enabled = 1; + ftraced_task = p; return 0; } -- cgit v0.10.2 From 60a11774b38fef1ab90b18c5353bd1c7c4d311c8 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:44 +0200 Subject: ftrace: add self-tests Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index cad9db1..3f73a17 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -105,3 +105,16 @@ config DYNAMIC_FTRACE wakes up once a second and checks to see if any ftrace calls were made. If so, it runs stop_machine (stops all CPUS) and modifies the code to jump over the call to ftrace. + +config FTRACE_SELFTEST + bool + +config FTRACE_STARTUP_TEST + bool "Perform a startup test on ftrace" + depends on TRACING + select FTRACE_SELFTEST + help + This option performs a series of startup tests on ftrace. On bootup + a series of tests are made to verify that the tracer is + functioning properly. It will do tests on all the configured + tracers of ftrace. diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9bad237..f6d026f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -32,6 +32,8 @@ unsigned long __read_mostly tracing_max_latency = (cycle_t)ULONG_MAX; unsigned long __read_mostly tracing_thresh; +static int tracing_disabled = 1; + static long notrace ns2usecs(cycle_t nsec) { @@ -217,11 +219,48 @@ int register_tracer(struct tracer *type) } } +#ifdef CONFIG_FTRACE_STARTUP_TEST + if (type->selftest) { + struct tracer *saved_tracer = current_trace; + struct trace_array_cpu *data; + struct trace_array *tr = &global_trace; + int saved_ctrl = tr->ctrl; + int i; + /* + * Run a selftest on this tracer. + * Here we reset the trace buffer, and set the current + * tracer to be this tracer. The tracer can then run some + * internal tracing to verify that everything is in order. + * If we fail, we do not register this tracer. + */ + for_each_possible_cpu(i) { + if (!data->trace) + continue; + data = tr->data[i]; + tracing_reset(data); + } + current_trace = type; + tr->ctrl = 0; + /* the test is responsible for initializing and enabling */ + pr_info("Testing tracer %s: ", type->name); + ret = type->selftest(type, tr); + /* the test is responsible for resetting too */ + current_trace = saved_tracer; + tr->ctrl = saved_ctrl; + if (ret) { + printk(KERN_CONT "FAILED!\n"); + goto out; + } + printk(KERN_CONT "PASSED\n"); + } +#endif + type->next = trace_types; trace_types = type; len = strlen(type->name); if (len > max_tracer_type_len) max_tracer_type_len = len; + out: mutex_unlock(&trace_types_lock); @@ -985,6 +1024,11 @@ __tracing_open(struct inode *inode, struct file *file, int *ret) { struct trace_iterator *iter; + if (tracing_disabled) { + *ret = -ENODEV; + return NULL; + } + iter = kzalloc(sizeof(*iter), GFP_KERNEL); if (!iter) { *ret = -ENOMEM; @@ -1023,6 +1067,9 @@ __tracing_open(struct inode *inode, struct file *file, int *ret) int tracing_open_generic(struct inode *inode, struct file *filp) { + if (tracing_disabled) + return -ENODEV; + filp->private_data = inode->i_private; return 0; } @@ -1128,6 +1175,9 @@ static int show_traces_open(struct inode *inode, struct file *file) { int ret; + if (tracing_disabled) + return -ENODEV; + ret = seq_open(file, &show_traces_seq_ops); if (!ret) { struct seq_file *m = file->private_data; @@ -1452,6 +1502,11 @@ struct dentry *tracing_init_dentry(void) return d_tracer; } +#ifdef CONFIG_FTRACE_SELFTEST +/* Let selftest have access to static functions in this file */ +#include "trace_selftest.c" +#endif + static __init void tracer_init_debugfs(void) { struct dentry *d_tracer; @@ -1585,6 +1640,7 @@ __init static int tracer_alloc_buffers(void) void *array; struct page *page; int pages = 0; + int ret = -ENOMEM; int i; /* Allocate the first page for all buffers */ @@ -1650,6 +1706,9 @@ __init static int tracer_alloc_buffers(void) register_tracer(&no_tracer); current_trace = &no_tracer; + /* All seems OK, enable tracing */ + tracing_disabled = 0; + return 0; free_buffers: @@ -1678,7 +1737,7 @@ __init static int tracer_alloc_buffers(void) } #endif } - return -ENOMEM; + return ret; } -device_initcall(tracer_alloc_buffers); +fs_initcall(tracer_alloc_buffers); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 83e257e..88edbf1 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -99,6 +99,10 @@ struct tracer { void (*start)(struct trace_iterator *iter); void (*stop)(struct trace_iterator *iter); void (*ctrl_update)(struct trace_array *tr); +#ifdef CONFIG_FTRACE_STARTUP_TEST + int (*selftest)(struct tracer *trace, + struct trace_array *tr); +#endif struct tracer *next; int print_max; }; @@ -185,4 +189,31 @@ extern int unregister_tracer_switch(struct tracer_switch_ops *ops); extern unsigned long ftrace_update_tot_cnt; #endif +#ifdef CONFIG_FTRACE_STARTUP_TEST +#ifdef CONFIG_FTRACE +extern int trace_selftest_startup_function(struct tracer *trace, + struct trace_array *tr); +#endif +#ifdef CONFIG_IRQSOFF_TRACER +extern int trace_selftest_startup_irqsoff(struct tracer *trace, + struct trace_array *tr); +#endif +#ifdef CONFIG_PREEMPT_TRACER +extern int trace_selftest_startup_preemptoff(struct tracer *trace, + struct trace_array *tr); +#endif +#if defined(CONFIG_IRQSOFF_TRACER) && defined(CONFIG_PREEMPT_TRACER) +extern int trace_selftest_startup_preemptirqsoff(struct tracer *trace, + struct trace_array *tr); +#endif +#ifdef CONFIG_SCHED_TRACER +extern int trace_selftest_startup_wakeup(struct tracer *trace, + struct trace_array *tr); +#endif +#ifdef CONFIG_CONTEXT_SWITCH_TRACER +extern int trace_selftest_startup_sched_switch(struct tracer *trace, + struct trace_array *tr); +#endif +#endif /* CONFIG_FTRACE_STARTUP_TEST */ + #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 82988c5..5d8ad7a 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -63,6 +63,9 @@ static struct tracer function_trace __read_mostly = .init = function_trace_init, .reset = function_trace_reset, .ctrl_update = function_trace_ctrl_update, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_function, +#endif }; static __init int init_function_trace(void) diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 74165f6..14183b8 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -432,6 +432,9 @@ static struct tracer irqsoff_tracer __read_mostly = .close = irqsoff_tracer_close, .ctrl_update = irqsoff_tracer_ctrl_update, .print_max = 1, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_irqsoff, +#endif }; # define register_irqsoff(trace) register_tracer(&trace) #else @@ -455,6 +458,9 @@ static struct tracer preemptoff_tracer __read_mostly = .close = irqsoff_tracer_close, .ctrl_update = irqsoff_tracer_ctrl_update, .print_max = 1, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_preemptoff, +#endif }; # define register_preemptoff(trace) register_tracer(&trace) #else @@ -480,6 +486,9 @@ static struct tracer preemptirqsoff_tracer __read_mostly = .close = irqsoff_tracer_close, .ctrl_update = irqsoff_tracer_ctrl_update, .print_max = 1, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_preemptirqsoff, +#endif }; # define register_preemptirqsoff(trace) register_tracer(&trace) diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 2715267..6c92841 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -107,6 +107,9 @@ static struct tracer sched_switch_trace __read_mostly = .init = sched_switch_trace_init, .reset = sched_switch_trace_reset, .ctrl_update = sched_switch_trace_ctrl_update, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_sched_switch, +#endif }; __init static int init_sched_switch_trace(void) diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 7c3ccef..3d10ff0 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -295,6 +295,9 @@ static struct tracer wakeup_tracer __read_mostly = .close = wakeup_tracer_close, .ctrl_update = wakeup_tracer_ctrl_update, .print_max = 1, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_wakeup, +#endif }; __init static int init_wakeup_tracer(void) diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c new file mode 100644 index 0000000..ef4d3cc --- /dev/null +++ b/kernel/trace/trace_selftest.c @@ -0,0 +1,415 @@ +/* Include in trace.c */ + +#include + +static inline int trace_valid_entry(struct trace_entry *entry) +{ + switch (entry->type) { + case TRACE_FN: + case TRACE_CTX: + return 1; + } + return 0; +} + +static int +trace_test_buffer_cpu(struct trace_array *tr, struct trace_array_cpu *data) +{ + struct page *page; + struct trace_entry *entries; + int idx = 0; + int i; + + page = list_entry(data->trace_pages.next, struct page, lru); + entries = page_address(page); + + if (data->trace != entries) + goto failed; + + /* + * The starting trace buffer always has valid elements, + * if any element exits. + */ + entries = data->trace; + + for (i = 0; i < tr->entries; i++) { + + if (i < data->trace_idx && + !trace_valid_entry(&entries[idx])) { + printk(KERN_CONT ".. invalid entry %d ", entries[idx].type); + goto failed; + } + + idx++; + if (idx >= ENTRIES_PER_PAGE) { + page = virt_to_page(entries); + if (page->lru.next == &data->trace_pages) { + if (i != tr->entries - 1) { + printk(KERN_CONT ".. entries buffer mismatch"); + goto failed; + } + } else { + page = list_entry(page->lru.next, struct page, lru); + entries = page_address(page); + } + idx = 0; + } + } + + page = virt_to_page(entries); + if (page->lru.next != &data->trace_pages) { + printk(KERN_CONT ".. too many entries"); + goto failed; + } + + return 0; + + failed: + printk(KERN_CONT ".. corrupted trace buffer .. "); + return -1; +} + +/* + * Test the trace buffer to see if all the elements + * are still sane. + */ +static int trace_test_buffer(struct trace_array *tr, unsigned long *count) +{ + unsigned long cnt = 0; + int cpu; + int ret = 0; + + for_each_possible_cpu(cpu) { + if (!tr->data[cpu]->trace) + continue; + + cnt += tr->data[cpu]->trace_idx; + printk("%d: count = %ld\n", cpu, cnt); + + ret = trace_test_buffer_cpu(tr, tr->data[cpu]); + if (ret) + break; + } + + if (count) + *count = cnt; + + return ret; +} + +#ifdef CONFIG_FTRACE +/* + * Simple verification test of ftrace function tracer. + * Enable ftrace, sleep 1/10 second, and then read the trace + * buffer to see if all is in order. + */ +int +trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) +{ + unsigned long count; + int ret; + + /* make sure functions have been recorded */ + ret = ftrace_force_update(); + if (ret) { + printk(KERN_CONT ".. ftraced failed .. "); + return ret; + } + + /* start the tracing */ + tr->ctrl = 1; + trace->init(tr); + /* Sleep for a 1/10 of a second */ + msleep(100); + /* stop the tracing. */ + tr->ctrl = 0; + trace->ctrl_update(tr); + /* check the trace buffer */ + ret = trace_test_buffer(tr, &count); + trace->reset(tr); + + if (!ret && !count) { + printk(KERN_CONT ".. no entries found .."); + ret = -1; + } + + return ret; +} +#endif /* CONFIG_FTRACE */ + +#ifdef CONFIG_IRQSOFF_TRACER +int +trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr) +{ + unsigned long save_max = tracing_max_latency; + unsigned long count; + int ret; + + /* start the tracing */ + tr->ctrl = 1; + trace->init(tr); + /* reset the max latency */ + tracing_max_latency = 0; + /* disable interrupts for a bit */ + local_irq_disable(); + udelay(100); + local_irq_enable(); + /* stop the tracing. */ + tr->ctrl = 0; + trace->ctrl_update(tr); + /* check both trace buffers */ + ret = trace_test_buffer(tr, NULL); + if (!ret) + ret = trace_test_buffer(&max_tr, &count); + trace->reset(tr); + + if (!ret && !count) { + printk(KERN_CONT ".. no entries found .."); + ret = -1; + } + + tracing_max_latency = save_max; + + return ret; +} +#endif /* CONFIG_IRQSOFF_TRACER */ + +#ifdef CONFIG_PREEMPT_TRACER +int +trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr) +{ + unsigned long save_max = tracing_max_latency; + unsigned long count; + int ret; + + /* start the tracing */ + tr->ctrl = 1; + trace->init(tr); + /* reset the max latency */ + tracing_max_latency = 0; + /* disable preemption for a bit */ + preempt_disable(); + udelay(100); + preempt_enable(); + /* stop the tracing. */ + tr->ctrl = 0; + trace->ctrl_update(tr); + /* check both trace buffers */ + ret = trace_test_buffer(tr, NULL); + if (!ret) + ret = trace_test_buffer(&max_tr, &count); + trace->reset(tr); + + if (!ret && !count) { + printk(KERN_CONT ".. no entries found .."); + ret = -1; + } + + tracing_max_latency = save_max; + + return ret; +} +#endif /* CONFIG_PREEMPT_TRACER */ + +#if defined(CONFIG_IRQSOFF_TRACER) && defined(CONFIG_PREEMPT_TRACER) +int +trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *tr) +{ + unsigned long save_max = tracing_max_latency; + unsigned long count; + int ret; + + /* start the tracing */ + tr->ctrl = 1; + trace->init(tr); + + /* reset the max latency */ + tracing_max_latency = 0; + + /* disable preemption and interrupts for a bit */ + preempt_disable(); + local_irq_disable(); + udelay(100); + preempt_enable(); + /* reverse the order of preempt vs irqs */ + local_irq_enable(); + + /* stop the tracing. */ + tr->ctrl = 0; + trace->ctrl_update(tr); + /* check both trace buffers */ + ret = trace_test_buffer(tr, NULL); + if (ret) + goto out; + + ret = trace_test_buffer(&max_tr, &count); + if (ret) + goto out; + + if (!ret && !count) { + printk(KERN_CONT ".. no entries found .."); + ret = -1; + goto out; + } + + /* do the test by disabling interrupts first this time */ + tracing_max_latency = 0; + tr->ctrl = 1; + trace->ctrl_update(tr); + preempt_disable(); + local_irq_disable(); + udelay(100); + preempt_enable(); + /* reverse the order of preempt vs irqs */ + local_irq_enable(); + + /* stop the tracing. */ + tr->ctrl = 0; + trace->ctrl_update(tr); + /* check both trace buffers */ + ret = trace_test_buffer(tr, NULL); + if (ret) + goto out; + + ret = trace_test_buffer(&max_tr, &count); + + if (!ret && !count) { + printk(KERN_CONT ".. no entries found .."); + ret = -1; + goto out; + } + + out: + trace->reset(tr); + tracing_max_latency = save_max; + + return ret; +} +#endif /* CONFIG_IRQSOFF_TRACER && CONFIG_PREEMPT_TRACER */ + +#ifdef CONFIG_SCHED_TRACER +static int trace_wakeup_test_thread(void *data) +{ + struct completion *x = data; + + /* Make this a RT thread, doesn't need to be too high */ + + rt_mutex_setprio(current, MAX_RT_PRIO - 5); + + /* Make it know we have a new prio */ + complete(x); + + /* now go to sleep and let the test wake us up */ + set_current_state(TASK_INTERRUPTIBLE); + schedule(); + + /* we are awake, now wait to disappear */ + while (!kthread_should_stop()) { + /* + * This is an RT task, do short sleeps to let + * others run. + */ + msleep(100); + } + + return 0; +} + +int +trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) +{ + unsigned long save_max = tracing_max_latency; + struct task_struct *p; + struct completion isrt; + unsigned long count; + int ret; + + init_completion(&isrt); + + /* create a high prio thread */ + p = kthread_run(trace_wakeup_test_thread, &isrt, "ftrace-test"); + if (!IS_ERR(p)) { + printk(KERN_CONT "Failed to create ftrace wakeup test thread "); + return -1; + } + + /* make sure the thread is running at an RT prio */ + wait_for_completion(&isrt); + + /* start the tracing */ + tr->ctrl = 1; + trace->init(tr); + /* reset the max latency */ + tracing_max_latency = 0; + + /* sleep to let the RT thread sleep too */ + msleep(100); + + /* + * Yes this is slightly racy. It is possible that for some + * strange reason that the RT thread we created, did not + * call schedule for 100ms after doing the completion, + * and we do a wakeup on a task that already is awake. + * But that is extremely unlikely, and the worst thing that + * happens in such a case, is that we disable tracing. + * Honestly, if this race does happen something is horrible + * wrong with the system. + */ + + wake_up_process(p); + + /* stop the tracing. */ + tr->ctrl = 0; + trace->ctrl_update(tr); + /* check both trace buffers */ + ret = trace_test_buffer(tr, NULL); + if (!ret) + ret = trace_test_buffer(&max_tr, &count); + + + trace->reset(tr); + + tracing_max_latency = save_max; + + /* kill the thread */ + kthread_stop(p); + + if (!ret && !count) { + printk(KERN_CONT ".. no entries found .."); + ret = -1; + } + + return ret; +} +#endif /* CONFIG_SCHED_TRACER */ + +#ifdef CONFIG_CONTEXT_SWITCH_TRACER +int +trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr) +{ + unsigned long count; + int ret; + + /* start the tracing */ + tr->ctrl = 1; + trace->init(tr); + /* Sleep for a 1/10 of a second */ + msleep(100); + /* stop the tracing. */ + tr->ctrl = 0; + trace->ctrl_update(tr); + /* check the trace buffer */ + ret = trace_test_buffer(tr, &count); + trace->reset(tr); + + if (!ret && !count) { + printk(KERN_CONT ".. no entries found .."); + ret = -1; + } + + return ret; +} +#endif /* CONFIG_CONTEXT_SWITCH_TRACER */ + +#ifdef CONFIG_DYNAMIC_FTRACE +#endif /* CONFIG_DYNAMIC_FTRACE */ -- cgit v0.10.2 From c7aafc549766b87819285d3480648fc652a47bc4 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:45 +0200 Subject: ftrace: cleanups factor out code and clean it up. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 2c1670c..953a36d 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -69,7 +69,7 @@ extern void ftrace_caller(void); extern void ftrace_call(void); extern void mcount_call(void); #else -# define ftrace_force_update() do { } while (0) +# define ftrace_force_update() ({ 0; }) #endif static inline void tracer_disable(void) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 4facf5c..6d4d2e8 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1152,10 +1152,10 @@ static int __init notrace ftrace_dynamic_init(void) core_initcall(ftrace_dynamic_init); #else -# define ftrace_startup() do { } while (0) -# define ftrace_shutdown() do { } while (0) -# define ftrace_startup_sysctl() do { } while (0) -# define ftrace_shutdown_sysctl() do { } while (0) +# define ftrace_startup() do { } while (0) +# define ftrace_shutdown() do { } while (0) +# define ftrace_startup_sysctl() do { } while (0) +# define ftrace_shutdown_sysctl() do { } while (0) #endif /* CONFIG_DYNAMIC_FTRACE */ /** diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f6d026f..61d2f02 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -142,12 +142,59 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) tracing_record_cmdline(current); } +void check_pages(struct trace_array_cpu *data) +{ + struct page *page, *tmp; + + BUG_ON(data->trace_pages.next->prev != &data->trace_pages); + BUG_ON(data->trace_pages.prev->next != &data->trace_pages); + + list_for_each_entry_safe(page, tmp, &data->trace_pages, lru) { + BUG_ON(page->lru.next->prev != &page->lru); + BUG_ON(page->lru.prev->next != &page->lru); + } +} + +void *head_page(struct trace_array_cpu *data) +{ + struct page *page; + + check_pages(data); + if (list_empty(&data->trace_pages)) + return NULL; + + page = list_entry(data->trace_pages.next, struct page, lru); + BUG_ON(&page->lru == &data->trace_pages); + + return page_address(page); +} + +notrace static void +flip_trace(struct trace_array_cpu *tr1, struct trace_array_cpu *tr2) +{ + struct list_head flip_pages; + + INIT_LIST_HEAD(&flip_pages); + + tr1->trace_current = NULL; + memcpy(&tr1->trace_current_idx, &tr2->trace_current_idx, + sizeof(struct trace_array_cpu) - + offsetof(struct trace_array_cpu, trace_current_idx)); + + check_pages(tr1); + check_pages(tr2); + list_splice_init(&tr1->trace_pages, &flip_pages); + list_splice_init(&tr2->trace_pages, &tr1->trace_pages); + list_splice_init(&flip_pages, &tr2->trace_pages); + BUG_ON(!list_empty(&flip_pages)); + check_pages(tr1); + check_pages(tr2); +} + notrace void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) { struct trace_array_cpu *data; - void *save_trace; - struct list_head save_pages; int i; WARN_ON_ONCE(!irqs_disabled()); @@ -155,11 +202,7 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) /* clear out all the previous traces */ for_each_possible_cpu(i) { data = tr->data[i]; - save_trace = max_tr.data[i]->trace; - save_pages = max_tr.data[i]->trace_pages; - memcpy(max_tr.data[i], data, sizeof(*data)); - data->trace = save_trace; - data->trace_pages = save_pages; + flip_trace(max_tr.data[i], data); tracing_reset(data); } @@ -177,8 +220,6 @@ notrace void update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) { struct trace_array_cpu *data = tr->data[cpu]; - void *save_trace; - struct list_head save_pages; int i; WARN_ON_ONCE(!irqs_disabled()); @@ -186,11 +227,8 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) for_each_possible_cpu(i) tracing_reset(max_tr.data[i]); - save_trace = max_tr.data[cpu]->trace; - save_pages = max_tr.data[cpu]->trace_pages; - memcpy(max_tr.data[cpu], data, sizeof(*data)); - data->trace = save_trace; - data->trace_pages = save_pages; + flip_trace(max_tr.data[cpu], data); + tracing_reset(data); __update_max_tr(tr, tsk, cpu); @@ -234,9 +272,9 @@ int register_tracer(struct tracer *type) * If we fail, we do not register this tracer. */ for_each_possible_cpu(i) { - if (!data->trace) - continue; data = tr->data[i]; + if (!head_page(data)) + continue; tracing_reset(data); } current_trace = type; @@ -298,7 +336,7 @@ void unregister_tracer(struct tracer *type) void notrace tracing_reset(struct trace_array_cpu *data) { data->trace_idx = 0; - data->trace_current = data->trace; + data->trace_current = head_page(data); data->trace_current_idx = 0; } @@ -425,26 +463,31 @@ notrace void tracing_record_cmdline(struct task_struct *tsk) } static inline notrace struct trace_entry * -tracing_get_trace_entry(struct trace_array *tr, - struct trace_array_cpu *data) +tracing_get_trace_entry(struct trace_array *tr, struct trace_array_cpu *data) { unsigned long idx, idx_next; struct trace_entry *entry; - struct page *page; struct list_head *next; + struct page *page; data->trace_idx++; idx = data->trace_current_idx; idx_next = idx + 1; + BUG_ON(idx * TRACE_ENTRY_SIZE >= PAGE_SIZE); + entry = data->trace_current + idx * TRACE_ENTRY_SIZE; if (unlikely(idx_next >= ENTRIES_PER_PAGE)) { page = virt_to_page(data->trace_current); - if (unlikely(&page->lru == data->trace_pages.prev)) - next = data->trace_pages.next; - else - next = page->lru.next; + /* + * Roundrobin - but skip the head (which is not a real page): + */ + next = page->lru.next; + if (unlikely(next == &data->trace_pages)) + next = next->next; + BUG_ON(next == &data->trace_pages); + page = list_entry(next, struct page, lru); data->trace_current = page_address(page); idx_next = 0; @@ -456,18 +499,17 @@ tracing_get_trace_entry(struct trace_array *tr, } static inline notrace void -tracing_generic_entry_update(struct trace_entry *entry, - unsigned long flags) +tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags) { struct task_struct *tsk = current; unsigned long pc; pc = preempt_count(); - entry->idx = atomic_inc_return(&tracer_counter); - entry->preempt_count = pc & 0xff; - entry->pid = tsk->pid; - entry->t = now(raw_smp_processor_id()); + entry->idx = atomic_inc_return(&tracer_counter); + entry->preempt_count = pc & 0xff; + entry->pid = tsk->pid; + entry->t = now(raw_smp_processor_id()); entry->flags = (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) | ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) | @@ -476,16 +518,15 @@ tracing_generic_entry_update(struct trace_entry *entry, notrace void ftrace(struct trace_array *tr, struct trace_array_cpu *data, - unsigned long ip, unsigned long parent_ip, - unsigned long flags) + unsigned long ip, unsigned long parent_ip, unsigned long flags) { struct trace_entry *entry; - entry = tracing_get_trace_entry(tr, data); + entry = tracing_get_trace_entry(tr, data); tracing_generic_entry_update(entry, flags); - entry->type = TRACE_FN; - entry->fn.ip = ip; - entry->fn.parent_ip = parent_ip; + entry->type = TRACE_FN; + entry->fn.ip = ip; + entry->fn.parent_ip = parent_ip; } notrace void @@ -496,7 +537,7 @@ tracing_sched_switch_trace(struct trace_array *tr, { struct trace_entry *entry; - entry = tracing_get_trace_entry(tr, data); + entry = tracing_get_trace_entry(tr, data); tracing_generic_entry_update(entry, flags); entry->type = TRACE_CTX; entry->ctx.prev_pid = prev->pid; @@ -540,6 +581,8 @@ trace_entry_idx(struct trace_array *tr, struct trace_array_cpu *data, } page = list_entry(iter->next_page[cpu], struct page, lru); + BUG_ON(&data->trace_pages == &page->lru); + array = page_address(page); return &array[iter->next_page_idx[cpu]]; @@ -554,7 +597,7 @@ find_next_entry(struct trace_iterator *iter, int *ent_cpu) int cpu; for_each_possible_cpu(cpu) { - if (!tr->data[cpu]->trace) + if (!head_page(tr->data[cpu])) continue; ent = trace_entry_idx(tr, tr->data[cpu], iter, cpu); if (ent && @@ -762,7 +805,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) name = type->name; for_each_possible_cpu(cpu) { - if (tr->data[cpu]->trace) { + if (head_page(tr->data[cpu])) { total += tr->data[cpu]->trace_idx; if (tr->data[cpu]->trace_idx > tr->entries) entries += tr->entries; @@ -975,8 +1018,7 @@ static int trace_empty(struct trace_iterator *iter) for_each_possible_cpu(cpu) { data = iter->tr->data[cpu]; - if (data->trace && - data->trace_idx) + if (head_page(data) && data->trace_idx) return 0; } return 1; @@ -1576,9 +1618,9 @@ static struct tracer no_tracer __read_mostly = static int trace_alloc_page(void) { struct trace_array_cpu *data; - void *array; struct page *page, *tmp; LIST_HEAD(pages); + void *array; int i; /* first allocate a page for each CPU */ @@ -1610,14 +1652,14 @@ static int trace_alloc_page(void) for_each_possible_cpu(i) { data = global_trace.data[i]; page = list_entry(pages.next, struct page, lru); - list_del(&page->lru); + list_del_init(&page->lru); list_add_tail(&page->lru, &data->trace_pages); ClearPageLRU(page); #ifdef CONFIG_TRACER_MAX_TRACE data = max_tr.data[i]; page = list_entry(pages.next, struct page, lru); - list_del(&page->lru); + list_del_init(&page->lru); list_add_tail(&page->lru, &data->trace_pages); SetPageLRU(page); #endif @@ -1628,7 +1670,7 @@ static int trace_alloc_page(void) free_pages: list_for_each_entry_safe(page, tmp, &pages, lru) { - list_del(&page->lru); + list_del_init(&page->lru); __free_page(page); } return -ENOMEM; @@ -1654,7 +1696,6 @@ __init static int tracer_alloc_buffers(void) "for trace buffer!\n"); goto free_buffers; } - data->trace = array; /* set the array to the list */ INIT_LIST_HEAD(&data->trace_pages); @@ -1671,7 +1712,6 @@ __init static int tracer_alloc_buffers(void) "for trace buffer!\n"); goto free_buffers; } - max_tr.data[i]->trace = array; INIT_LIST_HEAD(&max_tr.data[i]->trace_pages); page = virt_to_page(array); @@ -1716,24 +1756,22 @@ __init static int tracer_alloc_buffers(void) struct page *page, *tmp; struct trace_array_cpu *data = global_trace.data[i]; - if (data && data->trace) { + if (data) { list_for_each_entry_safe(page, tmp, &data->trace_pages, lru) { - list_del(&page->lru); + list_del_init(&page->lru); __free_page(page); } - data->trace = NULL; } #ifdef CONFIG_TRACER_MAX_TRACE data = max_tr.data[i]; - if (data && data->trace) { + if (data) { list_for_each_entry_safe(page, tmp, &data->trace_pages, lru) { - list_del(&page->lru); + list_del_init(&page->lru); __free_page(page); } - data->trace = NULL; } #endif } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 88edbf1..cc1d34b 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -53,12 +53,12 @@ struct trace_entry { * the trace, etc.) */ struct trace_array_cpu { - void *trace; void *trace_current; - unsigned trace_current_idx; struct list_head trace_pages; - unsigned long trace_idx; atomic_t disabled; + /* these fields get copied into max-trace: */ + unsigned trace_current_idx; + unsigned long trace_idx; unsigned long saved_latency; unsigned long critical_start; unsigned long critical_end; @@ -216,4 +216,6 @@ extern int trace_selftest_startup_sched_switch(struct tracer *trace, #endif #endif /* CONFIG_FTRACE_STARTUP_TEST */ +extern void *head_page(struct trace_array_cpu *data); + #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 14183b8..2dfebb6 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -144,7 +144,7 @@ check_critical_timing(struct trace_array *tr, if (!report_latency(delta)) goto out; - spin_lock(&max_trace_lock); + spin_lock_irqsave(&max_trace_lock, flags); /* check if we are still the max latency */ if (!report_latency(delta)) @@ -165,32 +165,24 @@ check_critical_timing(struct trace_array *tr, update_max_tr_single(tr, current, cpu); - if (tracing_thresh) - printk(KERN_INFO "(%16s-%-5d|#%d): %lu us critical section " - "violates %lu us threshold.\n" - " => started at timestamp %lu: ", + if (tracing_thresh) { + printk(KERN_INFO "(%16s-%-5d|#%d):" + " %lu us critical section violates %lu us threshold.\n", current->comm, current->pid, raw_smp_processor_id(), - latency, nsecs_to_usecs(tracing_thresh), t0); - else + latency, nsecs_to_usecs(tracing_thresh)); + } else { printk(KERN_INFO "(%16s-%-5d|#%d):" - " new %lu us maximum-latency " - "critical section.\n => started at timestamp %lu: ", + " new %lu us maximum-latency critical section.\n", current->comm, current->pid, raw_smp_processor_id(), - latency, t0); - - print_symbol(KERN_CONT "<%s>\n", data->critical_start); - printk(KERN_CONT " => ended at timestamp %lu: ", t1); - print_symbol(KERN_CONT "<%s>\n", data->critical_end); - dump_stack(); - t1 = nsecs_to_usecs(now(cpu)); - printk(KERN_CONT " => dump-end timestamp %lu\n\n", t1); + latency); + } max_sequence++; out_unlock: - spin_unlock(&max_trace_lock); + spin_unlock_irqrestore(&max_trace_lock, flags); out: data->critical_sequence = max_sequence; @@ -216,7 +208,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip) cpu = raw_smp_processor_id(); data = tr->data[cpu]; - if (unlikely(!data) || unlikely(!data->trace) || + if (unlikely(!data) || unlikely(!head_page(data)) || atomic_read(&data->disabled)) return; @@ -256,7 +248,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip) cpu = raw_smp_processor_id(); data = tr->data[cpu]; - if (unlikely(!data) || unlikely(!data->trace) || + if (unlikely(!data) || unlikely(!head_page(data)) || !data->critical_start || atomic_read(&data->disabled)) return; diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 3d10ff0..688df96 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -107,24 +107,18 @@ wakeup_sched_switch(struct task_struct *prev, struct task_struct *next) update_max_tr(tr, wakeup_task, wakeup_cpu); if (tracing_thresh) { - printk(KERN_INFO "(%16s-%-5d|#%d): %lu us wakeup latency " - "violates %lu us threshold.\n" - " => started at timestamp %lu: ", + printk(KERN_INFO "(%16s-%-5d|#%d):" + " %lu us wakeup latency violates %lu us threshold.\n", wakeup_task->comm, wakeup_task->pid, raw_smp_processor_id(), - latency, nsecs_to_usecs(tracing_thresh), t0); + latency, nsecs_to_usecs(tracing_thresh)); } else { - printk(KERN_INFO "(%16s-%-5d|#%d): new %lu us maximum " - "wakeup latency.\n => started at timestamp %lu: ", + printk(KERN_INFO "(%16s-%-5d|#%d):" + " new %lu us maximum wakeup latency.\n", wakeup_task->comm, wakeup_task->pid, - cpu, latency, t0); + cpu, latency); } - printk(KERN_CONT " ended at timestamp %lu: ", t1); - dump_stack(); - t1 = nsecs_to_usecs(now(cpu)); - printk(KERN_CONT " dump-end timestamp %lu\n\n", t1); - out_unlock: __wakeup_reset(tr); spin_unlock_irqrestore(&wakeup_lock, flags); diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index ef4d3cc..c01874c 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -1,6 +1,7 @@ /* Include in trace.c */ #include +#include static inline int trace_valid_entry(struct trace_entry *entry) { @@ -15,28 +16,29 @@ static inline int trace_valid_entry(struct trace_entry *entry) static int trace_test_buffer_cpu(struct trace_array *tr, struct trace_array_cpu *data) { - struct page *page; struct trace_entry *entries; + struct page *page; int idx = 0; int i; + BUG_ON(list_empty(&data->trace_pages)); page = list_entry(data->trace_pages.next, struct page, lru); entries = page_address(page); - if (data->trace != entries) + if (head_page(data) != entries) goto failed; /* * The starting trace buffer always has valid elements, - * if any element exits. + * if any element exists. */ - entries = data->trace; + entries = head_page(data); for (i = 0; i < tr->entries; i++) { - if (i < data->trace_idx && - !trace_valid_entry(&entries[idx])) { - printk(KERN_CONT ".. invalid entry %d ", entries[idx].type); + if (i < data->trace_idx && !trace_valid_entry(&entries[idx])) { + printk(KERN_CONT ".. invalid entry %d ", + entries[idx].type); goto failed; } @@ -80,11 +82,10 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count) int ret = 0; for_each_possible_cpu(cpu) { - if (!tr->data[cpu]->trace) + if (!head_page(tr->data[cpu])) continue; cnt += tr->data[cpu]->trace_idx; - printk("%d: count = %ld\n", cpu, cnt); ret = trace_test_buffer_cpu(tr, tr->data[cpu]); if (ret) @@ -117,6 +118,8 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) } /* start the tracing */ + ftrace_enabled = 1; + tr->ctrl = 1; trace->init(tr); /* Sleep for a 1/10 of a second */ @@ -124,6 +127,8 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) /* stop the tracing. */ tr->ctrl = 0; trace->ctrl_update(tr); + ftrace_enabled = 0; + /* check the trace buffer */ ret = trace_test_buffer(tr, &count); trace->reset(tr); @@ -328,7 +333,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) /* create a high prio thread */ p = kthread_run(trace_wakeup_test_thread, &isrt, "ftrace-test"); - if (!IS_ERR(p)) { + if (IS_ERR(p)) { printk(KERN_CONT "Failed to create ftrace wakeup test thread "); return -1; } -- cgit v0.10.2 From 7bd2f24c2f769e3f8f1d4fc8b9fddf689825f6a7 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:45 +0200 Subject: ftrace: add README make it easier for newbies to find their way around. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 61d2f02..c736dd2 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -995,6 +995,7 @@ print_trace_fmt(struct seq_file *m, struct trace_iterator *iter) seq_printf(m, " <-"); seq_print_ip_sym(m, entry->fn.parent_ip, sym_flags); } + seq_printf(m, "\n"); break; case TRACE_CTX: S = entry->ctx.prev_state < sizeof(state_to_char) ? @@ -1007,7 +1008,6 @@ print_trace_fmt(struct seq_file *m, struct trace_iterator *iter) entry->ctx.next_prio); break; } - seq_printf(m, "\n"); } static int trace_empty(struct trace_iterator *iter) @@ -1332,6 +1332,39 @@ static struct file_operations tracing_iter_fops = { .write = tracing_iter_ctrl_write, }; +static const char readme_msg[] = + "tracing mini-HOWTO:\n\n" + "# mkdir /debug\n" + "# mount -t debugfs nodev /debug\n\n" + "# cat /debug/tracing/available_tracers\n" + "wakeup preemptirqsoff preemptoff irqsoff ftrace sched_switch none\n\n" + "# cat /debug/tracing/current_tracer\n" + "none\n" + "# echo sched_switch > /debug/tracing/current_tracer\n" + "# cat /debug/tracing/current_tracer\n" + "sched_switch\n" + "# cat /debug/tracing/iter_ctrl\n" + "noprint-parent nosym-offset nosym-addr noverbose\n" + "# echo print-parent > /debug/tracing/iter_ctrl\n" + "# echo 1 > /debug/tracing/tracing_enabled\n" + "# cat /debug/tracing/trace > /tmp/trace.txt\n" + "echo 0 > /debug/tracing/tracing_enabled\n" +; + +static ssize_t +tracing_readme_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return simple_read_from_buffer(ubuf, cnt, ppos, + readme_msg, strlen(readme_msg)); +} + +static struct file_operations tracing_readme_fops = { + .open = tracing_open_generic, + .read = tracing_readme_read, +}; + + static ssize_t tracing_ctrl_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) @@ -1598,6 +1631,11 @@ static __init void tracer_init_debugfs(void) if (!entry) pr_warning("Could not create debugfs " "'tracing_threash' entry\n"); + entry = debugfs_create_file("README", 0644, d_tracer, + NULL, &tracing_readme_fops); + if (!entry) + pr_warning("Could not create debugfs 'README' entry\n"); + #ifdef CONFIG_DYNAMIC_FTRACE entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer, -- cgit v0.10.2 From 77a2b37d227483fe52aead242652aee406c25bf0 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:45 +0200 Subject: ftrace: startup tester on dynamic tracing. This patch adds a startup self test on dynamic code modification and filters. The test filters on a specific function, makes sure that no other function is traced, exectutes the function, then makes sure that the function is traced. This patch also fixes a slight bug with the ftrace selftest, where tracer_enabled was not being set. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 953a36d..a842d96 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -55,6 +55,7 @@ struct dyn_ftrace { }; int ftrace_force_update(void); +void ftrace_set_filter(unsigned char *buf, int len, int reset); /* defined in arch */ extern int ftrace_ip_converted(unsigned long ip); @@ -70,6 +71,7 @@ extern void ftrace_call(void); extern void mcount_call(void); #else # define ftrace_force_update() ({ 0; }) +# define ftrace_set_filter(buf, len, reset) do { } while (0) #endif static inline void tracer_disable(void) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 6d4d2e8..5e9389f 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1010,6 +1010,25 @@ ftrace_filter_write(struct file *file, const char __user *ubuf, return ret; } +/** + * ftrace_set_filter - set a function to filter on in ftrace + * @buf - the string that holds the function filter text. + * @len - the length of the string. + * @reset - non zero to reset all filters before applying this filter. + * + * Filters denote which functions should be enabled when tracing is enabled. + * If @buf is NULL and reset is set, all functions will be enabled for tracing. + */ +notrace void ftrace_set_filter(unsigned char *buf, int len, int reset) +{ + mutex_lock(&ftrace_filter_lock); + if (reset) + ftrace_filter_reset(); + if (buf) + ftrace_match(buf, len); + mutex_unlock(&ftrace_filter_lock); +} + static int notrace ftrace_filter_release(struct inode *inode, struct file *file) { diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index c01874c..4c8a1b2 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -99,6 +99,100 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count) } #ifdef CONFIG_FTRACE + +#ifdef CONFIG_DYNAMIC_FTRACE + +#define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func +#define __STR(x) #x +#define STR(x) __STR(x) +static int DYN_FTRACE_TEST_NAME(void) +{ + /* used to call mcount */ + return 0; +} + +/* Test dynamic code modification and ftrace filters */ +int trace_selftest_startup_dynamic_tracing(struct tracer *trace, + struct trace_array *tr, + int (*func)(void)) +{ + unsigned long count; + int ret; + int save_ftrace_enabled = ftrace_enabled; + int save_tracer_enabled = tracer_enabled; + + /* The ftrace test PASSED */ + printk(KERN_CONT "PASSED\n"); + pr_info("Testing dynamic ftrace: "); + + /* enable tracing, and record the filter function */ + ftrace_enabled = 1; + tracer_enabled = 1; + + /* passed in by parameter to fool gcc from optimizing */ + func(); + + /* update the records */ + ret = ftrace_force_update(); + if (ret) { + printk(KERN_CONT ".. ftraced failed .. "); + return ret; + } + + /* filter only on our function */ + ftrace_set_filter(STR(DYN_FTRACE_TEST_NAME), + sizeof(STR(DYN_FTRACE_TEST_NAME)), 1); + + /* enable tracing */ + tr->ctrl = 1; + trace->init(tr); + /* Sleep for a 1/10 of a second */ + msleep(100); + + /* we should have nothing in the buffer */ + ret = trace_test_buffer(tr, &count); + if (ret) + goto out; + + if (count) { + ret = -1; + printk(KERN_CONT ".. filter did not filter .. "); + goto out; + } + + /* call our function again */ + func(); + + /* sleep again */ + msleep(100); + + /* stop the tracing. */ + tr->ctrl = 0; + trace->ctrl_update(tr); + ftrace_enabled = 0; + + /* check the trace buffer */ + ret = trace_test_buffer(tr, &count); + trace->reset(tr); + + /* we should only have one item */ + if (!ret && count != 1) { + printk(KERN_CONT ".. filter failed .."); + ret = -1; + goto out; + } + out: + ftrace_enabled = save_ftrace_enabled; + tracer_enabled = save_tracer_enabled; + + /* Enable tracing on all functions again */ + ftrace_set_filter(NULL, 0, 1); + + return ret; +} +#else +# define trace_selftest_startup_dynamic_tracing(trace, tr, func) ({ 0; }) +#endif /* CONFIG_DYNAMIC_FTRACE */ /* * Simple verification test of ftrace function tracer. * Enable ftrace, sleep 1/10 second, and then read the trace @@ -109,8 +203,13 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) { unsigned long count; int ret; + int save_ftrace_enabled = ftrace_enabled; + int save_tracer_enabled = tracer_enabled; - /* make sure functions have been recorded */ + /* make sure msleep has been recorded */ + msleep(1); + + /* force the recorded functions to be traced */ ret = ftrace_force_update(); if (ret) { printk(KERN_CONT ".. ftraced failed .. "); @@ -119,6 +218,7 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) /* start the tracing */ ftrace_enabled = 1; + tracer_enabled = 1; tr->ctrl = 1; trace->init(tr); @@ -136,8 +236,16 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) if (!ret && !count) { printk(KERN_CONT ".. no entries found .."); ret = -1; + goto out; } + ret = trace_selftest_startup_dynamic_tracing(trace, tr, + DYN_FTRACE_TEST_NAME); + + out: + ftrace_enabled = save_ftrace_enabled; + tracer_enabled = save_tracer_enabled; + return ret; } #endif /* CONFIG_FTRACE */ @@ -415,6 +523,3 @@ trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr return ret; } #endif /* CONFIG_CONTEXT_SWITCH_TRACER */ - -#ifdef CONFIG_DYNAMIC_FTRACE -#endif /* CONFIG_DYNAMIC_FTRACE */ -- cgit v0.10.2 From 4e3c3333f3bd7eedfd21b1155b3c7cd24fc7f754 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:45 +0200 Subject: ftrace: fix time offset fix time offset calculations and ordering, plus make code more consistent. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c736dd2..8755a43 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -120,7 +120,7 @@ static DEFINE_SPINLOCK(ftrace_max_lock); * structure. (this way the maximum trace is permanently saved, * for later retrieval via /debugfs/tracing/latency_trace) */ -static void notrace +static notrace void __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) { struct trace_array_cpu *data = tr->data[cpu]; @@ -333,15 +333,16 @@ void unregister_tracer(struct tracer *type) mutex_unlock(&trace_types_lock); } -void notrace tracing_reset(struct trace_array_cpu *data) +notrace void tracing_reset(struct trace_array_cpu *data) { data->trace_idx = 0; data->trace_current = head_page(data); data->trace_current_idx = 0; + data->time_offset = 0; } #ifdef CONFIG_FTRACE -static void notrace +static notrace void function_trace_call(unsigned long ip, unsigned long parent_ip) { struct trace_array *tr = &global_trace; @@ -398,7 +399,7 @@ static void trace_init_cmdlines(void) notrace void trace_stop_cmdline_recording(void); -static void notrace trace_save_cmdline(struct task_struct *tsk) +static notrace void trace_save_cmdline(struct task_struct *tsk) { unsigned map; unsigned idx; @@ -624,6 +625,7 @@ static void *find_next_entry_inc(struct trace_iterator *iter) iter->idx++; iter->next_idx[next_cpu]++; iter->next_page_idx[next_cpu]++; + if (iter->next_page_idx[next_cpu] >= ENTRIES_PER_PAGE) { struct trace_array_cpu *data = iter->tr->data[next_cpu]; @@ -635,19 +637,21 @@ static void *find_next_entry_inc(struct trace_iterator *iter) data->trace_pages.next; } } + iter->prev_ent = iter->ent; + iter->prev_cpu = iter->cpu; + iter->ent = next; iter->cpu = next_cpu; return next ? iter : NULL; } -static void notrace * -s_next(struct seq_file *m, void *v, loff_t *pos) +static notrace void *s_next(struct seq_file *m, void *v, loff_t *pos) { struct trace_iterator *iter = m->private; - void *ent; void *last_ent = iter->ent; int i = (int)*pos; + void *ent; (*pos)++; @@ -693,6 +697,8 @@ static void *s_start(struct seq_file *m, loff_t *pos) iter->ent = NULL; iter->cpu = 0; iter->idx = -1; + iter->prev_ent = NULL; + iter->prev_cpu = -1; for_each_possible_cpu(i) { iter->next_idx[i] = 0; @@ -752,7 +758,7 @@ seq_print_sym_offset(struct seq_file *m, const char *fmt, unsigned long address) # define IP_FMT "%016lx" #endif -static void notrace +static notrace void seq_print_ip_sym(struct seq_file *m, unsigned long ip, unsigned long sym_flags) { if (!ip) { @@ -769,7 +775,7 @@ seq_print_ip_sym(struct seq_file *m, unsigned long ip, unsigned long sym_flags) seq_printf(m, " <" IP_FMT ">", ip); } -static void notrace print_lat_help_header(struct seq_file *m) +static notrace void print_lat_help_header(struct seq_file *m) { seq_puts(m, "# _------=> CPU# \n"); seq_puts(m, "# / _-----=> irqs-off \n"); @@ -782,14 +788,14 @@ static void notrace print_lat_help_header(struct seq_file *m) seq_puts(m, "# \\ / ||||| \\ | / \n"); } -static void notrace print_func_help_header(struct seq_file *m) +static notrace void print_func_help_header(struct seq_file *m) { seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n"); seq_puts(m, "# | | | | |\n"); } -static void notrace +static notrace void print_trace_header(struct seq_file *m, struct trace_iterator *iter) { unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); @@ -858,7 +864,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) seq_puts(m, "\n"); } -static void notrace +static notrace void lat_print_generic(struct seq_file *m, struct trace_entry *entry, int cpu) { int hardirq, softirq; @@ -895,7 +901,7 @@ lat_print_generic(struct seq_file *m, struct trace_entry *entry, int cpu) unsigned long preempt_mark_thresh = 100; -static void notrace +static notrace void lat_print_timestamp(struct seq_file *m, unsigned long long abs_usecs, unsigned long rel_usecs) { @@ -910,7 +916,7 @@ lat_print_timestamp(struct seq_file *m, unsigned long long abs_usecs, static const char state_to_char[] = TASK_STATE_TO_CHAR_STR; -static void notrace +static notrace void print_lat_fmt(struct seq_file *m, struct trace_iterator *iter, unsigned int trace_idx, int cpu) { @@ -966,20 +972,50 @@ print_lat_fmt(struct seq_file *m, struct trace_iterator *iter, } } -static void notrace +static notrace void sync_time_offset(struct trace_iterator *iter) +{ + struct trace_array_cpu *prev_array, *array; + struct trace_entry *prev_entry, *entry; + cycle_t prev_t, t; + + entry = iter->ent; + prev_entry = iter->prev_ent; + if (!prev_entry) + return; + + prev_array = iter->tr->data[iter->prev_cpu]; + array = iter->tr->data[iter->cpu]; + + prev_t = prev_entry->t + prev_array->time_offset; + t = entry->t + array->time_offset; + + /* + * If time goes backwards we increase the offset of + * the current array, to not have observable time warps. + * This will quickly synchronize the time offsets of + * multiple CPUs: + */ + if (t < prev_t) + array->time_offset += prev_t - t; +} + +static notrace void print_trace_fmt(struct seq_file *m, struct trace_iterator *iter) { unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); - struct trace_entry *entry = iter->ent; + struct trace_entry *entry; unsigned long usec_rem; unsigned long long t; unsigned long secs; char *comm; int S; + sync_time_offset(iter); + entry = iter->ent; + comm = trace_find_cmdline(iter->ent->pid); - t = ns2usecs(entry->t); + t = ns2usecs(entry->t + iter->tr->data[iter->cpu]->time_offset); usec_rem = do_div(t, 1000000ULL); secs = (unsigned long)t; @@ -1158,7 +1194,7 @@ static int tracing_lt_open(struct inode *inode, struct file *file) } -static void notrace * +static notrace void * t_next(struct seq_file *m, void *v, loff_t *pos) { struct tracer *t = m->private; @@ -1374,8 +1410,7 @@ tracing_ctrl_read(struct file *filp, char __user *ubuf, int r; r = sprintf(buf, "%ld\n", tr->ctrl); - return simple_read_from_buffer(ubuf, cnt, ppos, - buf, r); + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } static ssize_t diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index cc1d34b..5df8ff2 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -56,6 +56,8 @@ struct trace_array_cpu { void *trace_current; struct list_head trace_pages; atomic_t disabled; + cycle_t time_offset; + /* these fields get copied into max-trace: */ unsigned trace_current_idx; unsigned long trace_idx; @@ -114,14 +116,19 @@ struct tracer { struct trace_iterator { struct trace_array *tr; struct tracer *trace; + struct trace_entry *ent; + int cpu; + + struct trace_entry *prev_ent; + int prev_cpu; + unsigned long iter_flags; loff_t pos; unsigned long next_idx[NR_CPUS]; struct list_head *next_page[NR_CPUS]; unsigned next_page_idx[NR_CPUS]; long idx; - int cpu; }; void notrace tracing_reset(struct trace_array_cpu *data); -- cgit v0.10.2 From 08bafa0efcf29fe18ec39c2147077b597368b018 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:45 +0200 Subject: ftrace: disable all tracers on corrupted buffer If the trace buffer is detected to be corrupted, then we disable all tracers. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 4c8a1b2..a6f1ed7 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -67,6 +67,8 @@ trace_test_buffer_cpu(struct trace_array *tr, struct trace_array_cpu *data) return 0; failed: + /* disable tracing */ + tracing_disabled = 1; printk(KERN_CONT ".. corrupted trace buffer .. "); return -1; } -- cgit v0.10.2 From 1d4db00a5e30c7b8f8dc2a1b19e886fd942be143 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:45 +0200 Subject: ftrace: reset selftests The tests may leave stuff in the buffers. This resets the buffers after each test is run. If a test fails, it does not reset the buffer to avoid touching a buffer that is corrupted. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8755a43..6580e7e 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -289,6 +289,13 @@ int register_tracer(struct tracer *type) printk(KERN_CONT "FAILED!\n"); goto out; } + /* Only reset on passing, to avoid touching corrupted buffers */ + for_each_possible_cpu(i) { + data = tr->data[i]; + if (!head_page(data)) + continue; + tracing_reset(data); + } printk(KERN_CONT "PASSED\n"); } #endif -- cgit v0.10.2 From 93a588f459da134be6ab17c4104e28441beb0d22 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:45 +0200 Subject: ftrace: change buffers to producer consumer This patch changes the way the CPU trace buffers are handled. Instead of always starting from the trace page head, the logic is changed to a producer consumer logic. This allows for the buffers to be drained while they are alive. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 6580e7e..777b859 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -176,10 +176,9 @@ flip_trace(struct trace_array_cpu *tr1, struct trace_array_cpu *tr2) INIT_LIST_HEAD(&flip_pages); - tr1->trace_current = NULL; - memcpy(&tr1->trace_current_idx, &tr2->trace_current_idx, + memcpy(&tr1->trace_head_idx, &tr2->trace_head_idx, sizeof(struct trace_array_cpu) - - offsetof(struct trace_array_cpu, trace_current_idx)); + offsetof(struct trace_array_cpu, trace_head_idx)); check_pages(tr1); check_pages(tr2); @@ -228,7 +227,6 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) tracing_reset(max_tr.data[i]); flip_trace(max_tr.data[cpu], data); - tracing_reset(data); __update_max_tr(tr, tsk, cpu); @@ -343,9 +341,9 @@ void unregister_tracer(struct tracer *type) notrace void tracing_reset(struct trace_array_cpu *data) { data->trace_idx = 0; - data->trace_current = head_page(data); - data->trace_current_idx = 0; - data->time_offset = 0; + data->trace_head = data->trace_tail = head_page(data); + data->trace_head_idx = 0; + data->trace_tail_idx = 0; } #ifdef CONFIG_FTRACE @@ -470,38 +468,65 @@ notrace void tracing_record_cmdline(struct task_struct *tsk) trace_save_cmdline(tsk); } +static inline notrace struct list_head * +trace_next_list(struct trace_array_cpu *data, struct list_head *next) +{ + /* + * Roundrobin - but skip the head (which is not a real page): + */ + next = next->next; + if (unlikely(next == &data->trace_pages)) + next = next->next; + BUG_ON(next == &data->trace_pages); + + return next; +} + +static inline notrace void * +trace_next_page(struct trace_array_cpu *data, void *addr) +{ + struct list_head *next; + struct page *page; + + page = virt_to_page(addr); + + next = trace_next_list(data, &page->lru); + page = list_entry(next, struct page, lru); + + return page_address(page); +} + static inline notrace struct trace_entry * tracing_get_trace_entry(struct trace_array *tr, struct trace_array_cpu *data) { unsigned long idx, idx_next; struct trace_entry *entry; - struct list_head *next; - struct page *page; data->trace_idx++; - idx = data->trace_current_idx; + idx = data->trace_head_idx; idx_next = idx + 1; BUG_ON(idx * TRACE_ENTRY_SIZE >= PAGE_SIZE); - entry = data->trace_current + idx * TRACE_ENTRY_SIZE; + entry = data->trace_head + idx * TRACE_ENTRY_SIZE; if (unlikely(idx_next >= ENTRIES_PER_PAGE)) { - page = virt_to_page(data->trace_current); - /* - * Roundrobin - but skip the head (which is not a real page): - */ - next = page->lru.next; - if (unlikely(next == &data->trace_pages)) - next = next->next; - BUG_ON(next == &data->trace_pages); - - page = list_entry(next, struct page, lru); - data->trace_current = page_address(page); + data->trace_head = trace_next_page(data, data->trace_head); idx_next = 0; } - data->trace_current_idx = idx_next; + if (data->trace_head == data->trace_tail && + idx_next == data->trace_tail_idx) { + /* overrun */ + data->trace_tail_idx++; + if (data->trace_tail_idx >= ENTRIES_PER_PAGE) { + data->trace_tail = + trace_next_page(data, data->trace_tail); + data->trace_tail_idx = 0; + } + } + + data->trace_head_idx = idx_next; return entry; } @@ -571,21 +596,11 @@ trace_entry_idx(struct trace_array *tr, struct trace_array_cpu *data, return NULL; if (!iter->next_page[cpu]) { - /* - * Initialize. If the count of elements in - * this buffer is greater than the max entries - * we had an underrun. Which means we looped around. - * We can simply use the current pointer as our - * starting point. - */ - if (data->trace_idx >= tr->entries) { - page = virt_to_page(data->trace_current); - iter->next_page[cpu] = &page->lru; - iter->next_page_idx[cpu] = data->trace_current_idx; - } else { - iter->next_page[cpu] = data->trace_pages.next; - iter->next_page_idx[cpu] = 0; - } + /* Initialize the iterator for this cpu trace buffer */ + WARN_ON(!data->trace_tail); + page = virt_to_page(data->trace_tail); + iter->next_page[cpu] = &page->lru; + iter->next_page_idx[cpu] = data->trace_tail_idx; } page = list_entry(iter->next_page[cpu], struct page, lru); @@ -593,6 +608,12 @@ trace_entry_idx(struct trace_array *tr, struct trace_array_cpu *data, array = page_address(page); + /* Still possible to catch up to the tail */ + if (iter->next_idx[cpu] && array == data->trace_tail && + iter->next_page_idx[cpu] == data->trace_tail_idx) + return NULL; + + WARN_ON(iter->next_page_idx[cpu] >= ENTRIES_PER_PAGE); return &array[iter->next_page_idx[cpu]]; } @@ -638,10 +659,8 @@ static void *find_next_entry_inc(struct trace_iterator *iter) iter->next_page_idx[next_cpu] = 0; iter->next_page[next_cpu] = - iter->next_page[next_cpu]->next; - if (iter->next_page[next_cpu] == &data->trace_pages) - iter->next_page[next_cpu] = - data->trace_pages.next; + trace_next_list(data, iter->next_page[next_cpu]); + } } iter->prev_ent = iter->ent; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 5df8ff2..0ce1274 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -53,13 +53,15 @@ struct trace_entry { * the trace, etc.) */ struct trace_array_cpu { - void *trace_current; struct list_head trace_pages; atomic_t disabled; cycle_t time_offset; /* these fields get copied into max-trace: */ - unsigned trace_current_idx; + unsigned trace_head_idx; + unsigned trace_tail_idx; + void *trace_head; /* producer */ + void *trace_tail; /* consumer */ unsigned long trace_idx; unsigned long saved_latency; unsigned long critical_start; -- cgit v0.10.2 From 214023c3d13a71525e463b5e54e360b926b4dc90 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:46 +0200 Subject: ftrace: add a buffer for output Later patches will need to print the same things as the seq output does. But those outputs will not use the seq utility. This patch adds a buffer to the iterator, that can be used by either the seq utility or other output. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 777b859..d39f4fa 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -169,6 +169,66 @@ void *head_page(struct trace_array_cpu *data) return page_address(page); } +static notrace int +trace_seq_printf(struct trace_seq *s, const char *fmt, ...) +{ + int len = (PAGE_SIZE - 1) - s->len; + va_list ap; + + if (!len) + return 0; + + va_start(ap, fmt); + len = vsnprintf(s->buffer + s->len, len, fmt, ap); + va_end(ap); + + s->len += len; + + return len; +} + +static notrace int +trace_seq_puts(struct trace_seq *s, const char *str) +{ + int len = strlen(str); + + if (len > ((PAGE_SIZE - 1) - s->len)) + len = (PAGE_SIZE - 1) - s->len; + + memcpy(s->buffer + s->len, str, len); + s->len += len; + + return len; +} + +static notrace int +trace_seq_putc(struct trace_seq *s, unsigned char c) +{ + if (s->len >= (PAGE_SIZE - 1)) + return 0; + + s->buffer[s->len++] = c; + + return 1; +} + +static notrace void +trace_seq_reset(struct trace_seq *s) +{ + s->len = 0; +} + +static notrace void +trace_print_seq(struct seq_file *m, struct trace_seq *s) +{ + int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len; + + s->buffer[len] = 0; + seq_puts(m, s->buffer); + + trace_seq_reset(s); +} + notrace static void flip_trace(struct trace_array_cpu *tr1, struct trace_array_cpu *tr2) { @@ -756,25 +816,26 @@ static void s_stop(struct seq_file *m, void *p) } static void -seq_print_sym_short(struct seq_file *m, const char *fmt, unsigned long address) +seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address) { #ifdef CONFIG_KALLSYMS char str[KSYM_SYMBOL_LEN]; kallsyms_lookup(address, NULL, NULL, NULL, str); - seq_printf(m, fmt, str); + trace_seq_printf(s, fmt, str); #endif } static void -seq_print_sym_offset(struct seq_file *m, const char *fmt, unsigned long address) +seq_print_sym_offset(struct trace_seq *s, const char *fmt, + unsigned long address) { #ifdef CONFIG_KALLSYMS char str[KSYM_SYMBOL_LEN]; sprint_symbol(str, address); - seq_printf(m, fmt, str); + trace_seq_printf(s, fmt, str); #endif } @@ -785,20 +846,20 @@ seq_print_sym_offset(struct seq_file *m, const char *fmt, unsigned long address) #endif static notrace void -seq_print_ip_sym(struct seq_file *m, unsigned long ip, unsigned long sym_flags) +seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) { if (!ip) { - seq_printf(m, "0"); + trace_seq_printf(s, "0"); return; } if (sym_flags & TRACE_ITER_SYM_OFFSET) - seq_print_sym_offset(m, "%s", ip); + seq_print_sym_offset(s, "%s", ip); else - seq_print_sym_short(m, "%s", ip); + seq_print_sym_short(s, "%s", ip); if (sym_flags & TRACE_ITER_SYM_ADDR) - seq_printf(m, " <" IP_FMT ">", ip); + trace_seq_printf(s, " <" IP_FMT ">", ip); } static notrace void print_lat_help_header(struct seq_file *m) @@ -881,9 +942,11 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) if (data->critical_start) { seq_puts(m, " => started at: "); - seq_print_ip_sym(m, data->critical_start, sym_flags); + seq_print_ip_sym(&iter->seq, data->critical_start, sym_flags); + trace_print_seq(m, &iter->seq); seq_puts(m, "\n => ended at: "); - seq_print_ip_sym(m, data->critical_end, sym_flags); + seq_print_ip_sym(&iter->seq, data->critical_end, sym_flags); + trace_print_seq(m, &iter->seq); seq_puts(m, "\n"); } @@ -891,61 +954,61 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) } static notrace void -lat_print_generic(struct seq_file *m, struct trace_entry *entry, int cpu) +lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) { int hardirq, softirq; char *comm; comm = trace_find_cmdline(entry->pid); - seq_printf(m, "%8.8s-%-5d ", comm, entry->pid); - seq_printf(m, "%d", cpu); - seq_printf(m, "%c%c", - (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : '.', - ((entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.')); + trace_seq_printf(s, "%8.8s-%-5d ", comm, entry->pid); + trace_seq_printf(s, "%d", cpu); + trace_seq_printf(s, "%c%c", + (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : '.', + ((entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.')); hardirq = entry->flags & TRACE_FLAG_HARDIRQ; softirq = entry->flags & TRACE_FLAG_SOFTIRQ; if (hardirq && softirq) - seq_putc(m, 'H'); + trace_seq_putc(s, 'H'); else { if (hardirq) - seq_putc(m, 'h'); + trace_seq_putc(s, 'h'); else { if (softirq) - seq_putc(m, 's'); + trace_seq_putc(s, 's'); else - seq_putc(m, '.'); + trace_seq_putc(s, '.'); } } if (entry->preempt_count) - seq_printf(m, "%x", entry->preempt_count); + trace_seq_printf(s, "%x", entry->preempt_count); else - seq_puts(m, "."); + trace_seq_puts(s, "."); } unsigned long preempt_mark_thresh = 100; static notrace void -lat_print_timestamp(struct seq_file *m, unsigned long long abs_usecs, +lat_print_timestamp(struct trace_seq *s, unsigned long long abs_usecs, unsigned long rel_usecs) { - seq_printf(m, " %4lldus", abs_usecs); + trace_seq_printf(s, " %4lldus", abs_usecs); if (rel_usecs > preempt_mark_thresh) - seq_puts(m, "!: "); + trace_seq_puts(s, "!: "); else if (rel_usecs > 1) - seq_puts(m, "+: "); + trace_seq_puts(s, "+: "); else - seq_puts(m, " : "); + trace_seq_puts(s, " : "); } static const char state_to_char[] = TASK_STATE_TO_CHAR_STR; static notrace void -print_lat_fmt(struct seq_file *m, struct trace_iterator *iter, - unsigned int trace_idx, int cpu) +print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) { + struct trace_seq *s = &iter->seq; unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); struct trace_entry *next_entry = find_next_entry(iter, NULL); unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE); @@ -962,39 +1025,40 @@ print_lat_fmt(struct seq_file *m, struct trace_iterator *iter, if (verbose) { comm = trace_find_cmdline(entry->pid); - seq_printf(m, "%16s %5d %d %d %08x %08x [%08lx]" - " %ld.%03ldms (+%ld.%03ldms): ", - comm, - entry->pid, cpu, entry->flags, - entry->preempt_count, trace_idx, - ns2usecs(entry->t), - abs_usecs/1000, - abs_usecs % 1000, rel_usecs/1000, rel_usecs % 1000); + trace_seq_printf(s, "%16s %5d %d %d %08x %08x [%08lx]" + " %ld.%03ldms (+%ld.%03ldms): ", + comm, + entry->pid, cpu, entry->flags, + entry->preempt_count, trace_idx, + ns2usecs(entry->t), + abs_usecs/1000, + abs_usecs % 1000, rel_usecs/1000, + rel_usecs % 1000); } else { - lat_print_generic(m, entry, cpu); - lat_print_timestamp(m, abs_usecs, rel_usecs); + lat_print_generic(s, entry, cpu); + lat_print_timestamp(s, abs_usecs, rel_usecs); } switch (entry->type) { case TRACE_FN: - seq_print_ip_sym(m, entry->fn.ip, sym_flags); - seq_puts(m, " ("); - seq_print_ip_sym(m, entry->fn.parent_ip, sym_flags); - seq_puts(m, ")\n"); + seq_print_ip_sym(s, entry->fn.ip, sym_flags); + trace_seq_puts(s, " ("); + seq_print_ip_sym(s, entry->fn.parent_ip, sym_flags); + trace_seq_puts(s, ")\n"); break; case TRACE_CTX: S = entry->ctx.prev_state < sizeof(state_to_char) ? state_to_char[entry->ctx.prev_state] : 'X'; comm = trace_find_cmdline(entry->ctx.next_pid); - seq_printf(m, " %d:%d:%c --> %d:%d %s\n", - entry->ctx.prev_pid, - entry->ctx.prev_prio, - S, - entry->ctx.next_pid, - entry->ctx.next_prio, - comm); + trace_seq_printf(s, " %d:%d:%c --> %d:%d %s\n", + entry->ctx.prev_pid, + entry->ctx.prev_prio, + S, + entry->ctx.next_pid, + entry->ctx.next_prio, + comm); break; default: - seq_printf(m, "Unknown type %d\n", entry->type); + trace_seq_printf(s, "Unknown type %d\n", entry->type); } } @@ -1026,8 +1090,9 @@ static notrace void sync_time_offset(struct trace_iterator *iter) } static notrace void -print_trace_fmt(struct seq_file *m, struct trace_iterator *iter) +print_trace_fmt(struct trace_iterator *iter) { + struct trace_seq *s = &iter->seq; unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); struct trace_entry *entry; unsigned long usec_rem; @@ -1045,29 +1110,29 @@ print_trace_fmt(struct seq_file *m, struct trace_iterator *iter) usec_rem = do_div(t, 1000000ULL); secs = (unsigned long)t; - seq_printf(m, "%16s-%-5d ", comm, entry->pid); - seq_printf(m, "[%02d] ", iter->cpu); - seq_printf(m, "%5lu.%06lu: ", secs, usec_rem); + trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid); + trace_seq_printf(s, "[%02d] ", iter->cpu); + trace_seq_printf(s, "%5lu.%06lu: ", secs, usec_rem); switch (entry->type) { case TRACE_FN: - seq_print_ip_sym(m, entry->fn.ip, sym_flags); + seq_print_ip_sym(s, entry->fn.ip, sym_flags); if ((sym_flags & TRACE_ITER_PRINT_PARENT) && entry->fn.parent_ip) { - seq_printf(m, " <-"); - seq_print_ip_sym(m, entry->fn.parent_ip, sym_flags); + trace_seq_printf(s, " <-"); + seq_print_ip_sym(s, entry->fn.parent_ip, sym_flags); } - seq_printf(m, "\n"); + trace_seq_printf(s, "\n"); break; case TRACE_CTX: S = entry->ctx.prev_state < sizeof(state_to_char) ? state_to_char[entry->ctx.prev_state] : 'X'; - seq_printf(m, " %d:%d:%c ==> %d:%d\n", - entry->ctx.prev_pid, - entry->ctx.prev_prio, - S, - entry->ctx.next_pid, - entry->ctx.next_prio); + trace_seq_printf(s, " %d:%d:%c ==> %d:%d\n", + entry->ctx.prev_pid, + entry->ctx.prev_prio, + S, + entry->ctx.next_pid, + entry->ctx.next_prio); break; } } @@ -1108,9 +1173,10 @@ static int s_show(struct seq_file *m, void *v) } } else { if (iter->iter_flags & TRACE_FILE_LAT_FMT) - print_lat_fmt(m, iter, iter->idx, iter->cpu); + print_lat_fmt(iter, iter->idx, iter->cpu); else - print_trace_fmt(m, iter); + print_trace_fmt(iter); + trace_print_seq(m, &iter->seq); } return 0; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 0ce1274..f5b32ca 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -111,11 +111,17 @@ struct tracer { int print_max; }; +struct trace_seq { + unsigned char buffer[PAGE_SIZE]; + unsigned int len; +}; + /* * Trace iterator - used by printout routines who present trace * results to users and which routines might sleep, etc: */ struct trace_iterator { + struct trace_seq seq; struct trace_array *tr; struct tracer *trace; -- cgit v0.10.2 From b3806b4316306dc9c542eff6c23d7d42918f3504 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:46 +0200 Subject: ftrace: user run time file reading This patch creates a file called trace_pipe in the tracing debug directory. This file is a consumer of the trace buffers. This means that reads of this file consumes the entries from the trace buffers so that they will not be read a second time, as contrast to the static buffers latency_trace and trace. Reading from the trace_pipe will remove the entries from trace and latency_trace too. The advantage that trace_pipe has is that it can record live traces. It will block when there is nothing in the buffer, and read the entries as they are entered. An EOF happens when tracing is disabled (tracing_enabled = 0). Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d39f4fa..a40687a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -174,15 +174,20 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...) { int len = (PAGE_SIZE - 1) - s->len; va_list ap; + int ret; if (!len) return 0; va_start(ap, fmt); - len = vsnprintf(s->buffer + s->len, len, fmt, ap); + ret = vsnprintf(s->buffer + s->len, len, fmt, ap); va_end(ap); - s->len += len; + /* If we can't write it all, don't bother writing anything */ + if (ret > len) + return 0; + + s->len += ret; return len; } @@ -193,7 +198,7 @@ trace_seq_puts(struct trace_seq *s, const char *str) int len = strlen(str); if (len > ((PAGE_SIZE - 1) - s->len)) - len = (PAGE_SIZE - 1) - s->len; + return 0; memcpy(s->buffer + s->len, str, len); s->len += len; @@ -615,11 +620,13 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data, { struct trace_entry *entry; + spin_lock(&data->lock); entry = tracing_get_trace_entry(tr, data); tracing_generic_entry_update(entry, flags); entry->type = TRACE_FN; entry->fn.ip = ip; entry->fn.parent_ip = parent_ip; + spin_unlock(&data->lock); } notrace void @@ -630,6 +637,7 @@ tracing_sched_switch_trace(struct trace_array *tr, { struct trace_entry *entry; + spin_lock(&data->lock); entry = tracing_get_trace_entry(tr, data); tracing_generic_entry_update(entry, flags); entry->type = TRACE_CTX; @@ -638,6 +646,7 @@ tracing_sched_switch_trace(struct trace_array *tr, entry->ctx.prev_state = prev->state; entry->ctx.next_pid = next->pid; entry->ctx.next_prio = next->prio; + spin_unlock(&data->lock); } enum trace_file_type { @@ -652,7 +661,9 @@ trace_entry_idx(struct trace_array *tr, struct trace_array_cpu *data, struct trace_entry *array; if (iter->next_idx[cpu] >= tr->entries || - iter->next_idx[cpu] >= data->trace_idx) + iter->next_idx[cpu] >= data->trace_idx || + (data->trace_head == data->trace_tail && + data->trace_head_idx == data->trace_tail_idx)) return NULL; if (!iter->next_page[cpu]) { @@ -702,33 +713,57 @@ find_next_entry(struct trace_iterator *iter, int *ent_cpu) return next; } -static void *find_next_entry_inc(struct trace_iterator *iter) +static notrace void +trace_iterator_increment(struct trace_iterator *iter) { - struct trace_entry *next; - int next_cpu = -1; + iter->idx++; + iter->next_idx[iter->cpu]++; + iter->next_page_idx[iter->cpu]++; + if (iter->next_page_idx[iter->cpu] >= ENTRIES_PER_PAGE) { + struct trace_array_cpu *data = iter->tr->data[iter->cpu]; - next = find_next_entry(iter, &next_cpu); + iter->next_page_idx[iter->cpu] = 0; + iter->next_page[iter->cpu] = + trace_next_list(data, iter->next_page[iter->cpu]); + } +} - if (next) { - iter->idx++; - iter->next_idx[next_cpu]++; - iter->next_page_idx[next_cpu]++; +static notrace void +trace_consume(struct trace_iterator *iter) +{ + struct trace_array_cpu *data = iter->tr->data[iter->cpu]; + + data->trace_tail_idx++; + if (data->trace_tail_idx >= ENTRIES_PER_PAGE) { + data->trace_tail = trace_next_page(data, data->trace_tail); + data->trace_tail_idx = 0; + } - if (iter->next_page_idx[next_cpu] >= ENTRIES_PER_PAGE) { - struct trace_array_cpu *data = iter->tr->data[next_cpu]; + /* Check if we empty it, then reset the index */ + if (data->trace_head == data->trace_tail && + data->trace_head_idx == data->trace_tail_idx) + data->trace_idx = 0; - iter->next_page_idx[next_cpu] = 0; - iter->next_page[next_cpu] = - trace_next_list(data, iter->next_page[next_cpu]); + trace_iterator_increment(iter); +} + +static notrace void * +find_next_entry_inc(struct trace_iterator *iter) +{ + struct trace_entry *next; + int next_cpu = -1; + + next = find_next_entry(iter, &next_cpu); - } - } iter->prev_ent = iter->ent; iter->prev_cpu = iter->cpu; iter->ent = next; iter->cpu = next_cpu; + if (next) + trace_iterator_increment(iter); + return next ? iter : NULL; } @@ -815,7 +850,7 @@ static void s_stop(struct seq_file *m, void *p) mutex_unlock(&trace_types_lock); } -static void +static int seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address) { #ifdef CONFIG_KALLSYMS @@ -823,11 +858,12 @@ seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address) kallsyms_lookup(address, NULL, NULL, NULL, str); - trace_seq_printf(s, fmt, str); + return trace_seq_printf(s, fmt, str); #endif + return 1; } -static void +static int seq_print_sym_offset(struct trace_seq *s, const char *fmt, unsigned long address) { @@ -835,8 +871,9 @@ seq_print_sym_offset(struct trace_seq *s, const char *fmt, char str[KSYM_SYMBOL_LEN]; sprint_symbol(str, address); - trace_seq_printf(s, fmt, str); + return trace_seq_printf(s, fmt, str); #endif + return 1; } #ifndef CONFIG_64BIT @@ -845,21 +882,25 @@ seq_print_sym_offset(struct trace_seq *s, const char *fmt, # define IP_FMT "%016lx" #endif -static notrace void +static notrace int seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) { - if (!ip) { - trace_seq_printf(s, "0"); - return; - } + int ret; + + if (!ip) + return trace_seq_printf(s, "0"); if (sym_flags & TRACE_ITER_SYM_OFFSET) - seq_print_sym_offset(s, "%s", ip); + ret = seq_print_sym_offset(s, "%s", ip); else - seq_print_sym_short(s, "%s", ip); + ret = seq_print_sym_short(s, "%s", ip); + + if (!ret) + return 0; if (sym_flags & TRACE_ITER_SYM_ADDR) - trace_seq_printf(s, " <" IP_FMT ">", ip); + ret = trace_seq_printf(s, " <" IP_FMT ">", ip); + return ret; } static notrace void print_lat_help_header(struct seq_file *m) @@ -1089,7 +1130,7 @@ static notrace void sync_time_offset(struct trace_iterator *iter) array->time_offset += prev_t - t; } -static notrace void +static notrace int print_trace_fmt(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; @@ -1100,6 +1141,7 @@ print_trace_fmt(struct trace_iterator *iter) unsigned long secs; char *comm; int S; + int ret; sync_time_offset(iter); entry = iter->ent; @@ -1110,31 +1152,49 @@ print_trace_fmt(struct trace_iterator *iter) usec_rem = do_div(t, 1000000ULL); secs = (unsigned long)t; - trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid); - trace_seq_printf(s, "[%02d] ", iter->cpu); - trace_seq_printf(s, "%5lu.%06lu: ", secs, usec_rem); + ret = trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid); + if (!ret) + return 0; + ret = trace_seq_printf(s, "[%02d] ", iter->cpu); + if (!ret) + return 0; + ret = trace_seq_printf(s, "%5lu.%06lu: ", secs, usec_rem); + if (!ret) + return 0; switch (entry->type) { case TRACE_FN: - seq_print_ip_sym(s, entry->fn.ip, sym_flags); + ret = seq_print_ip_sym(s, entry->fn.ip, sym_flags); + if (!ret) + return 0; if ((sym_flags & TRACE_ITER_PRINT_PARENT) && entry->fn.parent_ip) { - trace_seq_printf(s, " <-"); - seq_print_ip_sym(s, entry->fn.parent_ip, sym_flags); + ret = trace_seq_printf(s, " <-"); + if (!ret) + return 0; + ret = seq_print_ip_sym(s, entry->fn.parent_ip, + sym_flags); + if (!ret) + return 0; } - trace_seq_printf(s, "\n"); + ret = trace_seq_printf(s, "\n"); + if (!ret) + return 0; break; case TRACE_CTX: S = entry->ctx.prev_state < sizeof(state_to_char) ? state_to_char[entry->ctx.prev_state] : 'X'; - trace_seq_printf(s, " %d:%d:%c ==> %d:%d\n", - entry->ctx.prev_pid, - entry->ctx.prev_prio, - S, - entry->ctx.next_pid, - entry->ctx.next_prio); + ret = trace_seq_printf(s, " %d:%d:%c ==> %d:%d\n", + entry->ctx.prev_pid, + entry->ctx.prev_prio, + S, + entry->ctx.next_pid, + entry->ctx.next_prio); + if (!ret) + return 0; break; } + return 1; } static int trace_empty(struct trace_iterator *iter) @@ -1145,7 +1205,9 @@ static int trace_empty(struct trace_iterator *iter) for_each_possible_cpu(cpu) { data = iter->tr->data[cpu]; - if (head_page(data) && data->trace_idx) + if (head_page(data) && data->trace_idx && + (data->trace_tail != data->trace_head || + data->trace_tail_idx != data->trace_head_idx)) return 0; } return 1; @@ -1645,6 +1707,192 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf, return cnt; } +static atomic_t tracing_reader; + +static int tracing_open_pipe(struct inode *inode, struct file *filp) +{ + struct trace_iterator *iter; + + if (tracing_disabled) + return -ENODEV; + + /* We only allow for reader of the pipe */ + if (atomic_inc_return(&tracing_reader) != 1) { + atomic_dec(&tracing_reader); + return -EBUSY; + } + + /* create a buffer to store the information to pass to userspace */ + iter = kzalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) + return -ENOMEM; + + iter->tr = &global_trace; + + filp->private_data = iter; + + return 0; +} + +static int tracing_release_pipe(struct inode *inode, struct file *file) +{ + struct trace_iterator *iter = file->private_data; + + kfree(iter); + atomic_dec(&tracing_reader); + + return 0; +} + +/* + * Consumer reader. + */ +static ssize_t +tracing_read_pipe(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_iterator *iter = filp->private_data; + struct trace_array_cpu *data; + static cpumask_t mask; + struct trace_entry *entry; + static int start; + unsigned long flags; + int read = 0; + int cpu; + int len; + int ret; + + /* return any leftover data */ + if (iter->seq.len > start) { + len = iter->seq.len - start; + if (cnt > len) + cnt = len; + ret = copy_to_user(ubuf, iter->seq.buffer + start, cnt); + if (ret) + cnt = -EFAULT; + + start += len; + + return cnt; + } + + trace_seq_reset(&iter->seq); + start = 0; + + while (trace_empty(iter)) { + /* + * This is a make-shift waitqueue. The reason we don't use + * an actual wait queue is because: + * 1) we only ever have one waiter + * 2) the tracing, traces all functions, we don't want + * the overhead of calling wake_up and friends + * (and tracing them too) + * Anyway, this is really very primitive wakeup. + */ + set_current_state(TASK_INTERRUPTIBLE); + iter->tr->waiter = current; + + /* sleep for one second, and try again. */ + schedule_timeout(HZ); + + iter->tr->waiter = NULL; + + if (signal_pending(current)) + return -EINTR; + + /* + * We block until we read something and tracing is disabled. + * We still block if tracing is disabled, but we have never + * read anything. This allows a user to cat this file, and + * then enable tracing. But after we have read something, + * we give an EOF when tracing is again disabled. + * + * iter->pos will be 0 if we haven't read anything. + */ + if (!tracer_enabled && iter->pos) + break; + + continue; + } + + /* stop when tracing is finished */ + if (trace_empty(iter)) + return 0; + + if (cnt >= PAGE_SIZE) + cnt = PAGE_SIZE - 1; + + memset(iter, 0, sizeof(*iter)); + iter->tr = &global_trace; + iter->pos = -1; + + /* + * We need to stop all tracing on all CPUS to read the + * the next buffer. This is a bit expensive, but is + * not done often. We fill all what we can read, + * and then release the locks again. + */ + + cpus_clear(mask); + local_irq_save(flags); + for_each_possible_cpu(cpu) { + data = iter->tr->data[cpu]; + + if (!head_page(data) || !data->trace_idx) + continue; + + atomic_inc(&data->disabled); + spin_lock(&data->lock); + cpu_set(cpu, mask); + } + + while ((entry = find_next_entry(iter, &cpu))) { + + if (!entry) + break; + + iter->ent = entry; + iter->cpu = cpu; + + ret = print_trace_fmt(iter); + if (!ret) + break; + + trace_consume(iter); + + if (iter->seq.len >= cnt) + break; + + } + + for_each_possible_cpu(cpu) { + data = iter->tr->data[cpu]; + + if (!cpu_isset(cpu, mask)) + continue; + spin_unlock(&data->lock); + atomic_dec(&data->disabled); + } + local_irq_restore(flags); + + /* Now copy what we have to the user */ + read = iter->seq.len; + if (read > cnt) + read = cnt; + + ret = copy_to_user(ubuf, iter->seq.buffer, read); + + if (read < iter->seq.len) + start = read; + else + trace_seq_reset(&iter->seq); + + if (ret) + read = -EFAULT; + + return read; +} + static struct file_operations tracing_max_lat_fops = { .open = tracing_open_generic, .read = tracing_max_lat_read, @@ -1663,6 +1911,12 @@ static struct file_operations set_tracer_fops = { .write = tracing_set_trace_write, }; +static struct file_operations tracing_pipe_fops = { + .open = tracing_open_pipe, + .read = tracing_read_pipe, + .release = tracing_release_pipe, +}; + #ifdef CONFIG_DYNAMIC_FTRACE static ssize_t @@ -1763,6 +2017,11 @@ static __init void tracer_init_debugfs(void) if (!entry) pr_warning("Could not create debugfs 'README' entry\n"); + entry = debugfs_create_file("trace_pipe", 0644, d_tracer, + NULL, &tracing_pipe_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'tracing_threash' entry\n"); #ifdef CONFIG_DYNAMIC_FTRACE entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer, @@ -1816,6 +2075,7 @@ static int trace_alloc_page(void) /* Now that we successfully allocate a page per CPU, add them */ for_each_possible_cpu(i) { data = global_trace.data[i]; + spin_lock_init(&data->lock); page = list_entry(pages.next, struct page, lru); list_del_init(&page->lru); list_add_tail(&page->lru, &data->trace_pages); @@ -1823,6 +2083,7 @@ static int trace_alloc_page(void) #ifdef CONFIG_TRACER_MAX_TRACE data = max_tr.data[i]; + spin_lock_init(&data->lock); page = list_entry(pages.next, struct page, lru); list_del_init(&page->lru); list_add_tail(&page->lru, &data->trace_pages); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index f5b32ca..29a7ea5 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -55,6 +55,7 @@ struct trace_entry { struct trace_array_cpu { struct list_head trace_pages; atomic_t disabled; + spinlock_t lock; cycle_t time_offset; /* these fields get copied into max-trace: */ @@ -88,6 +89,7 @@ struct trace_array { long ctrl; int cpu; cycle_t time_start; + struct task_struct *waiter; struct trace_array_cpu *data[NR_CPUS]; }; -- cgit v0.10.2 From d4c5a2f5870939d837293de87b41dda0012a4572 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:46 +0200 Subject: ftrace: fix locking we can hold all cpu trace buffer locks at once - put each into a separate lock class. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a40687a..b3811ca 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1865,11 +1865,8 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, } - for_each_possible_cpu(cpu) { + for_each_cpu_mask(cpu, mask) { data = iter->tr->data[cpu]; - - if (!cpu_isset(cpu, mask)) - continue; spin_unlock(&data->lock); atomic_dec(&data->disabled); } @@ -2076,6 +2073,7 @@ static int trace_alloc_page(void) for_each_possible_cpu(i) { data = global_trace.data[i]; spin_lock_init(&data->lock); + lockdep_set_class(&data->lock, &data->lock_key); page = list_entry(pages.next, struct page, lru); list_del_init(&page->lru); list_add_tail(&page->lru, &data->trace_pages); @@ -2084,6 +2082,7 @@ static int trace_alloc_page(void) #ifdef CONFIG_TRACER_MAX_TRACE data = max_tr.data[i]; spin_lock_init(&data->lock); + lockdep_set_class(&data->lock, &data->lock_key); page = list_entry(pages.next, struct page, lru); list_del_init(&page->lru); list_add_tail(&page->lru, &data->trace_pages); @@ -2203,5 +2202,4 @@ __init static int tracer_alloc_buffers(void) } return ret; } - fs_initcall(tracer_alloc_buffers); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 29a7ea5..b040835 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -56,6 +56,7 @@ struct trace_array_cpu { struct list_head trace_pages; atomic_t disabled; spinlock_t lock; + struct lock_class_key lock_key; cycle_t time_offset; /* these fields get copied into max-trace: */ -- cgit v0.10.2 From 4bf39a9411a4ce8712954e03a9bd1592ee345919 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:46 +0200 Subject: ftrace: cleanups no code changed. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 5e9389f..97c4086 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -756,9 +756,11 @@ ftrace_avail_open(struct inode *inode, struct file *file) ret = seq_open(file, &show_ftrace_seq_ops); if (!ret) { struct seq_file *m = file->private_data; + m->private = iter; - } else + } else { kfree(iter); + } return ret; } @@ -770,6 +772,7 @@ int ftrace_avail_release(struct inode *inode, struct file *file) seq_release(inode, file); kfree(iter); + return 0; } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index b3811ca..4550afd 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1245,10 +1245,10 @@ static int s_show(struct seq_file *m, void *v) } static struct seq_operations tracer_seq_ops = { - .start = s_start, - .next = s_next, - .stop = s_stop, - .show = s_show, + .start = s_start, + .next = s_next, + .stop = s_stop, + .show = s_show, }; static struct trace_iterator notrace * @@ -1397,10 +1397,10 @@ static int t_show(struct seq_file *m, void *v) } static struct seq_operations show_traces_seq_ops = { - .start = t_start, - .next = t_next, - .stop = t_stop, - .show = t_show, + .start = t_start, + .next = t_next, + .stop = t_stop, + .show = t_show, }; static int show_traces_open(struct inode *inode, struct file *file) @@ -1420,17 +1420,17 @@ static int show_traces_open(struct inode *inode, struct file *file) } static struct file_operations tracing_fops = { - .open = tracing_open, - .read = seq_read, - .llseek = seq_lseek, - .release = tracing_release, + .open = tracing_open, + .read = seq_read, + .llseek = seq_lseek, + .release = tracing_release, }; static struct file_operations tracing_lt_fops = { - .open = tracing_lt_open, - .read = seq_read, - .llseek = seq_lseek, - .release = tracing_release, + .open = tracing_lt_open, + .read = seq_read, + .llseek = seq_lseek, + .release = tracing_release, }; static struct file_operations show_traces_fops = { @@ -1620,8 +1620,7 @@ tracing_set_trace_read(struct file *filp, char __user *ubuf, r = sprintf(buf, "\n"); mutex_unlock(&trace_types_lock); - return simple_read_from_buffer(ubuf, cnt, ppos, - buf, r); + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } static ssize_t @@ -1680,8 +1679,7 @@ tracing_max_lat_read(struct file *filp, char __user *ubuf, *ptr == (unsigned long)-1 ? -1 : nsecs_to_usecs(*ptr)); if (r > 64) r = 64; - return simple_read_from_buffer(ubuf, cnt, ppos, - buf, r); + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } static ssize_t @@ -1891,27 +1889,27 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, } static struct file_operations tracing_max_lat_fops = { - .open = tracing_open_generic, - .read = tracing_max_lat_read, - .write = tracing_max_lat_write, + .open = tracing_open_generic, + .read = tracing_max_lat_read, + .write = tracing_max_lat_write, }; static struct file_operations tracing_ctrl_fops = { - .open = tracing_open_generic, - .read = tracing_ctrl_read, - .write = tracing_ctrl_write, + .open = tracing_open_generic, + .read = tracing_ctrl_read, + .write = tracing_ctrl_write, }; static struct file_operations set_tracer_fops = { - .open = tracing_open_generic, - .read = tracing_set_trace_read, - .write = tracing_set_trace_write, + .open = tracing_open_generic, + .read = tracing_set_trace_read, + .write = tracing_set_trace_write, }; static struct file_operations tracing_pipe_fops = { - .open = tracing_open_pipe, - .read = tracing_read_pipe, - .release = tracing_release_pipe, + .open = tracing_open_pipe, + .read = tracing_read_pipe, + .release = tracing_release_pipe, }; #ifdef CONFIG_DYNAMIC_FTRACE @@ -1925,13 +1923,13 @@ tracing_read_long(struct file *filp, char __user *ubuf, int r; r = sprintf(buf, "%ld\n", *p); - return simple_read_from_buffer(ubuf, cnt, ppos, - buf, r); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } static struct file_operations tracing_read_long_fops = { - .open = tracing_open_generic, - .read = tracing_read_long, + .open = tracing_open_generic, + .read = tracing_read_long, }; #endif @@ -2033,7 +2031,7 @@ static __init void tracer_init_debugfs(void) /* dummy trace to disable tracing */ static struct tracer no_tracer __read_mostly = { - .name = "none", + .name = "none", }; static int trace_alloc_page(void) -- cgit v0.10.2 From 750ed1a40783432d0dcb0e6c2e813a12615d7664 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:46 +0200 Subject: ftrace: timestamp syncing, prepare rename and uninline now() to ftrace_now(). Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 97c4086..a15e068 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -531,7 +531,7 @@ static int notrace __ftrace_update_code(void *ignore) save_ftrace_enabled = ftrace_enabled; ftrace_enabled = 0; - start = now(raw_smp_processor_id()); + start = ftrace_now(raw_smp_processor_id()); ftrace_update_cnt = 0; /* No locks needed, the machine is stopped! */ @@ -550,7 +550,7 @@ static int notrace __ftrace_update_code(void *ignore) } - stop = now(raw_smp_processor_id()); + stop = ftrace_now(raw_smp_processor_id()); ftrace_update_time = stop - start; ftrace_update_tot_cnt += ftrace_update_cnt; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4550afd..e3778ab 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -42,6 +42,11 @@ ns2usecs(cycle_t nsec) return nsec; } +notrace cycle_t ftrace_now(int cpu) +{ + return cpu_clock(cpu); +} + static atomic_t tracer_counter; static struct trace_array global_trace; @@ -607,7 +612,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags) entry->idx = atomic_inc_return(&tracer_counter); entry->preempt_count = pc & 0xff; entry->pid = tsk->pid; - entry->t = now(raw_smp_processor_id()); + entry->t = ftrace_now(raw_smp_processor_id()); entry->flags = (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) | ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) | diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index b040835..30cad67 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -171,10 +171,7 @@ void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu); void update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu); -static inline notrace cycle_t now(int cpu) -{ - return cpu_clock(cpu); -} +extern notrace cycle_t ftrace_now(int cpu); #ifdef CONFIG_SCHED_TRACER extern void notrace diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 5d8ad7a..e5d34b7 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -20,7 +20,7 @@ static notrace void function_reset(struct trace_array *tr) { int cpu; - tr->time_start = now(tr->cpu); + tr->time_start = ftrace_now(tr->cpu); for_each_online_cpu(cpu) tracing_reset(tr->data[cpu]); diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 2dfebb6..d2a6e6f 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -136,7 +136,7 @@ check_critical_timing(struct trace_array *tr, * as long as possible: */ T0 = data->preempt_timestamp; - T1 = now(cpu); + T1 = ftrace_now(cpu); delta = T1-T0; local_save_flags(flags); @@ -186,7 +186,7 @@ out_unlock: out: data->critical_sequence = max_sequence; - data->preempt_timestamp = now(cpu); + data->preempt_timestamp = ftrace_now(cpu); tracing_reset(data); ftrace(tr, data, CALLER_ADDR0, parent_ip, flags); } @@ -215,7 +215,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip) atomic_inc(&data->disabled); data->critical_sequence = max_sequence; - data->preempt_timestamp = now(cpu); + data->preempt_timestamp = ftrace_now(cpu); data->critical_start = parent_ip ? : ip; tracing_reset(data); diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 6c92841..8d65667 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -61,7 +61,7 @@ static notrace void sched_switch_reset(struct trace_array *tr) { int cpu; - tr->time_start = now(tr->cpu); + tr->time_start = ftrace_now(tr->cpu); for_each_online_cpu(cpu) tracing_reset(tr->data[cpu]); diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 688df96..b7df825 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -92,7 +92,7 @@ wakeup_sched_switch(struct task_struct *prev, struct task_struct *next) * as long as possible: */ T0 = data->preempt_timestamp; - T1 = now(cpu); + T1 = ftrace_now(cpu); delta = T1-T0; if (!report_latency(delta)) @@ -191,7 +191,7 @@ wakeup_check_start(struct trace_array *tr, struct task_struct *p, local_save_flags(flags); - tr->data[wakeup_cpu]->preempt_timestamp = now(cpu); + tr->data[wakeup_cpu]->preempt_timestamp = ftrace_now(cpu); ftrace(tr, tr->data[wakeup_cpu], CALLER_ADDR1, CALLER_ADDR2, flags); out_locked: -- cgit v0.10.2 From 53c37c17aafcf50f7c6fddaf01dda8f9d7e31ddf Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:46 +0200 Subject: ftrace: fast, scalable, synchronized timestamps implement globally synchronized, fast and scalable time source for tracing. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index e3778ab..9a931c7 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -42,9 +42,61 @@ ns2usecs(cycle_t nsec) return nsec; } +static const int time_sync_freq_max = 128; +static const cycle_t time_sync_thresh = 100000; + +static DEFINE_PER_CPU(cycle_t, time_offset); +static DEFINE_PER_CPU(cycle_t, prev_cpu_time); +static DEFINE_PER_CPU(int, time_sync_count); +static DEFINE_PER_CPU(int, time_sync_freq); + +/* + * Global lock which we take every now and then to synchronize + * the CPUs time. This method is not warp-safe, but it's good + * enough to synchronize slowly diverging time sources and thus + * it's good enough for tracing: + */ +static DEFINE_SPINLOCK(time_sync_lock); +static cycle_t prev_global_time; + +static notrace cycle_t __ftrace_now_sync(cycles_t time, int cpu) +{ + unsigned long flags; + + spin_lock_irqsave(&time_sync_lock, flags); + + /* + * Update the synchronization frequency: + */ + if (per_cpu(time_sync_freq, cpu) < time_sync_freq_max) + per_cpu(time_sync_freq, cpu) *= 2; + per_cpu(time_sync_count, cpu) = per_cpu(time_sync_freq, cpu); + + if (time < prev_global_time) { + per_cpu(time_offset, cpu) += prev_global_time - time; + time = prev_global_time; + } else { + prev_global_time = time; + } + + spin_unlock_irqrestore(&time_sync_lock, flags); + + return time; +} + notrace cycle_t ftrace_now(int cpu) { - return cpu_clock(cpu); + cycle_t prev_cpu_time, time, delta_time; + + prev_cpu_time = per_cpu(prev_cpu_time, cpu); + time = sched_clock() + per_cpu(time_offset, cpu); + delta_time = time-prev_cpu_time; + + if (unlikely(delta_time > time_sync_thresh || + --per_cpu(time_sync_count, cpu) <= 0)) + time = __ftrace_now_sync(time, cpu); + + return time; } static atomic_t tracer_counter; -- cgit v0.10.2 From cdd31cd2d7a0dcbec2cce3974f7129dd4fc8c879 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:46 +0200 Subject: ftrace: remove-idx-sync remove idx syncing - it's expensive on SMP. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9a931c7..ce8ceb8 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -99,7 +99,6 @@ notrace cycle_t ftrace_now(int cpu) return time; } -static atomic_t tracer_counter; static struct trace_array global_trace; static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu); @@ -661,7 +660,6 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags) pc = preempt_count(); - entry->idx = atomic_inc_return(&tracer_counter); entry->preempt_count = pc & 0xff; entry->pid = tsk->pid; entry->t = ftrace_now(raw_smp_processor_id()); @@ -757,8 +755,10 @@ find_next_entry(struct trace_iterator *iter, int *ent_cpu) if (!head_page(tr->data[cpu])) continue; ent = trace_entry_idx(tr, tr->data[cpu], iter, cpu); - if (ent && - (!next || (long)(next->idx - ent->idx) > 0)) { + /* + * Pick the entry with the smallest timestamp: + */ + if (ent && (!next || ent->t < next->t)) { next = ent; next_cpu = cpu; } @@ -800,8 +800,6 @@ trace_consume(struct trace_iterator *iter) if (data->trace_head == data->trace_tail && data->trace_head_idx == data->trace_tail_idx) data->trace_idx = 0; - - trace_iterator_increment(iter); } static notrace void * @@ -1160,33 +1158,6 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) } } -static notrace void sync_time_offset(struct trace_iterator *iter) -{ - struct trace_array_cpu *prev_array, *array; - struct trace_entry *prev_entry, *entry; - cycle_t prev_t, t; - - entry = iter->ent; - prev_entry = iter->prev_ent; - if (!prev_entry) - return; - - prev_array = iter->tr->data[iter->prev_cpu]; - array = iter->tr->data[iter->cpu]; - - prev_t = prev_entry->t + prev_array->time_offset; - t = entry->t + array->time_offset; - - /* - * If time goes backwards we increase the offset of - * the current array, to not have observable time warps. - * This will quickly synchronize the time offsets of - * multiple CPUs: - */ - if (t < prev_t) - array->time_offset += prev_t - t; -} - static notrace int print_trace_fmt(struct trace_iterator *iter) { @@ -1200,12 +1171,11 @@ print_trace_fmt(struct trace_iterator *iter) int S; int ret; - sync_time_offset(iter); entry = iter->ent; comm = trace_find_cmdline(iter->ent->pid); - t = ns2usecs(entry->t + iter->tr->data[iter->cpu]->time_offset); + t = ns2usecs(entry->t); usec_rem = do_div(t, 1000000ULL); secs = (unsigned long)t; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 30cad67..27fa2d0 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -38,7 +38,6 @@ struct trace_entry { char preempt_count; int pid; cycle_t t; - unsigned long idx; union { struct ftrace_entry fn; struct ctx_switch_entry ctx; @@ -57,7 +56,6 @@ struct trace_array_cpu { atomic_t disabled; spinlock_t lock; struct lock_class_key lock_key; - cycle_t time_offset; /* these fields get copied into max-trace: */ unsigned trace_head_idx; -- cgit v0.10.2 From 8c523a9d82dbc4f3f7d972df8c0f1eacd83d0d55 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:46 +0200 Subject: ftrace: clean-up-pipe-iteration Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ce8ceb8..42f1926 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -770,12 +770,12 @@ find_next_entry(struct trace_iterator *iter, int *ent_cpu) return next; } -static notrace void -trace_iterator_increment(struct trace_iterator *iter) +static notrace void trace_iterator_increment(struct trace_iterator *iter) { iter->idx++; iter->next_idx[iter->cpu]++; iter->next_page_idx[iter->cpu]++; + if (iter->next_page_idx[iter->cpu] >= ENTRIES_PER_PAGE) { struct trace_array_cpu *data = iter->tr->data[iter->cpu]; @@ -785,8 +785,7 @@ trace_iterator_increment(struct trace_iterator *iter) } } -static notrace void -trace_consume(struct trace_iterator *iter) +static notrace void trace_consume(struct trace_iterator *iter) { struct trace_array_cpu *data = iter->tr->data[iter->cpu]; @@ -802,8 +801,7 @@ trace_consume(struct trace_iterator *iter) data->trace_idx = 0; } -static notrace void * -find_next_entry_inc(struct trace_iterator *iter) +static notrace void *find_next_entry_inc(struct trace_iterator *iter) { struct trace_entry *next; int next_cpu = -1; @@ -1871,14 +1869,7 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, cpu_set(cpu, mask); } - while ((entry = find_next_entry(iter, &cpu))) { - - if (!entry) - break; - - iter->ent = entry; - iter->cpu = cpu; - + while ((entry = find_next_entry_inc(iter)) != NULL) { ret = print_trace_fmt(iter); if (!ret) break; @@ -1887,7 +1878,6 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, if (iter->seq.len >= cnt) break; - } for_each_cpu_mask(cpu, mask) { -- cgit v0.10.2 From f9896bf30928922a3913a3810a4ab7908da6cfe7 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:47 +0200 Subject: ftrace: add raw output Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 42f1926..bebd263 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -153,6 +153,7 @@ enum trace_iterator_flags { TRACE_ITER_SYM_OFFSET = 0x02, TRACE_ITER_SYM_ADDR = 0x04, TRACE_ITER_VERBOSE = 0x08, + TRACE_ITER_RAW = 0x10, }; #define TRACE_ITER_SYM_MASK \ @@ -164,6 +165,7 @@ static const char *trace_options[] = { "sym-offset", "sym-addr", "verbose", + "raw", NULL }; @@ -1099,7 +1101,7 @@ lat_print_timestamp(struct trace_seq *s, unsigned long long abs_usecs, static const char state_to_char[] = TASK_STATE_TO_CHAR_STR; -static notrace void +static notrace int print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) { struct trace_seq *s = &iter->seq; @@ -1154,10 +1156,10 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) default: trace_seq_printf(s, "Unknown type %d\n", entry->type); } + return 1; } -static notrace int -print_trace_fmt(struct trace_iterator *iter) +static notrace int print_trace_fmt(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); @@ -1222,6 +1224,43 @@ print_trace_fmt(struct trace_iterator *iter) return 1; } +static notrace int print_raw_fmt(struct trace_iterator *iter) +{ + struct trace_seq *s = &iter->seq; + struct trace_entry *entry; + int ret; + int S; + + entry = iter->ent; + + ret = trace_seq_printf(s, "%d %d %llu ", + entry->pid, iter->cpu, entry->t); + if (!ret) + return 0; + + switch (entry->type) { + case TRACE_FN: + ret = trace_seq_printf(s, "%x %x\n", + entry->fn.ip, entry->fn.parent_ip); + if (!ret) + return 0; + break; + case TRACE_CTX: + S = entry->ctx.prev_state < sizeof(state_to_char) ? + state_to_char[entry->ctx.prev_state] : 'X'; + ret = trace_seq_printf(s, "%d %d %c %d %d\n", + entry->ctx.prev_pid, + entry->ctx.prev_prio, + S, + entry->ctx.next_pid, + entry->ctx.next_prio); + if (!ret) + return 0; + break; + } + return 1; +} + static int trace_empty(struct trace_iterator *iter) { struct trace_array_cpu *data; @@ -1238,6 +1277,17 @@ static int trace_empty(struct trace_iterator *iter) return 1; } +static int print_trace_line(struct trace_iterator *iter) +{ + if (trace_flags & TRACE_ITER_RAW) + return print_raw_fmt(iter); + + if (iter->iter_flags & TRACE_FILE_LAT_FMT) + return print_lat_fmt(iter, iter->idx, iter->cpu); + + return print_trace_fmt(iter); +} + static int s_show(struct seq_file *m, void *v) { struct trace_iterator *iter = v; @@ -1259,10 +1309,7 @@ static int s_show(struct seq_file *m, void *v) print_func_help_header(m); } } else { - if (iter->iter_flags & TRACE_FILE_LAT_FMT) - print_lat_fmt(iter, iter->idx, iter->cpu); - else - print_trace_fmt(iter); + print_trace_line(iter); trace_print_seq(m, &iter->seq); } @@ -1870,7 +1917,7 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, } while ((entry = find_next_entry_inc(iter)) != NULL) { - ret = print_trace_fmt(iter); + ret = print_trace_line(iter); if (!ret) break; -- cgit v0.10.2 From cb0f12aae8d085140d37ada351aa5a8e76c3f9b0 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:47 +0200 Subject: ftrace: bin-output Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index bebd263..d78cbc4 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -154,6 +154,7 @@ enum trace_iterator_flags { TRACE_ITER_SYM_ADDR = 0x04, TRACE_ITER_VERBOSE = 0x08, TRACE_ITER_RAW = 0x10, + TRACE_ITER_BIN = 0x20, }; #define TRACE_ITER_SYM_MASK \ @@ -166,6 +167,7 @@ static const char *trace_options[] = { "sym-addr", "verbose", "raw", + "bin", NULL }; @@ -275,6 +277,18 @@ trace_seq_putc(struct trace_seq *s, unsigned char c) return 1; } +static notrace int +trace_seq_putmem(struct trace_seq *s, void *mem, size_t len) +{ + if (len > ((PAGE_SIZE - 1) - s->len)) + return 0; + + memcpy(s->buffer + s->len, mem, len); + s->len += len; + + return len; +} + static notrace void trace_seq_reset(struct trace_seq *s) { @@ -1261,6 +1275,39 @@ static notrace int print_raw_fmt(struct trace_iterator *iter) return 1; } +#define SEQ_PUT_FIELD_RET(s, x) \ +do { \ + if (!trace_seq_putmem(s, &(x), sizeof(x))) \ + return 0; \ +} while (0) + +static notrace int print_bin_fmt(struct trace_iterator *iter) +{ + struct trace_seq *s = &iter->seq; + struct trace_entry *entry; + + entry = iter->ent; + + SEQ_PUT_FIELD_RET(s, entry->pid); + SEQ_PUT_FIELD_RET(s, entry->cpu); + SEQ_PUT_FIELD_RET(s, entry->t); + + switch (entry->type) { + case TRACE_FN: + SEQ_PUT_FIELD_RET(s, entry->fn.ip); + SEQ_PUT_FIELD_RET(s, entry->fn.parent_ip); + break; + case TRACE_CTX: + SEQ_PUT_FIELD_RET(s, entry->ctx.prev_pid); + SEQ_PUT_FIELD_RET(s, entry->ctx.prev_prio); + SEQ_PUT_FIELD_RET(s, entry->ctx.prev_state); + SEQ_PUT_FIELD_RET(s, entry->ctx.next_pid); + SEQ_PUT_FIELD_RET(s, entry->ctx.next_prio); + break; + } + return 1; +} + static int trace_empty(struct trace_iterator *iter) { struct trace_array_cpu *data; @@ -1279,6 +1326,9 @@ static int trace_empty(struct trace_iterator *iter) static int print_trace_line(struct trace_iterator *iter) { + if (trace_flags & TRACE_ITER_BIN) + return print_bin_fmt(iter); + if (trace_flags & TRACE_ITER_RAW) return print_raw_fmt(iter); -- cgit v0.10.2 From f0a920d5752e1788c0cba2add103076bcc0f7a49 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:47 +0200 Subject: ftrace: add trace_special() for ad-hoc tracing. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d78cbc4..fa13059 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -137,6 +137,7 @@ enum trace_type { TRACE_FN, TRACE_CTX, + TRACE_SPECIAL, __TRACE_LAST_TYPE }; @@ -701,6 +702,22 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data, } notrace void +trace_special(struct trace_array *tr, struct trace_array_cpu *data, + unsigned long arg1, unsigned long arg2, unsigned long arg3) +{ + struct trace_entry *entry; + + spin_lock(&data->lock); + entry = tracing_get_trace_entry(tr, data); + tracing_generic_entry_update(entry, 0); + entry->type = TRACE_SPECIAL; + entry->special.arg1 = arg1; + entry->special.arg2 = arg2; + entry->special.arg3 = arg3; + spin_unlock(&data->lock); +} + +notrace void tracing_sched_switch_trace(struct trace_array *tr, struct trace_array_cpu *data, struct task_struct *prev, struct task_struct *next, @@ -1167,6 +1184,12 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) entry->ctx.next_prio, comm); break; + case TRACE_SPECIAL: + trace_seq_printf(s, " %lx %lx %lx\n", + entry->special.arg1, + entry->special.arg2, + entry->special.arg3); + break; default: trace_seq_printf(s, "Unknown type %d\n", entry->type); } @@ -1234,6 +1257,14 @@ static notrace int print_trace_fmt(struct trace_iterator *iter) if (!ret) return 0; break; + case TRACE_SPECIAL: + ret = trace_seq_printf(s, " %lx %lx %lx\n", + entry->special.arg1, + entry->special.arg2, + entry->special.arg3); + if (!ret) + return 0; + break; } return 1; } @@ -1271,6 +1302,14 @@ static notrace int print_raw_fmt(struct trace_iterator *iter) if (!ret) return 0; break; + case TRACE_SPECIAL: + ret = trace_seq_printf(s, " %lx %lx %lx\n", + entry->special.arg1, + entry->special.arg2, + entry->special.arg3); + if (!ret) + return 0; + break; } return 1; } @@ -1304,6 +1343,11 @@ static notrace int print_bin_fmt(struct trace_iterator *iter) SEQ_PUT_FIELD_RET(s, entry->ctx.next_pid); SEQ_PUT_FIELD_RET(s, entry->ctx.next_prio); break; + case TRACE_SPECIAL: + SEQ_PUT_FIELD_RET(s, entry->special.arg1); + SEQ_PUT_FIELD_RET(s, entry->special.arg2); + SEQ_PUT_FIELD_RET(s, entry->special.arg3); + break; } return 1; } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 27fa2d0..7bdfef3 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -26,6 +26,15 @@ struct ctx_switch_entry { }; /* + * Special (free-form) trace entry: + */ +struct special_entry { + unsigned long arg1; + unsigned long arg2; + unsigned long arg3; +}; + +/* * The trace entry - the most basic unit of tracing. This is what * is printed in the end as a single line in the trace output, such as: * @@ -41,6 +50,7 @@ struct trace_entry { union { struct ftrace_entry fn; struct ctx_switch_entry ctx; + struct special_entry special; }; }; @@ -154,6 +164,11 @@ void tracing_sched_switch_trace(struct trace_array *tr, struct task_struct *next, unsigned long flags); void tracing_record_cmdline(struct task_struct *tsk); +void trace_special(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long arg1, + unsigned long arg2, + unsigned long arg3); void tracing_start_function_trace(void); void tracing_stop_function_trace(void); -- cgit v0.10.2 From dcb6308f2b56720599f6b9d5a01c33e67e69bde4 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:48 +0200 Subject: ftrace, locking fix should be an irq-safe lock ... Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index fa13059..70f94fa 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -691,14 +691,15 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data, unsigned long ip, unsigned long parent_ip, unsigned long flags) { struct trace_entry *entry; + unsigned long irq_flags; - spin_lock(&data->lock); + spin_lock_irqsave(&data->lock, irq_flags); entry = tracing_get_trace_entry(tr, data); tracing_generic_entry_update(entry, flags); entry->type = TRACE_FN; entry->fn.ip = ip; entry->fn.parent_ip = parent_ip; - spin_unlock(&data->lock); + spin_unlock_irqrestore(&data->lock, irq_flags); } notrace void @@ -706,15 +707,16 @@ trace_special(struct trace_array *tr, struct trace_array_cpu *data, unsigned long arg1, unsigned long arg2, unsigned long arg3) { struct trace_entry *entry; + unsigned long irq_flags; - spin_lock(&data->lock); + spin_lock_irqsave(&data->lock, irq_flags); entry = tracing_get_trace_entry(tr, data); tracing_generic_entry_update(entry, 0); entry->type = TRACE_SPECIAL; entry->special.arg1 = arg1; entry->special.arg2 = arg2; entry->special.arg3 = arg3; - spin_unlock(&data->lock); + spin_unlock_irqrestore(&data->lock, irq_flags); } notrace void @@ -724,8 +726,9 @@ tracing_sched_switch_trace(struct trace_array *tr, unsigned long flags) { struct trace_entry *entry; + unsigned long irq_flags; - spin_lock(&data->lock); + spin_lock_irqsave(&data->lock, irq_flags); entry = tracing_get_trace_entry(tr, data); tracing_generic_entry_update(entry, flags); entry->type = TRACE_CTX; @@ -734,7 +737,7 @@ tracing_sched_switch_trace(struct trace_array *tr, entry->ctx.prev_state = prev->state; entry->ctx.next_pid = next->pid; entry->ctx.next_prio = next->prio; - spin_unlock(&data->lock); + spin_unlock_irqrestore(&data->lock, irq_flags); } enum trace_file_type { -- cgit v0.10.2 From 088b1e427dbba2af93cb6a7d39258c10ff58dd27 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:48 +0200 Subject: ftrace: pipe fixes Some fixes for better output with the trace pipe. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 70f94fa..c56fc5e 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -770,11 +770,6 @@ trace_entry_idx(struct trace_array *tr, struct trace_array_cpu *data, array = page_address(page); - /* Still possible to catch up to the tail */ - if (iter->next_idx[cpu] && array == data->trace_tail && - iter->next_page_idx[cpu] == data->trace_tail_idx) - return NULL; - WARN_ON(iter->next_page_idx[cpu] >= ENTRIES_PER_PAGE); return &array[iter->next_page_idx[cpu]]; } @@ -1921,7 +1916,6 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, struct trace_iterator *iter = filp->private_data; struct trace_array_cpu *data; static cpumask_t mask; - struct trace_entry *entry; static int start; unsigned long flags; int read = 0; @@ -2013,10 +2007,15 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, cpu_set(cpu, mask); } - while ((entry = find_next_entry_inc(iter)) != NULL) { + while (find_next_entry_inc(iter) != NULL) { + int len = iter->seq.len; + ret = print_trace_line(iter); - if (!ret) + if (!ret) { + /* don't print partial lines */ + iter->seq.len = len; break; + } trace_consume(iter); -- cgit v0.10.2 From 37ad508419f0fdfda7b378756eb1f35cfd26d96d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:48 +0200 Subject: ftrace - fix dynamic ftrace memory leak The ftrace dynamic function update allocates a record to store the instruction pointers that are being modified. If the modified instruction pointer fails to update, then the record is marked as failed and nothing more is done. Worse, if the modification fails, but the record ip function is still called, it will allocate a new record and try again. In just a matter of time, will this cause a serious memory leak and crash the system. This patch plugs this memory leak. When a record fails, it is included back into the pool of records to be used. Now a record may fail over and over again, but the number of allocated records will not increase. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index a842d96..61e757b 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -43,9 +43,10 @@ extern void mcount(void); # define FTRACE_HASHSIZE (1<node, &ftrace_hash[key]); } +static notrace void ftrace_free_rec(struct dyn_ftrace *rec) +{ + /* no locking, only called from kstop_machine */ + + rec->ip = (unsigned long)ftrace_free_records; + ftrace_free_records = rec; + rec->flags |= FTRACE_FL_FREE; +} + static notrace struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip) { + struct dyn_ftrace *rec; + + /* First check for freed records */ + if (ftrace_free_records) { + rec = ftrace_free_records; + + /* todo, disable tracing altogether on this warning */ + if (unlikely(!(rec->flags & FTRACE_FL_FREE))) { + WARN_ON_ONCE(1); + ftrace_free_records = NULL; + return NULL; + } + + ftrace_free_records = (void *)rec->ip; + memset(rec, 0, sizeof(*rec)); + return rec; + } + if (ftrace_pages->index == ENTRIES_PER_PAGE) { if (!ftrace_pages->next) return NULL; @@ -356,8 +385,16 @@ __ftrace_replace_code(struct dyn_ftrace *rec, } failed = ftrace_modify_code(ip, old, new); - if (failed) - rec->flags |= FTRACE_FL_FAILED; + if (failed) { + unsigned long key; + /* It is possible that the function hasn't been converted yet */ + key = hash_long(ip, FTRACE_HASHBITS); + if (!ftrace_ip_in_hash(ip, key)) { + rec->flags |= FTRACE_FL_FAILED; + ftrace_free_rec(rec); + } + + } } static void notrace ftrace_replace_code(int enable) @@ -407,8 +444,10 @@ ftrace_code_disable(struct dyn_ftrace *rec) call = ftrace_call_replace(ip, MCOUNT_ADDR); failed = ftrace_modify_code(ip, call, nop); - if (failed) + if (failed) { rec->flags |= FTRACE_FL_FAILED; + ftrace_free_rec(rec); + } } static int notrace __ftrace_modify_code(void *data) -- cgit v0.10.2 From 4eebcc81a33fbc45e28542b50197ed7b3c486d90 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:48 +0200 Subject: ftrace: disable tracing on failure Since ftrace touches practically every function. If we detect any anomaly, we want to fully disable ftrace. This patch adds code to try shutdown ftrace as much as possible without doing any more harm is something is detected not quite correct. This only kills ftrace, this patch does have checks for other parts of the tracer (irqsoff, wakeup, etc.). Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 61e757b..4650a31 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -58,6 +58,9 @@ struct dyn_ftrace { int ftrace_force_update(void); void ftrace_set_filter(unsigned char *buf, int len, int reset); +/* totally disable ftrace - can not re-enable after this */ +void ftrace_kill(void); + /* defined in arch */ extern int ftrace_ip_converted(unsigned long ip); extern unsigned char *ftrace_nop_replace(void); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 8e02aa6..ff42345 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -29,9 +29,16 @@ #include "trace.h" -int ftrace_enabled; +/* ftrace_enabled is a method to turn ftrace on or off */ +int ftrace_enabled __read_mostly; static int last_ftrace_enabled; +/* + * ftrace_disabled is set when an anomaly is discovered. + * ftrace_disabled is much stronger than ftrace_enabled. + */ +static int ftrace_disabled __read_mostly; + static DEFINE_SPINLOCK(ftrace_lock); static DEFINE_MUTEX(ftrace_sysctl_lock); @@ -230,10 +237,11 @@ static notrace struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip) if (ftrace_free_records) { rec = ftrace_free_records; - /* todo, disable tracing altogether on this warning */ if (unlikely(!(rec->flags & FTRACE_FL_FREE))) { WARN_ON_ONCE(1); ftrace_free_records = NULL; + ftrace_disabled = 1; + ftrace_enabled = 0; return NULL; } @@ -260,7 +268,7 @@ ftrace_record_ip(unsigned long ip) int resched; int atomic; - if (!ftrace_enabled) + if (!ftrace_enabled || ftrace_disabled) return; resched = need_resched(); @@ -485,6 +493,9 @@ static void notrace ftrace_startup(void) { int command = 0; + if (unlikely(ftrace_disabled)) + return; + mutex_lock(&ftraced_lock); ftraced_suspend++; if (ftraced_suspend == 1) @@ -507,6 +518,9 @@ static void notrace ftrace_shutdown(void) { int command = 0; + if (unlikely(ftrace_disabled)) + return; + mutex_lock(&ftraced_lock); ftraced_suspend--; if (!ftraced_suspend) @@ -529,6 +543,9 @@ static void notrace ftrace_startup_sysctl(void) { int command = FTRACE_ENABLE_MCOUNT; + if (unlikely(ftrace_disabled)) + return; + mutex_lock(&ftraced_lock); /* Force update next time */ saved_ftrace_func = NULL; @@ -544,6 +561,9 @@ static void notrace ftrace_shutdown_sysctl(void) { int command = FTRACE_DISABLE_MCOUNT; + if (unlikely(ftrace_disabled)) + return; + mutex_lock(&ftraced_lock); /* ftraced_suspend is true if ftrace is running */ if (ftraced_suspend) @@ -600,6 +620,9 @@ static int notrace __ftrace_update_code(void *ignore) static void notrace ftrace_update_code(void) { + if (unlikely(ftrace_disabled)) + return; + stop_machine_run(__ftrace_update_code, NULL, NR_CPUS); } @@ -614,6 +637,9 @@ static int notrace ftraced(void *ignore) /* check once a second */ schedule_timeout(HZ); + if (unlikely(ftrace_disabled)) + continue; + mutex_lock(&ftrace_sysctl_lock); mutex_lock(&ftraced_lock); if (ftrace_enabled && ftraced_trigger && !ftraced_suspend) { @@ -628,6 +654,7 @@ static int notrace ftraced(void *ignore) ftrace_update_cnt != 1 ? "s" : "", ftrace_update_tot_cnt, usecs, usecs != 1 ? "s" : ""); + ftrace_disabled = 1; WARN_ON_ONCE(1); } ftraced_trigger = 0; @@ -785,6 +812,9 @@ ftrace_avail_open(struct inode *inode, struct file *file) struct ftrace_iterator *iter; int ret; + if (unlikely(ftrace_disabled)) + return -ENODEV; + iter = kzalloc(sizeof(*iter), GFP_KERNEL); if (!iter) return -ENOMEM; @@ -843,6 +873,9 @@ ftrace_filter_open(struct inode *inode, struct file *file) struct ftrace_iterator *iter; int ret = 0; + if (unlikely(ftrace_disabled)) + return -ENODEV; + iter = kzalloc(sizeof(*iter), GFP_KERNEL); if (!iter) return -ENOMEM; @@ -1063,6 +1096,9 @@ ftrace_filter_write(struct file *file, const char __user *ubuf, */ notrace void ftrace_set_filter(unsigned char *buf, int len, int reset) { + if (unlikely(ftrace_disabled)) + return; + mutex_lock(&ftrace_filter_lock); if (reset) ftrace_filter_reset(); @@ -1133,7 +1169,7 @@ int ftrace_force_update(void) DECLARE_WAITQUEUE(wait, current); int ret = 0; - if (!ftraced_task) + if (unlikely(ftrace_disabled)) return -ENODEV; mutex_lock(&ftraced_lock); @@ -1142,6 +1178,11 @@ int ftrace_force_update(void) set_current_state(TASK_INTERRUPTIBLE); add_wait_queue(&ftraced_waiters, &wait); + if (unlikely(!ftraced_task)) { + ret = -ENODEV; + goto out; + } + do { mutex_unlock(&ftraced_lock); wake_up_process(ftraced_task); @@ -1154,6 +1195,7 @@ int ftrace_force_update(void) set_current_state(TASK_INTERRUPTIBLE); } while (last_counter == ftraced_iteration_counter); + out: mutex_unlock(&ftraced_lock); remove_wait_queue(&ftraced_waiters, &wait); set_current_state(TASK_RUNNING); @@ -1161,6 +1203,22 @@ int ftrace_force_update(void) return ret; } +static void ftrace_force_shutdown(void) +{ + struct task_struct *task; + int command = FTRACE_DISABLE_CALLS | FTRACE_UPDATE_TRACE_FUNC; + + mutex_lock(&ftraced_lock); + task = ftraced_task; + ftraced_task = NULL; + ftraced_suspend = -1; + ftrace_run_update_code(command); + mutex_unlock(&ftraced_lock); + + if (task) + kthread_stop(task); +} + static __init int ftrace_init_debugfs(void) { struct dentry *d_tracer; @@ -1194,21 +1252,29 @@ static int __init notrace ftrace_dynamic_init(void) stop_machine_run(ftrace_dyn_arch_init, &addr, NR_CPUS); /* ftrace_dyn_arch_init places the return code in addr */ - if (addr) - return addr; + if (addr) { + ret = (int)addr; + goto failed; + } ret = ftrace_dyn_table_alloc(); if (ret) - return ret; + goto failed; p = kthread_run(ftraced, NULL, "ftraced"); - if (IS_ERR(p)) - return -1; + if (IS_ERR(p)) { + ret = -1; + goto failed; + } last_ftrace_enabled = ftrace_enabled = 1; ftraced_task = p; return 0; + + failed: + ftrace_disabled = 1; + return ret; } core_initcall(ftrace_dynamic_init); @@ -1217,9 +1283,31 @@ core_initcall(ftrace_dynamic_init); # define ftrace_shutdown() do { } while (0) # define ftrace_startup_sysctl() do { } while (0) # define ftrace_shutdown_sysctl() do { } while (0) +# define ftrace_force_shutdown() do { } while (0) #endif /* CONFIG_DYNAMIC_FTRACE */ /** + * ftrace_kill - totally shutdown ftrace + * + * This is a safety measure. If something was detected that seems + * wrong, calling this function will keep ftrace from doing + * any more modifications, and updates. + * used when something went wrong. + */ +void ftrace_kill(void) +{ + mutex_lock(&ftrace_sysctl_lock); + ftrace_disabled = 1; + ftrace_enabled = 0; + + clear_ftrace_function(); + mutex_unlock(&ftrace_sysctl_lock); + + /* Try to totally disable ftrace */ + ftrace_force_shutdown(); +} + +/** * register_ftrace_function - register a function for profiling * @ops - ops structure that holds the function for profiling. * @@ -1234,6 +1322,9 @@ int register_ftrace_function(struct ftrace_ops *ops) { int ret; + if (unlikely(ftrace_disabled)) + return -1; + mutex_lock(&ftrace_sysctl_lock); ret = __register_ftrace_function(ops); ftrace_startup(); @@ -1267,6 +1358,9 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, { int ret; + if (unlikely(ftrace_disabled)) + return -ENODEV; + mutex_lock(&ftrace_sysctl_lock); ret = proc_dointvec(table, write, file, buffer, lenp, ppos); diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index a6f1ed7..85715b8 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -248,6 +248,10 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) ftrace_enabled = save_ftrace_enabled; tracer_enabled = save_tracer_enabled; + /* kill ftrace totally if we failed */ + if (ret) + ftrace_kill(); + return ret; } #endif /* CONFIG_FTRACE */ -- cgit v0.10.2 From 26994ead1fc8cced63f17e9848edc1771036664e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:48 +0200 Subject: ftrace: enabled tracing by default This patch is the correct way to have tracing enabled by default. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c56fc5e..3dc6eac 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -107,7 +107,7 @@ static struct trace_array max_tr; static DEFINE_PER_CPU(struct trace_array_cpu, max_data); -static int tracer_enabled; +static int tracer_enabled = 1; static unsigned long trace_nr_entries = 16384UL; static struct tracer *trace_types __read_mostly; @@ -2268,6 +2268,8 @@ __init static int tracer_alloc_buffers(void) int ret = -ENOMEM; int i; + global_trace.ctrl = tracer_enabled; + /* Allocate the first page for all buffers */ for_each_possible_cpu(i) { data = global_trace.data[i] = &per_cpu(global_trace_cpu, i); -- cgit v0.10.2 From 0fd9e0dac9026df09986a4b201518ae015814aef Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:48 +0200 Subject: ftrace: use cpu clock again Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 3dc6eac..d74c039 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -42,61 +42,9 @@ ns2usecs(cycle_t nsec) return nsec; } -static const int time_sync_freq_max = 128; -static const cycle_t time_sync_thresh = 100000; - -static DEFINE_PER_CPU(cycle_t, time_offset); -static DEFINE_PER_CPU(cycle_t, prev_cpu_time); -static DEFINE_PER_CPU(int, time_sync_count); -static DEFINE_PER_CPU(int, time_sync_freq); - -/* - * Global lock which we take every now and then to synchronize - * the CPUs time. This method is not warp-safe, but it's good - * enough to synchronize slowly diverging time sources and thus - * it's good enough for tracing: - */ -static DEFINE_SPINLOCK(time_sync_lock); -static cycle_t prev_global_time; - -static notrace cycle_t __ftrace_now_sync(cycles_t time, int cpu) -{ - unsigned long flags; - - spin_lock_irqsave(&time_sync_lock, flags); - - /* - * Update the synchronization frequency: - */ - if (per_cpu(time_sync_freq, cpu) < time_sync_freq_max) - per_cpu(time_sync_freq, cpu) *= 2; - per_cpu(time_sync_count, cpu) = per_cpu(time_sync_freq, cpu); - - if (time < prev_global_time) { - per_cpu(time_offset, cpu) += prev_global_time - time; - time = prev_global_time; - } else { - prev_global_time = time; - } - - spin_unlock_irqrestore(&time_sync_lock, flags); - - return time; -} - notrace cycle_t ftrace_now(int cpu) { - cycle_t prev_cpu_time, time, delta_time; - - prev_cpu_time = per_cpu(prev_cpu_time, cpu); - time = sched_clock() + per_cpu(time_offset, cpu); - delta_time = time-prev_cpu_time; - - if (unlikely(delta_time > time_sync_thresh || - --per_cpu(time_sync_count, cpu) <= 0)) - time = __ftrace_now_sync(time, cpu); - - return time; + return cpu_clock(cpu); } static struct trace_array global_trace; -- cgit v0.10.2 From 2e0f57618529a2739a5e1570e6c445c9c966b595 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:49 +0200 Subject: ftrace: build fix Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d74c039..71b25b7 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -432,47 +432,6 @@ notrace void tracing_reset(struct trace_array_cpu *data) data->trace_tail_idx = 0; } -#ifdef CONFIG_FTRACE -static notrace void -function_trace_call(unsigned long ip, unsigned long parent_ip) -{ - struct trace_array *tr = &global_trace; - struct trace_array_cpu *data; - unsigned long flags; - long disabled; - int cpu; - - if (unlikely(!tracer_enabled)) - return; - - local_irq_save(flags); - cpu = raw_smp_processor_id(); - data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); - - if (likely(disabled == 1)) - ftrace(tr, data, ip, parent_ip, flags); - - atomic_dec(&data->disabled); - local_irq_restore(flags); -} - -static struct ftrace_ops trace_ops __read_mostly = -{ - .func = function_trace_call, -}; -#endif - -notrace void tracing_start_function_trace(void) -{ - register_ftrace_function(&trace_ops); -} - -notrace void tracing_stop_function_trace(void) -{ - unregister_ftrace_function(&trace_ops); -} - #define SAVED_CMDLINES 128 static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; static unsigned map_cmdline_to_pid[SAVED_CMDLINES]; @@ -635,8 +594,8 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags) } notrace void -ftrace(struct trace_array *tr, struct trace_array_cpu *data, - unsigned long ip, unsigned long parent_ip, unsigned long flags) +__ftrace(struct trace_array *tr, struct trace_array_cpu *data, + unsigned long ip, unsigned long parent_ip, unsigned long flags) { struct trace_entry *entry; unsigned long irq_flags; @@ -651,6 +610,14 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data, } notrace void +ftrace(struct trace_array *tr, struct trace_array_cpu *data, + unsigned long ip, unsigned long parent_ip, unsigned long flags) +{ + if (likely(!atomic_read(&data->disabled))) + __ftrace(tr, data, ip, parent_ip, flags); +} + +notrace void trace_special(struct trace_array *tr, struct trace_array_cpu *data, unsigned long arg1, unsigned long arg2, unsigned long arg3) { @@ -688,6 +655,47 @@ tracing_sched_switch_trace(struct trace_array *tr, spin_unlock_irqrestore(&data->lock, irq_flags); } +#ifdef CONFIG_FTRACE +static notrace void +function_trace_call(unsigned long ip, unsigned long parent_ip) +{ + struct trace_array *tr = &global_trace; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int cpu; + + if (unlikely(!tracer_enabled)) + return; + + local_irq_save(flags); + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + disabled = atomic_inc_return(&data->disabled); + + if (likely(disabled == 1)) + __ftrace(tr, data, ip, parent_ip, flags); + + atomic_dec(&data->disabled); + local_irq_restore(flags); +} + +static struct ftrace_ops trace_ops __read_mostly = +{ + .func = function_trace_call, +}; + +notrace void tracing_start_function_trace(void) +{ + register_ftrace_function(&trace_ops); +} + +notrace void tracing_stop_function_trace(void) +{ + unregister_ftrace_function(&trace_ops); +} +#endif + enum trace_file_type { TRACE_FILE_LAT_FMT = 1, }; @@ -722,7 +730,7 @@ trace_entry_idx(struct trace_array *tr, struct trace_array_cpu *data, return &array[iter->next_page_idx[cpu]]; } -static struct notrace trace_entry * +static struct trace_entry * notrace find_next_entry(struct trace_iterator *iter, int *ent_cpu) { struct trace_array *tr = iter->tr; @@ -1866,6 +1874,7 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, static cpumask_t mask; static int start; unsigned long flags; + int ftrace_save; int read = 0; int cpu; int len; @@ -1944,6 +1953,9 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, cpus_clear(mask); local_irq_save(flags); + ftrace_save = ftrace_enabled; + ftrace_enabled = 0; + smp_wmb(); for_each_possible_cpu(cpu) { data = iter->tr->data[cpu]; @@ -1951,10 +1963,14 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, continue; atomic_inc(&data->disabled); - spin_lock(&data->lock); cpu_set(cpu, mask); } + for_each_cpu_mask(cpu, mask) { + data = iter->tr->data[cpu]; + spin_lock(&data->lock); + } + while (find_next_entry_inc(iter) != NULL) { int len = iter->seq.len; @@ -1974,8 +1990,13 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, for_each_cpu_mask(cpu, mask) { data = iter->tr->data[cpu]; spin_unlock(&data->lock); + } + + for_each_cpu_mask(cpu, mask) { + data = iter->tr->data[cpu]; atomic_dec(&data->disabled); } + ftrace_enabled = ftrace_save; local_irq_restore(flags); /* Now copy what we have to the user */ diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index e5d34b7..69a0eb0 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -9,10 +9,10 @@ * Copyright (C) 2004-2006 Ingo Molnar * Copyright (C) 2004 William Lee Irwin III */ -#include #include #include #include +#include #include "trace.h" -- cgit v0.10.2 From 5e3ca0ec76fce92daa4eed0d02de9c79b1fe3920 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:49 +0200 Subject: ftrace: introduce the "hex" output method Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 71b25b7..6974b21 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -103,7 +103,8 @@ enum trace_iterator_flags { TRACE_ITER_SYM_ADDR = 0x04, TRACE_ITER_VERBOSE = 0x08, TRACE_ITER_RAW = 0x10, - TRACE_ITER_BIN = 0x20, + TRACE_ITER_HEX = 0x20, + TRACE_ITER_BIN = 0x40, }; #define TRACE_ITER_SYM_MASK \ @@ -116,6 +117,7 @@ static const char *trace_options[] = { "sym-addr", "verbose", "raw", + "hex", "bin", NULL }; @@ -238,6 +240,47 @@ trace_seq_putmem(struct trace_seq *s, void *mem, size_t len) return len; } +#define HEX_CHARS 17 + +static notrace int +trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len) +{ + unsigned char hex[HEX_CHARS]; + unsigned char *data; + unsigned char byte; + int i, j; + + BUG_ON(len >= HEX_CHARS); + + data = mem; + +#ifdef __BIG_ENDIAN + for (i = 0, j = 0; i < len; i++) { +#else + for (i = len-1, j = 0; i >= 0; i--) { +#endif + byte = data[i]; + + hex[j] = byte & 0x0f; + if (hex[j] >= 10) + hex[j] += 'a' - 10; + else + hex[j] += '0'; + j++; + + hex[j] = byte >> 4; + if (hex[j] >= 10) + hex[j] += 'a' - 10; + else + hex[j] += '0'; + j++; + } + hex[j] = ' '; + j++; + + return trace_seq_putmem(s, hex, j); +} + static notrace void trace_seq_reset(struct trace_seq *s) { @@ -1274,6 +1317,51 @@ do { \ return 0; \ } while (0) +#define SEQ_PUT_HEX_FIELD_RET(s, x) \ +do { \ + if (!trace_seq_putmem_hex(s, &(x), sizeof(x))) \ + return 0; \ +} while (0) + +static notrace int print_hex_fmt(struct trace_iterator *iter) +{ + struct trace_seq *s = &iter->seq; + unsigned char newline = '\n'; + struct trace_entry *entry; + int S; + + entry = iter->ent; + + SEQ_PUT_HEX_FIELD_RET(s, entry->pid); + SEQ_PUT_HEX_FIELD_RET(s, iter->cpu); + SEQ_PUT_HEX_FIELD_RET(s, entry->t); + + switch (entry->type) { + case TRACE_FN: + SEQ_PUT_HEX_FIELD_RET(s, entry->fn.ip); + SEQ_PUT_HEX_FIELD_RET(s, entry->fn.parent_ip); + break; + case TRACE_CTX: + S = entry->ctx.prev_state < sizeof(state_to_char) ? + state_to_char[entry->ctx.prev_state] : 'X'; + SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.prev_pid); + SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.prev_prio); + SEQ_PUT_HEX_FIELD_RET(s, S); + SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.next_pid); + SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.next_prio); + SEQ_PUT_HEX_FIELD_RET(s, entry->fn.parent_ip); + break; + case TRACE_SPECIAL: + SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg1); + SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg2); + SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg3); + break; + } + SEQ_PUT_FIELD_RET(s, newline); + + return 1; +} + static notrace int print_bin_fmt(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; @@ -1327,6 +1415,9 @@ static int print_trace_line(struct trace_iterator *iter) if (trace_flags & TRACE_ITER_BIN) return print_bin_fmt(iter); + if (trace_flags & TRACE_ITER_HEX) + return print_hex_fmt(iter); + if (trace_flags & TRACE_ITER_RAW) return print_raw_fmt(iter); -- cgit v0.10.2 From 2577046740fe6d77864128c6187c11125c2449ea Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:49 +0200 Subject: ftrace: build fix no need to backmerge, only affects ftrace-enabled kernels. (which is not the default) Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 6974b21..958c4d7 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1965,7 +1965,9 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, static cpumask_t mask; static int start; unsigned long flags; +#ifdef CONFIG_FTRACE int ftrace_save; +#endif int read = 0; int cpu; int len; @@ -2044,8 +2046,10 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, cpus_clear(mask); local_irq_save(flags); +#ifdef CONFIG_FTRACE ftrace_save = ftrace_enabled; ftrace_enabled = 0; +#endif smp_wmb(); for_each_possible_cpu(cpu) { data = iter->tr->data[cpu]; @@ -2087,7 +2091,9 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, data = iter->tr->data[cpu]; atomic_dec(&data->disabled); } +#ifdef CONFIG_FTRACE ftrace_enabled = ftrace_save; +#endif local_irq_restore(flags); /* Now copy what we have to the user */ -- cgit v0.10.2 From aeaee8a2c9cb4489f166ca0e39c568e8254faaa6 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:49 +0200 Subject: ftrace: build fix Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 4650a31..08fbef1 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -58,9 +58,6 @@ struct dyn_ftrace { int ftrace_force_update(void); void ftrace_set_filter(unsigned char *buf, int len, int reset); -/* totally disable ftrace - can not re-enable after this */ -void ftrace_kill(void); - /* defined in arch */ extern int ftrace_ip_converted(unsigned long ip); extern unsigned char *ftrace_nop_replace(void); @@ -74,10 +71,13 @@ extern void ftrace_caller(void); extern void ftrace_call(void); extern void mcount_call(void); #else -# define ftrace_force_update() ({ 0; }) -# define ftrace_set_filter(buf, len, reset) do { } while (0) +# define ftrace_force_update() ({ 0; }) +# define ftrace_set_filter(buf, len, reset) do { } while (0) #endif +/* totally disable ftrace - can not re-enable after this */ +void ftrace_kill(void); + static inline void tracer_disable(void) { #ifdef CONFIG_FTRACE -- cgit v0.10.2 From 2a2cc8f7c4d0dfd75720867f7dc58d24f075edfc Mon Sep 17 00:00:00 2001 From: Soeren Sandmann Pedersen Date: Mon, 12 May 2008 21:20:49 +0200 Subject: ftrace: allow the event pipe to be polled Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 958c4d7..d041578 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -63,6 +64,7 @@ static struct tracer *current_trace __read_mostly; static int max_tracer_type_len; static DEFINE_MUTEX(trace_types_lock); +static DECLARE_WAIT_QUEUE_HEAD (trace_wait); #define ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(struct trace_entry)) @@ -105,6 +107,7 @@ enum trace_iterator_flags { TRACE_ITER_RAW = 0x10, TRACE_ITER_HEX = 0x20, TRACE_ITER_BIN = 0x40, + TRACE_ITER_BLOCK = 0x80, }; #define TRACE_ITER_SYM_MASK \ @@ -119,6 +122,7 @@ static const char *trace_options[] = { "raw", "hex", "bin", + "block", NULL }; @@ -650,6 +654,9 @@ __ftrace(struct trace_array *tr, struct trace_array_cpu *data, entry->fn.ip = ip; entry->fn.parent_ip = parent_ip; spin_unlock_irqrestore(&data->lock, irq_flags); + + if (!(trace_flags & TRACE_ITER_BLOCK)) + wake_up (&trace_wait); } notrace void @@ -675,6 +682,9 @@ trace_special(struct trace_array *tr, struct trace_array_cpu *data, entry->special.arg2 = arg2; entry->special.arg3 = arg3; spin_unlock_irqrestore(&data->lock, irq_flags); + + if (!(trace_flags & TRACE_ITER_BLOCK)) + wake_up (&trace_wait); } notrace void @@ -696,6 +706,9 @@ tracing_sched_switch_trace(struct trace_array *tr, entry->ctx.next_pid = next->pid; entry->ctx.next_prio = next->prio; spin_unlock_irqrestore(&data->lock, irq_flags); + + if (!(trace_flags & TRACE_ITER_BLOCK)) + wake_up (&trace_wait); } #ifdef CONFIG_FTRACE @@ -1765,7 +1778,6 @@ static struct file_operations tracing_readme_fops = { .read = tracing_readme_read, }; - static ssize_t tracing_ctrl_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) @@ -1953,6 +1965,28 @@ static int tracing_release_pipe(struct inode *inode, struct file *file) return 0; } +static unsigned int +tracing_poll_pipe(struct file *filp, poll_table *poll_table) +{ + struct trace_iterator *iter = filp->private_data; + + if (trace_flags & TRACE_ITER_BLOCK) { + /* + * Always select as readable when in blocking mode + */ + return POLLIN | POLLRDNORM; + } + else { + if (!trace_empty(iter)) + return POLLIN | POLLRDNORM; + poll_wait(filp, &trace_wait, poll_table); + if (!trace_empty(iter)) + return POLLIN | POLLRDNORM; + + return 0; + } +} + /* * Consumer reader. */ @@ -1991,6 +2025,8 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, start = 0; while (trace_empty(iter)) { + if (!(trace_flags & TRACE_ITER_BLOCK)) + return -EWOULDBLOCK; /* * This is a make-shift waitqueue. The reason we don't use * an actual wait queue is because: @@ -2134,6 +2170,7 @@ static struct file_operations set_tracer_fops = { static struct file_operations tracing_pipe_fops = { .open = tracing_open_pipe, + .poll = tracing_poll_pipe, .read = tracing_read_pipe, .release = tracing_release_pipe, }; -- cgit v0.10.2 From 6fb44b717c10ecf37beaaebd312f3afa93fed714 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:49 +0200 Subject: ftrace: add trace_function api for other tracers to use A new check was added in the ftrace function that wont trace if the CPU trace buffer is disabled. Unfortunately, other tracers used ftrace() to write to the buffer after they disabled it. The new disable check makes these calls into a nop. This patch changes the __ftrace that is called without the check into a new api for the other tracers to use, called "trace_function". The other tracers use this interface instead when the trace CPU buffer is already disabled. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d041578..9022c35 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -641,8 +641,8 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags) } notrace void -__ftrace(struct trace_array *tr, struct trace_array_cpu *data, - unsigned long ip, unsigned long parent_ip, unsigned long flags) +trace_function(struct trace_array *tr, struct trace_array_cpu *data, + unsigned long ip, unsigned long parent_ip, unsigned long flags) { struct trace_entry *entry; unsigned long irq_flags; @@ -664,7 +664,7 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data, unsigned long ip, unsigned long parent_ip, unsigned long flags) { if (likely(!atomic_read(&data->disabled))) - __ftrace(tr, data, ip, parent_ip, flags); + trace_function(tr, data, ip, parent_ip, flags); } notrace void @@ -730,7 +730,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip) disabled = atomic_inc_return(&data->disabled); if (likely(disabled == 1)) - __ftrace(tr, data, ip, parent_ip, flags); + trace_function(tr, data, ip, parent_ip, flags); atomic_dec(&data->disabled); local_irq_restore(flags); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 7bdfef3..faf9f67 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -169,6 +169,11 @@ void trace_special(struct trace_array *tr, unsigned long arg1, unsigned long arg2, unsigned long arg3); +void trace_function(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long ip, + unsigned long parent_ip, + unsigned long flags); void tracing_start_function_trace(void); void tracing_stop_function_trace(void); diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index d2a6e6f..3269f4f 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -95,7 +95,7 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) disabled = atomic_inc_return(&data->disabled); if (likely(disabled == 1)) - ftrace(tr, data, ip, parent_ip, flags); + trace_function(tr, data, ip, parent_ip, flags); atomic_dec(&data->disabled); } @@ -150,7 +150,7 @@ check_critical_timing(struct trace_array *tr, if (!report_latency(delta)) goto out_unlock; - ftrace(tr, data, CALLER_ADDR0, parent_ip, flags); + trace_function(tr, data, CALLER_ADDR0, parent_ip, flags); latency = nsecs_to_usecs(delta); @@ -188,7 +188,7 @@ out: data->critical_sequence = max_sequence; data->preempt_timestamp = ftrace_now(cpu); tracing_reset(data); - ftrace(tr, data, CALLER_ADDR0, parent_ip, flags); + trace_function(tr, data, CALLER_ADDR0, parent_ip, flags); } static inline void notrace @@ -221,7 +221,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip) local_save_flags(flags); - ftrace(tr, data, ip, parent_ip, flags); + trace_function(tr, data, ip, parent_ip, flags); __get_cpu_var(tracing_cpu) = 1; @@ -254,7 +254,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip) atomic_inc(&data->disabled); local_save_flags(flags); - ftrace(tr, data, ip, parent_ip, flags); + trace_function(tr, data, ip, parent_ip, flags); check_critical_timing(tr, data, parent_ip ? : ip, cpu); data->critical_start = 0; atomic_dec(&data->disabled); diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index b7df825..3549e41 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -85,7 +85,7 @@ wakeup_sched_switch(struct task_struct *prev, struct task_struct *next) if (unlikely(!tracer_enabled || next != wakeup_task)) goto out_unlock; - ftrace(tr, data, CALLER_ADDR1, CALLER_ADDR2, flags); + trace_function(tr, data, CALLER_ADDR1, CALLER_ADDR2, flags); /* * usecs conversion is slow so we try to delay the conversion @@ -192,7 +192,8 @@ wakeup_check_start(struct trace_array *tr, struct task_struct *p, local_save_flags(flags); tr->data[wakeup_cpu]->preempt_timestamp = ftrace_now(cpu); - ftrace(tr, tr->data[wakeup_cpu], CALLER_ADDR1, CALLER_ADDR2, flags); + trace_function(tr, tr->data[wakeup_cpu], + CALLER_ADDR1, CALLER_ADDR2, flags); out_locked: spin_unlock(&wakeup_lock); -- cgit v0.10.2 From 9ff9cdb2d3b0971f89e899b3420aadd91bddc215 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:50 +0200 Subject: ftrace: cleanups clean up recent code. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index ff42345..425b1fe 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -197,8 +197,8 @@ static int ftrace_record_suspend; static struct dyn_ftrace *ftrace_free_records; -static inline int -notrace ftrace_ip_in_hash(unsigned long ip, unsigned long key) +static inline int notrace +ftrace_ip_in_hash(unsigned long ip, unsigned long key) { struct dyn_ftrace *p; struct hlist_node *t; @@ -1249,6 +1249,7 @@ static int __init notrace ftrace_dynamic_init(void) int ret; addr = (unsigned long)ftrace_record_ip; + stop_machine_run(ftrace_dyn_arch_init, &addr, NR_CPUS); /* ftrace_dyn_arch_init places the return code in addr */ diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 3269f4f..2ac0d09 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -309,7 +309,7 @@ void trace_softirqs_off(unsigned long ip) { } -inline void print_irqtrace_events(struct task_struct *curr) +inline notrace void print_irqtrace_events(struct task_struct *curr) { } diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 85715b8..546307d 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -3,7 +3,7 @@ #include #include -static inline int trace_valid_entry(struct trace_entry *entry) +static notrace inline int trace_valid_entry(struct trace_entry *entry) { switch (entry->type) { case TRACE_FN: -- cgit v0.10.2 From caf8cdebfb6c1cff50ea8077f1a07c2333d6d1fd Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:50 +0200 Subject: ftrace: remove address of function names PowerPC is very fragile when it comes to use of function names and function addresses. ftrace needs to either use all function addresses or function names (i.e. my_func as suppose to &my_func). This patch chooses to use the names and not the addresses, and makes ftrace consistent. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 425b1fe..57350cb 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -326,8 +326,8 @@ ftrace_record_ip(unsigned long ip) preempt_enable_notrace(); } -#define FTRACE_ADDR ((long)(&ftrace_caller)) -#define MCOUNT_ADDR ((long)(&mcount)) +#define FTRACE_ADDR ((long)(ftrace_caller)) +#define MCOUNT_ADDR ((long)(mcount)) static void notrace __ftrace_replace_code(struct dyn_ftrace *rec, -- cgit v0.10.2 From 3594136ad67a54d77bcb2547e70011754a2f91d5 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:50 +0200 Subject: ftrace: do not profile lib/string.o Most archs define the string and memory compare functions in assembly. Some do not. But these functions may be used in some archs at early boot up. Since most archs define this code in assembly and they are not usually traced, there's no need to trace them when they are not defined in assembly. This patch removes the -pg from the CFLAGS for lib/string.o. This prevents the string functions use in either vdso or early bootup from crashing the system. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/lib/Makefile b/lib/Makefile index 74b0cfb..6ca9e6e 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -8,6 +8,14 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \ sha1.o irq_regs.o reciprocal_div.o argv_split.o \ proportions.o prio_heap.o ratelimit.o +ifdef CONFIG_FTRACE +# Do not profile string.o, since it may be used in early boot or vdso +ORIG_CFLAGS := $(KBUILD_CFLAGS) +KBUILD_CFLAGS = $(if $(subst string,,$(basename $(notdir $@))), \ + $(ORIG_CFLAGS), \ + $(subst -pg,,$(ORIG_CFLAGS))) +endif + lib-$(CONFIG_MMU) += ioremap.o lib-$(CONFIG_SMP) += cpumask.o -- cgit v0.10.2 From b53dde9d34f2df396540988ebc65c33400f57b04 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:51 +0200 Subject: ftrace: disable -pg for the tracer itself Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 3fec653..c25a6cd 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -1,3 +1,11 @@ + +# Do not instrument the tracer itself: + +ifdef CONFIG_FTRACE +ORIG_CFLAGS := $(KBUILD_CFLAGS) +KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS)) +endif + obj-$(CONFIG_FTRACE) += libftrace.o obj-$(CONFIG_TRACING) += trace.o -- cgit v0.10.2 From e309b41dd65aa953f86765eeeecc941d8e1e8b8f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:51 +0200 Subject: ftrace: remove notrace now that we have a kbuild method for notrace, no need to pollute the C code with the annotations. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 57350cb..281d97a 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -53,7 +53,7 @@ ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; /* mcount is defined per arch in assembly */ EXPORT_SYMBOL(mcount); -notrace void ftrace_list_func(unsigned long ip, unsigned long parent_ip) +void ftrace_list_func(unsigned long ip, unsigned long parent_ip) { struct ftrace_ops *op = ftrace_list; @@ -79,7 +79,7 @@ void clear_ftrace_function(void) ftrace_trace_function = ftrace_stub; } -static int notrace __register_ftrace_function(struct ftrace_ops *ops) +static int __register_ftrace_function(struct ftrace_ops *ops) { /* Should never be called by interrupts */ spin_lock(&ftrace_lock); @@ -110,7 +110,7 @@ static int notrace __register_ftrace_function(struct ftrace_ops *ops) return 0; } -static int notrace __unregister_ftrace_function(struct ftrace_ops *ops) +static int __unregister_ftrace_function(struct ftrace_ops *ops) { struct ftrace_ops **p; int ret = 0; @@ -197,7 +197,7 @@ static int ftrace_record_suspend; static struct dyn_ftrace *ftrace_free_records; -static inline int notrace +static inline int ftrace_ip_in_hash(unsigned long ip, unsigned long key) { struct dyn_ftrace *p; @@ -214,13 +214,13 @@ ftrace_ip_in_hash(unsigned long ip, unsigned long key) return found; } -static inline void notrace +static inline void ftrace_add_hash(struct dyn_ftrace *node, unsigned long key) { hlist_add_head(&node->node, &ftrace_hash[key]); } -static notrace void ftrace_free_rec(struct dyn_ftrace *rec) +static void ftrace_free_rec(struct dyn_ftrace *rec) { /* no locking, only called from kstop_machine */ @@ -229,7 +229,7 @@ static notrace void ftrace_free_rec(struct dyn_ftrace *rec) rec->flags |= FTRACE_FL_FREE; } -static notrace struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip) +static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip) { struct dyn_ftrace *rec; @@ -259,7 +259,7 @@ static notrace struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip) return &ftrace_pages->records[ftrace_pages->index++]; } -static void notrace +static void ftrace_record_ip(unsigned long ip) { struct dyn_ftrace *node; @@ -329,7 +329,7 @@ ftrace_record_ip(unsigned long ip) #define FTRACE_ADDR ((long)(ftrace_caller)) #define MCOUNT_ADDR ((long)(mcount)) -static void notrace +static void __ftrace_replace_code(struct dyn_ftrace *rec, unsigned char *old, unsigned char *new, int enable) { @@ -405,7 +405,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, } } -static void notrace ftrace_replace_code(int enable) +static void ftrace_replace_code(int enable) { unsigned char *new = NULL, *old = NULL; struct dyn_ftrace *rec; @@ -430,7 +430,7 @@ static void notrace ftrace_replace_code(int enable) } } -static notrace void ftrace_shutdown_replenish(void) +static void ftrace_shutdown_replenish(void) { if (ftrace_pages->next) return; @@ -439,7 +439,7 @@ static notrace void ftrace_shutdown_replenish(void) ftrace_pages->next = (void *)get_zeroed_page(GFP_KERNEL); } -static notrace void +static void ftrace_code_disable(struct dyn_ftrace *rec) { unsigned long ip; @@ -458,7 +458,7 @@ ftrace_code_disable(struct dyn_ftrace *rec) } } -static int notrace __ftrace_modify_code(void *data) +static int __ftrace_modify_code(void *data) { unsigned long addr; int *command = data; @@ -482,14 +482,14 @@ static int notrace __ftrace_modify_code(void *data) return 0; } -static void notrace ftrace_run_update_code(int command) +static void ftrace_run_update_code(int command) { stop_machine_run(__ftrace_modify_code, &command, NR_CPUS); } static ftrace_func_t saved_ftrace_func; -static void notrace ftrace_startup(void) +static void ftrace_startup(void) { int command = 0; @@ -514,7 +514,7 @@ static void notrace ftrace_startup(void) mutex_unlock(&ftraced_lock); } -static void notrace ftrace_shutdown(void) +static void ftrace_shutdown(void) { int command = 0; @@ -539,7 +539,7 @@ static void notrace ftrace_shutdown(void) mutex_unlock(&ftraced_lock); } -static void notrace ftrace_startup_sysctl(void) +static void ftrace_startup_sysctl(void) { int command = FTRACE_ENABLE_MCOUNT; @@ -557,7 +557,7 @@ static void notrace ftrace_startup_sysctl(void) mutex_unlock(&ftraced_lock); } -static void notrace ftrace_shutdown_sysctl(void) +static void ftrace_shutdown_sysctl(void) { int command = FTRACE_DISABLE_MCOUNT; @@ -577,7 +577,7 @@ static cycle_t ftrace_update_time; static unsigned long ftrace_update_cnt; unsigned long ftrace_update_tot_cnt; -static int notrace __ftrace_update_code(void *ignore) +static int __ftrace_update_code(void *ignore) { struct dyn_ftrace *p; struct hlist_head head; @@ -618,7 +618,7 @@ static int notrace __ftrace_update_code(void *ignore) return 0; } -static void notrace ftrace_update_code(void) +static void ftrace_update_code(void) { if (unlikely(ftrace_disabled)) return; @@ -626,7 +626,7 @@ static void notrace ftrace_update_code(void) stop_machine_run(__ftrace_update_code, NULL, NR_CPUS); } -static int notrace ftraced(void *ignore) +static int ftraced(void *ignore) { unsigned long usecs; @@ -733,7 +733,7 @@ struct ftrace_iterator { unsigned filtered; }; -static void notrace * +static void * t_next(struct seq_file *m, void *v, loff_t *pos) { struct ftrace_iterator *iter = m->private; @@ -806,7 +806,7 @@ static struct seq_operations show_ftrace_seq_ops = { .show = t_show, }; -static int notrace +static int ftrace_avail_open(struct inode *inode, struct file *file) { struct ftrace_iterator *iter; @@ -845,7 +845,7 @@ int ftrace_avail_release(struct inode *inode, struct file *file) return 0; } -static void notrace ftrace_filter_reset(void) +static void ftrace_filter_reset(void) { struct ftrace_page *pg; struct dyn_ftrace *rec; @@ -867,7 +867,7 @@ static void notrace ftrace_filter_reset(void) preempt_enable(); } -static int notrace +static int ftrace_filter_open(struct inode *inode, struct file *file) { struct ftrace_iterator *iter; @@ -903,7 +903,7 @@ ftrace_filter_open(struct inode *inode, struct file *file) return ret; } -static ssize_t notrace +static ssize_t ftrace_filter_read(struct file *file, char __user *ubuf, size_t cnt, loff_t *ppos) { @@ -913,7 +913,7 @@ ftrace_filter_read(struct file *file, char __user *ubuf, return -EPERM; } -static loff_t notrace +static loff_t ftrace_filter_lseek(struct file *file, loff_t offset, int origin) { loff_t ret; @@ -933,7 +933,7 @@ enum { MATCH_END_ONLY, }; -static void notrace +static void ftrace_match(unsigned char *buff, int len) { char str[KSYM_SYMBOL_LEN]; @@ -1002,7 +1002,7 @@ ftrace_match(unsigned char *buff, int len) preempt_enable(); } -static ssize_t notrace +static ssize_t ftrace_filter_write(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos) { @@ -1094,7 +1094,7 @@ ftrace_filter_write(struct file *file, const char __user *ubuf, * Filters denote which functions should be enabled when tracing is enabled. * If @buf is NULL and reset is set, all functions will be enabled for tracing. */ -notrace void ftrace_set_filter(unsigned char *buf, int len, int reset) +void ftrace_set_filter(unsigned char *buf, int len, int reset) { if (unlikely(ftrace_disabled)) return; @@ -1107,7 +1107,7 @@ notrace void ftrace_set_filter(unsigned char *buf, int len, int reset) mutex_unlock(&ftrace_filter_lock); } -static int notrace +static int ftrace_filter_release(struct inode *inode, struct file *file) { struct seq_file *m = (struct seq_file *)file->private_data; @@ -1242,7 +1242,7 @@ static __init int ftrace_init_debugfs(void) fs_initcall(ftrace_init_debugfs); -static int __init notrace ftrace_dynamic_init(void) +static int __init ftrace_dynamic_init(void) { struct task_struct *p; unsigned long addr; @@ -1352,7 +1352,7 @@ int unregister_ftrace_function(struct ftrace_ops *ops) return ret; } -notrace int +int ftrace_enable_sysctl(struct ctl_table *table, int write, struct file *file, void __user *buffer, size_t *lenp, loff_t *ppos) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9022c35..f589805 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -35,7 +35,7 @@ unsigned long __read_mostly tracing_thresh; static int tracing_disabled = 1; -static long notrace +static long ns2usecs(cycle_t nsec) { nsec += 500; @@ -43,7 +43,7 @@ ns2usecs(cycle_t nsec) return nsec; } -notrace cycle_t ftrace_now(int cpu) +cycle_t ftrace_now(int cpu) { return cpu_clock(cpu); } @@ -135,7 +135,7 @@ static DEFINE_SPINLOCK(ftrace_max_lock); * structure. (this way the maximum trace is permanently saved, * for later retrieval via /debugfs/tracing/latency_trace) */ -static notrace void +static void __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) { struct trace_array_cpu *data = tr->data[cpu]; @@ -184,7 +184,7 @@ void *head_page(struct trace_array_cpu *data) return page_address(page); } -static notrace int +static int trace_seq_printf(struct trace_seq *s, const char *fmt, ...) { int len = (PAGE_SIZE - 1) - s->len; @@ -207,7 +207,7 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...) return len; } -static notrace int +static int trace_seq_puts(struct trace_seq *s, const char *str) { int len = strlen(str); @@ -221,7 +221,7 @@ trace_seq_puts(struct trace_seq *s, const char *str) return len; } -static notrace int +static int trace_seq_putc(struct trace_seq *s, unsigned char c) { if (s->len >= (PAGE_SIZE - 1)) @@ -232,7 +232,7 @@ trace_seq_putc(struct trace_seq *s, unsigned char c) return 1; } -static notrace int +static int trace_seq_putmem(struct trace_seq *s, void *mem, size_t len) { if (len > ((PAGE_SIZE - 1) - s->len)) @@ -246,7 +246,7 @@ trace_seq_putmem(struct trace_seq *s, void *mem, size_t len) #define HEX_CHARS 17 -static notrace int +static int trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len) { unsigned char hex[HEX_CHARS]; @@ -285,13 +285,13 @@ trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len) return trace_seq_putmem(s, hex, j); } -static notrace void +static void trace_seq_reset(struct trace_seq *s) { s->len = 0; } -static notrace void +static void trace_print_seq(struct seq_file *m, struct trace_seq *s) { int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len; @@ -302,7 +302,7 @@ trace_print_seq(struct seq_file *m, struct trace_seq *s) trace_seq_reset(s); } -notrace static void +static void flip_trace(struct trace_array_cpu *tr1, struct trace_array_cpu *tr2) { struct list_head flip_pages; @@ -323,7 +323,7 @@ flip_trace(struct trace_array_cpu *tr1, struct trace_array_cpu *tr2) check_pages(tr2); } -notrace void +void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) { struct trace_array_cpu *data; @@ -348,7 +348,7 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) * @tsk - task with the latency * @cpu - the cpu of the buffer to copy. */ -notrace void +void update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) { struct trace_array_cpu *data = tr->data[cpu]; @@ -471,7 +471,7 @@ void unregister_tracer(struct tracer *type) mutex_unlock(&trace_types_lock); } -notrace void tracing_reset(struct trace_array_cpu *data) +void tracing_reset(struct trace_array_cpu *data) { data->trace_idx = 0; data->trace_head = data->trace_tail = head_page(data); @@ -494,9 +494,9 @@ static void trace_init_cmdlines(void) cmdline_idx = 0; } -notrace void trace_stop_cmdline_recording(void); +void trace_stop_cmdline_recording(void); -static notrace void trace_save_cmdline(struct task_struct *tsk) +static void trace_save_cmdline(struct task_struct *tsk) { unsigned map; unsigned idx; @@ -531,7 +531,7 @@ static notrace void trace_save_cmdline(struct task_struct *tsk) spin_unlock(&trace_cmdline_lock); } -static notrace char *trace_find_cmdline(int pid) +static char *trace_find_cmdline(int pid) { char *cmdline = "<...>"; unsigned map; @@ -552,7 +552,7 @@ static notrace char *trace_find_cmdline(int pid) return cmdline; } -notrace void tracing_record_cmdline(struct task_struct *tsk) +void tracing_record_cmdline(struct task_struct *tsk) { if (atomic_read(&trace_record_cmdline_disabled)) return; @@ -560,7 +560,7 @@ notrace void tracing_record_cmdline(struct task_struct *tsk) trace_save_cmdline(tsk); } -static inline notrace struct list_head * +static inline struct list_head * trace_next_list(struct trace_array_cpu *data, struct list_head *next) { /* @@ -574,7 +574,7 @@ trace_next_list(struct trace_array_cpu *data, struct list_head *next) return next; } -static inline notrace void * +static inline void * trace_next_page(struct trace_array_cpu *data, void *addr) { struct list_head *next; @@ -588,7 +588,7 @@ trace_next_page(struct trace_array_cpu *data, void *addr) return page_address(page); } -static inline notrace struct trace_entry * +static inline struct trace_entry * tracing_get_trace_entry(struct trace_array *tr, struct trace_array_cpu *data) { unsigned long idx, idx_next; @@ -623,7 +623,7 @@ tracing_get_trace_entry(struct trace_array *tr, struct trace_array_cpu *data) return entry; } -static inline notrace void +static inline void tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags) { struct task_struct *tsk = current; @@ -640,7 +640,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags) (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0); } -notrace void +void trace_function(struct trace_array *tr, struct trace_array_cpu *data, unsigned long ip, unsigned long parent_ip, unsigned long flags) { @@ -659,7 +659,7 @@ trace_function(struct trace_array *tr, struct trace_array_cpu *data, wake_up (&trace_wait); } -notrace void +void ftrace(struct trace_array *tr, struct trace_array_cpu *data, unsigned long ip, unsigned long parent_ip, unsigned long flags) { @@ -667,7 +667,7 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data, trace_function(tr, data, ip, parent_ip, flags); } -notrace void +void trace_special(struct trace_array *tr, struct trace_array_cpu *data, unsigned long arg1, unsigned long arg2, unsigned long arg3) { @@ -687,7 +687,7 @@ trace_special(struct trace_array *tr, struct trace_array_cpu *data, wake_up (&trace_wait); } -notrace void +void tracing_sched_switch_trace(struct trace_array *tr, struct trace_array_cpu *data, struct task_struct *prev, struct task_struct *next, @@ -712,7 +712,7 @@ tracing_sched_switch_trace(struct trace_array *tr, } #ifdef CONFIG_FTRACE -static notrace void +static void function_trace_call(unsigned long ip, unsigned long parent_ip) { struct trace_array *tr = &global_trace; @@ -741,12 +741,12 @@ static struct ftrace_ops trace_ops __read_mostly = .func = function_trace_call, }; -notrace void tracing_start_function_trace(void) +void tracing_start_function_trace(void) { register_ftrace_function(&trace_ops); } -notrace void tracing_stop_function_trace(void) +void tracing_stop_function_trace(void) { unregister_ftrace_function(&trace_ops); } @@ -786,7 +786,7 @@ trace_entry_idx(struct trace_array *tr, struct trace_array_cpu *data, return &array[iter->next_page_idx[cpu]]; } -static struct trace_entry * notrace +static struct trace_entry * find_next_entry(struct trace_iterator *iter, int *ent_cpu) { struct trace_array *tr = iter->tr; @@ -813,7 +813,7 @@ find_next_entry(struct trace_iterator *iter, int *ent_cpu) return next; } -static notrace void trace_iterator_increment(struct trace_iterator *iter) +static void trace_iterator_increment(struct trace_iterator *iter) { iter->idx++; iter->next_idx[iter->cpu]++; @@ -828,7 +828,7 @@ static notrace void trace_iterator_increment(struct trace_iterator *iter) } } -static notrace void trace_consume(struct trace_iterator *iter) +static void trace_consume(struct trace_iterator *iter) { struct trace_array_cpu *data = iter->tr->data[iter->cpu]; @@ -844,7 +844,7 @@ static notrace void trace_consume(struct trace_iterator *iter) data->trace_idx = 0; } -static notrace void *find_next_entry_inc(struct trace_iterator *iter) +static void *find_next_entry_inc(struct trace_iterator *iter) { struct trace_entry *next; int next_cpu = -1; @@ -863,7 +863,7 @@ static notrace void *find_next_entry_inc(struct trace_iterator *iter) return next ? iter : NULL; } -static notrace void *s_next(struct seq_file *m, void *v, loff_t *pos) +static void *s_next(struct seq_file *m, void *v, loff_t *pos) { struct trace_iterator *iter = m->private; void *last_ent = iter->ent; @@ -978,7 +978,7 @@ seq_print_sym_offset(struct trace_seq *s, const char *fmt, # define IP_FMT "%016lx" #endif -static notrace int +static int seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) { int ret; @@ -999,7 +999,7 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) return ret; } -static notrace void print_lat_help_header(struct seq_file *m) +static void print_lat_help_header(struct seq_file *m) { seq_puts(m, "# _------=> CPU# \n"); seq_puts(m, "# / _-----=> irqs-off \n"); @@ -1012,14 +1012,14 @@ static notrace void print_lat_help_header(struct seq_file *m) seq_puts(m, "# \\ / ||||| \\ | / \n"); } -static notrace void print_func_help_header(struct seq_file *m) +static void print_func_help_header(struct seq_file *m) { seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n"); seq_puts(m, "# | | | | |\n"); } -static notrace void +static void print_trace_header(struct seq_file *m, struct trace_iterator *iter) { unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); @@ -1090,7 +1090,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) seq_puts(m, "\n"); } -static notrace void +static void lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) { int hardirq, softirq; @@ -1127,7 +1127,7 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) unsigned long preempt_mark_thresh = 100; -static notrace void +static void lat_print_timestamp(struct trace_seq *s, unsigned long long abs_usecs, unsigned long rel_usecs) { @@ -1142,7 +1142,7 @@ lat_print_timestamp(struct trace_seq *s, unsigned long long abs_usecs, static const char state_to_char[] = TASK_STATE_TO_CHAR_STR; -static notrace int +static int print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) { struct trace_seq *s = &iter->seq; @@ -1206,7 +1206,7 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) return 1; } -static notrace int print_trace_fmt(struct trace_iterator *iter) +static int print_trace_fmt(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); @@ -1279,7 +1279,7 @@ static notrace int print_trace_fmt(struct trace_iterator *iter) return 1; } -static notrace int print_raw_fmt(struct trace_iterator *iter) +static int print_raw_fmt(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; struct trace_entry *entry; @@ -1336,7 +1336,7 @@ do { \ return 0; \ } while (0) -static notrace int print_hex_fmt(struct trace_iterator *iter) +static int print_hex_fmt(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; unsigned char newline = '\n'; @@ -1375,7 +1375,7 @@ static notrace int print_hex_fmt(struct trace_iterator *iter) return 1; } -static notrace int print_bin_fmt(struct trace_iterator *iter) +static int print_bin_fmt(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; struct trace_entry *entry; @@ -1475,7 +1475,7 @@ static struct seq_operations tracer_seq_ops = { .show = s_show, }; -static struct trace_iterator notrace * +static struct trace_iterator * __tracing_open(struct inode *inode, struct file *file, int *ret) { struct trace_iterator *iter; @@ -1572,7 +1572,7 @@ static int tracing_lt_open(struct inode *inode, struct file *file) } -static notrace void * +static void * t_next(struct seq_file *m, void *v, loff_t *pos) { struct tracer *t = m->private; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index faf9f67..2b7352b 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -150,7 +150,7 @@ struct trace_iterator { long idx; }; -void notrace tracing_reset(struct trace_array_cpu *data); +void tracing_reset(struct trace_array_cpu *data); int tracing_open_generic(struct inode *inode, struct file *filp); struct dentry *tracing_init_dentry(void); void ftrace(struct trace_array *tr, @@ -189,10 +189,10 @@ void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu); void update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu); -extern notrace cycle_t ftrace_now(int cpu); +extern cycle_t ftrace_now(int cpu); #ifdef CONFIG_SCHED_TRACER -extern void notrace +extern void wakeup_sched_switch(struct task_struct *prev, struct task_struct *next); #else static inline void diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 69a0eb0..4165d34 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -16,7 +16,7 @@ #include "trace.h" -static notrace void function_reset(struct trace_array *tr) +static void function_reset(struct trace_array *tr) { int cpu; @@ -26,30 +26,30 @@ static notrace void function_reset(struct trace_array *tr) tracing_reset(tr->data[cpu]); } -static notrace void start_function_trace(struct trace_array *tr) +static void start_function_trace(struct trace_array *tr) { function_reset(tr); tracing_start_function_trace(); } -static notrace void stop_function_trace(struct trace_array *tr) +static void stop_function_trace(struct trace_array *tr) { tracing_stop_function_trace(); } -static notrace void function_trace_init(struct trace_array *tr) +static void function_trace_init(struct trace_array *tr) { if (tr->ctrl) start_function_trace(tr); } -static notrace void function_trace_reset(struct trace_array *tr) +static void function_trace_reset(struct trace_array *tr) { if (tr->ctrl) stop_function_trace(tr); } -static notrace void function_trace_ctrl_update(struct trace_array *tr) +static void function_trace_ctrl_update(struct trace_array *tr) { if (tr->ctrl) start_function_trace(tr); diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 2ac0d09..7a4dc01 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -33,7 +33,7 @@ enum { static int trace_type __read_mostly; #ifdef CONFIG_PREEMPT_TRACER -static inline int notrace +static inline int preempt_trace(void) { return ((trace_type & TRACER_PREEMPT_OFF) && preempt_count()); @@ -43,7 +43,7 @@ preempt_trace(void) #endif #ifdef CONFIG_IRQSOFF_TRACER -static inline int notrace +static inline int irq_trace(void) { return ((trace_type & TRACER_IRQS_OFF) && @@ -67,7 +67,7 @@ static __cacheline_aligned_in_smp unsigned long max_sequence; /* * irqsoff uses its own tracer function to keep the overhead down: */ -static void notrace +static void irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) { struct trace_array *tr = irqsoff_trace; @@ -109,7 +109,7 @@ static struct ftrace_ops trace_ops __read_mostly = /* * Should this new latency be reported/recorded? */ -static int notrace report_latency(cycle_t delta) +static int report_latency(cycle_t delta) { if (tracing_thresh) { if (delta < tracing_thresh) @@ -121,7 +121,7 @@ static int notrace report_latency(cycle_t delta) return 1; } -static void notrace +static void check_critical_timing(struct trace_array *tr, struct trace_array_cpu *data, unsigned long parent_ip, @@ -191,7 +191,7 @@ out: trace_function(tr, data, CALLER_ADDR0, parent_ip, flags); } -static inline void notrace +static inline void start_critical_timing(unsigned long ip, unsigned long parent_ip) { int cpu; @@ -228,7 +228,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip) atomic_dec(&data->disabled); } -static inline void notrace +static inline void stop_critical_timing(unsigned long ip, unsigned long parent_ip) { int cpu; @@ -261,13 +261,13 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip) } /* start and stop critical timings used to for stoppage (in idle) */ -void notrace start_critical_timings(void) +void start_critical_timings(void) { if (preempt_trace() || irq_trace()) start_critical_timing(CALLER_ADDR0, CALLER_ADDR1); } -void notrace stop_critical_timings(void) +void stop_critical_timings(void) { if (preempt_trace() || irq_trace()) stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1); @@ -275,13 +275,13 @@ void notrace stop_critical_timings(void) #ifdef CONFIG_IRQSOFF_TRACER #ifdef CONFIG_PROVE_LOCKING -void notrace time_hardirqs_on(unsigned long a0, unsigned long a1) +void time_hardirqs_on(unsigned long a0, unsigned long a1) { if (!preempt_trace() && irq_trace()) stop_critical_timing(a0, a1); } -void notrace time_hardirqs_off(unsigned long a0, unsigned long a1) +void time_hardirqs_off(unsigned long a0, unsigned long a1) { if (!preempt_trace() && irq_trace()) start_critical_timing(a0, a1); @@ -309,35 +309,35 @@ void trace_softirqs_off(unsigned long ip) { } -inline notrace void print_irqtrace_events(struct task_struct *curr) +inline void print_irqtrace_events(struct task_struct *curr) { } /* * We are only interested in hardirq on/off events: */ -void notrace trace_hardirqs_on(void) +void trace_hardirqs_on(void) { if (!preempt_trace() && irq_trace()) stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1); } EXPORT_SYMBOL(trace_hardirqs_on); -void notrace trace_hardirqs_off(void) +void trace_hardirqs_off(void) { if (!preempt_trace() && irq_trace()) start_critical_timing(CALLER_ADDR0, CALLER_ADDR1); } EXPORT_SYMBOL(trace_hardirqs_off); -void notrace trace_hardirqs_on_caller(unsigned long caller_addr) +void trace_hardirqs_on_caller(unsigned long caller_addr) { if (!preempt_trace() && irq_trace()) stop_critical_timing(CALLER_ADDR0, caller_addr); } EXPORT_SYMBOL(trace_hardirqs_on_caller); -void notrace trace_hardirqs_off_caller(unsigned long caller_addr) +void trace_hardirqs_off_caller(unsigned long caller_addr) { if (!preempt_trace() && irq_trace()) start_critical_timing(CALLER_ADDR0, caller_addr); @@ -348,12 +348,12 @@ EXPORT_SYMBOL(trace_hardirqs_off_caller); #endif /* CONFIG_IRQSOFF_TRACER */ #ifdef CONFIG_PREEMPT_TRACER -void notrace trace_preempt_on(unsigned long a0, unsigned long a1) +void trace_preempt_on(unsigned long a0, unsigned long a1) { stop_critical_timing(a0, a1); } -void notrace trace_preempt_off(unsigned long a0, unsigned long a1) +void trace_preempt_off(unsigned long a0, unsigned long a1) { start_critical_timing(a0, a1); } @@ -395,14 +395,14 @@ static void irqsoff_tracer_ctrl_update(struct trace_array *tr) stop_irqsoff_tracer(tr); } -static void notrace irqsoff_tracer_open(struct trace_iterator *iter) +static void irqsoff_tracer_open(struct trace_iterator *iter) { /* stop the trace while dumping */ if (iter->tr->ctrl) stop_irqsoff_tracer(iter->tr); } -static void notrace irqsoff_tracer_close(struct trace_iterator *iter) +static void irqsoff_tracer_close(struct trace_iterator *iter) { if (iter->tr->ctrl) start_irqsoff_tracer(iter->tr); diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 8d65667..b738eac 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -17,7 +17,7 @@ static struct trace_array *ctx_trace; static int __read_mostly tracer_enabled; -static void notrace +static void ctx_switch_func(struct task_struct *prev, struct task_struct *next) { struct trace_array *tr = ctx_trace; @@ -57,7 +57,7 @@ void ftrace_ctx_switch(struct task_struct *prev, struct task_struct *next) wakeup_sched_switch(prev, next); } -static notrace void sched_switch_reset(struct trace_array *tr) +static void sched_switch_reset(struct trace_array *tr) { int cpu; @@ -67,18 +67,18 @@ static notrace void sched_switch_reset(struct trace_array *tr) tracing_reset(tr->data[cpu]); } -static notrace void start_sched_trace(struct trace_array *tr) +static void start_sched_trace(struct trace_array *tr) { sched_switch_reset(tr); tracer_enabled = 1; } -static notrace void stop_sched_trace(struct trace_array *tr) +static void stop_sched_trace(struct trace_array *tr) { tracer_enabled = 0; } -static notrace void sched_switch_trace_init(struct trace_array *tr) +static void sched_switch_trace_init(struct trace_array *tr) { ctx_trace = tr; @@ -86,7 +86,7 @@ static notrace void sched_switch_trace_init(struct trace_array *tr) start_sched_trace(tr); } -static notrace void sched_switch_trace_reset(struct trace_array *tr) +static void sched_switch_trace_reset(struct trace_array *tr) { if (tr->ctrl) stop_sched_trace(tr); diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 3549e41..662679c 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -27,12 +27,12 @@ static unsigned wakeup_prio = -1; static DEFINE_SPINLOCK(wakeup_lock); -static void notrace __wakeup_reset(struct trace_array *tr); +static void __wakeup_reset(struct trace_array *tr); /* * Should this new latency be reported/recorded? */ -static int notrace report_latency(cycle_t delta) +static int report_latency(cycle_t delta) { if (tracing_thresh) { if (delta < tracing_thresh) @@ -44,7 +44,7 @@ static int notrace report_latency(cycle_t delta) return 1; } -void notrace +void wakeup_sched_switch(struct task_struct *prev, struct task_struct *next) { unsigned long latency = 0, t0 = 0, t1 = 0; @@ -126,7 +126,7 @@ out: atomic_dec(&tr->data[cpu]->disabled); } -static void notrace __wakeup_reset(struct trace_array *tr) +static void __wakeup_reset(struct trace_array *tr) { struct trace_array_cpu *data; int cpu; @@ -147,7 +147,7 @@ static void notrace __wakeup_reset(struct trace_array *tr) wakeup_task = NULL; } -static void notrace wakeup_reset(struct trace_array *tr) +static void wakeup_reset(struct trace_array *tr) { unsigned long flags; @@ -156,7 +156,7 @@ static void notrace wakeup_reset(struct trace_array *tr) spin_unlock_irqrestore(&wakeup_lock, flags); } -static notrace void +static void wakeup_check_start(struct trace_array *tr, struct task_struct *p, struct task_struct *curr) { @@ -201,7 +201,7 @@ out: atomic_dec(&tr->data[cpu]->disabled); } -notrace void +void ftrace_wake_up_task(struct task_struct *wakee, struct task_struct *curr) { if (likely(!tracer_enabled)) @@ -210,7 +210,7 @@ ftrace_wake_up_task(struct task_struct *wakee, struct task_struct *curr) wakeup_check_start(wakeup_trace, wakee, curr); } -notrace void +void ftrace_wake_up_new_task(struct task_struct *wakee, struct task_struct *curr) { if (likely(!tracer_enabled)) @@ -219,7 +219,7 @@ ftrace_wake_up_new_task(struct task_struct *wakee, struct task_struct *curr) wakeup_check_start(wakeup_trace, wakee, curr); } -static notrace void start_wakeup_tracer(struct trace_array *tr) +static void start_wakeup_tracer(struct trace_array *tr) { wakeup_reset(tr); @@ -237,12 +237,12 @@ static notrace void start_wakeup_tracer(struct trace_array *tr) return; } -static notrace void stop_wakeup_tracer(struct trace_array *tr) +static void stop_wakeup_tracer(struct trace_array *tr) { tracer_enabled = 0; } -static notrace void wakeup_tracer_init(struct trace_array *tr) +static void wakeup_tracer_init(struct trace_array *tr) { wakeup_trace = tr; @@ -250,7 +250,7 @@ static notrace void wakeup_tracer_init(struct trace_array *tr) start_wakeup_tracer(tr); } -static notrace void wakeup_tracer_reset(struct trace_array *tr) +static void wakeup_tracer_reset(struct trace_array *tr) { if (tr->ctrl) { stop_wakeup_tracer(tr); @@ -267,14 +267,14 @@ static void wakeup_tracer_ctrl_update(struct trace_array *tr) stop_wakeup_tracer(tr); } -static void notrace wakeup_tracer_open(struct trace_iterator *iter) +static void wakeup_tracer_open(struct trace_iterator *iter) { /* stop the trace while dumping */ if (iter->tr->ctrl) stop_wakeup_tracer(iter->tr); } -static void notrace wakeup_tracer_close(struct trace_iterator *iter) +static void wakeup_tracer_close(struct trace_iterator *iter) { /* forget about any processes we were recording */ if (iter->tr->ctrl) diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 546307d..85715b8 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -3,7 +3,7 @@ #include #include -static notrace inline int trace_valid_entry(struct trace_entry *entry) +static inline int trace_valid_entry(struct trace_entry *entry) { switch (entry->type) { case TRACE_FN: -- cgit v0.10.2 From 57422797dc009fc83766bcf230d29dbe6e08e21e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:51 +0200 Subject: ftrace: add wakeup events to sched tracer Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f589805..192c135 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -57,7 +57,7 @@ static struct trace_array max_tr; static DEFINE_PER_CPU(struct trace_array_cpu, max_data); static int tracer_enabled = 1; -static unsigned long trace_nr_entries = 16384UL; +static unsigned long trace_nr_entries = 65536UL; static struct tracer *trace_types __read_mostly; static struct tracer *current_trace __read_mostly; @@ -87,6 +87,7 @@ enum trace_type { TRACE_FN, TRACE_CTX, + TRACE_WAKE, TRACE_SPECIAL, __TRACE_LAST_TYPE @@ -711,6 +712,30 @@ tracing_sched_switch_trace(struct trace_array *tr, wake_up (&trace_wait); } +void +tracing_sched_wakeup_trace(struct trace_array *tr, + struct trace_array_cpu *data, + struct task_struct *wakee, struct task_struct *curr, + unsigned long flags) +{ + struct trace_entry *entry; + unsigned long irq_flags; + + spin_lock_irqsave(&data->lock, irq_flags); + entry = tracing_get_trace_entry(tr, data); + tracing_generic_entry_update(entry, flags); + entry->type = TRACE_WAKE; + entry->ctx.prev_pid = curr->pid; + entry->ctx.prev_prio = curr->prio; + entry->ctx.prev_state = curr->state; + entry->ctx.next_pid = wakee->pid; + entry->ctx.next_prio = wakee->prio; + spin_unlock_irqrestore(&data->lock, irq_flags); + + if (!(trace_flags & TRACE_ITER_BLOCK)) + wake_up(&trace_wait); +} + #ifdef CONFIG_FTRACE static void function_trace_call(unsigned long ip, unsigned long parent_ip) @@ -1183,13 +1208,14 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) trace_seq_puts(s, ")\n"); break; case TRACE_CTX: + case TRACE_WAKE: S = entry->ctx.prev_state < sizeof(state_to_char) ? state_to_char[entry->ctx.prev_state] : 'X'; comm = trace_find_cmdline(entry->ctx.next_pid); - trace_seq_printf(s, " %d:%d:%c --> %d:%d %s\n", + trace_seq_printf(s, " %5d:%3d:%c %s %5d:%3d %s\n", entry->ctx.prev_pid, entry->ctx.prev_prio, - S, + S, entry->type == TRACE_CTX ? "==>" : " +", entry->ctx.next_pid, entry->ctx.next_prio, comm); @@ -1256,12 +1282,14 @@ static int print_trace_fmt(struct trace_iterator *iter) return 0; break; case TRACE_CTX: + case TRACE_WAKE: S = entry->ctx.prev_state < sizeof(state_to_char) ? state_to_char[entry->ctx.prev_state] : 'X'; - ret = trace_seq_printf(s, " %d:%d:%c ==> %d:%d\n", + ret = trace_seq_printf(s, " %5d:%3d:%c %s %5d:%3d\n", entry->ctx.prev_pid, entry->ctx.prev_prio, S, + entry->type == TRACE_CTX ? "==>" : " +", entry->ctx.next_pid, entry->ctx.next_prio); if (!ret) @@ -1301,8 +1329,11 @@ static int print_raw_fmt(struct trace_iterator *iter) return 0; break; case TRACE_CTX: + case TRACE_WAKE: S = entry->ctx.prev_state < sizeof(state_to_char) ? state_to_char[entry->ctx.prev_state] : 'X'; + if (entry->type == TRACE_WAKE) + S = '+'; ret = trace_seq_printf(s, "%d %d %c %d %d\n", entry->ctx.prev_pid, entry->ctx.prev_prio, @@ -1355,8 +1386,11 @@ static int print_hex_fmt(struct trace_iterator *iter) SEQ_PUT_HEX_FIELD_RET(s, entry->fn.parent_ip); break; case TRACE_CTX: + case TRACE_WAKE: S = entry->ctx.prev_state < sizeof(state_to_char) ? state_to_char[entry->ctx.prev_state] : 'X'; + if (entry->type == TRACE_WAKE) + S = '+'; SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.prev_pid); SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.prev_prio); SEQ_PUT_HEX_FIELD_RET(s, S); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 2b7352b..90e0ba0 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -164,6 +164,12 @@ void tracing_sched_switch_trace(struct trace_array *tr, struct task_struct *next, unsigned long flags); void tracing_record_cmdline(struct task_struct *tsk); + +void tracing_sched_wakeup_trace(struct trace_array *tr, + struct trace_array_cpu *data, + struct task_struct *wakee, + struct task_struct *cur, + unsigned long flags); void trace_special(struct trace_array *tr, struct trace_array_cpu *data, unsigned long arg1, @@ -194,11 +200,17 @@ extern cycle_t ftrace_now(int cpu); #ifdef CONFIG_SCHED_TRACER extern void wakeup_sched_switch(struct task_struct *prev, struct task_struct *next); +extern void +wakeup_sched_wakeup(struct task_struct *wakee, struct task_struct *curr); #else static inline void wakeup_sched_switch(struct task_struct *prev, struct task_struct *next) { } +static inline void +wakeup_sched_wakeup(struct task_struct *wakee, struct task_struct *curr) +{ +} #endif #ifdef CONFIG_CONTEXT_SWITCH_TRACER diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index b738eac..8b1cf1a 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -41,6 +41,29 @@ ctx_switch_func(struct task_struct *prev, struct task_struct *next) local_irq_restore(flags); } +static void wakeup_func(struct task_struct *wakee, struct task_struct *curr) +{ + struct trace_array *tr = ctx_trace; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int cpu; + + if (!tracer_enabled) + return; + + local_irq_save(flags); + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + disabled = atomic_inc_return(&data->disabled); + + if (likely(disabled == 1)) + tracing_sched_wakeup_trace(tr, data, wakee, curr, flags); + + atomic_dec(&data->disabled); + local_irq_restore(flags); +} + void ftrace_ctx_switch(struct task_struct *prev, struct task_struct *next) { tracing_record_cmdline(prev); @@ -57,6 +80,19 @@ void ftrace_ctx_switch(struct task_struct *prev, struct task_struct *next) wakeup_sched_switch(prev, next); } +void +ftrace_wake_up_task(struct task_struct *wakee, struct task_struct *curr) +{ + tracing_record_cmdline(curr); + + wakeup_func(wakee, curr); + + /* + * Chain to the wakeup tracer (this is a NOP if disabled): + */ + wakeup_sched_wakeup(wakee, curr); +} + static void sched_switch_reset(struct trace_array *tr) { int cpu; diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 662679c..87fa7b2 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -202,7 +202,7 @@ out: } void -ftrace_wake_up_task(struct task_struct *wakee, struct task_struct *curr) +wakeup_sched_wakeup(struct task_struct *wakee, struct task_struct *curr) { if (likely(!tracer_enabled)) return; diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 85715b8..39dd452 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -8,6 +8,7 @@ static inline int trace_valid_entry(struct trace_entry *entry) switch (entry->type) { case TRACE_FN: case TRACE_CTX: + case TRACE_WAKE: return 1; } return 0; -- cgit v0.10.2 From 86387f7ee5d3273ff4859e2c64ce656639b6ca65 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:51 +0200 Subject: ftrace: add stack tracing Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 08fbef1..0d3714e 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -93,6 +93,7 @@ static inline void tracer_disable(void) # define CALLER_ADDR3 ((unsigned long)__builtin_return_address(3)) # define CALLER_ADDR4 ((unsigned long)__builtin_return_address(4)) # define CALLER_ADDR5 ((unsigned long)__builtin_return_address(5)) +# define CALLER_ADDR6 ((unsigned long)__builtin_return_address(6)) #else # define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0)) # define CALLER_ADDR1 0UL @@ -100,6 +101,7 @@ static inline void tracer_disable(void) # define CALLER_ADDR3 0UL # define CALLER_ADDR4 0UL # define CALLER_ADDR5 0UL +# define CALLER_ADDR6 0UL #endif #ifdef CONFIG_IRQSOFF_TRACER diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 3f73a17..eb1988e 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -10,6 +10,7 @@ config TRACER_MAX_TRACE config TRACING bool select DEBUG_FS + select STACKTRACE config FTRACE bool "Kernel Function Tracer" diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 192c135..b4b1b4f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -28,6 +28,8 @@ #include #include +#include + #include "trace.h" unsigned long __read_mostly tracing_max_latency = (cycle_t)ULONG_MAX; @@ -88,6 +90,7 @@ enum trace_type { TRACE_FN, TRACE_CTX, TRACE_WAKE, + TRACE_STACK, TRACE_SPECIAL, __TRACE_LAST_TYPE @@ -109,6 +112,7 @@ enum trace_iterator_flags { TRACE_ITER_HEX = 0x20, TRACE_ITER_BIN = 0x40, TRACE_ITER_BLOCK = 0x80, + TRACE_ITER_STACKTRACE = 0x100, }; #define TRACE_ITER_SYM_MASK \ @@ -124,10 +128,11 @@ static const char *trace_options[] = { "hex", "bin", "block", + "stacktrace", NULL }; -static unsigned trace_flags; +static unsigned trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_STACKTRACE; static DEFINE_SPINLOCK(ftrace_max_lock); @@ -657,7 +662,7 @@ trace_function(struct trace_array *tr, struct trace_array_cpu *data, spin_unlock_irqrestore(&data->lock, irq_flags); if (!(trace_flags & TRACE_ITER_BLOCK)) - wake_up (&trace_wait); + wake_up(&trace_wait); } void @@ -685,13 +690,39 @@ trace_special(struct trace_array *tr, struct trace_array_cpu *data, spin_unlock_irqrestore(&data->lock, irq_flags); if (!(trace_flags & TRACE_ITER_BLOCK)) - wake_up (&trace_wait); + wake_up(&trace_wait); +} + +void __trace_stack(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long flags, + int skip) +{ + struct trace_entry *entry; + struct stack_trace trace; + + if (!(trace_flags & TRACE_ITER_STACKTRACE)) + return; + + entry = tracing_get_trace_entry(tr, data); + tracing_generic_entry_update(entry, flags); + entry->type = TRACE_STACK; + + memset(&entry->stack, 0, sizeof(entry->stack)); + + trace.nr_entries = 0; + trace.max_entries = FTRACE_STACK_ENTRIES; + trace.skip = skip; + trace.entries = entry->stack.caller; + + save_stack_trace(&trace); } void tracing_sched_switch_trace(struct trace_array *tr, struct trace_array_cpu *data, - struct task_struct *prev, struct task_struct *next, + struct task_struct *prev, + struct task_struct *next, unsigned long flags) { struct trace_entry *entry; @@ -706,16 +737,18 @@ tracing_sched_switch_trace(struct trace_array *tr, entry->ctx.prev_state = prev->state; entry->ctx.next_pid = next->pid; entry->ctx.next_prio = next->prio; + __trace_stack(tr, data, flags, 4); spin_unlock_irqrestore(&data->lock, irq_flags); if (!(trace_flags & TRACE_ITER_BLOCK)) - wake_up (&trace_wait); + wake_up(&trace_wait); } void tracing_sched_wakeup_trace(struct trace_array *tr, struct trace_array_cpu *data, - struct task_struct *wakee, struct task_struct *curr, + struct task_struct *wakee, + struct task_struct *curr, unsigned long flags) { struct trace_entry *entry; @@ -730,6 +763,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr, entry->ctx.prev_state = curr->state; entry->ctx.next_pid = wakee->pid; entry->ctx.next_prio = wakee->prio; + __trace_stack(tr, data, flags, 5); spin_unlock_irqrestore(&data->lock, irq_flags); if (!(trace_flags & TRACE_ITER_BLOCK)) @@ -1179,6 +1213,7 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) unsigned long rel_usecs; char *comm; int S; + int i; if (!next_entry) next_entry = entry; @@ -1197,8 +1232,10 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) abs_usecs % 1000, rel_usecs/1000, rel_usecs % 1000); } else { - lat_print_generic(s, entry, cpu); - lat_print_timestamp(s, abs_usecs, rel_usecs); + if (entry->type != TRACE_STACK) { + lat_print_generic(s, entry, cpu); + lat_print_timestamp(s, abs_usecs, rel_usecs); + } } switch (entry->type) { case TRACE_FN: @@ -1226,6 +1263,14 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) entry->special.arg2, entry->special.arg3); break; + case TRACE_STACK: + for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { + if (i) + trace_seq_puts(s, " <= "); + seq_print_ip_sym(s, entry->stack.caller[i], sym_flags); + } + trace_seq_puts(s, "\n"); + break; default: trace_seq_printf(s, "Unknown type %d\n", entry->type); } @@ -1241,8 +1286,9 @@ static int print_trace_fmt(struct trace_iterator *iter) unsigned long long t; unsigned long secs; char *comm; - int S; int ret; + int S; + int i; entry = iter->ent; @@ -1252,15 +1298,17 @@ static int print_trace_fmt(struct trace_iterator *iter) usec_rem = do_div(t, 1000000ULL); secs = (unsigned long)t; - ret = trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid); - if (!ret) - return 0; - ret = trace_seq_printf(s, "[%02d] ", iter->cpu); - if (!ret) - return 0; - ret = trace_seq_printf(s, "%5lu.%06lu: ", secs, usec_rem); - if (!ret) - return 0; + if (entry->type != TRACE_STACK) { + ret = trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid); + if (!ret) + return 0; + ret = trace_seq_printf(s, "[%02d] ", iter->cpu); + if (!ret) + return 0; + ret = trace_seq_printf(s, "%5lu.%06lu: ", secs, usec_rem); + if (!ret) + return 0; + } switch (entry->type) { case TRACE_FN: @@ -1303,6 +1351,22 @@ static int print_trace_fmt(struct trace_iterator *iter) if (!ret) return 0; break; + case TRACE_STACK: + for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { + if (i) { + ret = trace_seq_puts(s, " <= "); + if (!ret) + return 0; + } + ret = seq_print_ip_sym(s, entry->stack.caller[i], + sym_flags); + if (!ret) + return 0; + } + ret = trace_seq_puts(s, "\n"); + if (!ret) + return 0; + break; } return 1; } @@ -1344,6 +1408,7 @@ static int print_raw_fmt(struct trace_iterator *iter) return 0; break; case TRACE_SPECIAL: + case TRACE_STACK: ret = trace_seq_printf(s, " %lx %lx %lx\n", entry->special.arg1, entry->special.arg2, @@ -1399,6 +1464,7 @@ static int print_hex_fmt(struct trace_iterator *iter) SEQ_PUT_HEX_FIELD_RET(s, entry->fn.parent_ip); break; case TRACE_SPECIAL: + case TRACE_STACK: SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg1); SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg2); SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg3); @@ -1433,6 +1499,7 @@ static int print_bin_fmt(struct trace_iterator *iter) SEQ_PUT_FIELD_RET(s, entry->ctx.next_prio); break; case TRACE_SPECIAL: + case TRACE_STACK: SEQ_PUT_FIELD_RET(s, entry->special.arg1); SEQ_PUT_FIELD_RET(s, entry->special.arg2); SEQ_PUT_FIELD_RET(s, entry->special.arg3); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 90e0ba0..387bdcf 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -35,6 +35,16 @@ struct special_entry { }; /* + * Stack-trace entry: + */ + +#define FTRACE_STACK_ENTRIES 5 + +struct stack_entry { + unsigned long caller[FTRACE_STACK_ENTRIES]; +}; + +/* * The trace entry - the most basic unit of tracing. This is what * is printed in the end as a single line in the trace output, such as: * @@ -51,6 +61,7 @@ struct trace_entry { struct ftrace_entry fn; struct ctx_switch_entry ctx; struct special_entry special; + struct stack_entry stack; }; }; -- cgit v0.10.2 From 8ac0fca4ccb355ce50471d7aa3f10f5900b28b95 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:51 +0200 Subject: ftrace: sched tracer fix Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/include/linux/sched.h b/include/linux/sched.h index 6e26f1f..05744f9 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2130,17 +2130,11 @@ ftrace_ctx_switch(struct task_struct *prev, struct task_struct *next) #ifdef CONFIG_SCHED_TRACER extern void ftrace_wake_up_task(struct task_struct *wakee, struct task_struct *curr); -extern void -ftrace_wake_up_new_task(struct task_struct *wakee, struct task_struct *curr); #else static inline void ftrace_wake_up_task(struct task_struct *wakee, struct task_struct *curr) { } -static inline void -ftrace_wake_up_new_task(struct task_struct *wakee, struct task_struct *curr) -{ -} #endif extern long sched_setaffinity(pid_t pid, const cpumask_t *new_mask); diff --git a/kernel/sched.c b/kernel/sched.c index 328494e..53ab117 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2613,7 +2613,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) p->sched_class->task_new(rq, p); inc_nr_running(rq); } - ftrace_wake_up_new_task(p, rq->curr); + ftrace_wake_up_task(p, rq->curr); check_preempt_curr(rq, p); #ifdef CONFIG_SMP if (p->sched_class->task_wake_up) diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 87fa7b2..2a01242 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -201,20 +201,13 @@ out: atomic_dec(&tr->data[cpu]->disabled); } -void -wakeup_sched_wakeup(struct task_struct *wakee, struct task_struct *curr) +void wakeup_sched_wakeup(struct task_struct *wakee, struct task_struct *curr) { if (likely(!tracer_enabled)) return; - wakeup_check_start(wakeup_trace, wakee, curr); -} - -void -ftrace_wake_up_new_task(struct task_struct *wakee, struct task_struct *curr) -{ - if (likely(!tracer_enabled)) - return; + tracing_record_cmdline(curr); + tracing_record_cmdline(wakee); wakeup_check_start(wakeup_trace, wakee, curr); } -- cgit v0.10.2 From 4c1f4d4f0175129934d5dbc19a39296430937a05 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:51 +0200 Subject: ftrace: make nostacktrace the default Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index b4b1b4f..0e4b711 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -132,7 +132,7 @@ static const char *trace_options[] = { NULL }; -static unsigned trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_STACKTRACE; +static unsigned trace_flags = TRACE_ITER_PRINT_PARENT; static DEFINE_SPINLOCK(ftrace_max_lock); -- cgit v0.10.2 From 4e65551905fb0300ae7e667cbaa41ee2e3f29a13 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:52 +0200 Subject: ftrace: sched tracer, trace full rbtree Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/include/linux/sched.h b/include/linux/sched.h index 05744f9..652d380 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2119,20 +2119,34 @@ static inline void arch_pick_mmap_layout(struct mm_struct *mm) #ifdef CONFIG_CONTEXT_SWITCH_TRACER extern void -ftrace_ctx_switch(struct task_struct *prev, struct task_struct *next); +ftrace_ctx_switch(void *rq, struct task_struct *prev, struct task_struct *next); +extern void +ftrace_wake_up_task(void *rq, struct task_struct *wakee, + struct task_struct *curr); +extern void ftrace_all_fair_tasks(void *__rq, void *__tr, void *__data); +extern void +__trace_special(void *__tr, void *__data, + unsigned long arg1, unsigned long arg2, unsigned long arg3); #else static inline void -ftrace_ctx_switch(struct task_struct *prev, struct task_struct *next) +ftrace_ctx_switch(void *rq, struct task_struct *prev, struct task_struct *next) +{ +} +static inline void +sched_trace_special(unsigned long p1, unsigned long p2, unsigned long p3) +{ +} +static inline void +ftrace_wake_up_task(void *rq, struct task_struct *wakee, + struct task_struct *curr) +{ +} +static inline void ftrace_all_fair_tasks(void *__rq, void *__tr, void *__data) { } -#endif - -#ifdef CONFIG_SCHED_TRACER -extern void -ftrace_wake_up_task(struct task_struct *wakee, struct task_struct *curr); -#else static inline void -ftrace_wake_up_task(struct task_struct *wakee, struct task_struct *curr) +__trace_special(void *__tr, void *__data, + unsigned long arg1, unsigned long arg2, unsigned long arg3) { } #endif diff --git a/kernel/sched.c b/kernel/sched.c index 53ab117..b9208a0 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2394,6 +2394,35 @@ static int sched_balance_self(int cpu, int flag) #endif /* CONFIG_SMP */ +#ifdef CONFIG_CONTEXT_SWITCH_TRACER + +void ftrace_all_fair_tasks(void *__rq, void *__tr, void *__data) +{ + struct sched_entity *se; + struct task_struct *p; + struct rb_node *curr; + struct rq *rq = __rq; + + curr = first_fair(&rq->cfs); + if (!curr) + return; + + while (curr) { + se = rb_entry(curr, struct sched_entity, run_node); + if (!entity_is_task(se)) + continue; + + p = task_of(se); + + __trace_special(__tr, __data, + p->pid, p->se.vruntime, p->se.sum_exec_runtime); + + curr = rb_next(curr); + } +} + +#endif + /*** * try_to_wake_up - wake up a thread * @p: the to-be-woken-up thread @@ -2468,7 +2497,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) out_activate: #endif /* CONFIG_SMP */ - ftrace_wake_up_task(p, rq->curr); + ftrace_wake_up_task(rq, p, rq->curr); schedstat_inc(p, se.nr_wakeups); if (sync) schedstat_inc(p, se.nr_wakeups_sync); @@ -2613,7 +2642,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) p->sched_class->task_new(rq, p); inc_nr_running(rq); } - ftrace_wake_up_task(p, rq->curr); + ftrace_wake_up_task(rq, p, rq->curr); check_preempt_curr(rq, p); #ifdef CONFIG_SMP if (p->sched_class->task_wake_up) @@ -2786,7 +2815,7 @@ context_switch(struct rq *rq, struct task_struct *prev, struct mm_struct *mm, *oldmm; prepare_task_switch(rq, prev, next); - ftrace_ctx_switch(prev, next); + ftrace_ctx_switch(rq, prev, next); mm = next->mm; oldmm = prev->active_mm; /* diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 0e4b711..65173b1 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -66,7 +66,18 @@ static struct tracer *current_trace __read_mostly; static int max_tracer_type_len; static DEFINE_MUTEX(trace_types_lock); -static DECLARE_WAIT_QUEUE_HEAD (trace_wait); +static DECLARE_WAIT_QUEUE_HEAD(trace_wait); + +unsigned long trace_flags = TRACE_ITER_PRINT_PARENT; + +/* + * FIXME: where should this be called? + */ +void trace_wake_up(void) +{ + if (!(trace_flags & TRACE_ITER_BLOCK)) + wake_up(&trace_wait); +} #define ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(struct trace_entry)) @@ -103,18 +114,6 @@ enum trace_flag_type { TRACE_FLAG_SOFTIRQ = 0x08, }; -enum trace_iterator_flags { - TRACE_ITER_PRINT_PARENT = 0x01, - TRACE_ITER_SYM_OFFSET = 0x02, - TRACE_ITER_SYM_ADDR = 0x04, - TRACE_ITER_VERBOSE = 0x08, - TRACE_ITER_RAW = 0x10, - TRACE_ITER_HEX = 0x20, - TRACE_ITER_BIN = 0x40, - TRACE_ITER_BLOCK = 0x80, - TRACE_ITER_STACKTRACE = 0x100, -}; - #define TRACE_ITER_SYM_MASK \ (TRACE_ITER_PRINT_PARENT|TRACE_ITER_SYM_OFFSET|TRACE_ITER_SYM_ADDR) @@ -132,8 +131,6 @@ static const char *trace_options[] = { NULL }; -static unsigned trace_flags = TRACE_ITER_PRINT_PARENT; - static DEFINE_SPINLOCK(ftrace_max_lock); /* @@ -660,9 +657,6 @@ trace_function(struct trace_array *tr, struct trace_array_cpu *data, entry->fn.ip = ip; entry->fn.parent_ip = parent_ip; spin_unlock_irqrestore(&data->lock, irq_flags); - - if (!(trace_flags & TRACE_ITER_BLOCK)) - wake_up(&trace_wait); } void @@ -673,10 +667,14 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data, trace_function(tr, data, ip, parent_ip, flags); } +#ifdef CONFIG_CONTEXT_SWITCH_TRACER + void -trace_special(struct trace_array *tr, struct trace_array_cpu *data, - unsigned long arg1, unsigned long arg2, unsigned long arg3) +__trace_special(void *__tr, void *__data, + unsigned long arg1, unsigned long arg2, unsigned long arg3) { + struct trace_array_cpu *data = __data; + struct trace_array *tr = __tr; struct trace_entry *entry; unsigned long irq_flags; @@ -688,11 +686,10 @@ trace_special(struct trace_array *tr, struct trace_array_cpu *data, entry->special.arg2 = arg2; entry->special.arg3 = arg3; spin_unlock_irqrestore(&data->lock, irq_flags); - - if (!(trace_flags & TRACE_ITER_BLOCK)) - wake_up(&trace_wait); } +#endif + void __trace_stack(struct trace_array *tr, struct trace_array_cpu *data, unsigned long flags, @@ -739,9 +736,6 @@ tracing_sched_switch_trace(struct trace_array *tr, entry->ctx.next_prio = next->prio; __trace_stack(tr, data, flags, 4); spin_unlock_irqrestore(&data->lock, irq_flags); - - if (!(trace_flags & TRACE_ITER_BLOCK)) - wake_up(&trace_wait); } void @@ -765,9 +759,6 @@ tracing_sched_wakeup_trace(struct trace_array *tr, entry->ctx.next_prio = wakee->prio; __trace_stack(tr, data, flags, 5); spin_unlock_irqrestore(&data->lock, irq_flags); - - if (!(trace_flags & TRACE_ITER_BLOCK)) - wake_up(&trace_wait); } #ifdef CONFIG_FTRACE @@ -1258,7 +1249,7 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) comm); break; case TRACE_SPECIAL: - trace_seq_printf(s, " %lx %lx %lx\n", + trace_seq_printf(s, " %ld %ld %ld\n", entry->special.arg1, entry->special.arg2, entry->special.arg3); @@ -1344,7 +1335,7 @@ static int print_trace_fmt(struct trace_iterator *iter) return 0; break; case TRACE_SPECIAL: - ret = trace_seq_printf(s, " %lx %lx %lx\n", + ret = trace_seq_printf(s, " %ld %ld %ld\n", entry->special.arg1, entry->special.arg2, entry->special.arg3); @@ -1409,7 +1400,7 @@ static int print_raw_fmt(struct trace_iterator *iter) break; case TRACE_SPECIAL: case TRACE_STACK: - ret = trace_seq_printf(s, " %lx %lx %lx\n", + ret = trace_seq_printf(s, " %ld %ld %ld\n", entry->special.arg1, entry->special.arg2, entry->special.arg3); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 387bdcf..75e2374 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -274,4 +274,18 @@ extern int trace_selftest_startup_sched_switch(struct tracer *trace, extern void *head_page(struct trace_array_cpu *data); +extern unsigned long trace_flags; + +enum trace_iterator_flags { + TRACE_ITER_PRINT_PARENT = 0x01, + TRACE_ITER_SYM_OFFSET = 0x02, + TRACE_ITER_SYM_ADDR = 0x04, + TRACE_ITER_VERBOSE = 0x08, + TRACE_ITER_RAW = 0x10, + TRACE_ITER_HEX = 0x20, + TRACE_ITER_BIN = 0x40, + TRACE_ITER_BLOCK = 0x80, + TRACE_ITER_STACKTRACE = 0x100, +}; + #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 8b1cf1a..12658b3 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -18,7 +18,7 @@ static struct trace_array *ctx_trace; static int __read_mostly tracer_enabled; static void -ctx_switch_func(struct task_struct *prev, struct task_struct *next) +ctx_switch_func(void *__rq, struct task_struct *prev, struct task_struct *next) { struct trace_array *tr = ctx_trace; struct trace_array_cpu *data; @@ -34,14 +34,17 @@ ctx_switch_func(struct task_struct *prev, struct task_struct *next) data = tr->data[cpu]; disabled = atomic_inc_return(&data->disabled); - if (likely(disabled == 1)) + if (likely(disabled == 1)) { tracing_sched_switch_trace(tr, data, prev, next, flags); + ftrace_all_fair_tasks(__rq, tr, data); + } atomic_dec(&data->disabled); local_irq_restore(flags); } -static void wakeup_func(struct task_struct *wakee, struct task_struct *curr) +static void +wakeup_func(void *__rq, struct task_struct *wakee, struct task_struct *curr) { struct trace_array *tr = ctx_trace; struct trace_array_cpu *data; @@ -57,14 +60,18 @@ static void wakeup_func(struct task_struct *wakee, struct task_struct *curr) data = tr->data[cpu]; disabled = atomic_inc_return(&data->disabled); - if (likely(disabled == 1)) + if (likely(disabled == 1)) { tracing_sched_wakeup_trace(tr, data, wakee, curr, flags); + ftrace_all_fair_tasks(__rq, tr, data); + } atomic_dec(&data->disabled); local_irq_restore(flags); } -void ftrace_ctx_switch(struct task_struct *prev, struct task_struct *next) +void +ftrace_ctx_switch(void *__rq, struct task_struct *prev, + struct task_struct *next) { tracing_record_cmdline(prev); @@ -72,7 +79,7 @@ void ftrace_ctx_switch(struct task_struct *prev, struct task_struct *next) * If tracer_switch_func only points to the local * switch func, it still needs the ptr passed to it. */ - ctx_switch_func(prev, next); + ctx_switch_func(__rq, prev, next); /* * Chain to the wakeup tracer (this is a NOP if disabled): @@ -81,11 +88,12 @@ void ftrace_ctx_switch(struct task_struct *prev, struct task_struct *next) } void -ftrace_wake_up_task(struct task_struct *wakee, struct task_struct *curr) +ftrace_wake_up_task(void *__rq, struct task_struct *wakee, + struct task_struct *curr) { tracing_record_cmdline(curr); - wakeup_func(wakee, curr); + wakeup_func(__rq, wakee, curr); /* * Chain to the wakeup tracer (this is a NOP if disabled): -- cgit v0.10.2 From 24cd5d111e8c713e62cda7ca1d01232402e3d3c9 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:52 +0200 Subject: ftrace: trace curr/next tasks Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/sched.c b/kernel/sched.c index b9208a0..673b588 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2398,8 +2398,8 @@ static int sched_balance_self(int cpu, int flag) void ftrace_all_fair_tasks(void *__rq, void *__tr, void *__data) { - struct sched_entity *se; struct task_struct *p; + struct sched_entity *se; struct rb_node *curr; struct rq *rq = __rq; @@ -2407,6 +2407,17 @@ void ftrace_all_fair_tasks(void *__rq, void *__tr, void *__data) if (!curr) return; + if (rq->cfs.curr) { + p = task_of(rq->cfs.curr); + __trace_special(__tr, __data, + p->pid, p->se.vruntime, p->se.sum_exec_runtime); + } + if (rq->cfs.next) { + p = task_of(rq->cfs.next); + __trace_special(__tr, __data, + p->pid, p->se.vruntime, p->se.sum_exec_runtime); + } + while (curr) { se = rb_entry(curr, struct sched_entity, run_node); if (!entity_is_task(se)) -- cgit v0.10.2 From 017730c11241e26577673eb9d957cfc66172ea91 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:52 +0200 Subject: ftrace: fix wakeups Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/include/linux/sched.h b/include/linux/sched.h index 652d380..a3970b5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -246,6 +246,8 @@ extern asmlinkage void schedule_tail(struct task_struct *prev); extern void init_idle(struct task_struct *idle, int cpu); extern void init_idle_bootup_task(struct task_struct *idle); +extern int runqueue_is_locked(void); + extern cpumask_t nohz_cpu_mask; #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ) extern int select_nohz_load_balancer(int cpu); diff --git a/kernel/sched.c b/kernel/sched.c index 673b588..9ca4a2e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -642,6 +642,24 @@ static inline void update_rq_clock(struct rq *rq) # define const_debug static const #endif +/** + * runqueue_is_locked + * + * Returns true if the current cpu runqueue is locked. + * This interface allows printk to be called with the runqueue lock + * held and know whether or not it is OK to wake up the klogd. + */ +int runqueue_is_locked(void) +{ + int cpu = get_cpu(); + struct rq *rq = cpu_rq(cpu); + int ret; + + ret = spin_is_locked(&rq->lock); + put_cpu(); + return ret; +} + /* * Debugging: various feature bits */ diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 65173b1..2ca9d66 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -70,12 +70,13 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait); unsigned long trace_flags = TRACE_ITER_PRINT_PARENT; -/* - * FIXME: where should this be called? - */ void trace_wake_up(void) { - if (!(trace_flags & TRACE_ITER_BLOCK)) + /* + * The runqueue_is_locked() can fail, but this is the best we + * have for now: + */ + if (!(trace_flags & TRACE_ITER_BLOCK) && !runqueue_is_locked()) wake_up(&trace_wait); } @@ -657,6 +658,8 @@ trace_function(struct trace_array *tr, struct trace_array_cpu *data, entry->fn.ip = ip; entry->fn.parent_ip = parent_ip; spin_unlock_irqrestore(&data->lock, irq_flags); + + trace_wake_up(); } void @@ -686,6 +689,8 @@ __trace_special(void *__tr, void *__data, entry->special.arg2 = arg2; entry->special.arg3 = arg3; spin_unlock_irqrestore(&data->lock, irq_flags); + + trace_wake_up(); } #endif @@ -759,6 +764,8 @@ tracing_sched_wakeup_trace(struct trace_array *tr, entry->ctx.next_prio = wakee->prio; __trace_stack(tr, data, flags, 5); spin_unlock_irqrestore(&data->lock, irq_flags); + + trace_wake_up(); } #ifdef CONFIG_FTRACE -- cgit v0.10.2 From 1a3c3034336320554a3342572dae98d69e054fc7 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:52 +0200 Subject: ftrace: fix __trace_special() Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/include/linux/sched.h b/include/linux/sched.h index a3970b5..5b186be 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2119,6 +2119,18 @@ static inline void arch_pick_mmap_layout(struct mm_struct *mm) } #endif +#ifdef CONFIG_TRACING +extern void +__trace_special(void *__tr, void *__data, + unsigned long arg1, unsigned long arg2, unsigned long arg3); +#else +static inline void +__trace_special(void *__tr, void *__data, + unsigned long arg1, unsigned long arg2, unsigned long arg3) +{ +} +#endif + #ifdef CONFIG_CONTEXT_SWITCH_TRACER extern void ftrace_ctx_switch(void *rq, struct task_struct *prev, struct task_struct *next); @@ -2126,9 +2138,6 @@ extern void ftrace_wake_up_task(void *rq, struct task_struct *wakee, struct task_struct *curr); extern void ftrace_all_fair_tasks(void *__rq, void *__tr, void *__data); -extern void -__trace_special(void *__tr, void *__data, - unsigned long arg1, unsigned long arg2, unsigned long arg3); #else static inline void ftrace_ctx_switch(void *rq, struct task_struct *prev, struct task_struct *next) @@ -2146,11 +2155,6 @@ ftrace_wake_up_task(void *rq, struct task_struct *wakee, static inline void ftrace_all_fair_tasks(void *__rq, void *__tr, void *__data) { } -static inline void -__trace_special(void *__tr, void *__data, - unsigned long arg1, unsigned long arg2, unsigned long arg3) -{ -} #endif extern long sched_setaffinity(pid_t pid, const cpumask_t *new_mask); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2ca9d66..65d2c0a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -670,8 +670,6 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data, trace_function(tr, data, ip, parent_ip, flags); } -#ifdef CONFIG_CONTEXT_SWITCH_TRACER - void __trace_special(void *__tr, void *__data, unsigned long arg1, unsigned long arg2, unsigned long arg3) @@ -693,8 +691,6 @@ __trace_special(void *__tr, void *__data, trace_wake_up(); } -#endif - void __trace_stack(struct trace_array *tr, struct trace_array_cpu *data, unsigned long flags, -- cgit v0.10.2 From 4ac3ba41d372e3a9e420b36bc43589662b188a14 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:52 +0200 Subject: ftrace: trace scheduler rbtree Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 65d2c0a..06380dc 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -129,6 +129,7 @@ static const char *trace_options[] = { "bin", "block", "stacktrace", + "sched-tree", NULL }; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 75e2374..a520157 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -286,6 +286,7 @@ enum trace_iterator_flags { TRACE_ITER_BIN = 0x40, TRACE_ITER_BLOCK = 0x80, TRACE_ITER_STACKTRACE = 0x100, + TRACE_ITER_SCHED_TREE = 0x200, }; #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 12658b3..5555b90 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -36,7 +36,8 @@ ctx_switch_func(void *__rq, struct task_struct *prev, struct task_struct *next) if (likely(disabled == 1)) { tracing_sched_switch_trace(tr, data, prev, next, flags); - ftrace_all_fair_tasks(__rq, tr, data); + if (trace_flags & TRACE_ITER_SCHED_TREE) + ftrace_all_fair_tasks(__rq, tr, data); } atomic_dec(&data->disabled); @@ -62,7 +63,8 @@ wakeup_func(void *__rq, struct task_struct *wakee, struct task_struct *curr) if (likely(disabled == 1)) { tracing_sched_wakeup_trace(tr, data, wakee, curr, flags); - ftrace_all_fair_tasks(__rq, tr, data); + if (trace_flags & TRACE_ITER_SCHED_TREE) + ftrace_all_fair_tasks(__rq, tr, data); } atomic_dec(&data->disabled); -- cgit v0.10.2 From c7078de1aaf562a31b20984409c38cc1b40fa8a3 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:52 +0200 Subject: ftrace: add tracing_cpumask Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 06380dc..18f0ab8 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -70,6 +70,23 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait); unsigned long trace_flags = TRACE_ITER_PRINT_PARENT; +/* + * Only trace on a CPU if the bitmask is set: + */ +static cpumask_t tracing_cpumask __read_mostly = CPU_MASK_ALL; + +/* + * The tracer itself will not take this lock, but still we want + * to provide a consistent cpumask to user-space: + */ +static DEFINE_MUTEX(tracing_cpumask_update_lock); + +/* + * Temporary storage for the character representation of the + * CPU bitmask: + */ +static char mask_str[NR_CPUS]; + void trace_wake_up(void) { /* @@ -1754,9 +1771,49 @@ static struct file_operations tracing_lt_fops = { }; static struct file_operations show_traces_fops = { - .open = show_traces_open, - .read = seq_read, - .release = seq_release, + .open = show_traces_open, + .read = seq_read, + .release = seq_release, +}; + +static ssize_t +tracing_cpumask_read(struct file *filp, char __user *ubuf, + size_t count, loff_t *ppos) +{ + int err; + + count = min(count, (size_t)NR_CPUS); + + mutex_lock(&tracing_cpumask_update_lock); + cpumask_scnprintf(mask_str, NR_CPUS, tracing_cpumask); + err = copy_to_user(ubuf, mask_str, count); + if (err) + count = -EFAULT; + mutex_unlock(&tracing_cpumask_update_lock); + + return count; +} + +static ssize_t +tracing_cpumask_write(struct file *filp, const char __user *ubuf, + size_t count, loff_t *ppos) +{ + int err; + + mutex_lock(&tracing_cpumask_update_lock); + err = cpumask_parse_user(ubuf, count, tracing_cpumask); + mutex_unlock(&tracing_cpumask_update_lock); + + if (err) + return err; + + return count; +} + +static struct file_operations tracing_cpumask_fops = { + .open = tracing_open_generic, + .read = tracing_cpumask_read, + .write = tracing_cpumask_write, }; static ssize_t @@ -1837,9 +1894,9 @@ tracing_iter_ctrl_write(struct file *filp, const char __user *ubuf, } static struct file_operations tracing_iter_fops = { - .open = tracing_open_generic, - .read = tracing_iter_ctrl_read, - .write = tracing_iter_ctrl_write, + .open = tracing_open_generic, + .read = tracing_iter_ctrl_read, + .write = tracing_iter_ctrl_write, }; static const char readme_msg[] = @@ -1870,8 +1927,8 @@ tracing_readme_read(struct file *filp, char __user *ubuf, } static struct file_operations tracing_readme_fops = { - .open = tracing_open_generic, - .read = tracing_readme_read, + .open = tracing_open_generic, + .read = tracing_readme_read, }; static ssize_t @@ -2334,6 +2391,11 @@ static __init void tracer_init_debugfs(void) if (!entry) pr_warning("Could not create debugfs 'iter_ctrl' entry\n"); + entry = debugfs_create_file("tracing_cpumask", 0644, d_tracer, + NULL, &tracing_cpumask_fops); + if (!entry) + pr_warning("Could not create debugfs 'tracing_cpumask' entry\n"); + entry = debugfs_create_file("latency_trace", 0444, d_tracer, &global_trace, &tracing_lt_fops); if (!entry) -- cgit v0.10.2 From 36dfe9252bd4c9b55e8454363fb7e444c92c5030 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:52 +0200 Subject: ftrace: make use of tracing_cpumask Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 18f0ab8..3ae17e2 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -70,23 +70,6 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait); unsigned long trace_flags = TRACE_ITER_PRINT_PARENT; -/* - * Only trace on a CPU if the bitmask is set: - */ -static cpumask_t tracing_cpumask __read_mostly = CPU_MASK_ALL; - -/* - * The tracer itself will not take this lock, but still we want - * to provide a consistent cpumask to user-space: - */ -static DEFINE_MUTEX(tracing_cpumask_update_lock); - -/* - * Temporary storage for the character representation of the - * CPU bitmask: - */ -static char mask_str[NR_CPUS]; - void trace_wake_up(void) { /* @@ -1776,19 +1759,46 @@ static struct file_operations show_traces_fops = { .release = seq_release, }; +/* + * Only trace on a CPU if the bitmask is set: + */ +static cpumask_t tracing_cpumask = CPU_MASK_ALL; + +/* + * When tracing/tracing_cpu_mask is modified then this holds + * the new bitmask we are about to install: + */ +static cpumask_t tracing_cpumask_new; + +/* + * The tracer itself will not take this lock, but still we want + * to provide a consistent cpumask to user-space: + */ +static DEFINE_MUTEX(tracing_cpumask_update_lock); + +/* + * Temporary storage for the character representation of the + * CPU bitmask (and one more byte for the newline): + */ +static char mask_str[NR_CPUS + 1]; + static ssize_t tracing_cpumask_read(struct file *filp, char __user *ubuf, size_t count, loff_t *ppos) { - int err; - - count = min(count, (size_t)NR_CPUS); + int len; mutex_lock(&tracing_cpumask_update_lock); - cpumask_scnprintf(mask_str, NR_CPUS, tracing_cpumask); - err = copy_to_user(ubuf, mask_str, count); - if (err) - count = -EFAULT; + + len = cpumask_scnprintf(mask_str, count, tracing_cpumask); + if (count - len < 2) { + count = -EINVAL; + goto out_err; + } + len += sprintf(mask_str + len, "\n"); + count = simple_read_from_buffer(ubuf, count, ppos, mask_str, NR_CPUS+1); + +out_err: mutex_unlock(&tracing_cpumask_update_lock); return count; @@ -1798,16 +1808,40 @@ static ssize_t tracing_cpumask_write(struct file *filp, const char __user *ubuf, size_t count, loff_t *ppos) { - int err; + int err, cpu; mutex_lock(&tracing_cpumask_update_lock); - err = cpumask_parse_user(ubuf, count, tracing_cpumask); - mutex_unlock(&tracing_cpumask_update_lock); - + err = cpumask_parse_user(ubuf, count, tracing_cpumask_new); if (err) - return err; + goto err_unlock; + + spin_lock_irq(&ftrace_max_lock); + for_each_possible_cpu(cpu) { + /* + * Increase/decrease the disabled counter if we are + * about to flip a bit in the cpumask: + */ + if (cpu_isset(cpu, tracing_cpumask) && + !cpu_isset(cpu, tracing_cpumask_new)) { + atomic_inc(&global_trace.data[cpu]->disabled); + } + if (!cpu_isset(cpu, tracing_cpumask) && + cpu_isset(cpu, tracing_cpumask_new)) { + atomic_dec(&global_trace.data[cpu]->disabled); + } + } + spin_unlock_irq(&ftrace_max_lock); + + tracing_cpumask = tracing_cpumask_new; + + mutex_unlock(&tracing_cpumask_update_lock); return count; + +err_unlock: + mutex_unlock(&tracing_cpumask_update_lock); + + return err; } static struct file_operations tracing_cpumask_fops = { @@ -1846,8 +1880,7 @@ tracing_iter_ctrl_read(struct file *filp, char __user *ubuf, r += sprintf(buf + r, "\n"); WARN_ON(r >= len + 2); - r = simple_read_from_buffer(ubuf, cnt, ppos, - buf, r); + r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); kfree(buf); -- cgit v0.10.2 From d9af56fbd8b37181bffaad060f74aa2e17382874 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:53 +0200 Subject: ftrace: fix cmdline tracing Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 5555b90..5a217e8 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -29,6 +29,8 @@ ctx_switch_func(void *__rq, struct task_struct *prev, struct task_struct *next) if (!tracer_enabled) return; + tracing_record_cmdline(prev); + local_irq_save(flags); cpu = raw_smp_processor_id(); data = tr->data[cpu]; @@ -56,6 +58,8 @@ wakeup_func(void *__rq, struct task_struct *wakee, struct task_struct *curr) if (!tracer_enabled) return; + tracing_record_cmdline(curr); + local_irq_save(flags); cpu = raw_smp_processor_id(); data = tr->data[cpu]; @@ -75,8 +79,6 @@ void ftrace_ctx_switch(void *__rq, struct task_struct *prev, struct task_struct *next) { - tracing_record_cmdline(prev); - /* * If tracer_switch_func only points to the local * switch func, it still needs the ptr passed to it. @@ -93,8 +95,6 @@ void ftrace_wake_up_task(void *__rq, struct task_struct *wakee, struct task_struct *curr) { - tracing_record_cmdline(curr); - wakeup_func(__rq, wakee, curr); /* -- cgit v0.10.2 From 442e544ce52d4415a024920b84fb95c5f9aa0855 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:53 +0200 Subject: ftrace: iter ctrl fix Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 3ae17e2..688b4cf 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1920,6 +1920,11 @@ tracing_iter_ctrl_write(struct file *filp, const char __user *ubuf, break; } } + /* + * If no option could be set, return an error: + */ + if (!trace_options[i]) + return -EINVAL; filp->f_pos += cnt; -- cgit v0.10.2 From f29c73fe3404f8799ed57aaf48859e0b55fc071f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:53 +0200 Subject: ftrace: include cpu in stacktrace Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 688b4cf..3a40324 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1227,10 +1227,8 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) abs_usecs % 1000, rel_usecs/1000, rel_usecs % 1000); } else { - if (entry->type != TRACE_STACK) { - lat_print_generic(s, entry, cpu); - lat_print_timestamp(s, abs_usecs, rel_usecs); - } + lat_print_generic(s, entry, cpu); + lat_print_timestamp(s, abs_usecs, rel_usecs); } switch (entry->type) { case TRACE_FN: @@ -1293,17 +1291,15 @@ static int print_trace_fmt(struct trace_iterator *iter) usec_rem = do_div(t, 1000000ULL); secs = (unsigned long)t; - if (entry->type != TRACE_STACK) { - ret = trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid); - if (!ret) - return 0; - ret = trace_seq_printf(s, "[%02d] ", iter->cpu); - if (!ret) - return 0; - ret = trace_seq_printf(s, "%5lu.%06lu: ", secs, usec_rem); - if (!ret) - return 0; - } + ret = trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid); + if (!ret) + return 0; + ret = trace_seq_printf(s, "[%02d] ", iter->cpu); + if (!ret) + return 0; + ret = trace_seq_printf(s, "%5lu.%06lu: ", secs, usec_rem); + if (!ret) + return 0; switch (entry->type) { case TRACE_FN: -- cgit v0.10.2 From 36fc25a9f48deacd8aa08cd2d1c186a4e412604f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:53 +0200 Subject: ftrace: sched tree fix Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/sched.c b/kernel/sched.c index 9ca4a2e..3bc7c53 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2414,6 +2414,23 @@ static int sched_balance_self(int cpu, int flag) #ifdef CONFIG_CONTEXT_SWITCH_TRACER +void ftrace_task(struct task_struct *p, void *__tr, void *__data) +{ +#if 0 + /* + * trace timeline tree + */ + __trace_special(__tr, __data, + p->pid, p->se.vruntime, p->se.sum_exec_runtime); +#else + /* + * trace balance metrics + */ + __trace_special(__tr, __data, + p->pid, p->se.avg_overlap, 0); +#endif +} + void ftrace_all_fair_tasks(void *__rq, void *__tr, void *__data) { struct task_struct *p; @@ -2421,32 +2438,22 @@ void ftrace_all_fair_tasks(void *__rq, void *__tr, void *__data) struct rb_node *curr; struct rq *rq = __rq; - curr = first_fair(&rq->cfs); - if (!curr) - return; - if (rq->cfs.curr) { p = task_of(rq->cfs.curr); - __trace_special(__tr, __data, - p->pid, p->se.vruntime, p->se.sum_exec_runtime); + ftrace_task(p, __tr, __data); } if (rq->cfs.next) { p = task_of(rq->cfs.next); - __trace_special(__tr, __data, - p->pid, p->se.vruntime, p->se.sum_exec_runtime); + ftrace_task(p, __tr, __data); } - while (curr) { + for (curr = first_fair(&rq->cfs); curr; curr = rb_next(curr)) { se = rb_entry(curr, struct sched_entity, run_node); if (!entity_is_task(se)) continue; p = task_of(se); - - __trace_special(__tr, __data, - p->pid, p->se.vruntime, p->se.sum_exec_runtime); - - curr = rb_next(curr); + ftrace_task(p, __tr, __data); } } -- cgit v0.10.2 From 88a4216c3ec4281fc7e6725cc3a3ccd01fb1aa14 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:53 +0200 Subject: ftrace: sched special Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/include/linux/sched.h b/include/linux/sched.h index 5b186be..360ca99 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2138,6 +2138,8 @@ extern void ftrace_wake_up_task(void *rq, struct task_struct *wakee, struct task_struct *curr); extern void ftrace_all_fair_tasks(void *__rq, void *__tr, void *__data); +extern void +ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3); #else static inline void ftrace_ctx_switch(void *rq, struct task_struct *prev, struct task_struct *next) @@ -2155,6 +2157,10 @@ ftrace_wake_up_task(void *rq, struct task_struct *wakee, static inline void ftrace_all_fair_tasks(void *__rq, void *__tr, void *__data) { } +static inline void +ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) +{ +} #endif extern long sched_setaffinity(pid_t pid, const cpumask_t *new_mask); diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index e24ecd3..dc1856f 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1061,6 +1061,8 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, if (!(this_sd->flags & SD_WAKE_AFFINE)) return 0; + ftrace_special(__LINE__, curr->se.avg_overlap, sync); + ftrace_special(__LINE__, p->se.avg_overlap, -1); /* * If the currently running task will sleep within * a reasonable amount of time then attract this newly @@ -1238,6 +1240,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) if (unlikely(se == pse)) return; + ftrace_special(__LINE__, p->pid, se->last_wakeup); cfs_rq_of(pse)->next = pse; /* diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 3a40324..b87a264 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1251,7 +1251,7 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) comm); break; case TRACE_SPECIAL: - trace_seq_printf(s, " %ld %ld %ld\n", + trace_seq_printf(s, "# %ld %ld %ld\n", entry->special.arg1, entry->special.arg2, entry->special.arg3); @@ -1335,7 +1335,7 @@ static int print_trace_fmt(struct trace_iterator *iter) return 0; break; case TRACE_SPECIAL: - ret = trace_seq_printf(s, " %ld %ld %ld\n", + ret = trace_seq_printf(s, "# %ld %ld %ld\n", entry->special.arg1, entry->special.arg2, entry->special.arg3); @@ -1400,7 +1400,7 @@ static int print_raw_fmt(struct trace_iterator *iter) break; case TRACE_SPECIAL: case TRACE_STACK: - ret = trace_seq_printf(s, " %ld %ld %ld\n", + ret = trace_seq_printf(s, "# %ld %ld %ld\n", entry->special.arg1, entry->special.arg2, entry->special.arg3); diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 5a217e8..bddf676 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -103,6 +103,30 @@ ftrace_wake_up_task(void *__rq, struct task_struct *wakee, wakeup_sched_wakeup(wakee, curr); } +void +ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) +{ + struct trace_array *tr = ctx_trace; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int cpu; + + if (!tracer_enabled) + return; + + local_irq_save(flags); + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + disabled = atomic_inc_return(&data->disabled); + + if (likely(disabled == 1)) + __trace_special(tr, data, arg1, arg2, arg3); + + atomic_dec(&data->disabled); + local_irq_restore(flags); +} + static void sched_switch_reset(struct trace_array *tr) { int cpu; -- cgit v0.10.2 From bac524d3f3dfeffa3a9d44f2c64035b88bcaacb4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 12 May 2008 21:20:53 +0200 Subject: ftrace: trace next state Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index b87a264..b63fe90 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -736,6 +736,7 @@ tracing_sched_switch_trace(struct trace_array *tr, entry->ctx.prev_state = prev->state; entry->ctx.next_pid = next->pid; entry->ctx.next_prio = next->prio; + entry->ctx.next_state = next->state; __trace_stack(tr, data, flags, 4); spin_unlock_irqrestore(&data->lock, irq_flags); } @@ -759,6 +760,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr, entry->ctx.prev_state = curr->state; entry->ctx.next_pid = wakee->pid; entry->ctx.next_prio = wakee->prio; + entry->ctx.next_state = wakee->state; __trace_stack(tr, data, flags, 5); spin_unlock_irqrestore(&data->lock, irq_flags); @@ -1207,7 +1209,7 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) unsigned long abs_usecs; unsigned long rel_usecs; char *comm; - int S; + int S, T; int i; if (!next_entry) @@ -1241,14 +1243,17 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) case TRACE_WAKE: S = entry->ctx.prev_state < sizeof(state_to_char) ? state_to_char[entry->ctx.prev_state] : 'X'; + T = entry->ctx.next_state < sizeof(state_to_char) ? + state_to_char[entry->ctx.next_state] : 'X'; + comm = trace_find_cmdline(entry->ctx.next_pid); - trace_seq_printf(s, " %5d:%3d:%c %s %5d:%3d %s\n", + trace_seq_printf(s, " %5d:%3d:%c %s %5d:%3d:%c %s\n", entry->ctx.prev_pid, entry->ctx.prev_prio, S, entry->type == TRACE_CTX ? "==>" : " +", entry->ctx.next_pid, entry->ctx.next_prio, - comm); + T, comm); break; case TRACE_SPECIAL: trace_seq_printf(s, "# %ld %ld %ld\n", @@ -1280,7 +1285,7 @@ static int print_trace_fmt(struct trace_iterator *iter) unsigned long secs; char *comm; int ret; - int S; + int S, T; int i; entry = iter->ent; @@ -1324,13 +1329,16 @@ static int print_trace_fmt(struct trace_iterator *iter) case TRACE_WAKE: S = entry->ctx.prev_state < sizeof(state_to_char) ? state_to_char[entry->ctx.prev_state] : 'X'; - ret = trace_seq_printf(s, " %5d:%3d:%c %s %5d:%3d\n", + T = entry->ctx.next_state < sizeof(state_to_char) ? + state_to_char[entry->ctx.next_state] : 'X'; + ret = trace_seq_printf(s, " %5d:%3d:%c %s %5d:%3d:%c\n", entry->ctx.prev_pid, entry->ctx.prev_prio, S, entry->type == TRACE_CTX ? "==>" : " +", entry->ctx.next_pid, - entry->ctx.next_prio); + entry->ctx.next_prio, + T); if (!ret) return 0; break; @@ -1367,7 +1375,7 @@ static int print_raw_fmt(struct trace_iterator *iter) struct trace_seq *s = &iter->seq; struct trace_entry *entry; int ret; - int S; + int S, T; entry = iter->ent; @@ -1387,14 +1395,17 @@ static int print_raw_fmt(struct trace_iterator *iter) case TRACE_WAKE: S = entry->ctx.prev_state < sizeof(state_to_char) ? state_to_char[entry->ctx.prev_state] : 'X'; + T = entry->ctx.next_state < sizeof(state_to_char) ? + state_to_char[entry->ctx.next_state] : 'X'; if (entry->type == TRACE_WAKE) S = '+'; - ret = trace_seq_printf(s, "%d %d %c %d %d\n", + ret = trace_seq_printf(s, "%d %d %c %d %d %c\n", entry->ctx.prev_pid, entry->ctx.prev_prio, S, entry->ctx.next_pid, - entry->ctx.next_prio); + entry->ctx.next_prio, + T); if (!ret) return 0; break; @@ -1428,7 +1439,7 @@ static int print_hex_fmt(struct trace_iterator *iter) struct trace_seq *s = &iter->seq; unsigned char newline = '\n'; struct trace_entry *entry; - int S; + int S, T; entry = iter->ent; @@ -1445,6 +1456,8 @@ static int print_hex_fmt(struct trace_iterator *iter) case TRACE_WAKE: S = entry->ctx.prev_state < sizeof(state_to_char) ? state_to_char[entry->ctx.prev_state] : 'X'; + T = entry->ctx.next_state < sizeof(state_to_char) ? + state_to_char[entry->ctx.next_state] : 'X'; if (entry->type == TRACE_WAKE) S = '+'; SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.prev_pid); @@ -1453,6 +1466,7 @@ static int print_hex_fmt(struct trace_iterator *iter) SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.next_pid); SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.next_prio); SEQ_PUT_HEX_FIELD_RET(s, entry->fn.parent_ip); + SEQ_PUT_HEX_FIELD_RET(s, T); break; case TRACE_SPECIAL: case TRACE_STACK: @@ -1488,6 +1502,7 @@ static int print_bin_fmt(struct trace_iterator *iter) SEQ_PUT_FIELD_RET(s, entry->ctx.prev_state); SEQ_PUT_FIELD_RET(s, entry->ctx.next_pid); SEQ_PUT_FIELD_RET(s, entry->ctx.next_prio); + SEQ_PUT_FIELD_RET(s, entry->ctx.next_state); break; case TRACE_SPECIAL: case TRACE_STACK: diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index a520157..96951a8 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -23,6 +23,7 @@ struct ctx_switch_entry { unsigned char prev_state; unsigned int next_pid; unsigned char next_prio; + unsigned char next_state; }; /* -- cgit v0.10.2 From 5429db2d26a59903c81a4f6c6dae7eb9daaea5fc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 12 May 2008 21:20:53 +0200 Subject: ftrace: fix wakeup callback Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/sched.c b/kernel/sched.c index 3bc7c53..1ec3fb2 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2533,7 +2533,6 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) out_activate: #endif /* CONFIG_SMP */ - ftrace_wake_up_task(rq, p, rq->curr); schedstat_inc(p, se.nr_wakeups); if (sync) schedstat_inc(p, se.nr_wakeups_sync); @@ -2548,6 +2547,7 @@ out_activate: success = 1; out_running: + ftrace_wake_up_task(rq, p, rq->curr); check_preempt_curr(rq, p); p->state = TASK_RUNNING; -- cgit v0.10.2 From 694379e9ed4f2f6babe111bf001c69e2e263338b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:54 +0200 Subject: ftrace: make it more available in the Kconfig Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index eb1988e..ebc158e 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -14,7 +14,7 @@ config TRACING config FTRACE bool "Kernel Function Tracer" - depends on DEBUG_KERNEL && HAVE_FTRACE + depends on HAVE_FTRACE select FRAME_POINTER select TRACING select CONTEXT_SWITCH_TRACER @@ -72,7 +72,6 @@ config PREEMPT_TRACER config SCHED_TRACER bool "Scheduling Latency Tracer" - depends on DEBUG_KERNEL select TRACING select CONTEXT_SWITCH_TRACER select TRACER_MAX_TRACE @@ -82,7 +81,6 @@ config SCHED_TRACER config CONTEXT_SWITCH_TRACER bool "Trace process context switches" - depends on DEBUG_KERNEL select TRACING select MARKERS help -- cgit v0.10.2 From 8f96da02c14d722ad9a3713cd7273ce28c9036ad Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:54 +0200 Subject: ftrace: remove wakeup from function trace trace_function is called by mcount and calling wake_up from that can have unpredictable results. This patch removes the wakeup from trace_function. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index b63fe90..736dcfb 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -659,8 +659,6 @@ trace_function(struct trace_array *tr, struct trace_array_cpu *data, entry->fn.ip = ip; entry->fn.parent_ip = parent_ip; spin_unlock_irqrestore(&data->lock, irq_flags); - - trace_wake_up(); } void -- cgit v0.10.2 From 4fe8c3048cd8280a54256bca9cac2007bd546c33 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:54 +0200 Subject: ftrace: printk and trace irqsoff and wakeups printk called from wakeup critical timings and irqs off can cause deadlocks since printk might do a wakeup itself. If the call to printk happens with the runqueue lock held, it can deadlock. This patch protects the printk from being called in trace irqs off with a test to see if the runqueue for the current CPU is locked. If it is locked, the printk is skipped. The wakeup always holds the runqueue lock, so the printk is simply removed. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 7a4dc01..d0c1748 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -165,18 +165,20 @@ check_critical_timing(struct trace_array *tr, update_max_tr_single(tr, current, cpu); - if (tracing_thresh) { - printk(KERN_INFO "(%16s-%-5d|#%d):" - " %lu us critical section violates %lu us threshold.\n", - current->comm, current->pid, - raw_smp_processor_id(), - latency, nsecs_to_usecs(tracing_thresh)); - } else { - printk(KERN_INFO "(%16s-%-5d|#%d):" - " new %lu us maximum-latency critical section.\n", - current->comm, current->pid, - raw_smp_processor_id(), - latency); + if (!runqueue_is_locked()) { + if (tracing_thresh) { + printk(KERN_INFO "(%16s-%-5d|#%d): %lu us critical" + " section violates %lu us threshold.\n", + current->comm, current->pid, + raw_smp_processor_id(), + latency, nsecs_to_usecs(tracing_thresh)); + } else { + printk(KERN_INFO "(%16s-%-5d|#%d): new %lu us" + " maximum-latency critical section.\n", + current->comm, current->pid, + raw_smp_processor_id(), + latency); + } } max_sequence++; diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 2a01242..5948011 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -106,19 +106,6 @@ wakeup_sched_switch(struct task_struct *prev, struct task_struct *next) update_max_tr(tr, wakeup_task, wakeup_cpu); - if (tracing_thresh) { - printk(KERN_INFO "(%16s-%-5d|#%d):" - " %lu us wakeup latency violates %lu us threshold.\n", - wakeup_task->comm, wakeup_task->pid, - raw_smp_processor_id(), - latency, nsecs_to_usecs(tracing_thresh)); - } else { - printk(KERN_INFO "(%16s-%-5d|#%d):" - " new %lu us maximum wakeup latency.\n", - wakeup_task->comm, wakeup_task->pid, - cpu, latency); - } - out_unlock: __wakeup_reset(tr); spin_unlock_irqrestore(&wakeup_lock, flags); -- cgit v0.10.2 From 06fa75ab566c50e01bfd7b055bde85cf9b1bc98a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:54 +0200 Subject: ftrace: add TRACE_STACK and TRACE_SPECIAL to selftest validation The selftest validation code checks for valid entries in the trace buffer. TRACE_STACK and TRACE_SPECIAL have been added to the code but not to the validator. This patch adds the two to prevent them from flagging a failure in the selftest. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 39dd452..92f4acb 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -9,6 +9,8 @@ static inline int trace_valid_entry(struct trace_entry *entry) case TRACE_FN: case TRACE_CTX: case TRACE_WAKE: + case TRACE_STACK: + case TRACE_SPECIAL: return 1; } return 0; @@ -180,7 +182,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, /* we should only have one item */ if (!ret && count != 1) { - printk(KERN_CONT ".. filter failed .."); + printk(KERN_CONT ".. filter failed count=%ld ..", count); ret = -1; goto out; } -- cgit v0.10.2 From d05cdb25d80f06f77aa6bddb53cd1390d4d91a0b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:54 +0200 Subject: ftrace: fix dynamic ftrace selftest With the adding of the configuration changes in the Makefile to prevent tracing of functions in the ftrace code, all tracing of all the ftrace code has been removed. Unfortunately, one of the selftests, relied on a function to be traced. With the new change, the function was no longer traced and the test failed. This patch separates out the test function into its own file so that we can add the "-pg" flag to the compilation of that function and the adding of the mcount call to that function. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index c25a6cd..d9efbbf 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -4,6 +4,10 @@ ifdef CONFIG_FTRACE ORIG_CFLAGS := $(KBUILD_CFLAGS) KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS)) + +# selftest needs instrumentation +CFLAGS_trace_selftest_dynamic.o = -pg +obj-y += trace_selftest_dynamic.o endif obj-$(CONFIG_FTRACE) += libftrace.o diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 96951a8..98cbfd0 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -244,6 +244,8 @@ extern int unregister_tracer_switch(struct tracer_switch_ops *ops); #ifdef CONFIG_DYNAMIC_FTRACE extern unsigned long ftrace_update_tot_cnt; +#define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func +extern int DYN_FTRACE_TEST_NAME(void); #endif #ifdef CONFIG_FTRACE_STARTUP_TEST diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 92f4acb..83e55a2 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -107,14 +107,8 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count) #ifdef CONFIG_DYNAMIC_FTRACE -#define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func #define __STR(x) #x #define STR(x) __STR(x) -static int DYN_FTRACE_TEST_NAME(void) -{ - /* used to call mcount */ - return 0; -} /* Test dynamic code modification and ftrace filters */ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, diff --git a/kernel/trace/trace_selftest_dynamic.c b/kernel/trace/trace_selftest_dynamic.c new file mode 100644 index 0000000..54dd77c --- /dev/null +++ b/kernel/trace/trace_selftest_dynamic.c @@ -0,0 +1,7 @@ +#include "trace.h" + +int DYN_FTRACE_TEST_NAME(void) +{ + /* used to call mcount */ + return 0; +} -- cgit v0.10.2 From 4d9493c90f8e6e1b164aede3814010a290161abb Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:54 +0200 Subject: ftrace: remove add-hoc code Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/sched.c b/kernel/sched.c index 1ec3fb2..ad95cca 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2412,53 +2412,6 @@ static int sched_balance_self(int cpu, int flag) #endif /* CONFIG_SMP */ -#ifdef CONFIG_CONTEXT_SWITCH_TRACER - -void ftrace_task(struct task_struct *p, void *__tr, void *__data) -{ -#if 0 - /* - * trace timeline tree - */ - __trace_special(__tr, __data, - p->pid, p->se.vruntime, p->se.sum_exec_runtime); -#else - /* - * trace balance metrics - */ - __trace_special(__tr, __data, - p->pid, p->se.avg_overlap, 0); -#endif -} - -void ftrace_all_fair_tasks(void *__rq, void *__tr, void *__data) -{ - struct task_struct *p; - struct sched_entity *se; - struct rb_node *curr; - struct rq *rq = __rq; - - if (rq->cfs.curr) { - p = task_of(rq->cfs.curr); - ftrace_task(p, __tr, __data); - } - if (rq->cfs.next) { - p = task_of(rq->cfs.next); - ftrace_task(p, __tr, __data); - } - - for (curr = first_fair(&rq->cfs); curr; curr = rb_next(curr)) { - se = rb_entry(curr, struct sched_entity, run_node); - if (!entity_is_task(se)) - continue; - - p = task_of(se); - ftrace_task(p, __tr, __data); - } -} - -#endif - /*** * try_to_wake_up - wake up a thread * @p: the to-be-woken-up thread diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index dc1856f..e24ecd3 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1061,8 +1061,6 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, if (!(this_sd->flags & SD_WAKE_AFFINE)) return 0; - ftrace_special(__LINE__, curr->se.avg_overlap, sync); - ftrace_special(__LINE__, p->se.avg_overlap, -1); /* * If the currently running task will sleep within * a reasonable amount of time then attract this newly @@ -1240,7 +1238,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) if (unlikely(se == pse)) return; - ftrace_special(__LINE__, p->pid, se->last_wakeup); cfs_rq_of(pse)->next = pse; /* diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index bddf676..5671db0 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -36,11 +36,8 @@ ctx_switch_func(void *__rq, struct task_struct *prev, struct task_struct *next) data = tr->data[cpu]; disabled = atomic_inc_return(&data->disabled); - if (likely(disabled == 1)) { + if (likely(disabled == 1)) tracing_sched_switch_trace(tr, data, prev, next, flags); - if (trace_flags & TRACE_ITER_SCHED_TREE) - ftrace_all_fair_tasks(__rq, tr, data); - } atomic_dec(&data->disabled); local_irq_restore(flags); @@ -65,11 +62,8 @@ wakeup_func(void *__rq, struct task_struct *wakee, struct task_struct *curr) data = tr->data[cpu]; disabled = atomic_inc_return(&data->disabled); - if (likely(disabled == 1)) { + if (likely(disabled == 1)) tracing_sched_wakeup_trace(tr, data, wakee, curr, flags); - if (trace_flags & TRACE_ITER_SCHED_TREE) - ftrace_all_fair_tasks(__rq, tr, data); - } atomic_dec(&data->disabled); local_irq_restore(flags); -- cgit v0.10.2 From c5f888cae49dfe3e86d9d1e0dab2b63ceb057be3 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:55 +0200 Subject: ftrace: irqsoff use raw_smp_processor_id This patch changes the use of __get_cpu_var to explicitly calling raw_smp_processor_id and using the per_cpu() macro. On some debug configurations, the use of __get_cpu_var may cause ftrace to trigger and this can cause problems with the irqsoff tracing. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index d0c1748..761f3ec 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -204,14 +204,14 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip) if (likely(!tracer_enabled)) return; - if (__get_cpu_var(tracing_cpu)) + cpu = raw_smp_processor_id(); + + if (per_cpu(tracing_cpu, cpu)) return; - cpu = raw_smp_processor_id(); data = tr->data[cpu]; - if (unlikely(!data) || unlikely(!head_page(data)) || - atomic_read(&data->disabled)) + if (unlikely(!data) || atomic_read(&data->disabled)) return; atomic_inc(&data->disabled); @@ -225,7 +225,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip) trace_function(tr, data, ip, parent_ip, flags); - __get_cpu_var(tracing_cpu) = 1; + per_cpu(tracing_cpu, cpu) = 1; atomic_dec(&data->disabled); } @@ -238,16 +238,16 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip) struct trace_array_cpu *data; unsigned long flags; + cpu = raw_smp_processor_id(); /* Always clear the tracing cpu on stopping the trace */ - if (unlikely(__get_cpu_var(tracing_cpu))) - __get_cpu_var(tracing_cpu) = 0; + if (unlikely(per_cpu(tracing_cpu, cpu))) + per_cpu(tracing_cpu, cpu) = 0; else return; if (!tracer_enabled) return; - cpu = raw_smp_processor_id(); data = tr->data[cpu]; if (unlikely(!data) || unlikely(!head_page(data)) || @@ -255,6 +255,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip) return; atomic_inc(&data->disabled); + local_save_flags(flags); trace_function(tr, data, ip, parent_ip, flags); check_critical_timing(tr, data, parent_ip ? : ip, cpu); @@ -376,7 +377,7 @@ static void stop_irqsoff_tracer(struct trace_array *tr) static void __irqsoff_tracer_init(struct trace_array *tr) { irqsoff_trace = tr; - /* make sure that the tracer is visibel */ + /* make sure that the tracer is visible */ smp_wmb(); if (tr->ctrl) -- cgit v0.10.2 From 92205c2343527a863d660360599a4bf8cede77b0 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:55 +0200 Subject: ftrace: user raw_spin_lock in tracing Lock debugging enabled cause huge performance problems for tracing. Having the lock verification happening for every function that is called because mcount calls spin_lock can cripple the system. This patch converts the spin_locks used by ftrace into raw_spin_locks. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 736dcfb..3009aaf 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -133,7 +133,8 @@ static const char *trace_options[] = { NULL }; -static DEFINE_SPINLOCK(ftrace_max_lock); +static raw_spinlock_t ftrace_max_lock = + (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; /* * Copy the new maximum trace into the separate maximum-trace @@ -335,7 +336,7 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) int i; WARN_ON_ONCE(!irqs_disabled()); - spin_lock(&ftrace_max_lock); + __raw_spin_lock(&ftrace_max_lock); /* clear out all the previous traces */ for_each_possible_cpu(i) { data = tr->data[i]; @@ -344,7 +345,7 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) } __update_max_tr(tr, tsk, cpu); - spin_unlock(&ftrace_max_lock); + __raw_spin_unlock(&ftrace_max_lock); } /** @@ -360,7 +361,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) int i; WARN_ON_ONCE(!irqs_disabled()); - spin_lock(&ftrace_max_lock); + __raw_spin_lock(&ftrace_max_lock); for_each_possible_cpu(i) tracing_reset(max_tr.data[i]); @@ -368,7 +369,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) tracing_reset(data); __update_max_tr(tr, tsk, cpu); - spin_unlock(&ftrace_max_lock); + __raw_spin_unlock(&ftrace_max_lock); } int register_tracer(struct tracer *type) @@ -652,13 +653,15 @@ trace_function(struct trace_array *tr, struct trace_array_cpu *data, struct trace_entry *entry; unsigned long irq_flags; - spin_lock_irqsave(&data->lock, irq_flags); + raw_local_irq_save(irq_flags); + __raw_spin_lock(&data->lock); entry = tracing_get_trace_entry(tr, data); tracing_generic_entry_update(entry, flags); entry->type = TRACE_FN; entry->fn.ip = ip; entry->fn.parent_ip = parent_ip; - spin_unlock_irqrestore(&data->lock, irq_flags); + __raw_spin_unlock(&data->lock); + raw_local_irq_restore(irq_flags); } void @@ -678,14 +681,16 @@ __trace_special(void *__tr, void *__data, struct trace_entry *entry; unsigned long irq_flags; - spin_lock_irqsave(&data->lock, irq_flags); + raw_local_irq_save(irq_flags); + __raw_spin_lock(&data->lock); entry = tracing_get_trace_entry(tr, data); tracing_generic_entry_update(entry, 0); entry->type = TRACE_SPECIAL; entry->special.arg1 = arg1; entry->special.arg2 = arg2; entry->special.arg3 = arg3; - spin_unlock_irqrestore(&data->lock, irq_flags); + __raw_spin_unlock(&data->lock); + raw_local_irq_restore(irq_flags); trace_wake_up(); } @@ -725,7 +730,8 @@ tracing_sched_switch_trace(struct trace_array *tr, struct trace_entry *entry; unsigned long irq_flags; - spin_lock_irqsave(&data->lock, irq_flags); + raw_local_irq_save(irq_flags); + __raw_spin_lock(&data->lock); entry = tracing_get_trace_entry(tr, data); tracing_generic_entry_update(entry, flags); entry->type = TRACE_CTX; @@ -736,7 +742,8 @@ tracing_sched_switch_trace(struct trace_array *tr, entry->ctx.next_prio = next->prio; entry->ctx.next_state = next->state; __trace_stack(tr, data, flags, 4); - spin_unlock_irqrestore(&data->lock, irq_flags); + __raw_spin_unlock(&data->lock); + raw_local_irq_restore(irq_flags); } void @@ -749,7 +756,8 @@ tracing_sched_wakeup_trace(struct trace_array *tr, struct trace_entry *entry; unsigned long irq_flags; - spin_lock_irqsave(&data->lock, irq_flags); + raw_local_irq_save(irq_flags); + __raw_spin_lock(&data->lock); entry = tracing_get_trace_entry(tr, data); tracing_generic_entry_update(entry, flags); entry->type = TRACE_WAKE; @@ -760,7 +768,8 @@ tracing_sched_wakeup_trace(struct trace_array *tr, entry->ctx.next_prio = wakee->prio; entry->ctx.next_state = wakee->state; __trace_stack(tr, data, flags, 5); - spin_unlock_irqrestore(&data->lock, irq_flags); + __raw_spin_unlock(&data->lock); + raw_local_irq_restore(irq_flags); trace_wake_up(); } @@ -1824,7 +1833,8 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, if (err) goto err_unlock; - spin_lock_irq(&ftrace_max_lock); + raw_local_irq_disable(); + __raw_spin_lock(&ftrace_max_lock); for_each_possible_cpu(cpu) { /* * Increase/decrease the disabled counter if we are @@ -1839,7 +1849,8 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, atomic_dec(&global_trace.data[cpu]->disabled); } } - spin_unlock_irq(&ftrace_max_lock); + __raw_spin_unlock(&ftrace_max_lock); + raw_local_irq_enable(); tracing_cpumask = tracing_cpumask_new; @@ -2299,7 +2310,7 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, for_each_cpu_mask(cpu, mask) { data = iter->tr->data[cpu]; - spin_lock(&data->lock); + __raw_spin_lock(&data->lock); } while (find_next_entry_inc(iter) != NULL) { @@ -2320,7 +2331,7 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, for_each_cpu_mask(cpu, mask) { data = iter->tr->data[cpu]; - spin_unlock(&data->lock); + __raw_spin_unlock(&data->lock); } for_each_cpu_mask(cpu, mask) { @@ -2538,8 +2549,7 @@ static int trace_alloc_page(void) /* Now that we successfully allocate a page per CPU, add them */ for_each_possible_cpu(i) { data = global_trace.data[i]; - spin_lock_init(&data->lock); - lockdep_set_class(&data->lock, &data->lock_key); + data->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; page = list_entry(pages.next, struct page, lru); list_del_init(&page->lru); list_add_tail(&page->lru, &data->trace_pages); @@ -2547,8 +2557,7 @@ static int trace_alloc_page(void) #ifdef CONFIG_TRACER_MAX_TRACE data = max_tr.data[i]; - spin_lock_init(&data->lock); - lockdep_set_class(&data->lock, &data->lock_key); + data->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; page = list_entry(pages.next, struct page, lru); list_del_init(&page->lru); list_add_tail(&page->lru, &data->trace_pages); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 98cbfd0..25cba28 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -76,7 +76,7 @@ struct trace_entry { struct trace_array_cpu { struct list_head trace_pages; atomic_t disabled; - spinlock_t lock; + raw_spinlock_t lock; struct lock_class_key lock_key; /* these fields get copied into max-trace: */ -- cgit v0.10.2 From 9d0a420b737f72d84fabebf29634d800cbf54538 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:55 +0200 Subject: ftrace: remove function tracing from spinlock debug The debug functions in spin_lock debugging pollute the output of the function tracer. This patch adds the debug files in the lib director to those that should not be compiled with mcount tracing. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/lib/Makefile b/lib/Makefile index 6ca9e6e..d97ad11 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -10,8 +10,9 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \ ifdef CONFIG_FTRACE # Do not profile string.o, since it may be used in early boot or vdso +# Also do not profile any debug utilities ORIG_CFLAGS := $(KBUILD_CFLAGS) -KBUILD_CFLAGS = $(if $(subst string,,$(basename $(notdir $@))), \ +KBUILD_CFLAGS = $(if $(filter-out %debug debug% string%,$(basename $(notdir $@))), \ $(ORIG_CFLAGS), \ $(subst -pg,,$(ORIG_CFLAGS))) endif -- cgit v0.10.2 From 1d09daa55d2e9bab7e7d30f0d05e5a7bc60b2a4a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:55 +0200 Subject: ftrace: use Makefile to remove tracing from lockdep This patch removes the "notrace" annotation from lockdep and adds the debugging files in the kernel director to those that should not be compiled with "-pg" mcount tracing. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/Makefile b/kernel/Makefile index 7e344e7..d2f80ea 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -11,6 +11,14 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ notifier.o ksysfs.o pm_qos_params.o sched_clock.o +ifdef CONFIG_FTRACE +# Do not profile debug utilities +ORIG_CFLAGS := $(KBUILD_CFLAGS) +KBUILD_CFLAGS = $(if $(filter-out lockdep% %debug,$(basename $(notdir $@))), \ + $(ORIG_CFLAGS), \ + $(subst -pg,,$(ORIG_CFLAGS))) +endif + obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-y += time/ diff --git a/kernel/lockdep.c b/kernel/lockdep.c index ac46847..90a440c 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -271,14 +271,14 @@ static struct list_head chainhash_table[CHAINHASH_SIZE]; ((key1) >> (64-MAX_LOCKDEP_KEYS_BITS)) ^ \ (key2)) -notrace void lockdep_off(void) +void lockdep_off(void) { current->lockdep_recursion++; } EXPORT_SYMBOL(lockdep_off); -notrace void lockdep_on(void) +void lockdep_on(void) { current->lockdep_recursion--; } @@ -1041,7 +1041,7 @@ find_usage_forwards(struct lock_class *source, unsigned int depth) * Return 1 otherwise and keep unchanged. * Return 0 on error. */ -static noinline notrace int +static noinline int find_usage_backwards(struct lock_class *source, unsigned int depth) { struct lock_list *entry; @@ -1591,7 +1591,7 @@ static inline int validate_chain(struct task_struct *curr, * We are building curr_chain_key incrementally, so double-check * it from scratch, to make sure that it's done correctly: */ -static notrace void check_chain_key(struct task_struct *curr) +static void check_chain_key(struct task_struct *curr) { #ifdef CONFIG_DEBUG_LOCKDEP struct held_lock *hlock, *prev_hlock = NULL; @@ -1967,7 +1967,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this, /* * Mark all held locks with a usage bit: */ -static notrace int +static int mark_held_locks(struct task_struct *curr, int hardirq) { enum lock_usage_bit usage_bit; @@ -2014,7 +2014,7 @@ void early_boot_irqs_on(void) /* * Hardirqs will be enabled: */ -void notrace trace_hardirqs_on_caller(unsigned long a0) +void trace_hardirqs_on_caller(unsigned long a0) { struct task_struct *curr = current; unsigned long ip; @@ -2060,7 +2060,7 @@ void notrace trace_hardirqs_on_caller(unsigned long a0) } EXPORT_SYMBOL(trace_hardirqs_on_caller); -void notrace trace_hardirqs_on(void) +void trace_hardirqs_on(void) { trace_hardirqs_on_caller(CALLER_ADDR0); } @@ -2069,7 +2069,7 @@ EXPORT_SYMBOL(trace_hardirqs_on); /* * Hardirqs were disabled: */ -void notrace trace_hardirqs_off_caller(unsigned long a0) +void trace_hardirqs_off_caller(unsigned long a0) { struct task_struct *curr = current; @@ -2094,7 +2094,7 @@ void notrace trace_hardirqs_off_caller(unsigned long a0) } EXPORT_SYMBOL(trace_hardirqs_off_caller); -void notrace trace_hardirqs_off(void) +void trace_hardirqs_off(void) { trace_hardirqs_off_caller(CALLER_ADDR0); } @@ -2260,7 +2260,7 @@ static inline int separate_irq_context(struct task_struct *curr, /* * Mark a lock with a usage bit, and validate the state transition: */ -static notrace int mark_lock(struct task_struct *curr, struct held_lock *this, +static int mark_lock(struct task_struct *curr, struct held_lock *this, enum lock_usage_bit new_bit) { unsigned int new_mask = 1 << new_bit, ret = 1; @@ -2663,7 +2663,7 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip) /* * Check whether we follow the irq-flags state precisely: */ -static notrace void check_flags(unsigned long flags) +static void check_flags(unsigned long flags) { #if defined(CONFIG_DEBUG_LOCKDEP) && defined(CONFIG_TRACE_IRQFLAGS) if (!debug_locks) @@ -2700,7 +2700,7 @@ static notrace void check_flags(unsigned long flags) * We are not always called with irqs disabled - do that here, * and also avoid lockdep recursion: */ -notrace void lock_acquire(struct lockdep_map *lock, unsigned int subclass, +void lock_acquire(struct lockdep_map *lock, unsigned int subclass, int trylock, int read, int check, unsigned long ip) { unsigned long flags; @@ -2723,7 +2723,7 @@ notrace void lock_acquire(struct lockdep_map *lock, unsigned int subclass, EXPORT_SYMBOL_GPL(lock_acquire); -notrace void lock_release(struct lockdep_map *lock, int nested, +void lock_release(struct lockdep_map *lock, int nested, unsigned long ip) { unsigned long flags; -- cgit v0.10.2 From c1d2327b36f2261ffa8ff7227321ba900c7eee7f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:55 +0200 Subject: ftrace: restrict tracing to HAVE_FTRACE architectures Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index ebc158e..f300571 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -32,6 +32,7 @@ config IRQSOFF_TRACER default n depends on TRACE_IRQFLAGS_SUPPORT depends on GENERIC_TIME + depends on HAVE_FTRACE select TRACE_IRQFLAGS select TRACING select TRACER_MAX_TRACE @@ -54,6 +55,7 @@ config PREEMPT_TRACER default n depends on GENERIC_TIME depends on PREEMPT + depends on HAVE_FTRACE select TRACING select TRACER_MAX_TRACE help @@ -72,6 +74,7 @@ config PREEMPT_TRACER config SCHED_TRACER bool "Scheduling Latency Tracer" + depends on HAVE_FTRACE select TRACING select CONTEXT_SWITCH_TRACER select TRACER_MAX_TRACE @@ -81,6 +84,7 @@ config SCHED_TRACER config CONTEXT_SWITCH_TRACER bool "Trace process context switches" + depends on HAVE_FTRACE select TRACING select MARKERS help -- cgit v0.10.2 From 07a267cdd2fd7d1de9455b1e36a1635ace7276c7 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:55 +0200 Subject: ftrace: add UNINTERRUPTIBLE state for kftraced on disable When dynamic ftrace fails and sets itself disabled, the ftraced daemon will go back to sleep everytime it wakes up. The setting of the ftraced state to UNINTERRUPTIBLE is skipped in this process, and the daemon takes up 100% of the CPU. This patch makes sure the ftraced daemon sets itself to UNINTERRUPTIBLE in that loop. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 281d97a..40f64f7 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -630,10 +630,10 @@ static int ftraced(void *ignore) { unsigned long usecs; - set_current_state(TASK_INTERRUPTIBLE); - while (!kthread_should_stop()) { + set_current_state(TASK_INTERRUPTIBLE); + /* check once a second */ schedule_timeout(HZ); @@ -667,8 +667,6 @@ static int ftraced(void *ignore) wake_up_interruptible(&ftraced_waiters); ftrace_shutdown_replenish(); - - set_current_state(TASK_INTERRUPTIBLE); } __set_current_state(TASK_RUNNING); return 0; -- cgit v0.10.2 From d15f57f23eaba975309a153b23699cd0c0236974 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:56 +0200 Subject: ftrace: fix mutex unlock in trace output If the trace output changes on reading the trace files, there is a chance that the start function will return NULL. If the start function of a sequence returns NULL the stop equivalent is not called. In this case, all locks that are taken must be released even if they are released in the stop function. This patch fixes a case that a mutex was not released on return of NULL in the start sequence function. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 3009aaf..ea11f4e 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -964,8 +964,10 @@ static void *s_start(struct seq_file *m, loff_t *pos) mutex_lock(&trace_types_lock); - if (!current_trace || current_trace != iter->trace) + if (!current_trace || current_trace != iter->trace) { + mutex_unlock(&trace_types_lock); return NULL; + } atomic_inc(&trace_record_cmdline_disabled); -- cgit v0.10.2 From 30afdcb1de0a37a2086145a82ca3febebe47d019 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:56 +0200 Subject: ftrace: selftest protect againt max flip There is a slight race condition in the selftest where the max update of the wakeup and irqs/preemption off tests can be doing a max update as the buffers are being tested. If this happens the system can crash with a GPF. This patch adds the max update spinlock around the checking of the buffers to prevent such a race. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 83e55a2..395274e 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -82,10 +82,12 @@ trace_test_buffer_cpu(struct trace_array *tr, struct trace_array_cpu *data) */ static int trace_test_buffer(struct trace_array *tr, unsigned long *count) { - unsigned long cnt = 0; - int cpu; - int ret = 0; + unsigned long flags, cnt = 0; + int cpu, ret = 0; + /* Don't allow flipping of max traces now */ + raw_local_irq_save(flags); + __raw_spin_lock(&ftrace_max_lock); for_each_possible_cpu(cpu) { if (!head_page(tr->data[cpu])) continue; @@ -96,6 +98,8 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count) if (ret) break; } + __raw_spin_unlock(&ftrace_max_lock); + raw_local_irq_restore(flags); if (count) *count = cnt; -- cgit v0.10.2 From a56be3fe2f65f9f776e727bfd382e35db75911d6 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:56 +0200 Subject: ftrace: fix the fault label in updating code The fault label to jump to on fault of updating the code was misplaced preventing the fault from being recorded. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 9f44623..498608c 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -93,8 +93,8 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code, " movb %b4, 4(%2)\n" "2:\n" ".section .fixup, \"ax\"\n" - " movl $1, %0\n" - "3: jmp 2b\n" + "3: movl $1, %0\n" + " jmp 2b\n" ".previous\n" _ASM_EXTABLE(1b, 3b) : "=r"(faulted), "=a"(replaced) -- cgit v0.10.2 From 8f0f996e80b980fba07d11961d96a5fefb60976a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:56 +0200 Subject: ftrace: dont write protect kernel text Dynamic ftrace cant work when the kernel has its text write protected. This patch keeps the kernel from being write protected when dynamic ftrace is in place. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index ec30d10..f96eca2 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -710,6 +710,8 @@ void mark_rodata_ro(void) unsigned long start = PFN_ALIGN(_text); unsigned long size = PFN_ALIGN(_etext) - start; +#ifndef CONFIG_DYNAMIC_FTRACE + /* Dynamic tracing modifies the kernel text section */ set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); printk(KERN_INFO "Write protecting the kernel text: %luk\n", size >> 10); @@ -722,6 +724,8 @@ void mark_rodata_ro(void) printk(KERN_INFO "Testing CPA: write protecting again\n"); set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT); #endif +#endif /* CONFIG_DYNAMIC_FTRACE */ + start += size; size = (unsigned long)__end_rodata - start; set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 32ba13b..41824e7 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -766,6 +766,13 @@ EXPORT_SYMBOL_GPL(rodata_test_data); void mark_rodata_ro(void) { unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata); + unsigned long rodata_start = + ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK; + +#ifdef CONFIG_DYNAMIC_FTRACE + /* Dynamic tracing modifies the kernel text section */ + start = rodata_start; +#endif printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", (end - start) >> 10); @@ -775,8 +782,7 @@ void mark_rodata_ro(void) * The rodata section (but not the kernel text!) should also be * not-executable. */ - start = ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK; - set_memory_nx(start, (end - start) >> PAGE_SHIFT); + set_memory_nx(rodata_start, (end - start) >> PAGE_SHIFT); rodata_test(); -- cgit v0.10.2 From 86069782d62e731b4835a0cf8eb7d1d0e17cf306 Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Mon, 12 May 2008 21:20:56 +0200 Subject: x86: add a list for custom page fault handlers. Provides kernel modules a way to register custom page fault handlers. On every page fault this will call a list of registered functions. The functions may handle the fault and force do_page_fault() to return immediately. This functionality is similar to the now removed page fault notifiers. Custom page fault handlers are used by debugging and reverse engineering tools. Mmiotrace is one such tool and a patch to add it into the tree will follow. The custom page fault handlers are called earlier in do_page_fault() than the page fault notifiers were. Signed-off-by: Pekka Paalanen Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index ac1e31b..9431a83 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -168,6 +168,14 @@ config IOMMU_LEAK Add a simple leak tracer to the IOMMU code. This is useful when you are debugging a buggy device driver that leaks IOMMU mappings. +config PAGE_FAULT_HANDLERS + bool "Custom page fault handlers" + depends on DEBUG_KERNEL + help + Allow the use of custom page fault handlers. A kernel module may + register a function that is called on every page fault. Custom + handlers are used by some debugging and reverse engineering tools. + # # IO delay types: # diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index fd7e179..343f5c1 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -49,6 +49,60 @@ #define PF_RSVD (1<<3) #define PF_INSTR (1<<4) +#ifdef CONFIG_PAGE_FAULT_HANDLERS +static HLIST_HEAD(pf_handlers); /* protected by RCU */ +static DEFINE_SPINLOCK(pf_handlers_writer); + +void register_page_fault_handler(struct pf_handler *new_pfh) +{ + unsigned long flags; + spin_lock_irqsave(&pf_handlers_writer, flags); + hlist_add_head_rcu(&new_pfh->hlist, &pf_handlers); + spin_unlock_irqrestore(&pf_handlers_writer, flags); +} +EXPORT_SYMBOL_GPL(register_page_fault_handler); + +/** + * unregister_page_fault_handler: + * The caller must ensure @old_pfh is not in use anymore before freeing it. + * This function does not guarantee it. The list of handlers is protected by + * RCU, so you can do this by e.g. calling synchronize_rcu(). + */ +void unregister_page_fault_handler(struct pf_handler *old_pfh) +{ + unsigned long flags; + spin_lock_irqsave(&pf_handlers_writer, flags); + hlist_del_rcu(&old_pfh->hlist); + spin_unlock_irqrestore(&pf_handlers_writer, flags); +} +EXPORT_SYMBOL_GPL(unregister_page_fault_handler); +#endif + +/* returns non-zero if do_page_fault() should return */ +static int handle_custom_pf(struct pt_regs *regs, unsigned long error_code, + unsigned long address) +{ +#ifdef CONFIG_PAGE_FAULT_HANDLERS + int ret = 0; + struct pf_handler *cur; + struct hlist_node *ncur; + + if (hlist_empty(&pf_handlers)) + return 0; + + rcu_read_lock(); + hlist_for_each_entry_rcu(cur, ncur, &pf_handlers, hlist) { + ret = cur->handler(regs, error_code, address); + if (ret) + break; + } + rcu_read_unlock(); + return ret; +#else + return 0; +#endif +} + static inline int notify_page_fault(struct pt_regs *regs) { #ifdef CONFIG_KPROBES @@ -601,6 +655,8 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) if (notify_page_fault(regs)) return; + if (handle_custom_pf(regs, error_code, address)) + return; /* * We fault-in kernel-space virtual memory on-demand. The diff --git a/include/asm-x86/kdebug.h b/include/asm-x86/kdebug.h index 96651bb..a80f2d6 100644 --- a/include/asm-x86/kdebug.h +++ b/include/asm-x86/kdebug.h @@ -35,4 +35,13 @@ extern void show_regs(struct pt_regs *regs); extern unsigned long oops_begin(void); extern void oops_end(unsigned long, struct pt_regs *, int signr); +struct pf_handler { + struct hlist_node hlist; + int (*handler)(struct pt_regs *regs, unsigned long error_code, + unsigned long address); +}; + +extern void register_page_fault_handler(struct pf_handler *new_pfh); +extern void unregister_page_fault_handler(struct pf_handler *old_pfh); + #endif -- cgit v0.10.2 From 72829bc3d63cdc592d8f7dd86ad3b3fe8900fb74 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 23 May 2008 21:37:28 +0200 Subject: ftrace: move enums to ftrace.h and make helper function global picked from the mmiotracer patches to distangle the patch queues. Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ea11f4e..0eef050 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -37,7 +37,7 @@ unsigned long __read_mostly tracing_thresh; static int tracing_disabled = 1; -static long +long ns2usecs(cycle_t nsec) { nsec += 500; @@ -96,18 +96,6 @@ unsigned long nsecs_to_usecs(unsigned long nsecs) return nsecs / 1000; } -enum trace_type { - __TRACE_FIRST_TYPE = 0, - - TRACE_FN, - TRACE_CTX, - TRACE_WAKE, - TRACE_STACK, - TRACE_SPECIAL, - - __TRACE_LAST_TYPE -}; - enum trace_flag_type { TRACE_FLAG_IRQS_OFF = 0x01, TRACE_FLAG_NEED_RESCHED = 0x02, @@ -190,7 +178,7 @@ void *head_page(struct trace_array_cpu *data) return page_address(page); } -static int +int trace_seq_printf(struct trace_seq *s, const char *fmt, ...) { int len = (PAGE_SIZE - 1) - s->len; @@ -205,7 +193,7 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...) va_end(ap); /* If we can't write it all, don't bother writing anything */ - if (ret > len) + if (ret >= len) return 0; s->len += ret; @@ -638,7 +626,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags) pc = preempt_count(); entry->preempt_count = pc & 0xff; - entry->pid = tsk->pid; + entry->pid = (tsk) ? tsk->pid : 0; entry->t = ftrace_now(raw_smp_processor_id()); entry->flags = (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) | @@ -1541,6 +1529,9 @@ static int trace_empty(struct trace_iterator *iter) static int print_trace_line(struct trace_iterator *iter) { + if (iter->trace && iter->trace->print_line) + return iter->trace->print_line(iter); + if (trace_flags & TRACE_ITER_BIN) return print_bin_fmt(iter); @@ -2162,6 +2153,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) return -ENOMEM; iter->tr = &global_trace; + iter->trace = current_trace; filp->private_data = iter; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 25cba28..b0ca747 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -6,6 +6,18 @@ #include #include +enum trace_type { + __TRACE_FIRST_TYPE = 0, + + TRACE_FN, + TRACE_CTX, + TRACE_WAKE, + TRACE_STACK, + TRACE_SPECIAL, + + __TRACE_LAST_TYPE +}; + /* * Function trace entry - function address and parent function addres: */ @@ -130,6 +142,7 @@ struct tracer { int (*selftest)(struct tracer *trace, struct trace_array *tr); #endif + int (*print_line)(struct trace_iterator *iter); struct tracer *next; int print_max; }; @@ -276,6 +289,8 @@ extern int trace_selftest_startup_sched_switch(struct tracer *trace, #endif /* CONFIG_FTRACE_STARTUP_TEST */ extern void *head_page(struct trace_array_cpu *data); +extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...); +extern long ns2usecs(cycle_t nsec); extern unsigned long trace_flags; -- cgit v0.10.2 From d17d969160c18b631a19c2b34d260691402650f8 Mon Sep 17 00:00:00 2001 From: Ankita Garg Date: Mon, 12 May 2008 21:20:58 +0200 Subject: ftrace: fix conversion of task state to char in latency tracer The conversion of task states to a character in the sched_switch tracer (part of latency tracer infrastructure), seems to be incorrect. We currently do it by indexing into the state_to_char array using the state value. The state values do not map directly into the array index and are thus incorrect. The following patch addresses this issue. This is also what is being done even in the show_task() routine in kernel/sched.c The patch has been compile and run tested. Signed-off-by: Ankita Garg Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 0eef050..9197782 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1208,6 +1208,7 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) char *comm; int S, T; int i; + unsigned state; if (!next_entry) next_entry = entry; @@ -1238,11 +1239,11 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) break; case TRACE_CTX: case TRACE_WAKE: - S = entry->ctx.prev_state < sizeof(state_to_char) ? - state_to_char[entry->ctx.prev_state] : 'X'; T = entry->ctx.next_state < sizeof(state_to_char) ? state_to_char[entry->ctx.next_state] : 'X'; + state = entry->ctx.prev_state ? __ffs(entry->ctx.prev_state) + 1 : 0; + S = state < sizeof(state_to_char) - 1 ? state_to_char[state] : 'X'; comm = trace_find_cmdline(entry->ctx.next_pid); trace_seq_printf(s, " %5d:%3d:%c %s %5d:%3d:%c %s\n", entry->ctx.prev_pid, -- cgit v0.10.2 From 8487c23765b6e0444ec6b5f1530766d63fe68e35 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:58 +0200 Subject: ftrace: allow trace_pipe to block on all reads We expect things like "cat" to block on reads to trace_pipe. That's what trace_pipe is for. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9197782..fd4ecc2 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2231,8 +2231,6 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, start = 0; while (trace_empty(iter)) { - if (!(trace_flags & TRACE_ITER_BLOCK)) - return -EWOULDBLOCK; /* * This is a make-shift waitqueue. The reason we don't use * an actual wait queue is because: -- cgit v0.10.2 From b5685aede3b7b65e72ddc73b951aa1f70798a614 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:58 +0200 Subject: ftrace: restore iterator trace in pipe read The trace iterator is reset in the read. We still need to restore the tracer that the trace_pipe was opened with. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index fd4ecc2..d141fc9 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2202,6 +2202,8 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, { struct trace_iterator *iter = filp->private_data; struct trace_array_cpu *data; + struct trace_array *tr = iter->tr; + struct tracer *tracer = iter->trace; static cpumask_t mask; static int start; unsigned long flags; @@ -2274,7 +2276,8 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, cnt = PAGE_SIZE - 1; memset(iter, 0, sizeof(*iter)); - iter->tr = &global_trace; + iter->tr = tr; + iter->trace = tracer; iter->pos = -1; /* -- cgit v0.10.2 From 845279972f1736c3463c9cebd1bad92a0a347176 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:58 +0200 Subject: ftrace: return EOF in trace_pipe on change of tracer Break out of while loop with EOF when the current_trace changes. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d141fc9..2af9404 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2253,6 +2253,9 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, if (signal_pending(current)) return -EINTR; + if (iter->trace != current_trace) + return 0; + /* * We block until we read something and tracing is disabled. * We still block if tracing is disabled, but we have never -- cgit v0.10.2 From 2dc8f09571a61d9cb3dc47bba6608689dfe15d16 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:58 +0200 Subject: ftrace: trace_pipe implement NONBLOCK This patch implements "NONBLOCK" for trace_pipe. If the trace_pipe is opened with O_NONBLOCK, then the trace_pipe read will not block when buffer is empty. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2af9404..3b4eaf3 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2233,6 +2233,10 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, start = 0; while (trace_empty(iter)) { + + if ((filp->f_flags & O_NONBLOCK)) + return -EAGAIN; + /* * This is a make-shift waitqueue. The reason we don't use * an actual wait queue is because: -- cgit v0.10.2 From 05bd68c514579e007b46e4fa0461b78416a3f4c2 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:59 +0200 Subject: ftrace: user proper API for setting RT prios in selftest The wakeup selftest used an internal API for setting the test task priority. This patch fixes it to use the proper API for performing such a task. Thanks goes to Randy Dunlap for pointing out this build failure. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 395274e..a5f6001 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -410,11 +410,11 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array * #ifdef CONFIG_SCHED_TRACER static int trace_wakeup_test_thread(void *data) { - struct completion *x = data; - /* Make this a RT thread, doesn't need to be too high */ + struct sched_param param = { .sched_priority = 5 }; + struct completion *x = data; - rt_mutex_setprio(current, MAX_RT_PRIO - 5); + sched_setscheduler(current, SCHED_FIFO, ¶m); /* Make it know we have a new prio */ complete(x); -- cgit v0.10.2 From a98a3c3fde3ae7614f19758a043691b6f59dac53 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:59 +0200 Subject: ftrace: trace_entries to dynamically change trace buffer size This patch adds /debug/tracing/trace_entries that allows users to see as well as modify the number of trace entries the buffers hold. The number of entries only increments in ENTRIES_PER_PAGE which is calculated by the size of an entry with the number of entries that can fit in a page. The user does not need to use an exact size, but the entries will be rounded to one of the increments. Trying to set the entries to 0 will return with -EINVAL. To avoid race conditions, the modification of the buffer size can only be done when tracing is completely disabled (current_tracer == none). A info message will be printed if a user tries to modify the buffer size when not set to none. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 3b4eaf3..4723e01 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -35,6 +35,15 @@ unsigned long __read_mostly tracing_max_latency = (cycle_t)ULONG_MAX; unsigned long __read_mostly tracing_thresh; +/* dummy trace to disable tracing */ +static struct tracer no_tracer __read_mostly = +{ + .name = "none", +}; + +static int trace_alloc_page(void); +static int trace_free_page(void); + static int tracing_disabled = 1; long @@ -2364,6 +2373,70 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, return read; } +static ssize_t +tracing_entries_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_array *tr = filp->private_data; + char buf[64]; + int r; + + r = sprintf(buf, "%lu\n", tr->entries); + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static ssize_t +tracing_entries_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + unsigned long val; + char buf[64]; + + if (cnt > 63) + cnt = 63; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + val = simple_strtoul(buf, NULL, 10); + + /* must have at least 1 entry */ + if (!val) + return -EINVAL; + + mutex_lock(&trace_types_lock); + + if (current_trace != &no_tracer) { + cnt = -EBUSY; + pr_info("ftrace: set current_tracer to none" + " before modifying buffer size\n"); + goto out; + } + + if (val > global_trace.entries) { + while (global_trace.entries < val) { + if (trace_alloc_page()) { + cnt = -ENOMEM; + goto out; + } + } + } else { + /* include the number of entries in val (inc of page entries) */ + while (global_trace.entries > val + (ENTRIES_PER_PAGE - 1)) + trace_free_page(); + } + + filp->f_pos += cnt; + + out: + max_tr.entries = global_trace.entries; + mutex_unlock(&trace_types_lock); + + return cnt; +} + static struct file_operations tracing_max_lat_fops = { .open = tracing_open_generic, .read = tracing_max_lat_read, @@ -2389,6 +2462,12 @@ static struct file_operations tracing_pipe_fops = { .release = tracing_release_pipe, }; +static struct file_operations tracing_entries_fops = { + .open = tracing_open_generic, + .read = tracing_entries_read, + .write = tracing_entries_write, +}; + #ifdef CONFIG_DYNAMIC_FTRACE static ssize_t @@ -2500,6 +2579,12 @@ static __init void tracer_init_debugfs(void) pr_warning("Could not create debugfs " "'tracing_threash' entry\n"); + entry = debugfs_create_file("trace_entries", 0644, d_tracer, + &global_trace, &tracing_entries_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'tracing_threash' entry\n"); + #ifdef CONFIG_DYNAMIC_FTRACE entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer, &ftrace_update_tot_cnt, @@ -2510,12 +2595,6 @@ static __init void tracer_init_debugfs(void) #endif } -/* dummy trace to disable tracing */ -static struct tracer no_tracer __read_mostly = -{ - .name = "none", -}; - static int trace_alloc_page(void) { struct trace_array_cpu *data; @@ -2552,7 +2631,6 @@ static int trace_alloc_page(void) /* Now that we successfully allocate a page per CPU, add them */ for_each_possible_cpu(i) { data = global_trace.data[i]; - data->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; page = list_entry(pages.next, struct page, lru); list_del_init(&page->lru); list_add_tail(&page->lru, &data->trace_pages); @@ -2560,7 +2638,6 @@ static int trace_alloc_page(void) #ifdef CONFIG_TRACER_MAX_TRACE data = max_tr.data[i]; - data->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; page = list_entry(pages.next, struct page, lru); list_del_init(&page->lru); list_add_tail(&page->lru, &data->trace_pages); @@ -2579,6 +2656,55 @@ static int trace_alloc_page(void) return -ENOMEM; } +static int trace_free_page(void) +{ + struct trace_array_cpu *data; + struct page *page; + struct list_head *p; + int i; + int ret = 0; + + /* free one page from each buffer */ + for_each_possible_cpu(i) { + data = global_trace.data[i]; + p = data->trace_pages.next; + if (p == &data->trace_pages) { + /* should never happen */ + WARN_ON(1); + tracing_disabled = 1; + ret = -1; + break; + } + page = list_entry(p, struct page, lru); + ClearPageLRU(page); + list_del(&page->lru); + __free_page(page); + + tracing_reset(data); + +#ifdef CONFIG_TRACER_MAX_TRACE + data = max_tr.data[i]; + p = data->trace_pages.next; + if (p == &data->trace_pages) { + /* should never happen */ + WARN_ON(1); + tracing_disabled = 1; + ret = -1; + break; + } + page = list_entry(p, struct page, lru); + ClearPageLRU(page); + list_del(&page->lru); + __free_page(page); + + tracing_reset(data); +#endif + } + global_trace.entries -= ENTRIES_PER_PAGE; + + return ret; +} + __init static int tracer_alloc_buffers(void) { struct trace_array_cpu *data; @@ -2609,6 +2735,9 @@ __init static int tracer_alloc_buffers(void) /* use the LRU flag to differentiate the two buffers */ ClearPageLRU(page); + data->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; + max_tr.data[i]->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; + /* Only allocate if we are actually using the max trace */ #ifdef CONFIG_TRACER_MAX_TRACE array = (void *)__get_free_page(GFP_KERNEL); -- cgit v0.10.2 From bb065afb8ebd07a03155502dba29ebf0f6fe67e8 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:21:00 +0200 Subject: lockdep: update lockdep_recursion on graph_lock With the introduction of ftrace, it is possible to recurse into the lockdep functions via the mcount call. To prevent possible lockups, updating the lockdep_recursion counter on grabbing the internal lockdep_lock should prevent deadlocks. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 90a440c..65548ef 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -82,6 +82,8 @@ static int graph_lock(void) __raw_spin_unlock(&lockdep_lock); return 0; } + /* prevent any recursions within lockdep from causing deadlocks */ + current->lockdep_recursion++; return 1; } @@ -90,6 +92,7 @@ static inline int graph_unlock(void) if (debug_locks && !__raw_spin_is_locked(&lockdep_lock)) return DEBUG_LOCKS_WARN_ON(1); + current->lockdep_recursion--; __raw_spin_unlock(&lockdep_lock); return 0; } -- cgit v0.10.2 From 93dcc6ea096c51cc30ad0f6647ed05fead2439bf Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 12 May 2008 21:21:00 +0200 Subject: ftrace: simplify hexprint simplify hex to ascii conversion. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4723e01..627e399 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -248,19 +248,18 @@ trace_seq_putmem(struct trace_seq *s, void *mem, size_t len) } #define HEX_CHARS 17 +static const char hex2asc[] = "0123456789abcdef"; static int trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len) { unsigned char hex[HEX_CHARS]; - unsigned char *data; + unsigned char *data = mem; unsigned char byte; int i, j; BUG_ON(len >= HEX_CHARS); - data = mem; - #ifdef __BIG_ENDIAN for (i = 0, j = 0; i < len; i++) { #else @@ -268,22 +267,10 @@ trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len) #endif byte = data[i]; - hex[j] = byte & 0x0f; - if (hex[j] >= 10) - hex[j] += 'a' - 10; - else - hex[j] += '0'; - j++; - - hex[j] = byte >> 4; - if (hex[j] >= 10) - hex[j] += 'a' - 10; - else - hex[j] += '0'; - j++; + hex[j++] = hex2asc[byte & 0x0f]; + hex[j++] = hex2asc[byte >> 4]; } - hex[j] = ' '; - j++; + hex[j++] = ' '; return trace_seq_putmem(s, hex, j); } -- cgit v0.10.2 From afc2abc0ae4265730a0fc48618012193a09a1d10 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:21:00 +0200 Subject: ftrace: cleanups no code changed. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 627e399..d6b6057 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1155,12 +1155,12 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) hardirq = entry->flags & TRACE_FLAG_HARDIRQ; softirq = entry->flags & TRACE_FLAG_SOFTIRQ; - if (hardirq && softirq) + if (hardirq && softirq) { trace_seq_putc(s, 'H'); - else { - if (hardirq) + } else { + if (hardirq) { trace_seq_putc(s, 'h'); - else { + } else { if (softirq) trace_seq_putc(s, 's'); else @@ -2177,8 +2177,7 @@ tracing_poll_pipe(struct file *filp, poll_table *poll_table) * Always select as readable when in blocking mode */ return POLLIN | POLLRDNORM; - } - else { + } else { if (!trace_empty(iter)) return POLLIN | POLLRDNORM; poll_wait(filp, &trace_wait, poll_table); -- cgit v0.10.2 From cffae437cdfbc8a5370d38cefbff1dfe34dad6ca Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:21:00 +0200 Subject: ftrace: simple clean ups Andrew Morton mentioned some clean ups that should be done to ftrace. This patch does some of the simple clean ups. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d6b6057..0a36736 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -36,8 +36,7 @@ unsigned long __read_mostly tracing_max_latency = (cycle_t)ULONG_MAX; unsigned long __read_mostly tracing_thresh; /* dummy trace to disable tracing */ -static struct tracer no_tracer __read_mostly = -{ +static struct tracer no_tracer __read_mostly = { .name = "none", }; @@ -1906,8 +1905,8 @@ tracing_iter_ctrl_write(struct file *filp, const char __user *ubuf, int neg = 0; int i; - if (cnt > 63) - cnt = 63; + if (cnt >= sizeof(buf)) + return -EINVAL; if (copy_from_user(&buf, ubuf, cnt)) return -EFAULT; @@ -1999,8 +1998,8 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf, long val; char buf[64]; - if (cnt > 63) - cnt = 63; + if (cnt >= sizeof(buf)) + return -EINVAL; if (copy_from_user(&buf, ubuf, cnt)) return -EFAULT; @@ -2099,10 +2098,10 @@ tracing_max_lat_read(struct file *filp, char __user *ubuf, char buf[64]; int r; - r = snprintf(buf, 64, "%ld\n", + r = snprintf(buf, sizeof(buf), "%ld\n", *ptr == (unsigned long)-1 ? -1 : nsecs_to_usecs(*ptr)); - if (r > 64) - r = 64; + if (r > sizeof(buf)) + r = sizeof(buf); return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } @@ -2114,8 +2113,8 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf, long val; char buf[64]; - if (cnt > 63) - cnt = 63; + if (cnt >= sizeof(buf)) + return -EINVAL; if (copy_from_user(&buf, ubuf, cnt)) return -EFAULT; @@ -2378,8 +2377,8 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, unsigned long val; char buf[64]; - if (cnt > 63) - cnt = 63; + if (cnt >= sizeof(buf)) + return -EINVAL; if (copy_from_user(&buf, ubuf, cnt)) return -EFAULT; -- cgit v0.10.2 From c6caeeb142cd3a03c46107aac45c8effc02bbadb Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:21:00 +0200 Subject: ftrace: replace simple_strtoul with strict_strtoul Andrew Morton suggested using strict_strtoul over simple_strtoul. This patch replaces them in ftrace. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 0a36736..290e9da 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -92,9 +92,16 @@ void trace_wake_up(void) static int __init set_nr_entries(char *str) { + unsigned long nr_entries; + int ret; + if (!str) return 0; - trace_nr_entries = simple_strtoul(str, &str, 0); + ret = strict_strtoul(str, 0, &nr_entries); + /* nr_entries can not be zero */ + if (ret < 0 || nr_entries == 0) + return 0; + trace_nr_entries = nr_entries; return 1; } __setup("trace_entries=", set_nr_entries); @@ -1995,8 +2002,9 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_array *tr = filp->private_data; - long val; char buf[64]; + long val; + int ret; if (cnt >= sizeof(buf)) return -EINVAL; @@ -2006,7 +2014,9 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf, buf[cnt] = 0; - val = simple_strtoul(buf, NULL, 10); + ret = strict_strtoul(buf, 10, &val); + if (ret < 0) + return ret; val = !!val; @@ -2110,8 +2120,9 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { long *ptr = filp->private_data; - long val; char buf[64]; + long val; + int ret; if (cnt >= sizeof(buf)) return -EINVAL; @@ -2121,7 +2132,9 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf, buf[cnt] = 0; - val = simple_strtoul(buf, NULL, 10); + ret = strict_strtoul(buf, 10, &val); + if (ret < 0) + return ret; *ptr = val * 1000; @@ -2376,6 +2389,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, { unsigned long val; char buf[64]; + int ret; if (cnt >= sizeof(buf)) return -EINVAL; @@ -2385,7 +2399,9 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, buf[cnt] = 0; - val = simple_strtoul(buf, NULL, 10); + ret = strict_strtoul(buf, 10, &val); + if (ret < 0) + return ret; /* must have at least 1 entry */ if (!val) -- cgit v0.10.2 From ab46428c6969d50ecf6f6e97b7a84abba6274368 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:21:00 +0200 Subject: ftrace: modulize the number of CPU buffers Currently ftrace allocates a trace buffer for every possible CPU. Work is being done to change it to only online CPUs and add hooks to hotplug CPUS. This patch lays out the infrastructure for such a change. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 290e9da..5da391c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -35,6 +35,12 @@ unsigned long __read_mostly tracing_max_latency = (cycle_t)ULONG_MAX; unsigned long __read_mostly tracing_thresh; +static unsigned long __read_mostly tracing_nr_buffers; +static cpumask_t __read_mostly tracing_buffer_mask; + +#define for_each_tracing_cpu(cpu) \ + for_each_cpu_mask(cpu, tracing_buffer_mask) + /* dummy trace to disable tracing */ static struct tracer no_tracer __read_mostly = { .name = "none", @@ -328,7 +334,7 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) WARN_ON_ONCE(!irqs_disabled()); __raw_spin_lock(&ftrace_max_lock); /* clear out all the previous traces */ - for_each_possible_cpu(i) { + for_each_tracing_cpu(i) { data = tr->data[i]; flip_trace(max_tr.data[i], data); tracing_reset(data); @@ -352,7 +358,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) WARN_ON_ONCE(!irqs_disabled()); __raw_spin_lock(&ftrace_max_lock); - for_each_possible_cpu(i) + for_each_tracing_cpu(i) tracing_reset(max_tr.data[i]); flip_trace(max_tr.data[cpu], data); @@ -398,7 +404,7 @@ int register_tracer(struct tracer *type) * internal tracing to verify that everything is in order. * If we fail, we do not register this tracer. */ - for_each_possible_cpu(i) { + for_each_tracing_cpu(i) { data = tr->data[i]; if (!head_page(data)) continue; @@ -417,7 +423,7 @@ int register_tracer(struct tracer *type) goto out; } /* Only reset on passing, to avoid touching corrupted buffers */ - for_each_possible_cpu(i) { + for_each_tracing_cpu(i) { data = tr->data[i]; if (!head_page(data)) continue; @@ -847,7 +853,7 @@ find_next_entry(struct trace_iterator *iter, int *ent_cpu) int next_cpu = -1; int cpu; - for_each_possible_cpu(cpu) { + for_each_tracing_cpu(cpu) { if (!head_page(tr->data[cpu])) continue; ent = trace_entry_idx(tr, tr->data[cpu], iter, cpu); @@ -972,7 +978,7 @@ static void *s_start(struct seq_file *m, loff_t *pos) iter->prev_ent = NULL; iter->prev_cpu = -1; - for_each_possible_cpu(i) { + for_each_tracing_cpu(i) { iter->next_idx[i] = 0; iter->next_page[i] = NULL; } @@ -1089,7 +1095,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) if (type) name = type->name; - for_each_possible_cpu(cpu) { + for_each_tracing_cpu(cpu) { if (head_page(tr->data[cpu])) { total += tr->data[cpu]->trace_idx; if (tr->data[cpu]->trace_idx > tr->entries) @@ -1519,7 +1525,7 @@ static int trace_empty(struct trace_iterator *iter) struct trace_array_cpu *data; int cpu; - for_each_possible_cpu(cpu) { + for_each_tracing_cpu(cpu) { data = iter->tr->data[cpu]; if (head_page(data) && data->trace_idx && @@ -1831,7 +1837,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, raw_local_irq_disable(); __raw_spin_lock(&ftrace_max_lock); - for_each_possible_cpu(cpu) { + for_each_tracing_cpu(cpu) { /* * Increase/decrease the disabled counter if we are * about to flip a bit in the cpumask: @@ -2308,7 +2314,7 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, ftrace_enabled = 0; #endif smp_wmb(); - for_each_possible_cpu(cpu) { + for_each_tracing_cpu(cpu) { data = iter->tr->data[cpu]; if (!head_page(data) || !data->trace_idx) @@ -2605,7 +2611,7 @@ static int trace_alloc_page(void) int i; /* first allocate a page for each CPU */ - for_each_possible_cpu(i) { + for_each_tracing_cpu(i) { array = (void *)__get_free_page(GFP_KERNEL); if (array == NULL) { printk(KERN_ERR "tracer: failed to allocate page" @@ -2630,7 +2636,7 @@ static int trace_alloc_page(void) } /* Now that we successfully allocate a page per CPU, add them */ - for_each_possible_cpu(i) { + for_each_tracing_cpu(i) { data = global_trace.data[i]; page = list_entry(pages.next, struct page, lru); list_del_init(&page->lru); @@ -2666,7 +2672,7 @@ static int trace_free_page(void) int ret = 0; /* free one page from each buffer */ - for_each_possible_cpu(i) { + for_each_tracing_cpu(i) { data = global_trace.data[i]; p = data->trace_pages.next; if (p == &data->trace_pages) { @@ -2717,8 +2723,12 @@ __init static int tracer_alloc_buffers(void) global_trace.ctrl = tracer_enabled; + /* TODO: make the number of buffers hot pluggable with CPUS */ + tracing_nr_buffers = num_possible_cpus(); + tracing_buffer_mask = cpu_possible_map; + /* Allocate the first page for all buffers */ - for_each_possible_cpu(i) { + for_each_tracing_cpu(i) { data = global_trace.data[i] = &per_cpu(global_trace_cpu, i); max_tr.data[i] = &per_cpu(max_data, i); -- cgit v0.10.2 From 4fcdae83cebda24b519a89d3dd976081fff1ca80 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:21:00 +0200 Subject: ftrace: comment code This is first installment of adding documentation to the ftrace. Expect many more patches of this kind in the near future. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 5da391c..a102b11 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -64,26 +64,79 @@ cycle_t ftrace_now(int cpu) return cpu_clock(cpu); } +/* + * The global_trace is the descriptor that holds the tracing + * buffers for the live tracing. For each CPU, it contains + * a link list of pages that will store trace entries. The + * page descriptor of the pages in the memory is used to hold + * the link list by linking the lru item in the page descriptor + * to each of the pages in the buffer per CPU. + * + * For each active CPU there is a data field that holds the + * pages for the buffer for that CPU. Each CPU has the same number + * of pages allocated for its buffer. + */ static struct trace_array global_trace; static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu); +/* + * The max_tr is used to snapshot the global_trace when a maximum + * latency is reached. Some tracers will use this to store a maximum + * trace while it continues examining live traces. + * + * The buffers for the max_tr are set up the same as the global_trace. + * When a snapshot is taken, the link list of the max_tr is swapped + * with the link list of the global_trace and the buffers are reset for + * the global_trace so the tracing can continue. + */ static struct trace_array max_tr; static DEFINE_PER_CPU(struct trace_array_cpu, max_data); +/* tracer_enabled is used to toggle activation of a tracer */ static int tracer_enabled = 1; + +/* + * trace_nr_entries is the number of entries that is allocated + * for a buffer. Note, the number of entries is always rounded + * to ENTRIES_PER_PAGE. + */ static unsigned long trace_nr_entries = 65536UL; +/* trace_types holds a link list of available tracers. */ static struct tracer *trace_types __read_mostly; + +/* current_trace points to the tracer that is currently active */ static struct tracer *current_trace __read_mostly; + +/* + * max_tracer_type_len is used to simplify the allocating of + * buffers to read userspace tracer names. We keep track of + * the longest tracer name registered. + */ static int max_tracer_type_len; +/* + * trace_types_lock is used to protect the trace_types list. + * This lock is also used to keep user access serialized. + * Accesses from userspace will grab this lock while userspace + * activities happen inside the kernel. + */ static DEFINE_MUTEX(trace_types_lock); + +/* trace_wait is a waitqueue for tasks blocked on trace_poll */ static DECLARE_WAIT_QUEUE_HEAD(trace_wait); +/* trace_flags holds iter_ctrl options */ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT; +/** + * trace_wake_up - wake up tasks waiting for trace input + * + * Simply wakes up any task that is blocked on the trace_wait + * queue. These is used with trace_poll for tasks polling the trace. + */ void trace_wake_up(void) { /* @@ -117,6 +170,14 @@ unsigned long nsecs_to_usecs(unsigned long nsecs) return nsecs / 1000; } +/* + * trace_flag_type is an enumeration that holds different + * states when a trace occurs. These are: + * IRQS_OFF - interrupts were disabled + * NEED_RESCED - reschedule is requested + * HARDIRQ - inside an interrupt handler + * SOFTIRQ - inside a softirq handler + */ enum trace_flag_type { TRACE_FLAG_IRQS_OFF = 0x01, TRACE_FLAG_NEED_RESCHED = 0x02, @@ -124,10 +185,14 @@ enum trace_flag_type { TRACE_FLAG_SOFTIRQ = 0x08, }; +/* + * TRACE_ITER_SYM_MASK masks the options in trace_flags that + * control the output of kernel symbols. + */ #define TRACE_ITER_SYM_MASK \ (TRACE_ITER_PRINT_PARENT|TRACE_ITER_SYM_OFFSET|TRACE_ITER_SYM_ADDR) -/* These must match the bit postions above */ +/* These must match the bit postions in trace_iterator_flags */ static const char *trace_options[] = { "print-parent", "sym-offset", @@ -142,6 +207,15 @@ static const char *trace_options[] = { NULL }; +/* + * ftrace_max_lock is used to protect the swapping of buffers + * when taking a max snapshot. The buffers themselves are + * protected by per_cpu spinlocks. But the action of the swap + * needs its own lock. + * + * This is defined as a raw_spinlock_t in order to help + * with performance when lockdep debugging is enabled. + */ static raw_spinlock_t ftrace_max_lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; @@ -172,6 +246,13 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) tracing_record_cmdline(current); } +/** + * check_pages - integrity check of trace buffers + * + * As a safty measure we check to make sure the data pages have not + * been corrupted. TODO: configure to disable this because it adds + * a bit of overhead. + */ void check_pages(struct trace_array_cpu *data) { struct page *page, *tmp; @@ -185,6 +266,13 @@ void check_pages(struct trace_array_cpu *data) } } +/** + * head_page - page address of the first page in per_cpu buffer. + * + * head_page returns the page address of the first page in + * a per_cpu buffer. This also preforms various consistency + * checks to make sure the buffer has not been corrupted. + */ void *head_page(struct trace_array_cpu *data) { struct page *page; @@ -199,6 +287,17 @@ void *head_page(struct trace_array_cpu *data) return page_address(page); } +/** + * trace_seq_printf - sequence printing of trace information + * @s: trace sequence descriptor + * @fmt: printf format string + * + * The tracer may use either sequence operations or its own + * copy to user routines. To simplify formating of a trace + * trace_seq_printf is used to store strings into a special + * buffer (@s). Then the output may be either used by + * the sequencer or pulled into another buffer. + */ int trace_seq_printf(struct trace_seq *s, const char *fmt, ...) { @@ -222,6 +321,16 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...) return len; } +/** + * trace_seq_puts - trace sequence printing of simple string + * @s: trace sequence descriptor + * @str: simple string to record + * + * The tracer may use either the sequence operations or its own + * copy to user routines. This function records a simple string + * into a special buffer (@s) for later retrieval by a sequencer + * or other mechanism. + */ static int trace_seq_puts(struct trace_seq *s, const char *str) { @@ -304,6 +413,13 @@ trace_print_seq(struct seq_file *m, struct trace_seq *s) trace_seq_reset(s); } +/* + * flip the trace buffers between two trace descriptors. + * This usually is the buffers between the global_trace and + * the max_tr to record a snapshot of a current trace. + * + * The ftrace_max_lock must be held. + */ static void flip_trace(struct trace_array_cpu *tr1, struct trace_array_cpu *tr2) { @@ -325,6 +441,15 @@ flip_trace(struct trace_array_cpu *tr1, struct trace_array_cpu *tr2) check_pages(tr2); } +/** + * update_max_tr - snapshot all trace buffers from global_trace to max_tr + * @tr: tracer + * @tsk: the task with the latency + * @cpu: The cpu that initiated the trace. + * + * Flip the buffers between the @tr and the max_tr and record information + * about which task was the cause of this latency. + */ void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) { @@ -349,6 +474,8 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) * @tr - tracer * @tsk - task with the latency * @cpu - the cpu of the buffer to copy. + * + * Flip the trace of a single CPU buffer between the @tr and the max_tr. */ void update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) @@ -368,6 +495,12 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) __raw_spin_unlock(&ftrace_max_lock); } +/** + * register_tracer - register a tracer with the ftrace system. + * @type - the plugin for the tracer + * + * Register a new plugin tracer. + */ int register_tracer(struct tracer *type) { struct tracer *t; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index b0ca747..21c29ee 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -294,6 +294,13 @@ extern long ns2usecs(cycle_t nsec); extern unsigned long trace_flags; +/* + * trace_iterator_flags is an enumeration that defines bit + * positions into trace_flags that controls the output. + * + * NOTE: These bits must match the trace_options array in + * trace.c. + */ enum trace_iterator_flags { TRACE_ITER_PRINT_PARENT = 0x01, TRACE_ITER_SYM_OFFSET = 0x02, -- cgit v0.10.2 From 25b0b44a1c732ccfc58095cdd8438955a0a19fff Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:21:00 +0200 Subject: ftrace: fix comm on function trace output In cleaning up of the sched_switch code, the function trace recording of task comms was removed. This patch adds back the recording of comms for function trace. The output of ftrace now has the task comm instead of <...>. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a102b11..1281969 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -620,7 +620,12 @@ static unsigned map_cmdline_to_pid[SAVED_CMDLINES]; static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN]; static int cmdline_idx; static DEFINE_SPINLOCK(trace_cmdline_lock); -atomic_t trace_record_cmdline_disabled; + +/* trace in all context switches */ +atomic_t trace_record_cmdline_enabled __read_mostly; + +/* temporary disable recording */ +atomic_t trace_record_cmdline_disabled __read_mostly; static void trace_init_cmdlines(void) { diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 21c29ee..8991c5e 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -216,6 +216,8 @@ extern unsigned long nsecs_to_usecs(unsigned long nsecs); extern unsigned long tracing_max_latency; extern unsigned long tracing_thresh; +extern atomic_t trace_record_cmdline_enabled; + void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu); void update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu); diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 4165d34..0a08465 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -29,12 +29,14 @@ static void function_reset(struct trace_array *tr) static void start_function_trace(struct trace_array *tr) { function_reset(tr); + atomic_inc(&trace_record_cmdline_enabled); tracing_start_function_trace(); } static void stop_function_trace(struct trace_array *tr) { tracing_stop_function_trace(); + atomic_dec(&trace_record_cmdline_enabled); } static void function_trace_init(struct trace_array *tr) diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 5671db0..a337647 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -29,8 +29,6 @@ ctx_switch_func(void *__rq, struct task_struct *prev, struct task_struct *next) if (!tracer_enabled) return; - tracing_record_cmdline(prev); - local_irq_save(flags); cpu = raw_smp_processor_id(); data = tr->data[cpu]; @@ -73,6 +71,9 @@ void ftrace_ctx_switch(void *__rq, struct task_struct *prev, struct task_struct *next) { + if (unlikely(atomic_read(&trace_record_cmdline_enabled))) + tracing_record_cmdline(prev); + /* * If tracer_switch_func only points to the local * switch func, it still needs the ptr passed to it. @@ -134,11 +135,13 @@ static void sched_switch_reset(struct trace_array *tr) static void start_sched_trace(struct trace_array *tr) { sched_switch_reset(tr); + atomic_inc(&trace_record_cmdline_enabled); tracer_enabled = 1; } static void stop_sched_trace(struct trace_array *tr) { + atomic_dec(&trace_record_cmdline_enabled); tracer_enabled = 0; } -- cgit v0.10.2 From 72b59d67f80983f7bb587b086fb4cb1bc95263a4 Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Mon, 12 May 2008 21:21:01 +0200 Subject: x86_64: fix kernel rodata NX setting Without CONFIG_DYNAMIC_FTRACE, mark_rodata_ro() would mark a wrong number of pages as no-execute. The bug was introduced in the patch "ftrace: dont write protect kernel text". The symptom was machine reboot after a CPU hotplug. Signed-off-by: Pekka Paalanen Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 41824e7..295be1d 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -782,7 +782,7 @@ void mark_rodata_ro(void) * The rodata section (but not the kernel text!) should also be * not-executable. */ - set_memory_nx(rodata_start, (end - start) >> PAGE_SHIFT); + set_memory_nx(rodata_start, (end - rodata_start) >> PAGE_SHIFT); rodata_test(); -- cgit v0.10.2 From 53d0aa773053ab18287781e25d52c5faf9e0e09e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:21:01 +0200 Subject: ftrace: add logic to record overruns This patch sets up the infrastructure to record overruns of the tracing buffer. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 1281969..b9126ef 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -609,6 +609,7 @@ void unregister_tracer(struct tracer *type) void tracing_reset(struct trace_array_cpu *data) { data->trace_idx = 0; + data->overrun = 0; data->trace_head = data->trace_tail = head_page(data); data->trace_head_idx = 0; data->trace_tail_idx = 0; @@ -750,6 +751,7 @@ tracing_get_trace_entry(struct trace_array *tr, struct trace_array_cpu *data) if (data->trace_head == data->trace_tail && idx_next == data->trace_tail_idx) { /* overrun */ + data->overrun++; data->trace_tail_idx++; if (data->trace_tail_idx >= ENTRIES_PER_PAGE) { data->trace_tail = @@ -2353,8 +2355,6 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, { struct trace_iterator *iter = filp->private_data; struct trace_array_cpu *data; - struct trace_array *tr = iter->tr; - struct tracer *tracer = iter->trace; static cpumask_t mask; static int start; unsigned long flags; @@ -2433,10 +2433,11 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, if (cnt >= PAGE_SIZE) cnt = PAGE_SIZE - 1; - memset(iter, 0, sizeof(*iter)); - iter->tr = tr; - iter->trace = tracer; + /* reset all but tr, trace, and overruns */ iter->pos = -1; + memset(&iter->seq, 0, + sizeof(struct trace_iterator) - + offsetof(struct trace_iterator, seq)); /* * We need to stop all tracing on all CPUS to read the @@ -2465,6 +2466,11 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, for_each_cpu_mask(cpu, mask) { data = iter->tr->data[cpu]; __raw_spin_lock(&data->lock); + + if (data->overrun > iter->last_overrun[cpu]) + iter->overrun[cpu] += + data->overrun - iter->last_overrun[cpu]; + iter->last_overrun[cpu] = data->overrun; } while (find_next_entry_inc(iter) != NULL) { diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 8991c5e..c1ec134 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -97,6 +97,7 @@ struct trace_array_cpu { void *trace_head; /* producer */ void *trace_tail; /* consumer */ unsigned long trace_idx; + unsigned long overrun; unsigned long saved_latency; unsigned long critical_start; unsigned long critical_end; @@ -157,10 +158,13 @@ struct trace_seq { * results to users and which routines might sleep, etc: */ struct trace_iterator { - struct trace_seq seq; struct trace_array *tr; struct tracer *trace; + long last_overrun[NR_CPUS]; + long overrun[NR_CPUS]; + /* The below is zeroed out in pipe_read */ + struct trace_seq seq; struct trace_entry *ent; int cpu; -- cgit v0.10.2 From 107bad8bef5ab2c3a3bff7648c18c9dc3abdc13b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:21:01 +0200 Subject: ftrace: add trace pipe header pluggin This patch adds a method for open_pipe and open_read to the pluggins so that they can add a header to the trace pipe call. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index b9126ef..32f9106 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2307,11 +2307,15 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) if (!iter) return -ENOMEM; + mutex_lock(&trace_types_lock); iter->tr = &global_trace; iter->trace = current_trace; - filp->private_data = iter; + if (iter->trace->pipe_open) + iter->trace->pipe_open(iter); + mutex_unlock(&trace_types_lock); + return 0; } @@ -2380,13 +2384,24 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, return cnt; } + mutex_lock(&trace_types_lock); + if (iter->trace->read) { + ret = iter->trace->read(iter, filp, ubuf, cnt, ppos); + if (ret) { + read = ret; + goto out; + } + } + trace_seq_reset(&iter->seq); start = 0; while (trace_empty(iter)) { - if ((filp->f_flags & O_NONBLOCK)) - return -EAGAIN; + if ((filp->f_flags & O_NONBLOCK)) { + read = -EAGAIN; + goto out; + } /* * This is a make-shift waitqueue. The reason we don't use @@ -2400,16 +2415,22 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, set_current_state(TASK_INTERRUPTIBLE); iter->tr->waiter = current; + mutex_unlock(&trace_types_lock); + /* sleep for one second, and try again. */ schedule_timeout(HZ); + mutex_lock(&trace_types_lock); + iter->tr->waiter = NULL; - if (signal_pending(current)) - return -EINTR; + if (signal_pending(current)) { + read = -EINTR; + goto out; + } if (iter->trace != current_trace) - return 0; + goto out; /* * We block until we read something and tracing is disabled. @@ -2428,7 +2449,7 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, /* stop when tracing is finished */ if (trace_empty(iter)) - return 0; + goto out; if (cnt >= PAGE_SIZE) cnt = PAGE_SIZE - 1; @@ -2518,6 +2539,9 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, if (ret) read = -EFAULT; +out: + mutex_unlock(&trace_types_lock); + return read; } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index c1ec134..ee53d70 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -135,9 +135,13 @@ struct tracer { void (*init)(struct trace_array *tr); void (*reset)(struct trace_array *tr); void (*open)(struct trace_iterator *iter); + void (*pipe_open)(struct trace_iterator *iter); void (*close)(struct trace_iterator *iter); void (*start)(struct trace_iterator *iter); void (*stop)(struct trace_iterator *iter); + ssize_t (*read)(struct trace_iterator *iter, + struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos); void (*ctrl_update)(struct trace_array *tr); #ifdef CONFIG_FTRACE_STARTUP_TEST int (*selftest)(struct tracer *trace, @@ -160,6 +164,7 @@ struct trace_seq { struct trace_iterator { struct trace_array *tr; struct tracer *trace; + void *private; long last_overrun[NR_CPUS]; long overrun[NR_CPUS]; -- cgit v0.10.2 From 2f1dafe50cc4e58a239fd81bd47f87f32042a1ee Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Mon, 12 May 2008 21:21:01 +0200 Subject: x86: fix SMP alternatives: use mutex instead of spinlock, text_poke is sleepable text_poke is sleepable. The original fix by Mathieu Desnoyers . Signed-off-by: Pekka Paalanen Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index de240ba..2763cb3 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -1,6 +1,6 @@ #include #include -#include +#include #include #include #include @@ -279,7 +279,7 @@ struct smp_alt_module { struct list_head next; }; static LIST_HEAD(smp_alt_modules); -static DEFINE_SPINLOCK(smp_alt); +static DEFINE_MUTEX(smp_alt); static int smp_mode = 1; /* protected by smp_alt */ void alternatives_smp_module_add(struct module *mod, char *name, @@ -312,12 +312,12 @@ void alternatives_smp_module_add(struct module *mod, char *name, __func__, smp->locks, smp->locks_end, smp->text, smp->text_end, smp->name); - spin_lock(&smp_alt); + mutex_lock(&smp_alt); list_add_tail(&smp->next, &smp_alt_modules); if (boot_cpu_has(X86_FEATURE_UP)) alternatives_smp_unlock(smp->locks, smp->locks_end, smp->text, smp->text_end); - spin_unlock(&smp_alt); + mutex_unlock(&smp_alt); } void alternatives_smp_module_del(struct module *mod) @@ -327,17 +327,17 @@ void alternatives_smp_module_del(struct module *mod) if (smp_alt_once || noreplace_smp) return; - spin_lock(&smp_alt); + mutex_lock(&smp_alt); list_for_each_entry(item, &smp_alt_modules, next) { if (mod != item->mod) continue; list_del(&item->next); - spin_unlock(&smp_alt); + mutex_unlock(&smp_alt); DPRINTK("%s: %s\n", __func__, item->name); kfree(item); return; } - spin_unlock(&smp_alt); + mutex_unlock(&smp_alt); } void alternatives_smp_switch(int smp) @@ -359,7 +359,7 @@ void alternatives_smp_switch(int smp) return; BUG_ON(!smp && (num_online_cpus() > 1)); - spin_lock(&smp_alt); + mutex_lock(&smp_alt); /* * Avoid unnecessary switches because it forces JIT based VMs to @@ -383,7 +383,7 @@ void alternatives_smp_switch(int smp) mod->text, mod->text_end); } smp_mode = smp; - spin_unlock(&smp_alt); + mutex_unlock(&smp_alt); } #endif -- cgit v0.10.2 From 4823ed7eadf35e4b57ce581327e21d39585f1f32 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:21:01 +0200 Subject: ftrace: fix setting of pos in read_pipe In resetting the iterator in read_pipe, the reset of pos was postitioned in the wrong location with respect to the memset operation. The current code sets pos, incorrectly, to zero. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 32f9106..49e1663 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2455,10 +2455,10 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, cnt = PAGE_SIZE - 1; /* reset all but tr, trace, and overruns */ - iter->pos = -1; memset(&iter->seq, 0, sizeof(struct trace_iterator) - offsetof(struct trace_iterator, seq)); + iter->pos = -1; /* * We need to stop all tracing on all CPUS to read the -- cgit v0.10.2 From 9fe068e92f6290e89e19adc521441661a1229f00 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:21:02 +0200 Subject: ftrace: trace faster Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 49e1663..ca0d6ff 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2417,8 +2417,8 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, mutex_unlock(&trace_types_lock); - /* sleep for one second, and try again. */ - schedule_timeout(HZ); + /* sleep for 100 msecs, and try again. */ + schedule_timeout(HZ/10); mutex_lock(&trace_types_lock); -- cgit v0.10.2 From a4feb8348b62fe76a63cdb5569f5c920f5283c06 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:21:02 +0200 Subject: ftrace: special stacktrace Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ca0d6ff..c232d82 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -808,29 +808,6 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data, trace_function(tr, data, ip, parent_ip, flags); } -void -__trace_special(void *__tr, void *__data, - unsigned long arg1, unsigned long arg2, unsigned long arg3) -{ - struct trace_array_cpu *data = __data; - struct trace_array *tr = __tr; - struct trace_entry *entry; - unsigned long irq_flags; - - raw_local_irq_save(irq_flags); - __raw_spin_lock(&data->lock); - entry = tracing_get_trace_entry(tr, data); - tracing_generic_entry_update(entry, 0); - entry->type = TRACE_SPECIAL; - entry->special.arg1 = arg1; - entry->special.arg2 = arg2; - entry->special.arg3 = arg3; - __raw_spin_unlock(&data->lock); - raw_local_irq_restore(irq_flags); - - trace_wake_up(); -} - void __trace_stack(struct trace_array *tr, struct trace_array_cpu *data, unsigned long flags, @@ -857,6 +834,30 @@ void __trace_stack(struct trace_array *tr, } void +__trace_special(void *__tr, void *__data, + unsigned long arg1, unsigned long arg2, unsigned long arg3) +{ + struct trace_array_cpu *data = __data; + struct trace_array *tr = __tr; + struct trace_entry *entry; + unsigned long irq_flags; + + raw_local_irq_save(irq_flags); + __raw_spin_lock(&data->lock); + entry = tracing_get_trace_entry(tr, data); + tracing_generic_entry_update(entry, 0); + entry->type = TRACE_SPECIAL; + entry->special.arg1 = arg1; + entry->special.arg2 = arg2; + entry->special.arg3 = arg3; + __trace_stack(tr, data, irq_flags, 4); + __raw_spin_unlock(&data->lock); + raw_local_irq_restore(irq_flags); + + trace_wake_up(); +} + +void tracing_sched_switch_trace(struct trace_array *tr, struct trace_array_cpu *data, struct task_struct *prev, -- cgit v0.10.2 From 2bb6f8d6389cbfadd657e7dc069f6986abf35e4f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:21:02 +0200 Subject: ftrace: use raw_smp_processor_id for mcount functions Due to debug hooks in the kernel that can change the way smp_processor_id works, use raw_smp_processor_id in mcount called functions (namely ftrace_record_ip). Currently we annotate most debug functions from calling mcount, but we should not rely on that to prevent kernel lockups. This patch uses the raw_smp_processor_id to prevent a recusive crash that can happen if a debug hook in smp_processor_id calls mcount. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 40f64f7..af5ad89 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -267,6 +267,7 @@ ftrace_record_ip(unsigned long ip) unsigned long key; int resched; int atomic; + int cpu; if (!ftrace_enabled || ftrace_disabled) return; @@ -274,9 +275,15 @@ ftrace_record_ip(unsigned long ip) resched = need_resched(); preempt_disable_notrace(); - /* We simply need to protect against recursion */ - __get_cpu_var(ftrace_shutdown_disable_cpu)++; - if (__get_cpu_var(ftrace_shutdown_disable_cpu) != 1) + /* + * We simply need to protect against recursion. + * Use the the raw version of smp_processor_id and not + * __get_cpu_var which can call debug hooks that can + * cause a recursive crash here. + */ + cpu = raw_smp_processor_id(); + per_cpu(ftrace_shutdown_disable_cpu, cpu)++; + if (per_cpu(ftrace_shutdown_disable_cpu, cpu) != 1) goto out; if (unlikely(ftrace_record_suspend)) @@ -317,7 +324,7 @@ ftrace_record_ip(unsigned long ip) out_unlock: spin_unlock_irqrestore(&ftrace_shutdown_lock, flags); out: - __get_cpu_var(ftrace_shutdown_disable_cpu)--; + per_cpu(ftrace_shutdown_disable_cpu, cpu)--; /* prevent recursion with scheduler */ if (resched) -- cgit v0.10.2 From 6c6c27969a4c6024e6c8838829546c02aaddca18 Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Mon, 12 May 2008 21:21:02 +0200 Subject: ftrace: add readpos to struct trace_seq; add trace_seq_to_user() Refactor code from tracing_read_pipe() and create trace_seq_to_user(). Moved trace_seq_reset() call before iter->trace->read() call so that when all leftover data is returned, trace_seq is reset automatically. Signed-off-by: Pekka Paalanen Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c232d82..82ced40 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -400,6 +400,26 @@ static void trace_seq_reset(struct trace_seq *s) { s->len = 0; + s->readpos = 0; +} + +ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt) +{ + int len; + int ret; + + if (s->len <= s->readpos) + return -EBUSY; + + len = s->len - s->readpos; + if (cnt > len) + cnt = len; + ret = copy_to_user(ubuf, s->buffer + s->readpos, cnt); + if (ret) + return -EFAULT; + + s->readpos += len; + return cnt; } static void @@ -2361,46 +2381,32 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, struct trace_iterator *iter = filp->private_data; struct trace_array_cpu *data; static cpumask_t mask; - static int start; unsigned long flags; #ifdef CONFIG_FTRACE int ftrace_save; #endif - int read = 0; int cpu; - int len; - int ret; + ssize_t sret; /* return any leftover data */ - if (iter->seq.len > start) { - len = iter->seq.len - start; - if (cnt > len) - cnt = len; - ret = copy_to_user(ubuf, iter->seq.buffer + start, cnt); - if (ret) - cnt = -EFAULT; - - start += len; + sret = trace_seq_to_user(&iter->seq, ubuf, cnt); + if (sret != -EBUSY) + return sret; + sret = 0; - return cnt; - } + trace_seq_reset(&iter->seq); mutex_lock(&trace_types_lock); if (iter->trace->read) { - ret = iter->trace->read(iter, filp, ubuf, cnt, ppos); - if (ret) { - read = ret; + sret = iter->trace->read(iter, filp, ubuf, cnt, ppos); + if (sret) goto out; - } } - trace_seq_reset(&iter->seq); - start = 0; - while (trace_empty(iter)) { if ((filp->f_flags & O_NONBLOCK)) { - read = -EAGAIN; + sret = -EAGAIN; goto out; } @@ -2426,7 +2432,7 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, iter->tr->waiter = NULL; if (signal_pending(current)) { - read = -EINTR; + sret = -EINTR; goto out; } @@ -2496,6 +2502,7 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, } while (find_next_entry_inc(iter) != NULL) { + int ret; int len = iter->seq.len; ret = print_trace_line(iter); @@ -2526,24 +2533,16 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, local_irq_restore(flags); /* Now copy what we have to the user */ - read = iter->seq.len; - if (read > cnt) - read = cnt; - - ret = copy_to_user(ubuf, iter->seq.buffer, read); - - if (read < iter->seq.len) - start = read; - else + sret = trace_seq_to_user(&iter->seq, ubuf, cnt); + if (iter->seq.readpos >= iter->seq.len) trace_seq_reset(&iter->seq); - - if (ret) - read = -EFAULT; + if (sret == -EBUSY) + sret = 0; out: mutex_unlock(&trace_types_lock); - return read; + return sret; } static ssize_t diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index ee53d70..8845033 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -155,6 +155,7 @@ struct tracer { struct trace_seq { unsigned char buffer[PAGE_SIZE]; unsigned int len; + unsigned int readpos; }; /* @@ -301,6 +302,8 @@ extern int trace_selftest_startup_sched_switch(struct tracer *trace, extern void *head_page(struct trace_array_cpu *data); extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...); +extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, + size_t cnt); extern long ns2usecs(cycle_t nsec); extern unsigned long trace_flags; -- cgit v0.10.2 From 3eefae994d9224fb7771a3ddb683868363c23510 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:21:04 +0200 Subject: ftrace: limit trace entries Currently there is no protection from the root user to use up all of memory for trace buffers. If the root user allocates too many entries, the OOM killer might start kill off all tasks. This patch adds an algorith to check the following condition: pages_requested > (freeable_memory + current_trace_buffer_pages) / 4 If the above is met then the allocation fails. The above prevents more than 1/4th of freeable memory from being used by trace buffers. To determine the freeable_memory, I made determine_dirtyable_memory in mm/page-writeback.c global. Special thanks goes to Peter Zijlstra for suggesting the above calculation. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/include/linux/writeback.h b/include/linux/writeback.h index f462439..bd91987 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -105,6 +105,8 @@ extern int vm_highmem_is_dirtyable; extern int block_dump; extern int laptop_mode; +extern unsigned long determine_dirtyable_memory(void); + extern int dirty_ratio_handler(struct ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 82ced40..2824cf48 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -27,6 +27,7 @@ #include #include #include +#include #include @@ -51,6 +52,8 @@ static int trace_free_page(void); static int tracing_disabled = 1; +static unsigned long tracing_pages_allocated; + long ns2usecs(cycle_t nsec) { @@ -2591,12 +2594,41 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, } if (val > global_trace.entries) { + long pages_requested; + unsigned long freeable_pages; + + /* make sure we have enough memory before mapping */ + pages_requested = + (val + (ENTRIES_PER_PAGE-1)) / ENTRIES_PER_PAGE; + + /* account for each buffer (and max_tr) */ + pages_requested *= tracing_nr_buffers * 2; + + /* Check for overflow */ + if (pages_requested < 0) { + cnt = -ENOMEM; + goto out; + } + + freeable_pages = determine_dirtyable_memory(); + + /* we only allow to request 1/4 of useable memory */ + if (pages_requested > + ((freeable_pages + tracing_pages_allocated) / 4)) { + cnt = -ENOMEM; + goto out; + } + while (global_trace.entries < val) { if (trace_alloc_page()) { cnt = -ENOMEM; goto out; } + /* double check that we don't go over the known pages */ + if (tracing_pages_allocated > pages_requested) + break; } + } else { /* include the number of entries in val (inc of page entries) */ while (global_trace.entries > val + (ENTRIES_PER_PAGE - 1)) @@ -2776,6 +2808,7 @@ static int trace_alloc_page(void) struct page *page, *tmp; LIST_HEAD(pages); void *array; + unsigned pages_allocated = 0; int i; /* first allocate a page for each CPU */ @@ -2787,6 +2820,7 @@ static int trace_alloc_page(void) goto free_pages; } + pages_allocated++; page = virt_to_page(array); list_add(&page->lru, &pages); @@ -2798,6 +2832,7 @@ static int trace_alloc_page(void) "for trace buffer!\n"); goto free_pages; } + pages_allocated++; page = virt_to_page(array); list_add(&page->lru, &pages); #endif @@ -2819,6 +2854,7 @@ static int trace_alloc_page(void) SetPageLRU(page); #endif } + tracing_pages_allocated += pages_allocated; global_trace.entries += ENTRIES_PER_PAGE; return 0; @@ -2853,6 +2889,8 @@ static int trace_free_page(void) page = list_entry(p, struct page, lru); ClearPageLRU(page); list_del(&page->lru); + tracing_pages_allocated--; + tracing_pages_allocated--; __free_page(page); tracing_reset(data); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 789b6ad..b38f700 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -126,8 +126,6 @@ static void background_writeout(unsigned long _min_pages); static struct prop_descriptor vm_completions; static struct prop_descriptor vm_dirties; -static unsigned long determine_dirtyable_memory(void); - /* * couple the period to the dirty_ratio: * @@ -347,7 +345,13 @@ static unsigned long highmem_dirtyable_memory(unsigned long total) #endif } -static unsigned long determine_dirtyable_memory(void) +/** + * determine_dirtyable_memory - amount of memory that may be used + * + * Returns the numebr of pages that can currently be freed and used + * by the kernel for direct mappings. + */ +unsigned long determine_dirtyable_memory(void) { unsigned long x; -- cgit v0.10.2 From dc102a8fae2d0d6bf5223fc549247f2e23959ae6 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Mon, 12 May 2008 21:21:09 +0200 Subject: Markers - remove extra format argument Denys Vlasenko : > Not in this patch, but I noticed: > > #define __trace_mark(name, call_private, format, args...) \ > do { \ > static const char __mstrtab_##name[] \ > __attribute__((section("__markers_strings"))) \ > = #name "\0" format; \ > static struct marker __mark_##name \ > __attribute__((section("__markers"), aligned(8))) = \ > { __mstrtab_##name, &__mstrtab_##name[sizeof(#name)], \ > 0, 0, marker_probe_cb, \ > { __mark_empty_function, NULL}, NULL }; \ > __mark_check_format(format, ## args); \ > if (unlikely(__mark_##name.state)) { \ > (*__mark_##name.call) \ > (&__mark_##name, call_private, \ > format, ## args); \ > } \ > } while (0) > > In this call: > > (*__mark_##name.call) \ > (&__mark_##name, call_private, \ > format, ## args); \ > > you make gcc allocate duplicate format string. You can use > &__mstrtab_##name[sizeof(#name)] instead since it holds the same string, > or drop ", format," above and "const char *fmt" from here: > > void (*call)(const struct marker *mdata, /* Probe wrapper */ > void *call_private, const char *fmt, ...); > > since mdata->format is the same and all callees which need it can take it there. Very good point. I actually thought about dropping it, since it would remove an unnecessary argument from the stack. And actually, since I now have the marker_probe_cb sitting between the marker site and the callbacks, there is no API change required. Thanks :) Mathieu Signed-off-by: Mathieu Desnoyers CC: Denys Vlasenko Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/include/linux/marker.h b/include/linux/marker.h index 430f6ad..338533a 100644 --- a/include/linux/marker.h +++ b/include/linux/marker.h @@ -44,8 +44,8 @@ struct marker { */ char state; /* Marker state. */ char ptype; /* probe type : 0 : single, 1 : multi */ - void (*call)(const struct marker *mdata, /* Probe wrapper */ - void *call_private, const char *fmt, ...); + /* Probe wrapper */ + void (*call)(const struct marker *mdata, void *call_private, ...); struct marker_probe_closure single; struct marker_probe_closure *multi; } __attribute__((aligned(8))); @@ -72,8 +72,7 @@ struct marker { __mark_check_format(format, ## args); \ if (unlikely(__mark_##name.state)) { \ (*__mark_##name.call) \ - (&__mark_##name, call_private, \ - format, ## args); \ + (&__mark_##name, call_private, ## args);\ } \ } while (0) @@ -117,9 +116,9 @@ static inline void __printf(1, 2) ___mark_check_format(const char *fmt, ...) extern marker_probe_func __mark_empty_function; extern void marker_probe_cb(const struct marker *mdata, - void *call_private, const char *fmt, ...); + void *call_private, ...); extern void marker_probe_cb_noarg(const struct marker *mdata, - void *call_private, const char *fmt, ...); + void *call_private, ...); /* * Connect a probe to a marker. diff --git a/kernel/marker.c b/kernel/marker.c index b5a9fe1..1abfb92 100644 --- a/kernel/marker.c +++ b/kernel/marker.c @@ -55,8 +55,8 @@ static DEFINE_MUTEX(markers_mutex); struct marker_entry { struct hlist_node hlist; char *format; - void (*call)(const struct marker *mdata, /* Probe wrapper */ - void *call_private, const char *fmt, ...); + /* Probe wrapper */ + void (*call)(const struct marker *mdata, void *call_private, ...); struct marker_probe_closure single; struct marker_probe_closure *multi; int refcount; /* Number of times armed. 0 if disarmed. */ @@ -91,15 +91,13 @@ EXPORT_SYMBOL_GPL(__mark_empty_function); * marker_probe_cb Callback that prepares the variable argument list for probes. * @mdata: pointer of type struct marker * @call_private: caller site private data - * @fmt: format string * @...: Variable argument list. * * Since we do not use "typical" pointer based RCU in the 1 argument case, we * need to put a full smp_rmb() in this branch. This is why we do not use * rcu_dereference() for the pointer read. */ -void marker_probe_cb(const struct marker *mdata, void *call_private, - const char *fmt, ...) +void marker_probe_cb(const struct marker *mdata, void *call_private, ...) { va_list args; char ptype; @@ -120,8 +118,9 @@ void marker_probe_cb(const struct marker *mdata, void *call_private, /* Must read the ptr before private data. They are not data * dependant, so we put an explicit smp_rmb() here. */ smp_rmb(); - va_start(args, fmt); - func(mdata->single.probe_private, call_private, fmt, &args); + va_start(args, call_private); + func(mdata->single.probe_private, call_private, mdata->format, + &args); va_end(args); } else { struct marker_probe_closure *multi; @@ -136,9 +135,9 @@ void marker_probe_cb(const struct marker *mdata, void *call_private, smp_read_barrier_depends(); multi = mdata->multi; for (i = 0; multi[i].func; i++) { - va_start(args, fmt); - multi[i].func(multi[i].probe_private, call_private, fmt, - &args); + va_start(args, call_private); + multi[i].func(multi[i].probe_private, call_private, + mdata->format, &args); va_end(args); } } @@ -150,13 +149,11 @@ EXPORT_SYMBOL_GPL(marker_probe_cb); * marker_probe_cb Callback that does not prepare the variable argument list. * @mdata: pointer of type struct marker * @call_private: caller site private data - * @fmt: format string * @...: Variable argument list. * * Should be connected to markers "MARK_NOARGS". */ -void marker_probe_cb_noarg(const struct marker *mdata, - void *call_private, const char *fmt, ...) +void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...) { va_list args; /* not initialized */ char ptype; @@ -172,7 +169,8 @@ void marker_probe_cb_noarg(const struct marker *mdata, /* Must read the ptr before private data. They are not data * dependant, so we put an explicit smp_rmb() here. */ smp_rmb(); - func(mdata->single.probe_private, call_private, fmt, &args); + func(mdata->single.probe_private, call_private, mdata->format, + &args); } else { struct marker_probe_closure *multi; int i; @@ -186,8 +184,8 @@ void marker_probe_cb_noarg(const struct marker *mdata, smp_read_barrier_depends(); multi = mdata->multi; for (i = 0; multi[i].func; i++) - multi[i].func(multi[i].probe_private, call_private, fmt, - &args); + multi[i].func(multi[i].probe_private, call_private, + mdata->format, &args); } preempt_enable(); } -- cgit v0.10.2 From 0aa977f592f17004f9d1d545f2e1bb9ea71896c3 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Mon, 12 May 2008 21:21:10 +0200 Subject: Markers - define non optimized marker To support the forthcoming "immediate values" marker optimization, we must have a way to declare markers in few code paths that does not use instruction modification based enable. This will be the case of printk(), some traps and eventually lockdep instrumentation. Changelog : - Fix reversed boolean logic of "generic". Signed-off-by: Mathieu Desnoyers Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/include/linux/marker.h b/include/linux/marker.h index 338533a..1290653 100644 --- a/include/linux/marker.h +++ b/include/linux/marker.h @@ -58,8 +58,12 @@ struct marker { * Make sure the alignment of the structure in the __markers section will * not add unwanted padding between the beginning of the section and the * structure. Force alignment to the same alignment as the section start. + * + * The "generic" argument controls which marker enabling mechanism must be used. + * If generic is true, a variable read is used. + * If generic is false, immediate values are used. */ -#define __trace_mark(name, call_private, format, args...) \ +#define __trace_mark(generic, name, call_private, format, args...) \ do { \ static const char __mstrtab_##name[] \ __attribute__((section("__markers_strings"))) \ @@ -79,7 +83,7 @@ struct marker { extern void marker_update_probe_range(struct marker *begin, struct marker *end); #else /* !CONFIG_MARKERS */ -#define __trace_mark(name, call_private, format, args...) \ +#define __trace_mark(generic, name, call_private, format, args...) \ __mark_check_format(format, ## args) static inline void marker_update_probe_range(struct marker *begin, struct marker *end) @@ -87,15 +91,30 @@ static inline void marker_update_probe_range(struct marker *begin, #endif /* CONFIG_MARKERS */ /** - * trace_mark - Marker + * trace_mark - Marker using code patching * @name: marker name, not quoted. * @format: format string * @args...: variable argument list * - * Places a marker. + * Places a marker using optimized code patching technique (imv_read()) + * to be enabled when immediate values are present. */ #define trace_mark(name, format, args...) \ - __trace_mark(name, NULL, format, ## args) + __trace_mark(0, name, NULL, format, ## args) + +/** + * _trace_mark - Marker using variable read + * @name: marker name, not quoted. + * @format: format string + * @args...: variable argument list + * + * Places a marker using a standard memory read (_imv_read()) to be + * enabled. Should be used for markers in code paths where instruction + * modification based enabling is not welcome. (__init and __exit functions, + * lockdep, some traps, printk). + */ +#define _trace_mark(name, format, args...) \ + __trace_mark(1, name, NULL, format, ## args) /** * MARK_NOARGS - Format string for a marker with no argument. -- cgit v0.10.2 From 5b82a1b08a00b2adca3d9dd9777efff40b7aaaa1 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Mon, 12 May 2008 21:21:10 +0200 Subject: Port ftrace to markers Porting ftrace to the marker infrastructure. Don't need to chain to the wakeup tracer from the sched tracer, because markers support multiple probes connected. Signed-off-by: Mathieu Desnoyers CC: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/include/linux/sched.h b/include/linux/sched.h index 360ca99..c0b1c69 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2131,38 +2131,6 @@ __trace_special(void *__tr, void *__data, } #endif -#ifdef CONFIG_CONTEXT_SWITCH_TRACER -extern void -ftrace_ctx_switch(void *rq, struct task_struct *prev, struct task_struct *next); -extern void -ftrace_wake_up_task(void *rq, struct task_struct *wakee, - struct task_struct *curr); -extern void ftrace_all_fair_tasks(void *__rq, void *__tr, void *__data); -extern void -ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3); -#else -static inline void -ftrace_ctx_switch(void *rq, struct task_struct *prev, struct task_struct *next) -{ -} -static inline void -sched_trace_special(unsigned long p1, unsigned long p2, unsigned long p3) -{ -} -static inline void -ftrace_wake_up_task(void *rq, struct task_struct *wakee, - struct task_struct *curr) -{ -} -static inline void ftrace_all_fair_tasks(void *__rq, void *__tr, void *__data) -{ -} -static inline void -ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) -{ -} -#endif - extern long sched_setaffinity(pid_t pid, const cpumask_t *new_mask); extern long sched_getaffinity(pid_t pid, cpumask_t *mask); diff --git a/kernel/sched.c b/kernel/sched.c index ad95cca..e2e985e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2500,7 +2500,9 @@ out_activate: success = 1; out_running: - ftrace_wake_up_task(rq, p, rq->curr); + trace_mark(kernel_sched_wakeup, + "pid %d state %ld ## rq %p task %p rq->curr %p", + p->pid, p->state, rq, p, rq->curr); check_preempt_curr(rq, p); p->state = TASK_RUNNING; @@ -2631,7 +2633,9 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) p->sched_class->task_new(rq, p); inc_nr_running(rq); } - ftrace_wake_up_task(rq, p, rq->curr); + trace_mark(kernel_sched_wakeup_new, + "pid %d state %ld ## rq %p task %p rq->curr %p", + p->pid, p->state, rq, p, rq->curr); check_preempt_curr(rq, p); #ifdef CONFIG_SMP if (p->sched_class->task_wake_up) @@ -2804,7 +2808,11 @@ context_switch(struct rq *rq, struct task_struct *prev, struct mm_struct *mm, *oldmm; prepare_task_switch(rq, prev, next); - ftrace_ctx_switch(rq, prev, next); + trace_mark(kernel_sched_schedule, + "prev_pid %d next_pid %d prev_state %ld " + "## rq %p prev %p next %p", + prev->pid, next->pid, prev->state, + rq, prev, next); mm = next->mm; oldmm = prev->active_mm; /* diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 8845033..f5de060 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -234,25 +234,10 @@ void update_max_tr_single(struct trace_array *tr, extern cycle_t ftrace_now(int cpu); -#ifdef CONFIG_SCHED_TRACER -extern void -wakeup_sched_switch(struct task_struct *prev, struct task_struct *next); -extern void -wakeup_sched_wakeup(struct task_struct *wakee, struct task_struct *curr); -#else -static inline void -wakeup_sched_switch(struct task_struct *prev, struct task_struct *next) -{ -} -static inline void -wakeup_sched_wakeup(struct task_struct *wakee, struct task_struct *curr) -{ -} -#endif - #ifdef CONFIG_CONTEXT_SWITCH_TRACER typedef void (*tracer_switch_func_t)(void *private, + void *__rq, struct task_struct *prev, struct task_struct *next); @@ -262,9 +247,6 @@ struct tracer_switch_ops { struct tracer_switch_ops *next; }; -extern int register_tracer_switch(struct tracer_switch_ops *ops); -extern int unregister_tracer_switch(struct tracer_switch_ops *ops); - #endif /* CONFIG_CONTEXT_SWITCH_TRACER */ #ifdef CONFIG_DYNAMIC_FTRACE diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index a337647..d25ffa5 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -16,11 +16,14 @@ static struct trace_array *ctx_trace; static int __read_mostly tracer_enabled; +static atomic_t sched_ref; static void -ctx_switch_func(void *__rq, struct task_struct *prev, struct task_struct *next) +sched_switch_func(void *private, void *__rq, struct task_struct *prev, + struct task_struct *next) { - struct trace_array *tr = ctx_trace; + struct trace_array **ptr = private; + struct trace_array *tr = *ptr; struct trace_array_cpu *data; unsigned long flags; long disabled; @@ -41,10 +44,40 @@ ctx_switch_func(void *__rq, struct task_struct *prev, struct task_struct *next) local_irq_restore(flags); } +static notrace void +sched_switch_callback(void *probe_data, void *call_data, + const char *format, va_list *args) +{ + struct task_struct *prev; + struct task_struct *next; + struct rq *__rq; + + if (!atomic_read(&sched_ref)) + return; + + /* skip prev_pid %d next_pid %d prev_state %ld */ + (void)va_arg(*args, int); + (void)va_arg(*args, int); + (void)va_arg(*args, long); + __rq = va_arg(*args, typeof(__rq)); + prev = va_arg(*args, typeof(prev)); + next = va_arg(*args, typeof(next)); + + tracing_record_cmdline(prev); + + /* + * If tracer_switch_func only points to the local + * switch func, it still needs the ptr passed to it. + */ + sched_switch_func(probe_data, __rq, prev, next); +} + static void -wakeup_func(void *__rq, struct task_struct *wakee, struct task_struct *curr) +wakeup_func(void *private, void *__rq, struct task_struct *wakee, struct + task_struct *curr) { - struct trace_array *tr = ctx_trace; + struct trace_array **ptr = private; + struct trace_array *tr = *ptr; struct trace_array_cpu *data; unsigned long flags; long disabled; @@ -67,35 +100,29 @@ wakeup_func(void *__rq, struct task_struct *wakee, struct task_struct *curr) local_irq_restore(flags); } -void -ftrace_ctx_switch(void *__rq, struct task_struct *prev, - struct task_struct *next) +static notrace void +wake_up_callback(void *probe_data, void *call_data, + const char *format, va_list *args) { - if (unlikely(atomic_read(&trace_record_cmdline_enabled))) - tracing_record_cmdline(prev); + struct task_struct *curr; + struct task_struct *task; + struct rq *__rq; - /* - * If tracer_switch_func only points to the local - * switch func, it still needs the ptr passed to it. - */ - ctx_switch_func(__rq, prev, next); + if (likely(!tracer_enabled)) + return; - /* - * Chain to the wakeup tracer (this is a NOP if disabled): - */ - wakeup_sched_switch(prev, next); -} + /* Skip pid %d state %ld */ + (void)va_arg(*args, int); + (void)va_arg(*args, long); + /* now get the meat: "rq %p task %p rq->curr %p" */ + __rq = va_arg(*args, typeof(__rq)); + task = va_arg(*args, typeof(task)); + curr = va_arg(*args, typeof(curr)); -void -ftrace_wake_up_task(void *__rq, struct task_struct *wakee, - struct task_struct *curr) -{ - wakeup_func(__rq, wakee, curr); + tracing_record_cmdline(task); + tracing_record_cmdline(curr); - /* - * Chain to the wakeup tracer (this is a NOP if disabled): - */ - wakeup_sched_wakeup(wakee, curr); + wakeup_func(probe_data, __rq, task, curr); } void @@ -132,15 +159,95 @@ static void sched_switch_reset(struct trace_array *tr) tracing_reset(tr->data[cpu]); } +static int tracing_sched_register(void) +{ + int ret; + + ret = marker_probe_register("kernel_sched_wakeup", + "pid %d state %ld ## rq %p task %p rq->curr %p", + wake_up_callback, + &ctx_trace); + if (ret) { + pr_info("wakeup trace: Couldn't add marker" + " probe to kernel_sched_wakeup\n"); + return ret; + } + + ret = marker_probe_register("kernel_sched_wakeup_new", + "pid %d state %ld ## rq %p task %p rq->curr %p", + wake_up_callback, + &ctx_trace); + if (ret) { + pr_info("wakeup trace: Couldn't add marker" + " probe to kernel_sched_wakeup_new\n"); + goto fail_deprobe; + } + + ret = marker_probe_register("kernel_sched_schedule", + "prev_pid %d next_pid %d prev_state %ld " + "## rq %p prev %p next %p", + sched_switch_callback, + &ctx_trace); + if (ret) { + pr_info("sched trace: Couldn't add marker" + " probe to kernel_sched_schedule\n"); + goto fail_deprobe_wake_new; + } + + return ret; +fail_deprobe_wake_new: + marker_probe_unregister("kernel_sched_wakeup_new", + wake_up_callback, + &ctx_trace); +fail_deprobe: + marker_probe_unregister("kernel_sched_wakeup", + wake_up_callback, + &ctx_trace); + return ret; +} + +static void tracing_sched_unregister(void) +{ + marker_probe_unregister("kernel_sched_schedule", + sched_switch_callback, + &ctx_trace); + marker_probe_unregister("kernel_sched_wakeup_new", + wake_up_callback, + &ctx_trace); + marker_probe_unregister("kernel_sched_wakeup", + wake_up_callback, + &ctx_trace); +} + +void tracing_start_sched_switch(void) +{ + long ref; + + ref = atomic_inc_return(&sched_ref); + if (ref == 1) + tracing_sched_register(); +} + +void tracing_stop_sched_switch(void) +{ + long ref; + + ref = atomic_dec_and_test(&sched_ref); + if (ref) + tracing_sched_unregister(); +} + static void start_sched_trace(struct trace_array *tr) { sched_switch_reset(tr); atomic_inc(&trace_record_cmdline_enabled); tracer_enabled = 1; + tracing_start_sched_switch(); } static void stop_sched_trace(struct trace_array *tr) { + tracing_stop_sched_switch(); atomic_dec(&trace_record_cmdline_enabled); tracer_enabled = 0; } @@ -181,6 +288,14 @@ static struct tracer sched_switch_trace __read_mostly = __init static int init_sched_switch_trace(void) { + int ret = 0; + + if (atomic_read(&sched_ref)) + ret = tracing_sched_register(); + if (ret) { + pr_info("error registering scheduler trace\n"); + return ret; + } return register_tracer(&sched_switch_trace); } device_initcall(init_sched_switch_trace); diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 5948011..5d2fb48 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -15,6 +15,7 @@ #include #include #include +#include #include "trace.h" @@ -44,11 +45,13 @@ static int report_latency(cycle_t delta) return 1; } -void -wakeup_sched_switch(struct task_struct *prev, struct task_struct *next) +static void notrace +wakeup_sched_switch(void *private, void *rq, struct task_struct *prev, + struct task_struct *next) { unsigned long latency = 0, t0 = 0, t1 = 0; - struct trace_array *tr = wakeup_trace; + struct trace_array **ptr = private; + struct trace_array *tr = *ptr; struct trace_array_cpu *data; cycle_t T0, T1, delta; unsigned long flags; @@ -113,6 +116,31 @@ out: atomic_dec(&tr->data[cpu]->disabled); } +static notrace void +sched_switch_callback(void *probe_data, void *call_data, + const char *format, va_list *args) +{ + struct task_struct *prev; + struct task_struct *next; + struct rq *__rq; + + /* skip prev_pid %d next_pid %d prev_state %ld */ + (void)va_arg(*args, int); + (void)va_arg(*args, int); + (void)va_arg(*args, long); + __rq = va_arg(*args, typeof(__rq)); + prev = va_arg(*args, typeof(prev)); + next = va_arg(*args, typeof(next)); + + tracing_record_cmdline(prev); + + /* + * If tracer_switch_func only points to the local + * switch func, it still needs the ptr passed to it. + */ + wakeup_sched_switch(probe_data, __rq, prev, next); +} + static void __wakeup_reset(struct trace_array *tr) { struct trace_array_cpu *data; @@ -188,19 +216,68 @@ out: atomic_dec(&tr->data[cpu]->disabled); } -void wakeup_sched_wakeup(struct task_struct *wakee, struct task_struct *curr) +static notrace void +wake_up_callback(void *probe_data, void *call_data, + const char *format, va_list *args) { + struct trace_array **ptr = probe_data; + struct trace_array *tr = *ptr; + struct task_struct *curr; + struct task_struct *task; + struct rq *__rq; + if (likely(!tracer_enabled)) return; + /* Skip pid %d state %ld */ + (void)va_arg(*args, int); + (void)va_arg(*args, long); + /* now get the meat: "rq %p task %p rq->curr %p" */ + __rq = va_arg(*args, typeof(__rq)); + task = va_arg(*args, typeof(task)); + curr = va_arg(*args, typeof(curr)); + + tracing_record_cmdline(task); tracing_record_cmdline(curr); - tracing_record_cmdline(wakee); - wakeup_check_start(wakeup_trace, wakee, curr); + wakeup_check_start(tr, task, curr); } static void start_wakeup_tracer(struct trace_array *tr) { + int ret; + + ret = marker_probe_register("kernel_sched_wakeup", + "pid %d state %ld ## rq %p task %p rq->curr %p", + wake_up_callback, + &wakeup_trace); + if (ret) { + pr_info("wakeup trace: Couldn't add marker" + " probe to kernel_sched_wakeup\n"); + return; + } + + ret = marker_probe_register("kernel_sched_wakeup_new", + "pid %d state %ld ## rq %p task %p rq->curr %p", + wake_up_callback, + &wakeup_trace); + if (ret) { + pr_info("wakeup trace: Couldn't add marker" + " probe to kernel_sched_wakeup_new\n"); + goto fail_deprobe; + } + + ret = marker_probe_register("kernel_sched_schedule", + "prev_pid %d next_pid %d prev_state %ld " + "## rq %p prev %p next %p", + sched_switch_callback, + &wakeup_trace); + if (ret) { + pr_info("sched trace: Couldn't add marker" + " probe to kernel_sched_schedule\n"); + goto fail_deprobe_wake_new; + } + wakeup_reset(tr); /* @@ -215,11 +292,28 @@ static void start_wakeup_tracer(struct trace_array *tr) tracer_enabled = 1; return; +fail_deprobe_wake_new: + marker_probe_unregister("kernel_sched_wakeup_new", + wake_up_callback, + &wakeup_trace); +fail_deprobe: + marker_probe_unregister("kernel_sched_wakeup", + wake_up_callback, + &wakeup_trace); } static void stop_wakeup_tracer(struct trace_array *tr) { tracer_enabled = 0; + marker_probe_unregister("kernel_sched_schedule", + sched_switch_callback, + &wakeup_trace); + marker_probe_unregister("kernel_sched_wakeup_new", + wake_up_callback, + &wakeup_trace); + marker_probe_unregister("kernel_sched_wakeup", + wake_up_callback, + &wakeup_trace); } static void wakeup_tracer_init(struct trace_array *tr) -- cgit v0.10.2 From 74f4e369fc5b52433ad824cef32d3bf1304549be Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:21:15 +0200 Subject: ftrace: stacktrace fix Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 0d3714e..017ab44 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -120,4 +120,12 @@ static inline void tracer_disable(void) # define trace_preempt_off(a0, a1) do { } while (0) #endif +#ifdef CONFIG_CONTEXT_SWITCH_TRACER +extern void +ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3); +#else +static inline void +ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) { } +#endif + #endif /* _LINUX_FTRACE_H */ diff --git a/kernel/semaphore.c b/kernel/semaphore.c index 5c2942e..1a064ad 100644 --- a/kernel/semaphore.c +++ b/kernel/semaphore.c @@ -31,6 +31,7 @@ #include #include #include +#include static noinline void __down(struct semaphore *sem); static noinline int __down_interruptible(struct semaphore *sem); @@ -53,6 +54,7 @@ void down(struct semaphore *sem) { unsigned long flags; + ftrace_special(sem->count, 0, __LINE__); spin_lock_irqsave(&sem->lock, flags); if (likely(sem->count > 0)) sem->count--; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2824cf48..3271916 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -901,7 +901,7 @@ tracing_sched_switch_trace(struct trace_array *tr, entry->ctx.next_pid = next->pid; entry->ctx.next_prio = next->prio; entry->ctx.next_state = next->state; - __trace_stack(tr, data, flags, 4); + __trace_stack(tr, data, flags, 5); __raw_spin_unlock(&data->lock); raw_local_irq_restore(irq_flags); } @@ -927,7 +927,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr, entry->ctx.next_pid = wakee->pid; entry->ctx.next_prio = wakee->prio; entry->ctx.next_state = wakee->state; - __trace_stack(tr, data, flags, 5); + __trace_stack(tr, data, flags, 6); __raw_spin_unlock(&data->lock); raw_local_irq_restore(irq_flags); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index f5de060..c460e85 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -51,7 +51,7 @@ struct special_entry { * Stack-trace entry: */ -#define FTRACE_STACK_ENTRIES 5 +#define FTRACE_STACK_ENTRIES 8 struct stack_entry { unsigned long caller[FTRACE_STACK_ENTRIES]; -- cgit v0.10.2 From aa5e5ceaf52a882a29d9b86531a20733f5116066 Mon Sep 17 00:00:00 2001 From: David Miller Date: Tue, 13 May 2008 22:06:56 -0700 Subject: ftrace: remove packed attribute on ftrace_page. It causes unaligned access traps on platforms like sparc (ftrace_page may be marked packed, but once we return a dyn_ftrace sub-object from this array to another piece of code, the "packed" part of the typing information doesn't propagate). But also, it didn't serve any purpose either. Even if packed, on 64-bit or 32-bit, it didn't give us any more dyn_ftrace entries per-page. Signed-off-by: David S. Miller Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index af5ad89..07b2a14 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -177,9 +177,9 @@ static DEFINE_MUTEX(ftrace_filter_lock); struct ftrace_page { struct ftrace_page *next; - int index; + unsigned long index; struct dyn_ftrace records[]; -} __attribute__((packed)); +}; #define ENTRIES_PER_PAGE \ ((PAGE_SIZE - sizeof(struct ftrace_page)) / sizeof(struct dyn_ftrace)) -- cgit v0.10.2 From d05f5f9906740474eb768823004ffcd775b12ca6 Mon Sep 17 00:00:00 2001 From: David Miller Date: Tue, 13 May 2008 22:06:59 -0700 Subject: sparc64: add ftrace support. Signed-off-by: David S. Miller Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/arch/sparc64/Kconfig b/arch/sparc64/Kconfig index eb36f3b..a480df6 100644 --- a/arch/sparc64/Kconfig +++ b/arch/sparc64/Kconfig @@ -11,6 +11,7 @@ config SPARC config SPARC64 bool default y + select HAVE_FTRACE select HAVE_IDE select HAVE_LMB select HAVE_ARCH_KGDB diff --git a/arch/sparc64/Kconfig.debug b/arch/sparc64/Kconfig.debug index 6a4d28a..d6d32d1 100644 --- a/arch/sparc64/Kconfig.debug +++ b/arch/sparc64/Kconfig.debug @@ -33,7 +33,7 @@ config DEBUG_PAGEALLOC config MCOUNT bool - depends on STACK_DEBUG + depends on STACK_DEBUG || FTRACE default y config FRAME_POINTER diff --git a/arch/sparc64/kernel/Makefile b/arch/sparc64/kernel/Makefile index ec4f5eb..418b578 100644 --- a/arch/sparc64/kernel/Makefile +++ b/arch/sparc64/kernel/Makefile @@ -14,6 +14,7 @@ obj-y := process.o setup.o cpu.o idprom.o \ power.o sbus.o sparc64_ksyms.o chmc.o \ visemul.o prom.o of_device.o hvapi.o sstate.o mdesc.o +obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-$(CONFIG_PCI) += ebus.o pci_common.o \ pci_psycho.o pci_sabre.o pci_schizo.o \ diff --git a/arch/sparc64/kernel/ftrace.c b/arch/sparc64/kernel/ftrace.c new file mode 100644 index 0000000..f449e6d --- /dev/null +++ b/arch/sparc64/kernel/ftrace.c @@ -0,0 +1,99 @@ +#include +#include +#include +#include +#include +#include + +static const u32 ftrace_nop = 0x01000000; + +notrace int ftrace_ip_converted(unsigned long ip) +{ + u32 insn = *(u32 *) ip; + + return (insn == ftrace_nop); +} + +notrace unsigned char *ftrace_nop_replace(void) +{ + return (char *)&ftrace_nop; +} + +notrace unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr) +{ + static u32 call; + s32 off; + + off = ((s32)addr - (s32)ip); + call = 0x40000000 | ((u32)off >> 2); + + return (unsigned char *) &call; +} + +notrace int +ftrace_modify_code(unsigned long ip, unsigned char *old_code, + unsigned char *new_code) +{ + u32 old = *(u32 *)old_code; + u32 new = *(u32 *)new_code; + u32 replaced; + int faulted; + + __asm__ __volatile__( + "1: cas [%[ip]], %[old], %[new]\n" + " flush %[ip]\n" + " mov 0, %[faulted]\n" + "2:\n" + " .section .fixup,#alloc,#execinstr\n" + " .align 4\n" + "3: sethi %%hi(2b), %[faulted]\n" + " jmpl %[faulted] + %%lo(2b), %%g0\n" + " mov 1, %[faulted]\n" + " .previous\n" + " .section __ex_table,\"a\"\n" + " .align 4\n" + " .word 1b, 3b\n" + " .previous\n" + : "=r" (replaced), [faulted] "=r" (faulted) + : [new] "0" (new), [old] "r" (old), [ip] "r" (ip) + : "memory"); + + if (replaced != old && replaced != new) + faulted = 2; + + return faulted; +} + +notrace int ftrace_update_ftrace_func(ftrace_func_t func) +{ + unsigned long ip = (unsigned long)(&ftrace_call); + unsigned char old[4], *new; + + memcpy(old, &ftrace_call, 4); + new = ftrace_call_replace(ip, (unsigned long)func); + return ftrace_modify_code(ip, old, new); +} + +notrace int ftrace_mcount_set(unsigned long *data) +{ + unsigned long ip = (long)(&mcount_call); + unsigned long *addr = data; + unsigned char old[4], *new; + + /* + * Replace the mcount stub with a pointer to the + * ip recorder function. + */ + memcpy(old, &mcount_call, 4); + new = ftrace_call_replace(ip, *addr); + *addr = ftrace_modify_code(ip, old, new); + + return 0; +} + + +int __init ftrace_dyn_arch_init(void *data) +{ + ftrace_mcount_set(data); + return 0; +} diff --git a/arch/sparc64/lib/mcount.S b/arch/sparc64/lib/mcount.S index 9e4534b..7735a7a 100644 --- a/arch/sparc64/lib/mcount.S +++ b/arch/sparc64/lib/mcount.S @@ -28,10 +28,13 @@ ovstack: .skip OVSTACKSIZE #endif .text - .align 32 - .globl mcount, _mcount -mcount: + .align 32 + .globl _mcount + .type _mcount,#function + .globl mcount + .type mcount,#function _mcount: +mcount: #ifdef CONFIG_STACK_DEBUG /* * Check whether %sp is dangerously low. @@ -55,6 +58,53 @@ _mcount: or %g3, %lo(panicstring), %o0 call prom_halt nop +1: +#endif +#ifdef CONFIG_FTRACE +#ifdef CONFIG_DYNAMIC_FTRACE + mov %o7, %o0 + .globl mcount_call +mcount_call: + call ftrace_stub + mov %o0, %o7 +#else + sethi %hi(ftrace_trace_function), %g1 + sethi %hi(ftrace_stub), %g2 + ldx [%g1 + %lo(ftrace_trace_function)], %g1 + or %g2, %lo(ftrace_stub), %g2 + cmp %g1, %g2 + be,pn %icc, 1f + mov %i7, %o1 + jmpl %g1, %g0 + mov %o7, %o0 + /* not reached */ +1: #endif -1: retl +#endif + retl nop + .size _mcount,.-_mcount + .size mcount,.-mcount + +#ifdef CONFIG_FTRACE + .globl ftrace_stub + .type ftrace_stub,#function +ftrace_stub: + retl + nop + .size ftrace_stub,.-ftrace_stub +#ifdef CONFIG_DYNAMIC_FTRACE + .globl ftrace_caller + .type ftrace_caller,#function +ftrace_caller: + mov %i7, %o1 + mov %o7, %o0 + .globl ftrace_call +ftrace_call: + call ftrace_stub + mov %o0, %o7 + retl + nop + .size ftrace_caller,.-ftrace_caller +#endif +#endif -- cgit v0.10.2 From 37135677e653537ffc6e7def679443272a1c03c3 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 14 May 2008 08:10:31 +0200 Subject: ftrace: fix mcount export bug David S. Miller noticed the following bug: the -pg instrumentation function callback is named differently on each platform. On x86 it is mcount, on sparc it is _mcount. So the export does not make sense in kernel/trace/ftrace.c - move it to x86. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c index deb4378..29999db 100644 --- a/arch/x86/kernel/i386_ksyms_32.c +++ b/arch/x86/kernel/i386_ksyms_32.c @@ -1,7 +1,14 @@ +#include #include + #include -#include #include +#include + +#ifdef CONFIG_FTRACE +/* mcount is defined in assembly */ +EXPORT_SYMBOL(mcount); +#endif /* Networking helper routines. */ EXPORT_SYMBOL(csum_partial_copy_generic); diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c index f6c05d0..122885b 100644 --- a/arch/x86/kernel/x8664_ksyms_64.c +++ b/arch/x86/kernel/x8664_ksyms_64.c @@ -1,15 +1,22 @@ /* Exports for assembly files. All C exports should go in the respective C files. */ +#include #include -#include #include +#include + #include -#include #include +#include #include +#ifdef CONFIG_FTRACE +/* mcount is defined in assembly */ +EXPORT_SYMBOL(mcount); +#endif + EXPORT_SYMBOL(kernel_thread); EXPORT_SYMBOL(__get_user_1); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 07b2a14..a3e47f4 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -50,9 +50,6 @@ static struct ftrace_ops ftrace_list_end __read_mostly = static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end; ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; -/* mcount is defined per arch in assembly */ -EXPORT_SYMBOL(mcount); - void ftrace_list_func(unsigned long ip, unsigned long parent_ip) { struct ftrace_ops *op = ftrace_list; -- cgit v0.10.2 From 2d8b820b2e81954754277723379ae9ed5de316fa Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 23 Feb 2008 16:55:50 +0100 Subject: ftrace: cleanups factor out code and clean it up. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index a3e47f4..89bd9a6 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -18,13 +18,13 @@ #include #include #include -#include #include -#include +#include #include +#include #include -#include #include +#include #include #include "trace.h" -- cgit v0.10.2 From d49dbf33f0bf8748ee3662b973eb57e60525d622 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 16 May 2008 10:41:53 +0200 Subject: ftrace: fix include file dependency Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 017ab44..911d5d8 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -4,6 +4,7 @@ #ifdef CONFIG_FTRACE #include +#include extern int ftrace_enabled; extern int -- cgit v0.10.2 From 489f139614596cbc956a06f5e4bb41288e276fe3 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 25 Feb 2008 13:38:05 +0100 Subject: ftrace: fix build bug Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 911d5d8..922e23d 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -106,16 +106,16 @@ static inline void tracer_disable(void) #endif #ifdef CONFIG_IRQSOFF_TRACER - extern void notrace time_hardirqs_on(unsigned long a0, unsigned long a1); - extern void notrace time_hardirqs_off(unsigned long a0, unsigned long a1); + extern void time_hardirqs_on(unsigned long a0, unsigned long a1); + extern void time_hardirqs_off(unsigned long a0, unsigned long a1); #else # define time_hardirqs_on(a0, a1) do { } while (0) # define time_hardirqs_off(a0, a1) do { } while (0) #endif #ifdef CONFIG_PREEMPT_TRACER - extern void notrace trace_preempt_on(unsigned long a0, unsigned long a1); - extern void notrace trace_preempt_off(unsigned long a0, unsigned long a1); + extern void trace_preempt_on(unsigned long a0, unsigned long a1); + extern void trace_preempt_off(unsigned long a0, unsigned long a1); #else # define trace_preempt_on(a0, a1) do { } while (0) # define trace_preempt_off(a0, a1) do { } while (0) -- cgit v0.10.2 From e0eca07badc023a675a61906020b397da20f07c3 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 14 May 2008 23:49:43 -0400 Subject: ftrace, POWERPC: add irqs_disabled_flags to ppc PPC doesn't have the irqs_disabled_flags needed by ftrace. This patch adds it. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/include/asm-powerpc/hw_irq.h b/include/asm-powerpc/hw_irq.h index ad8c9f7..f75a5fc 100644 --- a/include/asm-powerpc/hw_irq.h +++ b/include/asm-powerpc/hw_irq.h @@ -59,6 +59,11 @@ extern void iseries_handle_interrupts(void); get_paca()->hard_enabled = 0; \ } while(0) +static inline int irqs_disabled_flags(unsigned long flags) +{ + return flags == 0; +} + #else #if defined(CONFIG_BOOKE) @@ -113,6 +118,11 @@ static inline void local_irq_save_ptr(unsigned long *flags) #define hard_irq_enable() local_irq_enable() #define hard_irq_disable() local_irq_disable() +static inline int irqs_disabled_flags(unsigned long flags) +{ + return (flags & MSR_EE) == 0; +} + #endif /* CONFIG_PPC64 */ /* -- cgit v0.10.2 From 4e491d14f2506b218d678935c25a7027b79178b1 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 14 May 2008 23:49:44 -0400 Subject: ftrace: support for PowerPC This patch adds full support for ftrace for PowerPC (both 64 and 32 bit). This includes dynamic tracing and function filtering. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 3934e26..62d034a 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -105,11 +105,12 @@ config ARCH_NO_VIRT_TO_BUS config PPC bool default y + select HAVE_FTRACE select HAVE_IDE - select HAVE_OPROFILE select HAVE_KPROBES select HAVE_KRETPROBES select HAVE_LMB + select HAVE_OPROFILE config EARLY_PRINTK bool diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 2346d27..f3f5e26 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -12,6 +12,18 @@ CFLAGS_prom_init.o += -fPIC CFLAGS_btext.o += -fPIC endif +ifdef CONFIG_FTRACE +# Do not trace early boot code +CFLAGS_REMOVE_cputable.o = -pg +CFLAGS_REMOVE_prom_init.o = -pg + +ifdef CONFIG_DYNAMIC_FTRACE +# dynamic ftrace setup. +CFLAGS_REMOVE_ftrace.o = -pg +endif + +endif + obj-y := cputable.o ptrace.o syscalls.o \ irq.o align.o signal_32.o pmc.o vdso.o \ init_task.o process.o systbl.o idle.o \ @@ -78,6 +90,8 @@ obj-$(CONFIG_KEXEC) += machine_kexec.o crash.o \ obj-$(CONFIG_AUDIT) += audit.o obj64-$(CONFIG_AUDIT) += compat_audit.o +obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o + obj-$(CONFIG_8XX_MINIMAL_FPEMU) += softemu8xx.o ifneq ($(CONFIG_PPC_INDIRECT_IO),y) diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index 0c8614d..0e62218 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -1035,3 +1035,133 @@ machine_check_in_rtas: /* XXX load up BATs and panic */ #endif /* CONFIG_PPC_RTAS */ + +#ifdef CONFIG_FTRACE +#ifdef CONFIG_DYNAMIC_FTRACE +_GLOBAL(mcount) +_GLOBAL(_mcount) + stwu r1,-48(r1) + stw r3, 12(r1) + stw r4, 16(r1) + stw r5, 20(r1) + stw r6, 24(r1) + mflr r3 + stw r7, 28(r1) + mfcr r5 + stw r8, 32(r1) + stw r9, 36(r1) + stw r10,40(r1) + stw r3, 44(r1) + stw r5, 8(r1) + .globl mcount_call +mcount_call: + bl ftrace_stub + nop + lwz r6, 8(r1) + lwz r0, 44(r1) + lwz r3, 12(r1) + mtctr r0 + lwz r4, 16(r1) + mtcr r6 + lwz r5, 20(r1) + lwz r6, 24(r1) + lwz r0, 52(r1) + lwz r7, 28(r1) + lwz r8, 32(r1) + mtlr r0 + lwz r9, 36(r1) + lwz r10,40(r1) + addi r1, r1, 48 + bctr + +_GLOBAL(ftrace_caller) + /* Based off of objdump optput from glibc */ + stwu r1,-48(r1) + stw r3, 12(r1) + stw r4, 16(r1) + stw r5, 20(r1) + stw r6, 24(r1) + mflr r3 + lwz r4, 52(r1) + mfcr r5 + stw r7, 28(r1) + stw r8, 32(r1) + stw r9, 36(r1) + stw r10,40(r1) + stw r3, 44(r1) + stw r5, 8(r1) +.globl ftrace_call +ftrace_call: + bl ftrace_stub + nop + lwz r6, 8(r1) + lwz r0, 44(r1) + lwz r3, 12(r1) + mtctr r0 + lwz r4, 16(r1) + mtcr r6 + lwz r5, 20(r1) + lwz r6, 24(r1) + lwz r0, 52(r1) + lwz r7, 28(r1) + lwz r8, 32(r1) + mtlr r0 + lwz r9, 36(r1) + lwz r10,40(r1) + addi r1, r1, 48 + bctr +#else +_GLOBAL(mcount) +_GLOBAL(_mcount) + stwu r1,-48(r1) + stw r3, 12(r1) + stw r4, 16(r1) + stw r5, 20(r1) + stw r6, 24(r1) + mflr r3 + lwz r4, 52(r1) + mfcr r5 + stw r7, 28(r1) + stw r8, 32(r1) + stw r9, 36(r1) + stw r10,40(r1) + stw r3, 44(r1) + stw r5, 8(r1) + + LOAD_REG_ADDR(r5, ftrace_trace_function) +#if 0 + mtctr r3 + mr r1, r5 + bctrl +#endif + lwz r5,0(r5) +#if 1 + mtctr r5 + bctrl +#else + bl ftrace_stub +#endif + nop + + lwz r6, 8(r1) + lwz r0, 44(r1) + lwz r3, 12(r1) + mtctr r0 + lwz r4, 16(r1) + mtcr r6 + lwz r5, 20(r1) + lwz r6, 24(r1) + lwz r0, 52(r1) + lwz r7, 28(r1) + lwz r8, 32(r1) + mtlr r0 + lwz r9, 36(r1) + lwz r10,40(r1) + addi r1, r1, 48 + bctr +#endif + +_GLOBAL(ftrace_stub) + blr + +#endif /* CONFIG_MCOUNT */ diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index c0db5b7..2c4d9e0 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -870,3 +870,65 @@ _GLOBAL(enter_prom) ld r0,16(r1) mtlr r0 blr + +#ifdef CONFIG_FTRACE +#ifdef CONFIG_DYNAMIC_FTRACE +_GLOBAL(mcount) +_GLOBAL(_mcount) + /* Taken from output of objdump from lib64/glibc */ + mflr r3 + stdu r1, -112(r1) + std r3, 128(r1) + .globl mcount_call +mcount_call: + bl ftrace_stub + nop + ld r0, 128(r1) + mtlr r0 + addi r1, r1, 112 + blr + +_GLOBAL(ftrace_caller) + /* Taken from output of objdump from lib64/glibc */ + mflr r3 + ld r11, 0(r1) + stdu r1, -112(r1) + std r3, 128(r1) + ld r4, 16(r11) +.globl ftrace_call +ftrace_call: + bl ftrace_stub + nop + ld r0, 128(r1) + mtlr r0 + addi r1, r1, 112 +_GLOBAL(ftrace_stub) + blr +#else +_GLOBAL(mcount) + blr + +_GLOBAL(_mcount) + /* Taken from output of objdump from lib64/glibc */ + mflr r3 + ld r11, 0(r1) + stdu r1, -112(r1) + std r3, 128(r1) + ld r4, 16(r11) + + + LOAD_REG_ADDR(r5,ftrace_trace_function) + ld r5,0(r5) + ld r5,0(r5) + mtctr r5 + bctrl + + nop + ld r0, 128(r1) + mtlr r0 + addi r1, r1, 112 +_GLOBAL(ftrace_stub) + blr + +#endif +#endif diff --git a/arch/powerpc/kernel/ftrace.c b/arch/powerpc/kernel/ftrace.c new file mode 100644 index 0000000..5a4993f --- /dev/null +++ b/arch/powerpc/kernel/ftrace.c @@ -0,0 +1,165 @@ +/* + * Code for replacing ftrace calls with jumps. + * + * Copyright (C) 2007-2008 Steven Rostedt + * + * Thanks goes out to P.A. Semi, Inc for supplying me with a PPC64 box. + * + */ + +#include +#include +#include +#include +#include +#include + +#include + +#define CALL_BACK 4 + +static unsigned int ftrace_nop = 0x60000000; + +#ifdef CONFIG_PPC32 +# define GET_ADDR(addr) addr +#else +/* PowerPC64's functions are data that points to the functions */ +# define GET_ADDR(addr) *(unsigned long *)addr +#endif + +notrace int ftrace_ip_converted(unsigned long ip) +{ + unsigned int save; + + ip -= CALL_BACK; + save = *(unsigned int *)ip; + + return save == ftrace_nop; +} + +static unsigned int notrace ftrace_calc_offset(long ip, long addr) +{ + return (int)((addr + CALL_BACK) - ip); +} + +notrace unsigned char *ftrace_nop_replace(void) +{ + return (char *)&ftrace_nop; +} + +notrace unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr) +{ + static unsigned int op; + + addr = GET_ADDR(addr); + + /* Set to "bl addr" */ + op = 0x48000001 | (ftrace_calc_offset(ip, addr) & 0x03fffffe); + + /* + * No locking needed, this must be called via kstop_machine + * which in essence is like running on a uniprocessor machine. + */ + return (unsigned char *)&op; +} + +#ifdef CONFIG_PPC64 +# define _ASM_ALIGN " .align 3 " +# define _ASM_PTR " .llong " +#else +# define _ASM_ALIGN " .align 2 " +# define _ASM_PTR " .long " +#endif + +notrace int +ftrace_modify_code(unsigned long ip, unsigned char *old_code, + unsigned char *new_code) +{ + unsigned replaced; + unsigned old = *(unsigned *)old_code; + unsigned new = *(unsigned *)new_code; + int faulted = 0; + + /* move the IP back to the start of the call */ + ip -= CALL_BACK; + + /* + * Note: Due to modules and __init, code can + * disappear and change, we need to protect against faulting + * as well as code changing. + * + * No real locking needed, this code is run through + * kstop_machine. + */ + asm volatile ( + "1: lwz %1, 0(%2)\n" + " cmpw %1, %5\n" + " bne 2f\n" + " stwu %3, 0(%2)\n" + "2:\n" + ".section .fixup, \"ax\"\n" + "3: li %0, 1\n" + " b 2b\n" + ".previous\n" + ".section __ex_table,\"a\"\n" + _ASM_ALIGN "\n" + _ASM_PTR "1b, 3b\n" + ".previous" + : "=r"(faulted), "=r"(replaced) + : "r"(ip), "r"(new), + "0"(faulted), "r"(old) + : "memory"); + + if (replaced != old && replaced != new) + faulted = 2; + + if (!faulted) + flush_icache_range(ip, ip + 8); + + return faulted; +} + +notrace int ftrace_update_ftrace_func(ftrace_func_t func) +{ + unsigned long ip = (unsigned long)(&ftrace_call); + unsigned char old[4], *new; + int ret; + + ip += CALL_BACK; + + memcpy(old, &ftrace_call, 4); + new = ftrace_call_replace(ip, (unsigned long)func); + ret = ftrace_modify_code(ip, old, new); + + return ret; +} + +notrace int ftrace_mcount_set(unsigned long *data) +{ + unsigned long ip = (long)(&mcount_call); + unsigned long *addr = data; + unsigned char old[4], *new; + + /* ip is at the location, but modify code will subtact this */ + ip += CALL_BACK; + + /* + * Replace the mcount stub with a pointer to the + * ip recorder function. + */ + memcpy(old, &mcount_call, 4); + new = ftrace_call_replace(ip, *addr); + *addr = ftrace_modify_code(ip, old, new); + + return 0; +} + +int __init ftrace_dyn_arch_init(void *data) +{ + /* This is running in kstop_machine */ + + ftrace_mcount_set(data); + + return 0; +} + diff --git a/arch/powerpc/kernel/io.c b/arch/powerpc/kernel/io.c index e31aca92..1882bf4 100644 --- a/arch/powerpc/kernel/io.c +++ b/arch/powerpc/kernel/io.c @@ -120,7 +120,8 @@ EXPORT_SYMBOL(_outsl_ns); #define IO_CHECK_ALIGN(v,a) ((((unsigned long)(v)) & ((a) - 1)) == 0) -void _memset_io(volatile void __iomem *addr, int c, unsigned long n) +notrace void +_memset_io(volatile void __iomem *addr, int c, unsigned long n) { void *p = (void __force *)addr; u32 lc = c; diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index 2f73f70..6e01eb0 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -98,7 +98,7 @@ EXPORT_SYMBOL(irq_desc); int distribute_irqs = 1; -static inline unsigned long get_hard_enabled(void) +static inline notrace unsigned long get_hard_enabled(void) { unsigned long enabled; @@ -108,13 +108,13 @@ static inline unsigned long get_hard_enabled(void) return enabled; } -static inline void set_soft_enabled(unsigned long enable) +static inline notrace void set_soft_enabled(unsigned long enable) { __asm__ __volatile__("stb %0,%1(13)" : : "r" (enable), "i" (offsetof(struct paca_struct, soft_enabled))); } -void raw_local_irq_restore(unsigned long en) +notrace void raw_local_irq_restore(unsigned long en) { /* * get_paca()->soft_enabled = en; diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c index 5112a4a..22f8e2b 100644 --- a/arch/powerpc/kernel/setup_32.c +++ b/arch/powerpc/kernel/setup_32.c @@ -47,6 +47,11 @@ #include #endif +#ifdef CONFIG_FTRACE +extern void _mcount(void); +EXPORT_SYMBOL(_mcount); +#endif + extern void bootx_init(unsigned long r4, unsigned long phys); int boot_cpuid; @@ -81,7 +86,7 @@ int ucache_bsize; * from the address that it was linked at, so we must use RELOC/PTRRELOC * to access static data (including strings). -- paulus */ -unsigned long __init early_init(unsigned long dt_ptr) +notrace unsigned long __init early_init(unsigned long dt_ptr) { unsigned long offset = reloc_offset(); struct cpu_spec *spec; @@ -111,7 +116,7 @@ unsigned long __init early_init(unsigned long dt_ptr) * This is called very early on the boot process, after a minimal * MMU environment has been set up but before MMU_init is called. */ -void __init machine_init(unsigned long dt_ptr, unsigned long phys) +notrace void __init machine_init(unsigned long dt_ptr, unsigned long phys) { /* Enable early debugging if any specified (see udbg.h) */ udbg_early_init(); @@ -133,7 +138,7 @@ void __init machine_init(unsigned long dt_ptr, unsigned long phys) #ifdef CONFIG_BOOKE_WDT /* Checks wdt=x and wdt_period=xx command-line option */ -int __init early_parse_wdt(char *p) +notrace int __init early_parse_wdt(char *p) { if (p && strncmp(p, "0", 1) != 0) booke_wdt_enabled = 1; diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 098fd96..277bf18 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -85,6 +85,11 @@ struct ppc64_caches ppc64_caches = { }; EXPORT_SYMBOL_GPL(ppc64_caches); +#ifdef CONFIG_FTRACE +extern void _mcount(void); +EXPORT_SYMBOL(_mcount); +#endif + /* * These are used in binfmt_elf.c to put aux entries on the stack * for each elf executable being started. diff --git a/arch/powerpc/platforms/powermac/Makefile b/arch/powerpc/platforms/powermac/Makefile index 4d72c8f..8977417 100644 --- a/arch/powerpc/platforms/powermac/Makefile +++ b/arch/powerpc/platforms/powermac/Makefile @@ -1,5 +1,10 @@ CFLAGS_bootx_init.o += -fPIC +ifdef CONFIG_FTRACE +# Do not trace early boot code +CFLAGS_REMOVE_bootx_init.o = -pg +endif + obj-y += pic.o setup.o time.o feature.o pci.o \ sleep.o low_i2c.o cache.o pfunc_core.o \ pfunc_base.o diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index a5f6001..3877dd9 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -123,6 +123,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, int ret; int save_ftrace_enabled = ftrace_enabled; int save_tracer_enabled = tracer_enabled; + char *func_name; /* The ftrace test PASSED */ printk(KERN_CONT "PASSED\n"); @@ -142,9 +143,15 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, return ret; } + /* + * Some archs *cough*PowerPC*cough* add charachters to the + * start of the function names. We simply put a '*' to + * accomodate them. + */ + func_name = "*" STR(DYN_FTRACE_TEST_NAME); + /* filter only on our function */ - ftrace_set_filter(STR(DYN_FTRACE_TEST_NAME), - sizeof(STR(DYN_FTRACE_TEST_NAME)), 1); + ftrace_set_filter(func_name, strlen(func_name), 1); /* enable tracing */ tr->ctrl = 1; -- cgit v0.10.2 From 656ee82cc855027b2e994ad218519b09fa652cc1 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 14 May 2008 21:30:29 -0400 Subject: kbuild: create new CFLAGS_REMOVE_(basename).o option We currently have a way to add special CFLAGS to code, but we do not have a way to remove them if needed. With the case of ftrace, some files should simply not be profiled. Adding the -pg flag to these files is simply a waste, and adding "notrace" to each and every function is ugly. Currently we put in "Makefile turd" [1] to stop the compiler from adding -pg to certain files. This was clumsy and awkward. This patch now adds the revese of CFLAGS_(basename).o with CFLAGS_REMOVE_(basename).o. This allows developers to prevent certain CFLAGS from being used to compile files. For example, we can now do CFLAGS_REMOVE_string.o = -pg to remove the -pg option from the string.o file in the lib directory. Note: a space delimited list of options may be added to the REMOVE macro. [1] - what David Miller called the workaronud to remove -pg Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index 8e44023..ea48b82 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -96,7 +96,8 @@ basename_flags = -D"KBUILD_BASENAME=KBUILD_STR($(call name-fix,$(basetarget)))" modname_flags = $(if $(filter 1,$(words $(modname))),\ -D"KBUILD_MODNAME=KBUILD_STR($(call name-fix,$(modname)))") -_c_flags = $(KBUILD_CFLAGS) $(ccflags-y) $(CFLAGS_$(basetarget).o) +orig_c_flags = $(KBUILD_CFLAGS) $(ccflags-y) $(CFLAGS_$(basetarget).o) +_c_flags = $(filter-out $(CFLAGS_REMOVE_$(basetarget).o), $(orig_c_flags)) _a_flags = $(KBUILD_AFLAGS) $(asflags-y) $(AFLAGS_$(basetarget).o) _cpp_flags = $(KBUILD_CPPFLAGS) $(cppflags-y) $(CPPFLAGS_$(@F)) -- cgit v0.10.2 From 6ec562328fda585be2d7f472cfac99d3b44d362a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 14 May 2008 21:30:30 -0400 Subject: ftrace: use the new kbuild CFLAGS_REMOVE for kernel directory This patch removes the Makefile turd and uses the nice CFLAGS_REMOVE macro in the kernel directory. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/Makefile b/kernel/Makefile index d2f80ea..ca2433e 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -11,12 +11,16 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ notifier.o ksysfs.o pm_qos_params.o sched_clock.o +CFLAGS_REMOVE_sched.o = -pg -mno-spe + ifdef CONFIG_FTRACE -# Do not profile debug utilities -ORIG_CFLAGS := $(KBUILD_CFLAGS) -KBUILD_CFLAGS = $(if $(filter-out lockdep% %debug,$(basename $(notdir $@))), \ - $(ORIG_CFLAGS), \ - $(subst -pg,,$(ORIG_CFLAGS))) +# Do not trace debug files and internal ftrace files +CFLAGS_REMOVE_lockdep.o = -pg +CFLAGS_REMOVE_lockdep_proc.o = -pg +CFLAGS_REMOVE_mutex-debug.o = -pg +CFLAGS_REMOVE_rtmutex-debug.o = -pg +CFLAGS_REMOVE_cgroup-debug.o = -pg +CFLAGS_REMOVE_sched_clock.o = -pg endif obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o -- cgit v0.10.2 From 654e4787689faffdd2137fe91f59fd3ef3363ad2 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 14 May 2008 21:30:31 -0400 Subject: ftrace: use the new kbuild CFLAGS_REMOVE for lib directory This patch removes the Makefile turd and uses the nice CFLAGS_REMOVE macro in the lib directory. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/lib/Makefile b/lib/Makefile index d97ad11..4b836a5 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -10,11 +10,11 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \ ifdef CONFIG_FTRACE # Do not profile string.o, since it may be used in early boot or vdso +CFLAGS_REMOVE_string.o = -pg # Also do not profile any debug utilities -ORIG_CFLAGS := $(KBUILD_CFLAGS) -KBUILD_CFLAGS = $(if $(filter-out %debug debug% string%,$(basename $(notdir $@))), \ - $(ORIG_CFLAGS), \ - $(subst -pg,,$(ORIG_CFLAGS))) +CFLAGS_REMOVE_spinlock_debug.o = -pg +CFLAGS_REMOVE_list_debug.o = -pg +CFLAGS_REMOVE_debugobjects.o = -pg endif lib-$(CONFIG_MMU) += ioremap.o -- cgit v0.10.2 From 7fa09f24b477ad41b821713eba757b3aa7a2864a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 14 May 2008 21:30:32 -0400 Subject: ftrace: use the new kbuild CFLAGS_REMOVE for x86/kernel directory This patch removes the Makefile turd and uses the nice CFLAGS_REMOVE macro in the x86/kernel directory. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index e142091..739d49a 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -6,6 +6,13 @@ extra-y := head_$(BITS).o head$(BITS).o init_task.o vmlinux.lds CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE) +ifdef CONFIG_FTRACE +# Do not profile debug utilities +CFLAGS_REMOVE_tsc_64.o = -pg +CFLAGS_REMOVE_tsc_32.o = -pg +CFLAGS_REMOVE_rtc.o = -pg +endif + # # vsyscalls (which work on the user stack) should have # no stack-protector checks: -- cgit v0.10.2 From 677aa9f77e8de3791b481a0cec6c8b84d1eec626 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Sat, 17 May 2008 00:01:36 -0400 Subject: ftrace: add have dynamic ftrace config for archs Now that ftrace is being ported to other architectures, it has become apparent that DYNAMIC_FTRACE is dependent on whether or not that architecture implements dynamic ftrace. FTRACE itself may be ported to an architecture without porting dynamic ftrace. This patch adds HAVE_DYNAMIC_FTRACE to allow architectures to port ftrace without having to also port the dynamic aspect as well. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 62d034a..a5e9912 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -105,6 +105,7 @@ config ARCH_NO_VIRT_TO_BUS config PPC bool default y + select HAVE_DYNAMIC_FTRACE select HAVE_FTRACE select HAVE_IDE select HAVE_KPROBES diff --git a/arch/sparc64/Kconfig b/arch/sparc64/Kconfig index a480df6..fca9246 100644 --- a/arch/sparc64/Kconfig +++ b/arch/sparc64/Kconfig @@ -11,6 +11,7 @@ config SPARC config SPARC64 bool default y + select HAVE_DYNAMIC_FTRACE select HAVE_FTRACE select HAVE_IDE select HAVE_LMB diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index c742dfe..fc86c54 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -23,6 +23,7 @@ config X86 select HAVE_OPROFILE select HAVE_KPROBES select HAVE_KRETPROBES + select HAVE_DYNAMIC_FTRACE select HAVE_FTRACE select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) select HAVE_ARCH_KGDB if !X86_VOYAGER diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index f300571..5c2295b 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -4,6 +4,9 @@ config HAVE_FTRACE bool +config HAVE_DYNAMIC_FTRACE + bool + config TRACER_MAX_TRACE bool @@ -94,6 +97,7 @@ config CONTEXT_SWITCH_TRACER config DYNAMIC_FTRACE bool "enable/disable ftrace tracepoints dynamically" depends on FTRACE + depends on HAVE_DYNAMIC_FTRACE default y help This option will modify all the calls to ftrace dynamically -- cgit v0.10.2 From f06c38103ea9dbca27c3f4d77f444ddefb5477cd Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:47 +0200 Subject: ftrace: add sysprof plugin very first baby version. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 5c2295b..e101c9a 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -75,6 +75,14 @@ config PREEMPT_TRACER enabled. This option and the irqs-off timing option can be used together or separately.) +config SYSPROF_TRACER + bool "Sysprof Tracer" + depends on DEBUG_KERNEL + select TRACING + help + This tracer provides the trace needed by the 'Sysprof' userspace + tool. + config SCHED_TRACER bool "Scheduling Latency Tracer" depends on HAVE_FTRACE diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index d9efbbf..7aec123 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -14,6 +14,7 @@ obj-$(CONFIG_FTRACE) += libftrace.o obj-$(CONFIG_TRACING) += trace.o obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o +obj-$(CONFIG_SYSPROF_TRACER) += trace_sysprof.o obj-$(CONFIG_FTRACE) += trace_functions.o obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c new file mode 100644 index 0000000..6c139bc --- /dev/null +++ b/kernel/trace/trace_sysprof.c @@ -0,0 +1,80 @@ +/* + * trace stack traces + * + * Copyright (C) 2007 Steven Rostedt + * Copyright (C) 2008 Ingo Molnar + * + */ +#include +#include +#include +#include +#include +#include +#include + +#include "trace.h" + +static struct trace_array *ctx_trace; +static int __read_mostly tracer_enabled; + +static notrace void stack_reset(struct trace_array *tr) +{ + int cpu; + + tr->time_start = ftrace_now(tr->cpu); + + for_each_online_cpu(cpu) + tracing_reset(tr->data[cpu]); +} + +static notrace void start_stack_trace(struct trace_array *tr) +{ + stack_reset(tr); + tracer_enabled = 1; +} + +static notrace void stop_stack_trace(struct trace_array *tr) +{ + tracer_enabled = 0; +} + +static notrace void stack_trace_init(struct trace_array *tr) +{ + ctx_trace = tr; + + if (tr->ctrl) + start_stack_trace(tr); +} + +static notrace void stack_trace_reset(struct trace_array *tr) +{ + if (tr->ctrl) + stop_stack_trace(tr); +} + +static void stack_trace_ctrl_update(struct trace_array *tr) +{ + /* When starting a new trace, reset the buffers */ + if (tr->ctrl) + start_stack_trace(tr); + else + stop_stack_trace(tr); +} + +static struct tracer stack_trace __read_mostly = +{ + .name = "sysprof", + .init = stack_trace_init, + .reset = stack_trace_reset, + .ctrl_update = stack_trace_ctrl_update, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_stack, +#endif +}; + +__init static int init_stack_trace(void) +{ + return register_tracer(&stack_trace); +} +device_initcall(init_stack_trace); -- cgit v0.10.2 From 0075fa80305f3231a2d5df97b00d7f55a48ea27e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:47 +0200 Subject: ftrace: extend sysprof plugin add per CPU hrtimers. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c index 6c139bc..ba55b87 100644 --- a/kernel/trace/trace_sysprof.c +++ b/kernel/trace/trace_sysprof.c @@ -5,19 +5,76 @@ * Copyright (C) 2008 Ingo Molnar * */ -#include -#include -#include #include +#include +#include #include -#include #include +#include +#include #include "trace.h" static struct trace_array *ctx_trace; static int __read_mostly tracer_enabled; +static const unsigned long sample_period = 1000000; + +/* + * Per CPU hrtimers that do the profiling: + */ +static DEFINE_PER_CPU(struct hrtimer, stack_trace_hrtimer); + +static enum hrtimer_restart stack_trace_timer_fn(struct hrtimer *hrtimer) +{ + /* trace here */ + panic_timeout++; + + hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period)); + + return HRTIMER_RESTART; +} + +static void start_stack_timer(int cpu) +{ + struct hrtimer *hrtimer = &per_cpu(stack_trace_hrtimer, cpu); + + hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtimer->function = stack_trace_timer_fn; + hrtimer->cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; + + hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL); +} + +static void start_stack_timers(void) +{ + cpumask_t saved_mask = current->cpus_allowed; + int cpu; + + for_each_online_cpu(cpu) { + set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); + start_stack_timer(cpu); + printk("started timer on cpu%d\n", cpu); + } + set_cpus_allowed_ptr(current, &saved_mask); +} + +static void stop_stack_timer(int cpu) +{ + struct hrtimer *hrtimer = &per_cpu(stack_trace_hrtimer, cpu); + + hrtimer_cancel(hrtimer); + printk("cancelled timer on cpu%d\n", cpu); +} + +static void stop_stack_timers(void) +{ + int cpu; + + for_each_online_cpu(cpu) + stop_stack_timer(cpu); +} + static notrace void stack_reset(struct trace_array *tr) { int cpu; @@ -31,11 +88,13 @@ static notrace void stack_reset(struct trace_array *tr) static notrace void start_stack_trace(struct trace_array *tr) { stack_reset(tr); + start_stack_timers(); tracer_enabled = 1; } static notrace void stop_stack_trace(struct trace_array *tr) { + stop_stack_timers(); tracer_enabled = 0; } -- cgit v0.10.2 From 56a08bdcff20f0022bd9160c1093e56f763499aa Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:47 +0200 Subject: ftrace: extend sysprof plugin some more Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c index ba55b87..b1137c1 100644 --- a/kernel/trace/trace_sysprof.c +++ b/kernel/trace/trace_sysprof.c @@ -3,7 +3,7 @@ * * Copyright (C) 2007 Steven Rostedt * Copyright (C) 2008 Ingo Molnar - * + * Copyright (C) 2004, 2005, Soeren Sandmann */ #include #include @@ -11,13 +11,17 @@ #include #include #include +#include #include #include "trace.h" -static struct trace_array *ctx_trace; +static struct trace_array *sysprof_trace; static int __read_mostly tracer_enabled; +/* + * 10 msecs for now: + */ static const unsigned long sample_period = 1000000; /* @@ -25,10 +29,78 @@ static const unsigned long sample_period = 1000000; */ static DEFINE_PER_CPU(struct hrtimer, stack_trace_hrtimer); +struct stack_frame { + const void __user *next_fp; + unsigned long return_address; +}; + +static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) +{ + if (!access_ok(VERIFY_READ, fp, sizeof(*frame))) + return 0; + + if (__copy_from_user_inatomic(frame, frame_pointer, sizeof(*frame))) + return 0; + + return 1; +} + +#define SYSPROF_MAX_ADDRESSES 512 + +static void timer_notify(struct pt_regs *regs, int cpu) +{ + const void __user *frame_pointer; + struct trace_array_cpu *data; + struct stack_frame frame; + struct trace_array *tr; + int is_user; + int i; + + if (!regs) + return; + + tr = sysprof_trace; + data = tr->data[cpu]; + is_user = user_mode(regs); + + if (!current || current->pid == 0) + return; + + if (is_user && current->state != TASK_RUNNING) + return; + + if (!is_user) { + /* kernel */ + ftrace(tr, data, current->pid, 1, 0); + return; + + } + + trace_special(tr, data, 0, current->pid, regs->ip); + + frame_pointer = (void __user *)regs->bp; + + for (i = 0; i < SYSPROF_MAX_ADDRESSES; i++) { + if (!copy_stack_frame(frame_pointer, &frame)) + break; + if ((unsigned long)frame_pointer < regs->sp) + break; + + trace_special(tr, data, 1, frame.return_address, + (unsigned long)frame_pointer); + frame_pointer = frame.next_fp; + } + + trace_special(tr, data, 2, current->pid, i); + + if (i == SYSPROF_MAX_ADDRESSES) + trace_special(tr, data, -1, -1, -1); +} + static enum hrtimer_restart stack_trace_timer_fn(struct hrtimer *hrtimer) { /* trace here */ - panic_timeout++; + timer_notify(get_irq_regs(), smp_processor_id()); hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period)); @@ -100,7 +172,7 @@ static notrace void stop_stack_trace(struct trace_array *tr) static notrace void stack_trace_init(struct trace_array *tr) { - ctx_trace = tr; + sysprof_trace = tr; if (tr->ctrl) start_stack_trace(tr); -- cgit v0.10.2 From a6dd24f8d00cbccb560b19a723e6fb9bdfb20799 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:47 +0200 Subject: ftrace: sysprof-plugin, add self-tests Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index c460e85..b2198bc 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -280,6 +280,10 @@ extern int trace_selftest_startup_wakeup(struct tracer *trace, extern int trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr); #endif +#ifdef CONFIG_SYSPROF_TRACER +extern int trace_selftest_startup_sysprof(struct tracer *trace, + struct trace_array *tr); +#endif #endif /* CONFIG_FTRACE_STARTUP_TEST */ extern void *head_page(struct trace_array_cpu *data); diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 3877dd9..033a6fb 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -537,3 +537,31 @@ trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr return ret; } #endif /* CONFIG_CONTEXT_SWITCH_TRACER */ + +#ifdef CONFIG_SYSPROF_TRACER +int +trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr) +{ + unsigned long count; + int ret; + + /* start the tracing */ + tr->ctrl = 1; + trace->init(tr); + /* Sleep for a 1/10 of a second */ + msleep(100); + /* stop the tracing. */ + tr->ctrl = 0; + trace->ctrl_update(tr); + /* check the trace buffer */ + ret = trace_test_buffer(tr, &count); + trace->reset(tr); + + if (!ret && !count) { + printk(KERN_CONT ".. no entries found .."); + ret = -1; + } + + return ret; +} +#endif /* CONFIG_SYSPROF_TRACER */ diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c index b1137c1..b78f12f 100644 --- a/kernel/trace/trace_sysprof.c +++ b/kernel/trace/trace_sysprof.c @@ -126,7 +126,7 @@ static void start_stack_timers(void) for_each_online_cpu(cpu) { set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); start_stack_timer(cpu); - printk("started timer on cpu%d\n", cpu); + printk(KERN_INFO "started sysprof timer on cpu%d\n", cpu); } set_cpus_allowed_ptr(current, &saved_mask); } @@ -136,7 +136,7 @@ static void stop_stack_timer(int cpu) struct hrtimer *hrtimer = &per_cpu(stack_trace_hrtimer, cpu); hrtimer_cancel(hrtimer); - printk("cancelled timer on cpu%d\n", cpu); + printk(KERN_INFO "cancelled sysprof timer on cpu%d\n", cpu); } static void stop_stack_timers(void) @@ -200,7 +200,7 @@ static struct tracer stack_trace __read_mostly = .reset = stack_trace_reset, .ctrl_update = stack_trace_ctrl_update, #ifdef CONFIG_FTRACE_SELFTEST - .selftest = trace_selftest_startup_stack, + .selftest = trace_selftest_startup_sysprof, #endif }; -- cgit v0.10.2 From 842af315e8b0adad58fc642eaa5e6f53525e0534 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:47 +0200 Subject: ftrace: sysprof plugin improvement add sample maximum depth. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c index b78f12f..7f6fccc 100644 --- a/kernel/trace/trace_sysprof.c +++ b/kernel/trace/trace_sysprof.c @@ -23,6 +23,7 @@ static int __read_mostly tracer_enabled; * 10 msecs for now: */ static const unsigned long sample_period = 1000000; +static const unsigned int sample_max_depth = 512; /* * Per CPU hrtimers that do the profiling: @@ -45,8 +46,6 @@ static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) return 1; } -#define SYSPROF_MAX_ADDRESSES 512 - static void timer_notify(struct pt_regs *regs, int cpu) { const void __user *frame_pointer; @@ -80,7 +79,7 @@ static void timer_notify(struct pt_regs *regs, int cpu) frame_pointer = (void __user *)regs->bp; - for (i = 0; i < SYSPROF_MAX_ADDRESSES; i++) { + for (i = 0; i < sample_max_depth; i++) { if (!copy_stack_frame(frame_pointer, &frame)) break; if ((unsigned long)frame_pointer < regs->sp) @@ -93,7 +92,7 @@ static void timer_notify(struct pt_regs *regs, int cpu) trace_special(tr, data, 2, current->pid, i); - if (i == SYSPROF_MAX_ADDRESSES) + if (i == sample_max_depth) trace_special(tr, data, -1, -1, -1); } @@ -126,7 +125,6 @@ static void start_stack_timers(void) for_each_online_cpu(cpu) { set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); start_stack_timer(cpu); - printk(KERN_INFO "started sysprof timer on cpu%d\n", cpu); } set_cpus_allowed_ptr(current, &saved_mask); } @@ -136,7 +134,6 @@ static void stop_stack_timer(int cpu) struct hrtimer *hrtimer = &per_cpu(stack_trace_hrtimer, cpu); hrtimer_cancel(hrtimer); - printk(KERN_INFO "cancelled sysprof timer on cpu%d\n", cpu); } static void stop_stack_timers(void) -- cgit v0.10.2 From ef4ab15ff34fd9c65e92bee70f58e7179da881c5 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:48 +0200 Subject: ftrace: make sysprof dependent on x86 for now that's the only tested platform for now. If there's interest we can make it generic easily. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index e101c9a..9b49526 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -77,7 +77,7 @@ config PREEMPT_TRACER config SYSPROF_TRACER bool "Sysprof Tracer" - depends on DEBUG_KERNEL + depends on DEBUG_KERNEL && X86 select TRACING help This tracer provides the trace needed by the 'Sysprof' userspace diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 033a6fb..5588ecc 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -557,11 +557,6 @@ trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr) ret = trace_test_buffer(tr, &count); trace->reset(tr); - if (!ret && !count) { - printk(KERN_CONT ".. no entries found .."); - ret = -1; - } - return ret; } #endif /* CONFIG_SYSPROF_TRACER */ -- cgit v0.10.2 From 9f6b4e3f4a24f2590f1c96f117fc45fbea9b0fa4 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:48 +0200 Subject: ftrace: sysprof fix Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c index 7f6fccc..f9a09fe 100644 --- a/kernel/trace/trace_sysprof.c +++ b/kernel/trace/trace_sysprof.c @@ -37,21 +37,26 @@ struct stack_frame { static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) { + int ret; + if (!access_ok(VERIFY_READ, fp, sizeof(*frame))) return 0; - if (__copy_from_user_inatomic(frame, frame_pointer, sizeof(*frame))) - return 0; + ret = 1; + pagefault_disable(); + if (__copy_from_user_inatomic(frame, fp, sizeof(*frame))) + ret = 0; + pagefault_enable(); - return 1; + return ret; } static void timer_notify(struct pt_regs *regs, int cpu) { - const void __user *frame_pointer; struct trace_array_cpu *data; struct stack_frame frame; struct trace_array *tr; + const void __user *fp; int is_user; int i; @@ -77,21 +82,26 @@ static void timer_notify(struct pt_regs *regs, int cpu) trace_special(tr, data, 0, current->pid, regs->ip); - frame_pointer = (void __user *)regs->bp; + fp = (void __user *)regs->bp; for (i = 0; i < sample_max_depth; i++) { - if (!copy_stack_frame(frame_pointer, &frame)) + frame.next_fp = 0; + frame.return_address = 0; + if (!copy_stack_frame(fp, &frame)) break; - if ((unsigned long)frame_pointer < regs->sp) + if ((unsigned long)fp < regs->sp) break; trace_special(tr, data, 1, frame.return_address, - (unsigned long)frame_pointer); - frame_pointer = frame.next_fp; + (unsigned long)fp); + fp = frame.next_fp; } trace_special(tr, data, 2, current->pid, i); + /* + * Special trace entry if we overflow the max depth: + */ if (i == sample_max_depth) trace_special(tr, data, -1, -1, -1); } -- cgit v0.10.2 From d618b3e6e50970a6248ac857653fdd49bcd3c045 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:49 +0200 Subject: ftrace: sysprof updates make the sample period configurable. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 3271916..95b7c48 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2800,6 +2800,9 @@ static __init void tracer_init_debugfs(void) pr_warning("Could not create debugfs " "'dyn_ftrace_total_info' entry\n"); #endif +#ifdef CONFIG_SYSPROF_TRACER + init_tracer_sysprof_debugfs(d_tracer); +#endif } static int trace_alloc_page(void) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index b2198bc..b7f85d9 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -188,6 +188,8 @@ struct trace_iterator { void tracing_reset(struct trace_array_cpu *data); int tracing_open_generic(struct inode *inode, struct file *filp); struct dentry *tracing_init_dentry(void); +void init_tracer_sysprof_debugfs(struct dentry *d_tracer); + void ftrace(struct trace_array *tr, struct trace_array_cpu *data, unsigned long ip, diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c index f9a09fe..1940623 100644 --- a/kernel/trace/trace_sysprof.c +++ b/kernel/trace/trace_sysprof.c @@ -20,11 +20,12 @@ static struct trace_array *sysprof_trace; static int __read_mostly tracer_enabled; /* - * 10 msecs for now: + * 1 msec sample interval by default: */ -static const unsigned long sample_period = 1000000; +static unsigned long sample_period = 1000000; static const unsigned int sample_max_depth = 512; +static DEFINE_MUTEX(sample_timer_lock); /* * Per CPU hrtimers that do the profiling: */ @@ -166,15 +167,19 @@ static notrace void stack_reset(struct trace_array *tr) static notrace void start_stack_trace(struct trace_array *tr) { + mutex_lock(&sample_timer_lock); stack_reset(tr); start_stack_timers(); tracer_enabled = 1; + mutex_unlock(&sample_timer_lock); } static notrace void stop_stack_trace(struct trace_array *tr) { + mutex_lock(&sample_timer_lock); stop_stack_timers(); tracer_enabled = 0; + mutex_unlock(&sample_timer_lock); } static notrace void stack_trace_init(struct trace_array *tr) @@ -216,3 +221,64 @@ __init static int init_stack_trace(void) return register_tracer(&stack_trace); } device_initcall(init_stack_trace); + +#define MAX_LONG_DIGITS 22 + +static ssize_t +sysprof_sample_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[MAX_LONG_DIGITS]; + int r; + + r = sprintf(buf, "%ld\n", nsecs_to_usecs(sample_period)); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static ssize_t +sysprof_sample_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[MAX_LONG_DIGITS]; + unsigned long val; + + if (cnt > MAX_LONG_DIGITS-1) + cnt = MAX_LONG_DIGITS-1; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + val = simple_strtoul(buf, NULL, 10); + /* + * Enforce a minimum sample period of 100 usecs: + */ + if (val < 100) + val = 100; + + mutex_lock(&sample_timer_lock); + stop_stack_timers(); + sample_period = val * 1000; + start_stack_timers(); + mutex_unlock(&sample_timer_lock); + + return cnt; +} + +static struct file_operations sysprof_sample_fops = { + .read = sysprof_sample_read, + .write = sysprof_sample_write, +}; + +void init_tracer_sysprof_debugfs(struct dentry *d_tracer) +{ + struct dentry *entry; + + entry = debugfs_create_file("sysprof_sample_period", 0644, + d_tracer, NULL, &sysprof_sample_fops); + if (entry) + return; + pr_warning("Could not create debugfs 'dyn_ftrace_total_info' entry\n"); +} -- cgit v0.10.2 From ada6b835067dc022f11cdae1c313a3710d3d977c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 23 May 2008 23:50:41 +0200 Subject: ftrace: remove notrace Remove the notrace annotations. The build logic takes care of that. Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c index 1940623..3b1e4ba 100644 --- a/kernel/trace/trace_sysprof.c +++ b/kernel/trace/trace_sysprof.c @@ -155,7 +155,7 @@ static void stop_stack_timers(void) stop_stack_timer(cpu); } -static notrace void stack_reset(struct trace_array *tr) +static void stack_reset(struct trace_array *tr) { int cpu; @@ -165,7 +165,7 @@ static notrace void stack_reset(struct trace_array *tr) tracing_reset(tr->data[cpu]); } -static notrace void start_stack_trace(struct trace_array *tr) +static void start_stack_trace(struct trace_array *tr) { mutex_lock(&sample_timer_lock); stack_reset(tr); @@ -174,7 +174,7 @@ static notrace void start_stack_trace(struct trace_array *tr) mutex_unlock(&sample_timer_lock); } -static notrace void stop_stack_trace(struct trace_array *tr) +static void stop_stack_trace(struct trace_array *tr) { mutex_lock(&sample_timer_lock); stop_stack_timers(); @@ -182,7 +182,7 @@ static notrace void stop_stack_trace(struct trace_array *tr) mutex_unlock(&sample_timer_lock); } -static notrace void stack_trace_init(struct trace_array *tr) +static void stack_trace_init(struct trace_array *tr) { sysprof_trace = tr; @@ -190,7 +190,7 @@ static notrace void stack_trace_init(struct trace_array *tr) start_stack_trace(tr); } -static notrace void stack_trace_reset(struct trace_array *tr) +static void stack_trace_reset(struct trace_array *tr) { if (tr->ctrl) stop_stack_trace(tr); -- cgit v0.10.2 From 9caee613d3b860ae81b79370eeae9ac967c07536 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 23 May 2008 23:55:54 +0200 Subject: ftrace: fix __trace_special() Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c index 3b1e4ba..76dd953 100644 --- a/kernel/trace/trace_sysprof.c +++ b/kernel/trace/trace_sysprof.c @@ -81,7 +81,7 @@ static void timer_notify(struct pt_regs *regs, int cpu) } - trace_special(tr, data, 0, current->pid, regs->ip); + __trace_special(tr, data, 0, current->pid, regs->ip); fp = (void __user *)regs->bp; @@ -93,18 +93,18 @@ static void timer_notify(struct pt_regs *regs, int cpu) if ((unsigned long)fp < regs->sp) break; - trace_special(tr, data, 1, frame.return_address, + __trace_special(tr, data, 1, frame.return_address, (unsigned long)fp); fp = frame.next_fp; } - trace_special(tr, data, 2, current->pid, i); + __trace_special(tr, data, 2, current->pid, i); /* * Special trace entry if we overflow the max depth: */ if (i == sample_max_depth) - trace_special(tr, data, -1, -1, -1); + __trace_special(tr, data, -1, -1, -1); } static enum hrtimer_restart stack_trace_timer_fn(struct hrtimer *hrtimer) -- cgit v0.10.2 From 5fc4511c756860149b81aead6eca5bdf5c438ea7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 23 May 2008 23:58:21 +0200 Subject: ftrace: make it more available in the Kconfig Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 9b49526..e101c9a 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -77,7 +77,7 @@ config PREEMPT_TRACER config SYSPROF_TRACER bool "Sysprof Tracer" - depends on DEBUG_KERNEL && X86 + depends on DEBUG_KERNEL select TRACING help This tracer provides the trace needed by the 'Sysprof' userspace -- cgit v0.10.2 From cd2134b1dda92fd450e6a1e12b1c7960dd6a2178 Mon Sep 17 00:00:00 2001 From: Soeren Sandmann Pedersen Date: Mon, 12 May 2008 21:20:54 +0200 Subject: sysprof: kernel trace add kernel backtracing to the sysprof tracer. change the format of the data, so that type=0 means beginning of stack trace, 1 means kernel address, 2 means user address, and 3 means end of trace. EIP addresses are no longer distinguished from return addresses, mostly because sysprof userspace doesn't make use of it. It may be worthwhile adding this back in though, just in case it becomes interesting. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c index 76dd953..ebcb66d 100644 --- a/kernel/trace/trace_sysprof.c +++ b/kernel/trace/trace_sysprof.c @@ -14,6 +14,8 @@ #include #include +#include + #include "trace.h" static struct trace_array *sysprof_trace; @@ -52,6 +54,77 @@ static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) return ret; } +struct backtrace_info { + struct trace_array_cpu *data; + struct trace_array *tr; + int pos; +}; + +static void +backtrace_warning_symbol(void *data, char *msg, unsigned long symbol) +{ + /* Ignore warnings */ +} + +static void backtrace_warning(void *data, char *msg) +{ + /* Ignore warnings */ +} + +static int backtrace_stack(void *data, char *name) +{ + /* Don't bother with IRQ stacks for now */ + return -1; +} + +static void backtrace_address(void *data, unsigned long addr, int reliable) +{ + struct backtrace_info *info = data; + + if (info->pos < sample_max_depth && reliable) { + __trace_special(info->tr, info->data, 1, addr, 0); + + info->pos++; + } +} + +const static struct stacktrace_ops backtrace_ops = { + .warning = backtrace_warning, + .warning_symbol = backtrace_warning_symbol, + .stack = backtrace_stack, + .address = backtrace_address, +}; + +static struct pt_regs * +trace_kernel(struct pt_regs *regs, struct trace_array *tr, + struct trace_array_cpu *data) +{ + struct backtrace_info info; + unsigned long bp; + char *user_stack; + char *stack; + + info.tr = tr; + info.data = data; + info.pos = 1; + + __trace_special(info.tr, info.data, 1, regs->ip, 0); + + stack = ((char *)regs + sizeof(struct pt_regs)); +#ifdef CONFIG_FRAME_POINTER + bp = regs->bp; +#else + bp = 0; +#endif + + dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, &info); + + /* Now trace the user stack */ + user_stack = ((char *)current->thread.sp0 - sizeof(struct pt_regs)); + + return (struct pt_regs *)user_stack; +} + static void timer_notify(struct pt_regs *regs, int cpu) { struct trace_array_cpu *data; @@ -74,17 +147,15 @@ static void timer_notify(struct pt_regs *regs, int cpu) if (is_user && current->state != TASK_RUNNING) return; - if (!is_user) { - /* kernel */ - ftrace(tr, data, current->pid, 1, 0); - return; + __trace_special(tr, data, 0, 0, current->pid); - } - - __trace_special(tr, data, 0, current->pid, regs->ip); + if (!is_user) + regs = trace_kernel(regs, tr, data); fp = (void __user *)regs->bp; + __trace_special(tr, data, 2, regs->ip, 0); + for (i = 0; i < sample_max_depth; i++) { frame.next_fp = 0; frame.return_address = 0; @@ -93,12 +164,12 @@ static void timer_notify(struct pt_regs *regs, int cpu) if ((unsigned long)fp < regs->sp) break; - __trace_special(tr, data, 1, frame.return_address, + __trace_special(tr, data, 2, frame.return_address, (unsigned long)fp); fp = frame.next_fp; } - __trace_special(tr, data, 2, current->pid, i); + __trace_special(tr, data, 3, current->pid, i); /* * Special trace entry if we overflow the max depth: -- cgit v0.10.2 From 8a9e94c1fbfdac45a3b6811b880777c4116aa309 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:54 +0200 Subject: sysprof: update copyrights Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c index ebcb66d..fe23d6d 100644 --- a/kernel/trace/trace_sysprof.c +++ b/kernel/trace/trace_sysprof.c @@ -1,9 +1,9 @@ /* * trace stack traces * + * Copyright (C) 2004-2008, Soeren Sandmann * Copyright (C) 2007 Steven Rostedt * Copyright (C) 2008 Ingo Molnar - * Copyright (C) 2004, 2005, Soeren Sandmann */ #include #include -- cgit v0.10.2 From cf3271a73b612a03da00681ecd9bfefab37c74c9 Mon Sep 17 00:00:00 2001 From: Soeren Sandmann Date: Mon, 12 May 2008 05:28:50 +0200 Subject: ftrace/sysprof: don't trace the user stack if we are a kernel thread. Check that current->mm is non-NULL before attempting to trace the user stack. Also take depth of the kernel stack into account when comparing against sample_max_depth. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c index fe23d6d..2301e1e 100644 --- a/kernel/trace/trace_sysprof.c +++ b/kernel/trace/trace_sysprof.c @@ -95,13 +95,12 @@ const static struct stacktrace_ops backtrace_ops = { .address = backtrace_address, }; -static struct pt_regs * +static int trace_kernel(struct pt_regs *regs, struct trace_array *tr, struct trace_array_cpu *data) { struct backtrace_info info; unsigned long bp; - char *user_stack; char *stack; info.tr = tr; @@ -119,10 +118,7 @@ trace_kernel(struct pt_regs *regs, struct trace_array *tr, dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, &info); - /* Now trace the user stack */ - user_stack = ((char *)current->thread.sp0 - sizeof(struct pt_regs)); - - return (struct pt_regs *)user_stack; + return info.pos; } static void timer_notify(struct pt_regs *regs, int cpu) @@ -150,32 +146,44 @@ static void timer_notify(struct pt_regs *regs, int cpu) __trace_special(tr, data, 0, 0, current->pid); if (!is_user) - regs = trace_kernel(regs, tr, data); + i = trace_kernel(regs, tr, data); + else + i = 0; - fp = (void __user *)regs->bp; + /* + * Trace user stack if we are not a kernel thread + */ + if (current->mm && i < sample_max_depth) { + regs = (struct pt_regs *)current->thread.sp0 - 1; - __trace_special(tr, data, 2, regs->ip, 0); + fp = (void __user *)regs->bp; - for (i = 0; i < sample_max_depth; i++) { - frame.next_fp = 0; - frame.return_address = 0; - if (!copy_stack_frame(fp, &frame)) - break; - if ((unsigned long)fp < regs->sp) - break; + __trace_special(tr, data, 2, regs->ip, 0); - __trace_special(tr, data, 2, frame.return_address, - (unsigned long)fp); - fp = frame.next_fp; - } + while (i < sample_max_depth) { + frame.next_fp = 0; + frame.return_address = 0; + if (!copy_stack_frame(fp, &frame)) + break; + if ((unsigned long)fp < regs->sp) + break; - __trace_special(tr, data, 3, current->pid, i); + __trace_special(tr, data, 2, frame.return_address, + (unsigned long)fp); + fp = frame.next_fp; + + i++; + } + + } /* * Special trace entry if we overflow the max depth: */ if (i == sample_max_depth) __trace_special(tr, data, -1, -1, -1); + + __trace_special(tr, data, 3, current->pid, i); } static enum hrtimer_restart stack_trace_timer_fn(struct hrtimer *hrtimer) -- cgit v0.10.2 From 8b7d89d02ef3c6a7c73d6596f28cea7632850af4 Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Mon, 12 May 2008 21:20:56 +0200 Subject: x86: mmiotrace - trace memory mapped IO Mmiotrace is a tool for trapping memory mapped IO (MMIO) accesses within the kernel. It is used for debugging and especially for reverse engineering evil binary drivers. Mmiotrace works by wrapping the ioremap family of kernel functions and marking the returned pages as not present. Access to the IO memory triggers a page fault, which will be handled by mmiotrace's custom page fault handler. This will single-step the faulted instruction with the MMIO page marked as present. Access logs are directed to user space via relay and debug_fs. This page fault approach is necessary, because binary drivers have readl/writel etc. calls inlined and therefore extremely difficult to trap with with e.g. kprobes. This patch depends on the custom page fault handlers patch. Signed-off-by: Pekka Paalanen Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 9431a83..7c6496e 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -176,6 +176,33 @@ config PAGE_FAULT_HANDLERS register a function that is called on every page fault. Custom handlers are used by some debugging and reverse engineering tools. +config MMIOTRACE + tristate "Memory mapped IO tracing" + depends on DEBUG_KERNEL && PAGE_FAULT_HANDLERS && RELAY && DEBUG_FS + default n + help + This will build a kernel module called mmiotrace. + + Mmiotrace traces Memory Mapped I/O access and is meant for debugging + and reverse engineering. The kernel module offers wrapped + versions of the ioremap family of functions. The driver to be traced + must be modified to call these wrappers. A user space program is + required to collect the MMIO data. + + See http://nouveau.freedesktop.org/wiki/MmioTrace + If you are not helping to develop drivers, say N. + +config MMIOTRACE_TEST + tristate "Test module for mmiotrace" + depends on MMIOTRACE && m + default n + help + This is a dumb module for testing mmiotrace. It is very dangerous + as it will write garbage to IO memory starting at a given address. + However, it should be safe to use on e.g. unused portion of VRAM. + + Say N, unless you absolutely know what you are doing. + # # IO delay types: # diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 739d49a..a51ac15 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -79,6 +79,8 @@ obj-$(CONFIG_KGDB) += kgdb.o obj-$(CONFIG_VM86) += vm86_32.o obj-$(CONFIG_EARLY_PRINTK) += early_printk.o +obj-$(CONFIG_MMIOTRACE) += mmiotrace/ + obj-$(CONFIG_HPET_TIMER) += hpet.o obj-$(CONFIG_K8_NB) += k8.o diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c index a4f93b4..027a5b6 100644 --- a/arch/x86/kernel/init_task.c +++ b/arch/x86/kernel/init_task.c @@ -15,6 +15,7 @@ static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); struct mm_struct init_mm = INIT_MM(init_mm); EXPORT_UNUSED_SYMBOL(init_mm); /* will be removed in 2.6.26 */ +EXPORT_SYMBOL_GPL(init_mm); /* * Initial thread structure. diff --git a/arch/x86/kernel/mmiotrace/Makefile b/arch/x86/kernel/mmiotrace/Makefile new file mode 100644 index 0000000..d6905f7 --- /dev/null +++ b/arch/x86/kernel/mmiotrace/Makefile @@ -0,0 +1,4 @@ +obj-$(CONFIG_MMIOTRACE) += mmiotrace.o +mmiotrace-objs := pf_in.o kmmio.o mmio-mod.o + +obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o diff --git a/arch/x86/kernel/mmiotrace/kmmio.c b/arch/x86/kernel/mmiotrace/kmmio.c new file mode 100644 index 0000000..8ba48f9 --- /dev/null +++ b/arch/x86/kernel/mmiotrace/kmmio.c @@ -0,0 +1,391 @@ +/* Support for MMIO probes. + * Benfit many code from kprobes + * (C) 2002 Louis Zhuang . + * 2007 Alexander Eichner + * 2008 Pekka Paalanen + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kmmio.h" + +#define KMMIO_HASH_BITS 6 +#define KMMIO_TABLE_SIZE (1 << KMMIO_HASH_BITS) +#define KMMIO_PAGE_HASH_BITS 4 +#define KMMIO_PAGE_TABLE_SIZE (1 << KMMIO_PAGE_HASH_BITS) + +struct kmmio_context { + struct kmmio_fault_page *fpage; + struct kmmio_probe *probe; + unsigned long saved_flags; + int active; +}; + +static int kmmio_page_fault(struct pt_regs *regs, unsigned long error_code, + unsigned long address); +static int kmmio_die_notifier(struct notifier_block *nb, unsigned long val, + void *args); + +static DEFINE_SPINLOCK(kmmio_lock); + +/* These are protected by kmmio_lock */ +unsigned int kmmio_count; +static unsigned int handler_registered; +static struct list_head kmmio_page_table[KMMIO_PAGE_TABLE_SIZE]; +static LIST_HEAD(kmmio_probes); + +static struct kmmio_context kmmio_ctx[NR_CPUS]; + +static struct pf_handler kmmio_pf_hook = { + .handler = kmmio_page_fault +}; + +static struct notifier_block nb_die = { + .notifier_call = kmmio_die_notifier +}; + +int init_kmmio(void) +{ + int i; + for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++) + INIT_LIST_HEAD(&kmmio_page_table[i]); + + register_die_notifier(&nb_die); + return 0; +} + +void cleanup_kmmio(void) +{ + /* + * Assume the following have been already cleaned by calling + * unregister_kmmio_probe() appropriately: + * kmmio_page_table, kmmio_probes + */ + if (handler_registered) { + unregister_page_fault_handler(&kmmio_pf_hook); + synchronize_rcu(); + } + unregister_die_notifier(&nb_die); +} + +/* + * this is basically a dynamic stabbing problem: + * Could use the existing prio tree code or + * Possible better implementations: + * The Interval Skip List: A Data Structure for Finding All Intervals That + * Overlap a Point (might be simple) + * Space Efficient Dynamic Stabbing with Fast Queries - Mikkel Thorup + */ +/* Get the kmmio at this addr (if any). You must be holding kmmio_lock. */ +static struct kmmio_probe *get_kmmio_probe(unsigned long addr) +{ + struct kmmio_probe *p; + list_for_each_entry(p, &kmmio_probes, list) { + if (addr >= p->addr && addr <= (p->addr + p->len)) + return p; + } + return NULL; +} + +static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page) +{ + struct list_head *head, *tmp; + + page &= PAGE_MASK; + head = &kmmio_page_table[hash_long(page, KMMIO_PAGE_HASH_BITS)]; + list_for_each(tmp, head) { + struct kmmio_fault_page *p + = list_entry(tmp, struct kmmio_fault_page, list); + if (p->page == page) + return p; + } + + return NULL; +} + +static void arm_kmmio_fault_page(unsigned long page, int *large) +{ + unsigned long address = page & PAGE_MASK; + pgd_t *pgd = pgd_offset_k(address); + pud_t *pud = pud_offset(pgd, address); + pmd_t *pmd = pmd_offset(pud, address); + pte_t *pte = pte_offset_kernel(pmd, address); + + if (pmd_large(*pmd)) { + set_pmd(pmd, __pmd(pmd_val(*pmd) & ~_PAGE_PRESENT)); + if (large) + *large = 1; + } else { + set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT)); + } + + __flush_tlb_one(page); +} + +static void disarm_kmmio_fault_page(unsigned long page, int *large) +{ + unsigned long address = page & PAGE_MASK; + pgd_t *pgd = pgd_offset_k(address); + pud_t *pud = pud_offset(pgd, address); + pmd_t *pmd = pmd_offset(pud, address); + pte_t *pte = pte_offset_kernel(pmd, address); + + if (large && *large) { + set_pmd(pmd, __pmd(pmd_val(*pmd) | _PAGE_PRESENT)); + *large = 0; + } else { + set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT)); + } + + __flush_tlb_one(page); +} + +/* + * Interrupts are disabled on entry as trap3 is an interrupt gate + * and they remain disabled thorough out this function. + */ +static int kmmio_handler(struct pt_regs *regs, unsigned long addr) +{ + struct kmmio_context *ctx; + int cpu; + + /* + * Preemption is now disabled to prevent process switch during + * single stepping. We can only handle one active kmmio trace + * per cpu, so ensure that we finish it before something else + * gets to run. + * + * XXX what if an interrupt occurs between returning from + * do_page_fault() and entering the single-step exception handler? + * And that interrupt triggers a kmmio trap? + */ + preempt_disable(); + cpu = smp_processor_id(); + ctx = &kmmio_ctx[cpu]; + + /* interrupts disabled and CPU-local data => atomicity guaranteed. */ + if (ctx->active) { + /* + * This avoids a deadlock with kmmio_lock. + * If this page fault really was due to kmmio trap, + * all hell breaks loose. + */ + printk(KERN_EMERG "mmiotrace: recursive probe hit on CPU %d, " + "for address %lu. Ignoring.\n", + cpu, addr); + goto no_kmmio; + } + ctx->active++; + + /* + * Acquire the kmmio lock to prevent changes affecting + * get_kmmio_fault_page() and get_kmmio_probe(), since we save their + * returned pointers. + * The lock is released in post_kmmio_handler(). + * XXX: could/should get_kmmio_*() be using RCU instead of spinlock? + */ + spin_lock(&kmmio_lock); + + ctx->fpage = get_kmmio_fault_page(addr); + if (!ctx->fpage) { + /* this page fault is not caused by kmmio */ + goto no_kmmio_locked; + } + + ctx->probe = get_kmmio_probe(addr); + ctx->saved_flags = (regs->flags & (TF_MASK|IF_MASK)); + + if (ctx->probe && ctx->probe->pre_handler) + ctx->probe->pre_handler(ctx->probe, regs, addr); + + regs->flags |= TF_MASK; + regs->flags &= ~IF_MASK; + + /* We hold lock, now we set present bit in PTE and single step. */ + disarm_kmmio_fault_page(ctx->fpage->page, NULL); + + return 1; + +no_kmmio_locked: + spin_unlock(&kmmio_lock); + ctx->active--; +no_kmmio: + preempt_enable_no_resched(); + /* page fault not handled by kmmio */ + return 0; +} + +/* + * Interrupts are disabled on entry as trap1 is an interrupt gate + * and they remain disabled thorough out this function. + * And we hold kmmio lock. + */ +static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs) +{ + int cpu = smp_processor_id(); + struct kmmio_context *ctx = &kmmio_ctx[cpu]; + + if (!ctx->active) + return 0; + + if (ctx->probe && ctx->probe->post_handler) + ctx->probe->post_handler(ctx->probe, condition, regs); + + arm_kmmio_fault_page(ctx->fpage->page, NULL); + + regs->flags &= ~TF_MASK; + regs->flags |= ctx->saved_flags; + + /* These were acquired in kmmio_handler(). */ + ctx->active--; + spin_unlock(&kmmio_lock); + preempt_enable_no_resched(); + + /* + * if somebody else is singlestepping across a probe point, flags + * will have TF set, in which case, continue the remaining processing + * of do_debug, as if this is not a probe hit. + */ + if (regs->flags & TF_MASK) + return 0; + + return 1; +} + +static int add_kmmio_fault_page(unsigned long page) +{ + struct kmmio_fault_page *f; + + page &= PAGE_MASK; + f = get_kmmio_fault_page(page); + if (f) { + f->count++; + return 0; + } + + f = kmalloc(sizeof(*f), GFP_ATOMIC); + if (!f) + return -1; + + f->count = 1; + f->page = page; + list_add(&f->list, + &kmmio_page_table[hash_long(f->page, KMMIO_PAGE_HASH_BITS)]); + + arm_kmmio_fault_page(f->page, NULL); + + return 0; +} + +static void release_kmmio_fault_page(unsigned long page) +{ + struct kmmio_fault_page *f; + + page &= PAGE_MASK; + f = get_kmmio_fault_page(page); + if (!f) + return; + + f->count--; + if (!f->count) { + disarm_kmmio_fault_page(f->page, NULL); + list_del(&f->list); + } +} + +int register_kmmio_probe(struct kmmio_probe *p) +{ + int ret = 0; + unsigned long size = 0; + + spin_lock_irq(&kmmio_lock); + kmmio_count++; + if (get_kmmio_probe(p->addr)) { + ret = -EEXIST; + goto out; + } + list_add(&p->list, &kmmio_probes); + /*printk("adding fault pages...\n");*/ + while (size < p->len) { + if (add_kmmio_fault_page(p->addr + size)) + printk(KERN_ERR "mmio: Unable to set page fault.\n"); + size += PAGE_SIZE; + } + + if (!handler_registered) { + register_page_fault_handler(&kmmio_pf_hook); + handler_registered++; + } + +out: + spin_unlock_irq(&kmmio_lock); + /* + * XXX: What should I do here? + * Here was a call to global_flush_tlb(), but it does not exist + * anymore. + */ + return ret; +} + +void unregister_kmmio_probe(struct kmmio_probe *p) +{ + unsigned long size = 0; + + spin_lock_irq(&kmmio_lock); + while (size < p->len) { + release_kmmio_fault_page(p->addr + size); + size += PAGE_SIZE; + } + list_del(&p->list); + kmmio_count--; + spin_unlock_irq(&kmmio_lock); +} + +/* + * According to 2.6.20, mainly x86_64 arch: + * This is being called from do_page_fault(), via the page fault notifier + * chain. The chain is called for both user space faults and kernel space + * faults (address >= TASK_SIZE64), except not on faults serviced by + * vmalloc_fault(). + * + * We may be in an interrupt or a critical section. Also prefecthing may + * trigger a page fault. We may be in the middle of process switch. + * The page fault hook functionality has put us inside RCU read lock. + * + * Local interrupts are disabled, so preemption cannot happen. + * Do not enable interrupts, do not sleep, and watch out for other CPUs. + */ +static int kmmio_page_fault(struct pt_regs *regs, unsigned long error_code, + unsigned long address) +{ + if (is_kmmio_active()) + if (kmmio_handler(regs, address) == 1) + return -1; + return 0; +} + +static int kmmio_die_notifier(struct notifier_block *nb, unsigned long val, + void *args) +{ + struct die_args *arg = args; + + if (val == DIE_DEBUG) + if (post_kmmio_handler(arg->err, arg->regs) == 1) + return NOTIFY_STOP; + + return NOTIFY_DONE; +} diff --git a/arch/x86/kernel/mmiotrace/kmmio.h b/arch/x86/kernel/mmiotrace/kmmio.h new file mode 100644 index 0000000..85b7f68 --- /dev/null +++ b/arch/x86/kernel/mmiotrace/kmmio.h @@ -0,0 +1,58 @@ +#ifndef _LINUX_KMMIO_H +#define _LINUX_KMMIO_H + +#include +#include +#include +#include +#include +#include +#include + +struct kmmio_probe; +struct kmmio_fault_page; +struct pt_regs; + +typedef void (*kmmio_pre_handler_t)(struct kmmio_probe *, + struct pt_regs *, unsigned long addr); +typedef void (*kmmio_post_handler_t)(struct kmmio_probe *, + unsigned long condition, struct pt_regs *); + +struct kmmio_probe { + struct list_head list; + + /* start location of the probe point */ + unsigned long addr; + + /* length of the probe region */ + unsigned long len; + + /* Called before addr is executed. */ + kmmio_pre_handler_t pre_handler; + + /* Called after addr is executed, unless... */ + kmmio_post_handler_t post_handler; +}; + +struct kmmio_fault_page { + struct list_head list; + + /* location of the fault page */ + unsigned long page; + + int count; +}; + +/* kmmio is active by some kmmio_probes? */ +static inline int is_kmmio_active(void) +{ + extern unsigned int kmmio_count; + return kmmio_count; +} + +int init_kmmio(void); +void cleanup_kmmio(void); +int register_kmmio_probe(struct kmmio_probe *p); +void unregister_kmmio_probe(struct kmmio_probe *p); + +#endif /* _LINUX_KMMIO_H */ diff --git a/arch/x86/kernel/mmiotrace/mmio-mod.c b/arch/x86/kernel/mmiotrace/mmio-mod.c new file mode 100644 index 0000000..73561fe --- /dev/null +++ b/arch/x86/kernel/mmiotrace/mmio-mod.c @@ -0,0 +1,527 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2005 + * Jeff Muizelaar, 2006, 2007 + * Pekka Paalanen, 2008 + * + * Derived from the read-mod example from relay-examples by Tom Zanussi. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* for ISA_START_ADDRESS */ + +#include "kmmio.h" +#include "pf_in.h" + +/* This app's relay channel files will appear in /debug/mmio-trace */ +#define APP_DIR "mmio-trace" +/* the marker injection file in /proc */ +#define MARKER_FILE "mmio-marker" + +#define MODULE_NAME "mmiotrace" + +struct trap_reason { + unsigned long addr; + unsigned long ip; + enum reason_type type; + int active_traces; +}; + +static struct trap_reason pf_reason[NR_CPUS]; +static struct mm_io_header_rw cpu_trace[NR_CPUS]; + +static struct file_operations mmio_fops = { + .owner = THIS_MODULE, +}; + +static const size_t subbuf_size = 256*1024; +static struct rchan *chan; +static struct dentry *dir; +static int suspended; /* XXX should this be per cpu? */ +static struct proc_dir_entry *proc_marker_file; + +/* module parameters */ +static unsigned int n_subbufs = 32*4; +static unsigned long filter_offset; +static int nommiotrace; +static int ISA_trace; +static int trace_pc; + +module_param(n_subbufs, uint, 0); +module_param(filter_offset, ulong, 0); +module_param(nommiotrace, bool, 0); +module_param(ISA_trace, bool, 0); +module_param(trace_pc, bool, 0); + +MODULE_PARM_DESC(n_subbufs, "Number of 256kB buffers, default 128."); +MODULE_PARM_DESC(filter_offset, "Start address of traced mappings."); +MODULE_PARM_DESC(nommiotrace, "Disable actual MMIO tracing."); +MODULE_PARM_DESC(ISA_trace, "Do not exclude the low ISA range."); +MODULE_PARM_DESC(trace_pc, "Record address of faulting instructions."); + +static void record_timestamp(struct mm_io_header *header) +{ + struct timespec now; + + getnstimeofday(&now); + header->sec = now.tv_sec; + header->nsec = now.tv_nsec; +} + +/* + * Write callback for the /proc entry: + * Read a marker and write it to the mmio trace log + */ +static int write_marker(struct file *file, const char __user *buffer, + unsigned long count, void *data) +{ + char *event = NULL; + struct mm_io_header *headp; + int len = (count > 65535) ? 65535 : count; + + event = kzalloc(sizeof(*headp) + len, GFP_KERNEL); + if (!event) + return -ENOMEM; + + headp = (struct mm_io_header *)event; + headp->type = MMIO_MAGIC | (MMIO_MARKER << MMIO_OPCODE_SHIFT); + headp->data_len = len; + record_timestamp(headp); + + if (copy_from_user(event + sizeof(*headp), buffer, len)) { + kfree(event); + return -EFAULT; + } + + relay_write(chan, event, sizeof(*headp) + len); + kfree(event); + return len; +} + +static void print_pte(unsigned long address) +{ + pgd_t *pgd = pgd_offset_k(address); + pud_t *pud = pud_offset(pgd, address); + pmd_t *pmd = pmd_offset(pud, address); + if (pmd_large(*pmd)) { + printk(KERN_EMERG MODULE_NAME ": 4MB pages are not " + "currently supported: %lx\n", + address); + BUG(); + } + printk(KERN_DEBUG MODULE_NAME ": pte for 0x%lx: 0x%lx 0x%lx\n", + address, + pte_val(*pte_offset_kernel(pmd, address)), + pte_val(*pte_offset_kernel(pmd, address)) & _PAGE_PRESENT); +} + +/* + * For some reason the pre/post pairs have been called in an + * unmatched order. Report and die. + */ +static void die_kmmio_nesting_error(struct pt_regs *regs, unsigned long addr) +{ + const unsigned long cpu = smp_processor_id(); + printk(KERN_EMERG MODULE_NAME ": unexpected fault for address: %lx, " + "last fault for address: %lx\n", + addr, pf_reason[cpu].addr); + print_pte(addr); +#ifdef __i386__ + print_symbol(KERN_EMERG "faulting EIP is at %s\n", regs->ip); + print_symbol(KERN_EMERG "last faulting EIP was at %s\n", + pf_reason[cpu].ip); + printk(KERN_EMERG + "eax: %08lx ebx: %08lx ecx: %08lx edx: %08lx\n", + regs->ax, regs->bx, regs->cx, regs->dx); + printk(KERN_EMERG + "esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n", + regs->si, regs->di, regs->bp, regs->sp); +#else + print_symbol(KERN_EMERG "faulting RIP is at %s\n", regs->ip); + print_symbol(KERN_EMERG "last faulting RIP was at %s\n", + pf_reason[cpu].ip); + printk(KERN_EMERG "rax: %016lx rcx: %016lx rdx: %016lx\n", + regs->ax, regs->cx, regs->dx); + printk(KERN_EMERG "rsi: %016lx rdi: %016lx " + "rbp: %016lx rsp: %016lx\n", + regs->si, regs->di, regs->bp, regs->sp); +#endif + BUG(); +} + +static void pre(struct kmmio_probe *p, struct pt_regs *regs, + unsigned long addr) +{ + const unsigned long cpu = smp_processor_id(); + const unsigned long instptr = instruction_pointer(regs); + const enum reason_type type = get_ins_type(instptr); + + /* it doesn't make sense to have more than one active trace per cpu */ + if (pf_reason[cpu].active_traces) + die_kmmio_nesting_error(regs, addr); + else + pf_reason[cpu].active_traces++; + + pf_reason[cpu].type = type; + pf_reason[cpu].addr = addr; + pf_reason[cpu].ip = instptr; + + cpu_trace[cpu].header.type = MMIO_MAGIC; + cpu_trace[cpu].header.pid = 0; + cpu_trace[cpu].header.data_len = sizeof(struct mm_io_rw); + cpu_trace[cpu].rw.address = addr; + + /* + * Only record the program counter when requested. + * It may taint clean-room reverse engineering. + */ + if (trace_pc) + cpu_trace[cpu].rw.pc = instptr; + else + cpu_trace[cpu].rw.pc = 0; + + record_timestamp(&cpu_trace[cpu].header); + + switch (type) { + case REG_READ: + cpu_trace[cpu].header.type |= + (MMIO_READ << MMIO_OPCODE_SHIFT) | + (get_ins_mem_width(instptr) << MMIO_WIDTH_SHIFT); + break; + case REG_WRITE: + cpu_trace[cpu].header.type |= + (MMIO_WRITE << MMIO_OPCODE_SHIFT) | + (get_ins_mem_width(instptr) << MMIO_WIDTH_SHIFT); + cpu_trace[cpu].rw.value = get_ins_reg_val(instptr, regs); + break; + case IMM_WRITE: + cpu_trace[cpu].header.type |= + (MMIO_WRITE << MMIO_OPCODE_SHIFT) | + (get_ins_mem_width(instptr) << MMIO_WIDTH_SHIFT); + cpu_trace[cpu].rw.value = get_ins_imm_val(instptr); + break; + default: + { + unsigned char *ip = (unsigned char *)instptr; + cpu_trace[cpu].header.type |= + (MMIO_UNKNOWN_OP << MMIO_OPCODE_SHIFT); + cpu_trace[cpu].rw.value = (*ip) << 16 | + *(ip + 1) << 8 | + *(ip + 2); + } + } +} + +static void post(struct kmmio_probe *p, unsigned long condition, + struct pt_regs *regs) +{ + const unsigned long cpu = smp_processor_id(); + + /* this should always return the active_trace count to 0 */ + pf_reason[cpu].active_traces--; + if (pf_reason[cpu].active_traces) { + printk(KERN_EMERG MODULE_NAME ": unexpected post handler"); + BUG(); + } + + switch (pf_reason[cpu].type) { + case REG_READ: + cpu_trace[cpu].rw.value = get_ins_reg_val(pf_reason[cpu].ip, + regs); + break; + default: + break; + } + relay_write(chan, &cpu_trace[cpu], sizeof(struct mm_io_header_rw)); +} + +/* + * subbuf_start() relay callback. + * + * Defined so that we know when events are dropped due to the buffer-full + * condition. + */ +static int subbuf_start_handler(struct rchan_buf *buf, void *subbuf, + void *prev_subbuf, size_t prev_padding) +{ + if (relay_buf_full(buf)) { + if (!suspended) { + suspended = 1; + printk(KERN_ERR MODULE_NAME + ": cpu %d buffer full!!!\n", + smp_processor_id()); + } + return 0; + } else if (suspended) { + suspended = 0; + printk(KERN_ERR MODULE_NAME + ": cpu %d buffer no longer full.\n", + smp_processor_id()); + } + + return 1; +} + +/* file_create() callback. Creates relay file in debugfs. */ +static struct dentry *create_buf_file_handler(const char *filename, + struct dentry *parent, + int mode, + struct rchan_buf *buf, + int *is_global) +{ + struct dentry *buf_file; + + mmio_fops.read = relay_file_operations.read; + mmio_fops.open = relay_file_operations.open; + mmio_fops.poll = relay_file_operations.poll; + mmio_fops.mmap = relay_file_operations.mmap; + mmio_fops.release = relay_file_operations.release; + mmio_fops.splice_read = relay_file_operations.splice_read; + + buf_file = debugfs_create_file(filename, mode, parent, buf, + &mmio_fops); + + return buf_file; +} + +/* file_remove() default callback. Removes relay file in debugfs. */ +static int remove_buf_file_handler(struct dentry *dentry) +{ + debugfs_remove(dentry); + return 0; +} + +static struct rchan_callbacks relay_callbacks = { + .subbuf_start = subbuf_start_handler, + .create_buf_file = create_buf_file_handler, + .remove_buf_file = remove_buf_file_handler, +}; + +/* + * create_channel - creates channel /debug/APP_DIR/cpuXXX + * Returns channel on success, NULL otherwise + */ +static struct rchan *create_channel(unsigned size, unsigned n) +{ + return relay_open("cpu", dir, size, n, &relay_callbacks, NULL); +} + +/* destroy_channel - destroys channel /debug/APP_DIR/cpuXXX */ +static void destroy_channel(void) +{ + if (chan) { + relay_close(chan); + chan = NULL; + } +} + +struct remap_trace { + struct list_head list; + struct kmmio_probe probe; +}; +static LIST_HEAD(trace_list); +static DEFINE_SPINLOCK(trace_list_lock); + +static void do_ioremap_trace_core(unsigned long offset, unsigned long size, + void __iomem *addr) +{ + struct remap_trace *trace = kmalloc(sizeof(*trace), GFP_KERNEL); + struct mm_io_header_map event = { + .header = { + .type = MMIO_MAGIC | + (MMIO_PROBE << MMIO_OPCODE_SHIFT), + .sec = 0, + .nsec = 0, + .pid = 0, + .data_len = sizeof(struct mm_io_map) + }, + .map = { + .phys = offset, + .addr = (unsigned long)addr, + .len = size, + .pc = 0 + } + }; + record_timestamp(&event.header); + + *trace = (struct remap_trace) { + .probe = { + .addr = (unsigned long)addr, + .len = size, + .pre_handler = pre, + .post_handler = post, + } + }; + + relay_write(chan, &event, sizeof(event)); + spin_lock(&trace_list_lock); + list_add_tail(&trace->list, &trace_list); + spin_unlock(&trace_list_lock); + if (!nommiotrace) + register_kmmio_probe(&trace->probe); +} + +static void ioremap_trace_core(unsigned long offset, unsigned long size, + void __iomem *addr) +{ + if ((filter_offset) && (offset != filter_offset)) + return; + + /* Don't trace the low PCI/ISA area, it's always mapped.. */ + if (!ISA_trace && (offset < ISA_END_ADDRESS) && + (offset + size > ISA_START_ADDRESS)) { + printk(KERN_NOTICE MODULE_NAME ": Ignoring map of low " + "PCI/ISA area (0x%lx-0x%lx)\n", + offset, offset + size); + return; + } + do_ioremap_trace_core(offset, size, addr); +} + +void __iomem *ioremap_cache_trace(unsigned long offset, unsigned long size) +{ + void __iomem *p = ioremap_cache(offset, size); + printk(KERN_DEBUG MODULE_NAME ": ioremap_cache(0x%lx, 0x%lx) = %p\n", + offset, size, p); + ioremap_trace_core(offset, size, p); + return p; +} +EXPORT_SYMBOL(ioremap_cache_trace); + +void __iomem *ioremap_nocache_trace(unsigned long offset, unsigned long size) +{ + void __iomem *p = ioremap_nocache(offset, size); + printk(KERN_DEBUG MODULE_NAME ": ioremap_nocache(0x%lx, 0x%lx) = %p\n", + offset, size, p); + ioremap_trace_core(offset, size, p); + return p; +} +EXPORT_SYMBOL(ioremap_nocache_trace); + +void iounmap_trace(volatile void __iomem *addr) +{ + struct mm_io_header_map event = { + .header = { + .type = MMIO_MAGIC | + (MMIO_UNPROBE << MMIO_OPCODE_SHIFT), + .sec = 0, + .nsec = 0, + .pid = 0, + .data_len = sizeof(struct mm_io_map) + }, + .map = { + .phys = 0, + .addr = (unsigned long)addr, + .len = 0, + .pc = 0 + } + }; + struct remap_trace *trace; + struct remap_trace *tmp; + printk(KERN_DEBUG MODULE_NAME ": Unmapping %p.\n", addr); + record_timestamp(&event.header); + + spin_lock(&trace_list_lock); + list_for_each_entry_safe(trace, tmp, &trace_list, list) { + if ((unsigned long)addr == trace->probe.addr) { + if (!nommiotrace) + unregister_kmmio_probe(&trace->probe); + list_del(&trace->list); + kfree(trace); + break; + } + } + spin_unlock(&trace_list_lock); + relay_write(chan, &event, sizeof(event)); + iounmap(addr); +} +EXPORT_SYMBOL(iounmap_trace); + +static void clear_trace_list(void) +{ + struct remap_trace *trace; + struct remap_trace *tmp; + + spin_lock(&trace_list_lock); + list_for_each_entry_safe(trace, tmp, &trace_list, list) { + printk(KERN_WARNING MODULE_NAME ": purging non-iounmapped " + "trace @0x%08lx, size 0x%lx.\n", + trace->probe.addr, trace->probe.len); + if (!nommiotrace) + unregister_kmmio_probe(&trace->probe); + list_del(&trace->list); + kfree(trace); + break; + } + spin_unlock(&trace_list_lock); +} + +static int __init init(void) +{ + if (n_subbufs < 2) + return -EINVAL; + + dir = debugfs_create_dir(APP_DIR, NULL); + if (!dir) { + printk(KERN_ERR MODULE_NAME + ": Couldn't create relay app directory.\n"); + return -ENOMEM; + } + + chan = create_channel(subbuf_size, n_subbufs); + if (!chan) { + debugfs_remove(dir); + printk(KERN_ERR MODULE_NAME + ": relay app channel creation failed\n"); + return -ENOMEM; + } + + init_kmmio(); + + proc_marker_file = create_proc_entry(MARKER_FILE, 0, NULL); + if (proc_marker_file) + proc_marker_file->write_proc = write_marker; + + printk(KERN_DEBUG MODULE_NAME ": loaded.\n"); + if (nommiotrace) + printk(KERN_DEBUG MODULE_NAME ": MMIO tracing disabled.\n"); + if (ISA_trace) + printk(KERN_WARNING MODULE_NAME + ": Warning! low ISA range will be traced.\n"); + return 0; +} + +static void __exit cleanup(void) +{ + printk(KERN_DEBUG MODULE_NAME ": unload...\n"); + clear_trace_list(); + cleanup_kmmio(); + remove_proc_entry(MARKER_FILE, NULL); + destroy_channel(); + if (dir) + debugfs_remove(dir); +} + +module_init(init); +module_exit(cleanup); +MODULE_LICENSE("GPL"); diff --git a/arch/x86/kernel/mmiotrace/pf_in.c b/arch/x86/kernel/mmiotrace/pf_in.c new file mode 100644 index 0000000..67ea520 --- /dev/null +++ b/arch/x86/kernel/mmiotrace/pf_in.c @@ -0,0 +1,489 @@ +/* + * Fault Injection Test harness (FI) + * Copyright (C) Intel Crop. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, + * USA. + * + */ + +/* $Id: pf_in.c,v 1.1.1.1 2002/11/12 05:56:32 brlock Exp $ + * Copyright by Intel Crop., 2002 + * Louis Zhuang (louis.zhuang@intel.com) + * + * Bjorn Steinbrink (B.Steinbrink@gmx.de), 2007 + */ + +#include +#include /* struct pt_regs */ +#include "pf_in.h" + +#ifdef __i386__ +/* IA32 Manual 3, 2-1 */ +static unsigned char prefix_codes[] = { + 0xF0, 0xF2, 0xF3, 0x2E, 0x36, 0x3E, 0x26, 0x64, + 0x65, 0x2E, 0x3E, 0x66, 0x67 +}; +/* IA32 Manual 3, 3-432*/ +static unsigned int reg_rop[] = { + 0x8A, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F +}; +static unsigned int reg_wop[] = { 0x88, 0x89 }; +static unsigned int imm_wop[] = { 0xC6, 0xC7 }; +/* IA32 Manual 3, 3-432*/ +static unsigned int rw8[] = { 0x88, 0x8A, 0xC6 }; +static unsigned int rw32[] = { + 0x89, 0x8B, 0xC7, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F +}; +static unsigned int mw8[] = { 0x88, 0x8A, 0xC6, 0xB60F, 0xBE0F }; +static unsigned int mw16[] = { 0xB70F, 0xBF0F }; +static unsigned int mw32[] = { 0x89, 0x8B, 0xC7 }; +static unsigned int mw64[] = {}; +#else /* not __i386__ */ +static unsigned char prefix_codes[] = { + 0x66, 0x67, 0x2E, 0x3E, 0x26, 0x64, 0x65, 0x36, + 0xF0, 0xF3, 0xF2, + /* REX Prefixes */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f +}; +/* AMD64 Manual 3, Appendix A*/ +static unsigned int reg_rop[] = { + 0x8A, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F +}; +static unsigned int reg_wop[] = { 0x88, 0x89 }; +static unsigned int imm_wop[] = { 0xC6, 0xC7 }; +static unsigned int rw8[] = { 0xC6, 0x88, 0x8A }; +static unsigned int rw32[] = { + 0xC7, 0x89, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F +}; +/* 8 bit only */ +static unsigned int mw8[] = { 0xC6, 0x88, 0x8A, 0xB60F, 0xBE0F }; +/* 16 bit only */ +static unsigned int mw16[] = { 0xB70F, 0xBF0F }; +/* 16 or 32 bit */ +static unsigned int mw32[] = { 0xC7 }; +/* 16, 32 or 64 bit */ +static unsigned int mw64[] = { 0x89, 0x8B }; +#endif /* not __i386__ */ + +static int skip_prefix(unsigned char *addr, int *shorted, int *enlarged, + int *rexr) +{ + int i; + unsigned char *p = addr; + *shorted = 0; + *enlarged = 0; + *rexr = 0; + +restart: + for (i = 0; i < ARRAY_SIZE(prefix_codes); i++) { + if (*p == prefix_codes[i]) { + if (*p == 0x66) + *shorted = 1; +#ifdef __amd64__ + if ((*p & 0xf8) == 0x48) + *enlarged = 1; + if ((*p & 0xf4) == 0x44) + *rexr = 1; +#endif + p++; + goto restart; + } + } + + return (p - addr); +} + +static int get_opcode(unsigned char *addr, unsigned int *opcode) +{ + int len; + + if (*addr == 0x0F) { + /* 0x0F is extension instruction */ + *opcode = *(unsigned short *)addr; + len = 2; + } else { + *opcode = *addr; + len = 1; + } + + return len; +} + +#define CHECK_OP_TYPE(opcode, array, type) \ + for (i = 0; i < ARRAY_SIZE(array); i++) { \ + if (array[i] == opcode) { \ + rv = type; \ + goto exit; \ + } \ + } + +enum reason_type get_ins_type(unsigned long ins_addr) +{ + unsigned int opcode; + unsigned char *p; + int shorted, enlarged, rexr; + int i; + enum reason_type rv = OTHERS; + + p = (unsigned char *)ins_addr; + p += skip_prefix(p, &shorted, &enlarged, &rexr); + p += get_opcode(p, &opcode); + + CHECK_OP_TYPE(opcode, reg_rop, REG_READ); + CHECK_OP_TYPE(opcode, reg_wop, REG_WRITE); + CHECK_OP_TYPE(opcode, imm_wop, IMM_WRITE); + +exit: + return rv; +} +#undef CHECK_OP_TYPE + +static unsigned int get_ins_reg_width(unsigned long ins_addr) +{ + unsigned int opcode; + unsigned char *p; + int i, shorted, enlarged, rexr; + + p = (unsigned char *)ins_addr; + p += skip_prefix(p, &shorted, &enlarged, &rexr); + p += get_opcode(p, &opcode); + + for (i = 0; i < ARRAY_SIZE(rw8); i++) + if (rw8[i] == opcode) + return 1; + + for (i = 0; i < ARRAY_SIZE(rw32); i++) + if (rw32[i] == opcode) + return (shorted ? 2 : (enlarged ? 8 : 4)); + + printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode); + return 0; +} + +unsigned int get_ins_mem_width(unsigned long ins_addr) +{ + unsigned int opcode; + unsigned char *p; + int i, shorted, enlarged, rexr; + + p = (unsigned char *)ins_addr; + p += skip_prefix(p, &shorted, &enlarged, &rexr); + p += get_opcode(p, &opcode); + + for (i = 0; i < ARRAY_SIZE(mw8); i++) + if (mw8[i] == opcode) + return 1; + + for (i = 0; i < ARRAY_SIZE(mw16); i++) + if (mw16[i] == opcode) + return 2; + + for (i = 0; i < ARRAY_SIZE(mw32); i++) + if (mw32[i] == opcode) + return shorted ? 2 : 4; + + for (i = 0; i < ARRAY_SIZE(mw64); i++) + if (mw64[i] == opcode) + return shorted ? 2 : (enlarged ? 8 : 4); + + printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode); + return 0; +} + +/* + * Define register ident in mod/rm byte. + * Note: these are NOT the same as in ptrace-abi.h. + */ +enum { + arg_AL = 0, + arg_CL = 1, + arg_DL = 2, + arg_BL = 3, + arg_AH = 4, + arg_CH = 5, + arg_DH = 6, + arg_BH = 7, + + arg_AX = 0, + arg_CX = 1, + arg_DX = 2, + arg_BX = 3, + arg_SP = 4, + arg_BP = 5, + arg_SI = 6, + arg_DI = 7, +#ifdef __amd64__ + arg_R8 = 8, + arg_R9 = 9, + arg_R10 = 10, + arg_R11 = 11, + arg_R12 = 12, + arg_R13 = 13, + arg_R14 = 14, + arg_R15 = 15 +#endif +}; + +static unsigned char *get_reg_w8(int no, struct pt_regs *regs) +{ + unsigned char *rv = NULL; + + switch (no) { + case arg_AL: + rv = (unsigned char *)®s->ax; + break; + case arg_BL: + rv = (unsigned char *)®s->bx; + break; + case arg_CL: + rv = (unsigned char *)®s->cx; + break; + case arg_DL: + rv = (unsigned char *)®s->dx; + break; + case arg_AH: + rv = 1 + (unsigned char *)®s->ax; + break; + case arg_BH: + rv = 1 + (unsigned char *)®s->bx; + break; + case arg_CH: + rv = 1 + (unsigned char *)®s->cx; + break; + case arg_DH: + rv = 1 + (unsigned char *)®s->dx; + break; +#ifdef __amd64__ + case arg_R8: + rv = (unsigned char *)®s->r8; + break; + case arg_R9: + rv = (unsigned char *)®s->r9; + break; + case arg_R10: + rv = (unsigned char *)®s->r10; + break; + case arg_R11: + rv = (unsigned char *)®s->r11; + break; + case arg_R12: + rv = (unsigned char *)®s->r12; + break; + case arg_R13: + rv = (unsigned char *)®s->r13; + break; + case arg_R14: + rv = (unsigned char *)®s->r14; + break; + case arg_R15: + rv = (unsigned char *)®s->r15; + break; +#endif + default: + printk(KERN_ERR "mmiotrace: Error reg no# %d\n", no); + break; + } + return rv; +} + +static unsigned long *get_reg_w32(int no, struct pt_regs *regs) +{ + unsigned long *rv = NULL; + + switch (no) { + case arg_AX: + rv = ®s->ax; + break; + case arg_BX: + rv = ®s->bx; + break; + case arg_CX: + rv = ®s->cx; + break; + case arg_DX: + rv = ®s->dx; + break; + case arg_SP: + rv = ®s->sp; + break; + case arg_BP: + rv = ®s->bp; + break; + case arg_SI: + rv = ®s->si; + break; + case arg_DI: + rv = ®s->di; + break; +#ifdef __amd64__ + case arg_R8: + rv = ®s->r8; + break; + case arg_R9: + rv = ®s->r9; + break; + case arg_R10: + rv = ®s->r10; + break; + case arg_R11: + rv = ®s->r11; + break; + case arg_R12: + rv = ®s->r12; + break; + case arg_R13: + rv = ®s->r13; + break; + case arg_R14: + rv = ®s->r14; + break; + case arg_R15: + rv = ®s->r15; + break; +#endif + default: + printk(KERN_ERR "mmiotrace: Error reg no# %d\n", no); + } + + return rv; +} + +unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs) +{ + unsigned int opcode; + unsigned char mod_rm; + int reg; + unsigned char *p; + int i, shorted, enlarged, rexr; + unsigned long rv; + + p = (unsigned char *)ins_addr; + p += skip_prefix(p, &shorted, &enlarged, &rexr); + p += get_opcode(p, &opcode); + for (i = 0; i < ARRAY_SIZE(reg_rop); i++) + if (reg_rop[i] == opcode) { + rv = REG_READ; + goto do_work; + } + + for (i = 0; i < ARRAY_SIZE(reg_wop); i++) + if (reg_wop[i] == opcode) { + rv = REG_WRITE; + goto do_work; + } + + printk(KERN_ERR "mmiotrace: Not a register instruction, opcode " + "0x%02x\n", opcode); + goto err; + +do_work: + mod_rm = *p; + reg = ((mod_rm >> 3) & 0x7) | (rexr << 3); + switch (get_ins_reg_width(ins_addr)) { + case 1: + return *get_reg_w8(reg, regs); + + case 2: + return *(unsigned short *)get_reg_w32(reg, regs); + + case 4: + return *(unsigned int *)get_reg_w32(reg, regs); + +#ifdef __amd64__ + case 8: + return *(unsigned long *)get_reg_w32(reg, regs); +#endif + + default: + printk(KERN_ERR "mmiotrace: Error width# %d\n", reg); + } + +err: + return 0; +} + +unsigned long get_ins_imm_val(unsigned long ins_addr) +{ + unsigned int opcode; + unsigned char mod_rm; + unsigned char mod; + unsigned char *p; + int i, shorted, enlarged, rexr; + unsigned long rv; + + p = (unsigned char *)ins_addr; + p += skip_prefix(p, &shorted, &enlarged, &rexr); + p += get_opcode(p, &opcode); + for (i = 0; i < ARRAY_SIZE(imm_wop); i++) + if (imm_wop[i] == opcode) { + rv = IMM_WRITE; + goto do_work; + } + + printk(KERN_ERR "mmiotrace: Not an immediate instruction, opcode " + "0x%02x\n", opcode); + goto err; + +do_work: + mod_rm = *p; + mod = mod_rm >> 6; + p++; + switch (mod) { + case 0: + /* if r/m is 5 we have a 32 disp (IA32 Manual 3, Table 2-2) */ + /* AMD64: XXX Check for address size prefix? */ + if ((mod_rm & 0x7) == 0x5) + p += 4; + break; + + case 1: + p += 1; + break; + + case 2: + p += 4; + break; + + case 3: + default: + printk(KERN_ERR "mmiotrace: not a memory access instruction " + "at 0x%lx, rm_mod=0x%02x\n", + ins_addr, mod_rm); + } + + switch (get_ins_reg_width(ins_addr)) { + case 1: + return *(unsigned char *)p; + + case 2: + return *(unsigned short *)p; + + case 4: + return *(unsigned int *)p; + +#ifdef __amd64__ + case 8: + return *(unsigned long *)p; +#endif + + default: + printk(KERN_ERR "mmiotrace: Error: width.\n"); + } + +err: + return 0; +} diff --git a/arch/x86/kernel/mmiotrace/pf_in.h b/arch/x86/kernel/mmiotrace/pf_in.h new file mode 100644 index 0000000..e05341a --- /dev/null +++ b/arch/x86/kernel/mmiotrace/pf_in.h @@ -0,0 +1,39 @@ +/* + * Fault Injection Test harness (FI) + * Copyright (C) Intel Crop. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, + * USA. + * + */ + +#ifndef __PF_H_ +#define __PF_H_ + +enum reason_type { + NOT_ME, /* page fault is not in regions */ + NOTHING, /* access others point in regions */ + REG_READ, /* read from addr to reg */ + REG_WRITE, /* write from reg to addr */ + IMM_WRITE, /* write from imm to addr */ + OTHERS /* Other instructions can not intercept */ +}; + +enum reason_type get_ins_type(unsigned long ins_addr); +unsigned int get_ins_mem_width(unsigned long ins_addr); +unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs); +unsigned long get_ins_imm_val(unsigned long ins_addr); + +#endif /* __PF_H_ */ diff --git a/arch/x86/kernel/mmiotrace/testmmiotrace.c b/arch/x86/kernel/mmiotrace/testmmiotrace.c new file mode 100644 index 0000000..40e66b0 --- /dev/null +++ b/arch/x86/kernel/mmiotrace/testmmiotrace.c @@ -0,0 +1,77 @@ +/* + * Written by Pekka Paalanen, 2008 + */ +#include +#include + +extern void __iomem *ioremap_nocache_trace(unsigned long offset, + unsigned long size); +extern void iounmap_trace(volatile void __iomem *addr); + +#define MODULE_NAME "testmmiotrace" + +static unsigned long mmio_address; +module_param(mmio_address, ulong, 0); +MODULE_PARM_DESC(mmio_address, "Start address of the mapping of 16 kB."); + +static void do_write_test(void __iomem *p) +{ + unsigned int i; + for (i = 0; i < 256; i++) + iowrite8(i, p + i); + for (i = 1024; i < (5 * 1024); i += 2) + iowrite16(i * 12 + 7, p + i); + for (i = (5 * 1024); i < (16 * 1024); i += 4) + iowrite32(i * 212371 + 13, p + i); +} + +static void do_read_test(void __iomem *p) +{ + unsigned int i; + volatile unsigned int v; + for (i = 0; i < 256; i++) + v = ioread8(p + i); + for (i = 1024; i < (5 * 1024); i += 2) + v = ioread16(p + i); + for (i = (5 * 1024); i < (16 * 1024); i += 4) + v = ioread32(p + i); +} + +static void do_test(void) +{ + void __iomem *p = ioremap_nocache_trace(mmio_address, 0x4000); + if (!p) { + printk(KERN_ERR MODULE_NAME ": could not ioremap IO memory, " + "aborting.\n"); + return; + } + do_write_test(p); + do_read_test(p); + iounmap_trace(p); +} + +static int __init init(void) +{ + if (mmio_address == 0) { + printk(KERN_ERR MODULE_NAME ": you have to use the module " + "argument mmio_address.\n"); + printk(KERN_ERR MODULE_NAME ": DO NOT LOAD THIS MODULE UNLESS" + " YOU REALLY KNOW WHAT YOU ARE DOING!\n"); + return -ENXIO; + } + + printk(KERN_WARNING MODULE_NAME ": WARNING: mapping 16 kB @ 0x%08lx " + "in PCI address space, and writing " + "rubbish in there.\n", mmio_address); + do_test(); + return 0; +} + +static void __exit cleanup(void) +{ + printk(KERN_DEBUG MODULE_NAME ": unloaded.\n"); +} + +module_init(init); +module_exit(cleanup); +MODULE_LICENSE("GPL"); diff --git a/include/linux/mmiotrace.h b/include/linux/mmiotrace.h new file mode 100644 index 0000000..cb24782 --- /dev/null +++ b/include/linux/mmiotrace.h @@ -0,0 +1,62 @@ +#ifndef MMIOTRACE_H +#define MMIOTRACE_H + +#include + +#define MMIO_VERSION 0x04 + +/* mm_io_header.type */ +#define MMIO_OPCODE_MASK 0xff +#define MMIO_OPCODE_SHIFT 0 +#define MMIO_WIDTH_MASK 0xff00 +#define MMIO_WIDTH_SHIFT 8 +#define MMIO_MAGIC (0x6f000000 | (MMIO_VERSION<<16)) +#define MMIO_MAGIC_MASK 0xffff0000 + +enum mm_io_opcode { /* payload type: */ + MMIO_READ = 0x1, /* struct mm_io_rw */ + MMIO_WRITE = 0x2, /* struct mm_io_rw */ + MMIO_PROBE = 0x3, /* struct mm_io_map */ + MMIO_UNPROBE = 0x4, /* struct mm_io_map */ + MMIO_MARKER = 0x5, /* raw char data */ + MMIO_UNKNOWN_OP = 0x6, /* struct mm_io_rw */ +}; + +struct mm_io_header { + __u32 type; + __u32 sec; /* timestamp */ + __u32 nsec; + __u32 pid; /* PID of the process, or 0 for kernel core */ + __u16 data_len; /* length of the following payload */ +}; + +struct mm_io_rw { + __u64 address; /* virtual address of register */ + __u64 value; + __u64 pc; /* optional program counter */ +}; + +struct mm_io_map { + __u64 phys; /* base address in PCI space */ + __u64 addr; /* base virtual address */ + __u64 len; /* mapping size */ + __u64 pc; /* optional program counter */ +}; + + +/* + * These structures are used to allow a single relay_write() + * call to write a full packet. + */ + +struct mm_io_header_rw { + struct mm_io_header header; + struct mm_io_rw rw; +} __attribute__((packed)); + +struct mm_io_header_map { + struct mm_io_header header; + struct mm_io_map map; +} __attribute__((packed)); + +#endif /* MMIOTRACE_H */ -- cgit v0.10.2 From 75bb88350e0501b3cf5ac096a1008757844414a9 Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Mon, 12 May 2008 21:20:56 +0200 Subject: x86 mmiotrace: use lookup_address() Use lookup_address() from pageattr.c instead of doing the same manually. Also had to EXPORT_SYMBOL_GPL(lookup_address) to make this work for modules. This also fixes "undefined symbol 'init_mm'" compile error for x86_32. Signed-off-by: Pekka Paalanen Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/arch/x86/kernel/mmiotrace/kmmio.c b/arch/x86/kernel/mmiotrace/kmmio.c index 8ba48f9..28411da 100644 --- a/arch/x86/kernel/mmiotrace/kmmio.c +++ b/arch/x86/kernel/mmiotrace/kmmio.c @@ -20,6 +20,7 @@ #include #include #include +#include #include "kmmio.h" @@ -117,40 +118,55 @@ static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page) return NULL; } -static void arm_kmmio_fault_page(unsigned long page, int *large) +static void arm_kmmio_fault_page(unsigned long page, int *page_level) { unsigned long address = page & PAGE_MASK; - pgd_t *pgd = pgd_offset_k(address); - pud_t *pud = pud_offset(pgd, address); - pmd_t *pmd = pmd_offset(pud, address); - pte_t *pte = pte_offset_kernel(pmd, address); + int level; + pte_t *pte = lookup_address(address, &level); - if (pmd_large(*pmd)) { + if (!pte) { + printk(KERN_ERR "Error in %s: no pte for page 0x%08lx\n", + __FUNCTION__, page); + return; + } + + if (level == PG_LEVEL_2M) { + pmd_t *pmd = (pmd_t *)pte; set_pmd(pmd, __pmd(pmd_val(*pmd) & ~_PAGE_PRESENT)); - if (large) - *large = 1; } else { + /* PG_LEVEL_4K */ set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT)); } + if (page_level) + *page_level = level; + __flush_tlb_one(page); } -static void disarm_kmmio_fault_page(unsigned long page, int *large) +static void disarm_kmmio_fault_page(unsigned long page, int *page_level) { unsigned long address = page & PAGE_MASK; - pgd_t *pgd = pgd_offset_k(address); - pud_t *pud = pud_offset(pgd, address); - pmd_t *pmd = pmd_offset(pud, address); - pte_t *pte = pte_offset_kernel(pmd, address); + int level; + pte_t *pte = lookup_address(address, &level); - if (large && *large) { + if (!pte) { + printk(KERN_ERR "Error in %s: no pte for page 0x%08lx\n", + __FUNCTION__, page); + return; + } + + if (level == PG_LEVEL_2M) { + pmd_t *pmd = (pmd_t *)pte; set_pmd(pmd, __pmd(pmd_val(*pmd) | _PAGE_PRESENT)); - *large = 0; } else { + /* PG_LEVEL_4K */ set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT)); } + if (page_level) + *page_level = level; + __flush_tlb_one(page); } diff --git a/arch/x86/kernel/mmiotrace/mmio-mod.c b/arch/x86/kernel/mmiotrace/mmio-mod.c index 73561fe..e43947d 100644 --- a/arch/x86/kernel/mmiotrace/mmio-mod.c +++ b/arch/x86/kernel/mmiotrace/mmio-mod.c @@ -120,19 +120,24 @@ static int write_marker(struct file *file, const char __user *buffer, static void print_pte(unsigned long address) { - pgd_t *pgd = pgd_offset_k(address); - pud_t *pud = pud_offset(pgd, address); - pmd_t *pmd = pmd_offset(pud, address); - if (pmd_large(*pmd)) { + int level; + pte_t *pte = lookup_address(address, &level); + + if (!pte) { + printk(KERN_ERR "Error in %s: no pte for page 0x%08lx\n", + __FUNCTION__, address); + return; + } + + if (level == PG_LEVEL_2M) { printk(KERN_EMERG MODULE_NAME ": 4MB pages are not " "currently supported: %lx\n", address); BUG(); } printk(KERN_DEBUG MODULE_NAME ": pte for 0x%lx: 0x%lx 0x%lx\n", - address, - pte_val(*pte_offset_kernel(pmd, address)), - pte_val(*pte_offset_kernel(pmd, address)) & _PAGE_PRESENT); + address, pte_val(*pte), + pte_val(*pte) & _PAGE_PRESENT); } /* diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 60bcb5b..57970f2 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -227,6 +227,7 @@ pte_t *lookup_address(unsigned long address, unsigned int *level) return pte_offset_kernel(pmd, address); } +EXPORT_SYMBOL_GPL(lookup_address); /* * Set the new pmd in all the pgds we know about: -- cgit v0.10.2 From fe1ffafa80f6673101c6560c2bacfe3df10372ee Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Mon, 12 May 2008 21:20:56 +0200 Subject: x86 mmiotrace: fix relay-buffer-full flag for SMP Relay has per-cpu buffers, but mmiotrace was using only a single flag for detecting buffer full/not-full transitions. The new code makes this per-cpu and actually counts missed events. Signed-off-by: Pekka Paalanen Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/arch/x86/kernel/mmiotrace/mmio-mod.c b/arch/x86/kernel/mmiotrace/mmio-mod.c index e43947d..0019dcd 100644 --- a/arch/x86/kernel/mmiotrace/mmio-mod.c +++ b/arch/x86/kernel/mmiotrace/mmio-mod.c @@ -29,6 +29,7 @@ #include #include #include /* for ISA_START_ADDRESS */ +#include #include "kmmio.h" #include "pf_in.h" @@ -47,9 +48,13 @@ struct trap_reason { int active_traces; }; +/* Accessed per-cpu. */ static struct trap_reason pf_reason[NR_CPUS]; static struct mm_io_header_rw cpu_trace[NR_CPUS]; +/* Access to this is not per-cpu. */ +static atomic_t dropped[NR_CPUS]; + static struct file_operations mmio_fops = { .owner = THIS_MODULE, }; @@ -57,7 +62,6 @@ static struct file_operations mmio_fops = { static const size_t subbuf_size = 256*1024; static struct rchan *chan; static struct dentry *dir; -static int suspended; /* XXX should this be per cpu? */ static struct proc_dir_entry *proc_marker_file; /* module parameters */ @@ -269,19 +273,21 @@ static void post(struct kmmio_probe *p, unsigned long condition, static int subbuf_start_handler(struct rchan_buf *buf, void *subbuf, void *prev_subbuf, size_t prev_padding) { + unsigned int cpu = buf->cpu; + atomic_t *drop = &dropped[cpu]; + int count; if (relay_buf_full(buf)) { - if (!suspended) { - suspended = 1; - printk(KERN_ERR MODULE_NAME - ": cpu %d buffer full!!!\n", - smp_processor_id()); + if (atomic_inc_return(drop) == 1) { + printk(KERN_ERR MODULE_NAME ": cpu %d buffer full!\n", + cpu); } return 0; - } else if (suspended) { - suspended = 0; + } else if ((count = atomic_read(drop))) { printk(KERN_ERR MODULE_NAME - ": cpu %d buffer no longer full.\n", - smp_processor_id()); + ": cpu %d buffer no longer full, " + "missed %d events.\n", + cpu, count); + atomic_sub(count, drop); } return 1; -- cgit v0.10.2 From 63ffa3e456c1a9884a3ebac997d91e3fdae18d78 Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Mon, 12 May 2008 21:20:57 +0200 Subject: x86 mmiotrace: comment about user space ABI Signed-off-by: Pekka Paalanen Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/include/linux/mmiotrace.h b/include/linux/mmiotrace.h index cb24782..6ec288f 100644 --- a/include/linux/mmiotrace.h +++ b/include/linux/mmiotrace.h @@ -3,6 +3,10 @@ #include +/* + * If you change anything here, you must bump MMIO_VERSION. + * This is the relay data format for user space. + */ #define MMIO_VERSION 0x04 /* mm_io_header.type */ @@ -23,7 +27,7 @@ enum mm_io_opcode { /* payload type: */ }; struct mm_io_header { - __u32 type; + __u32 type; /* see MMIO_* macros above */ __u32 sec; /* timestamp */ __u32 nsec; __u32 pid; /* PID of the process, or 0 for kernel core */ -- cgit v0.10.2 From 10c43d2eb50c9a5ad60388b9d3c41c31150049e6 Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Mon, 12 May 2008 21:20:57 +0200 Subject: x86: explicit call to mmiotrace in do_page_fault() The custom page fault handler list is replaced with a single function pointer. All related functions and variables are renamed for mmiotrace. Signed-off-by: Pekka Paalanen Cc: Christoph Hellwig Cc: Arjan van de Ven Cc: pq@iki.fi Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 7c6496e..9491c0a 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -168,20 +168,18 @@ config IOMMU_LEAK Add a simple leak tracer to the IOMMU code. This is useful when you are debugging a buggy device driver that leaks IOMMU mappings. -config PAGE_FAULT_HANDLERS - bool "Custom page fault handlers" - depends on DEBUG_KERNEL - help - Allow the use of custom page fault handlers. A kernel module may - register a function that is called on every page fault. Custom - handlers are used by some debugging and reverse engineering tools. +config MMIOTRACE_HOOKS + bool + default n config MMIOTRACE tristate "Memory mapped IO tracing" - depends on DEBUG_KERNEL && PAGE_FAULT_HANDLERS && RELAY && DEBUG_FS + depends on DEBUG_KERNEL && RELAY && DEBUG_FS + select MMIOTRACE_HOOKS default n help This will build a kernel module called mmiotrace. + Making this a built-in is heavily discouraged. Mmiotrace traces Memory Mapped I/O access and is meant for debugging and reverse engineering. The kernel module offers wrapped diff --git a/arch/x86/kernel/mmiotrace/kmmio.c b/arch/x86/kernel/mmiotrace/kmmio.c index 28411da..e759f7c 100644 --- a/arch/x86/kernel/mmiotrace/kmmio.c +++ b/arch/x86/kernel/mmiotrace/kmmio.c @@ -51,10 +51,6 @@ static LIST_HEAD(kmmio_probes); static struct kmmio_context kmmio_ctx[NR_CPUS]; -static struct pf_handler kmmio_pf_hook = { - .handler = kmmio_page_fault -}; - static struct notifier_block nb_die = { .notifier_call = kmmio_die_notifier }; @@ -77,7 +73,8 @@ void cleanup_kmmio(void) * kmmio_page_table, kmmio_probes */ if (handler_registered) { - unregister_page_fault_handler(&kmmio_pf_hook); + if (mmiotrace_unregister_pf(&kmmio_page_fault)) + BUG(); synchronize_rcu(); } unregister_die_notifier(&nb_die); @@ -343,8 +340,11 @@ int register_kmmio_probe(struct kmmio_probe *p) } if (!handler_registered) { - register_page_fault_handler(&kmmio_pf_hook); - handler_registered++; + if (mmiotrace_register_pf(&kmmio_page_fault)) + printk(KERN_ERR "mmiotrace: Cannot register page " + "fault handler.\n"); + else + handler_registered++; } out: diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 343f5c1..e9a086a 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -49,53 +49,55 @@ #define PF_RSVD (1<<3) #define PF_INSTR (1<<4) -#ifdef CONFIG_PAGE_FAULT_HANDLERS -static HLIST_HEAD(pf_handlers); /* protected by RCU */ -static DEFINE_SPINLOCK(pf_handlers_writer); +#ifdef CONFIG_MMIOTRACE_HOOKS +static pf_handler_func mmiotrace_pf_handler; /* protected by RCU */ +static DEFINE_SPINLOCK(mmiotrace_handler_lock); -void register_page_fault_handler(struct pf_handler *new_pfh) +int mmiotrace_register_pf(pf_handler_func new_pfh) { + int ret = 0; unsigned long flags; - spin_lock_irqsave(&pf_handlers_writer, flags); - hlist_add_head_rcu(&new_pfh->hlist, &pf_handlers); - spin_unlock_irqrestore(&pf_handlers_writer, flags); + spin_lock_irqsave(&mmiotrace_handler_lock, flags); + if (mmiotrace_pf_handler) + ret = -EBUSY; + else + mmiotrace_pf_handler = new_pfh; + spin_unlock_irqrestore(&mmiotrace_handler_lock, flags); + return ret; } -EXPORT_SYMBOL_GPL(register_page_fault_handler); +EXPORT_SYMBOL_GPL(mmiotrace_register_pf); /** - * unregister_page_fault_handler: + * mmiotrace_unregister_pf: * The caller must ensure @old_pfh is not in use anymore before freeing it. - * This function does not guarantee it. The list of handlers is protected by - * RCU, so you can do this by e.g. calling synchronize_rcu(). + * This function does not guarantee it. The handler function pointer is + * protected by RCU, so you can do this by e.g. calling synchronize_rcu(). */ -void unregister_page_fault_handler(struct pf_handler *old_pfh) +int mmiotrace_unregister_pf(pf_handler_func old_pfh) { + int ret = 0; unsigned long flags; - spin_lock_irqsave(&pf_handlers_writer, flags); - hlist_del_rcu(&old_pfh->hlist); - spin_unlock_irqrestore(&pf_handlers_writer, flags); + spin_lock_irqsave(&mmiotrace_handler_lock, flags); + if (mmiotrace_pf_handler != old_pfh) + ret = -EPERM; + else + mmiotrace_pf_handler = NULL; + spin_unlock_irqrestore(&mmiotrace_handler_lock, flags); + return ret; } -EXPORT_SYMBOL_GPL(unregister_page_fault_handler); -#endif +EXPORT_SYMBOL_GPL(mmiotrace_unregister_pf); +#endif /* CONFIG_MMIOTRACE_HOOKS */ /* returns non-zero if do_page_fault() should return */ -static int handle_custom_pf(struct pt_regs *regs, unsigned long error_code, - unsigned long address) +static inline int call_mmiotrace(struct pt_regs *regs, + unsigned long error_code, + unsigned long address) { -#ifdef CONFIG_PAGE_FAULT_HANDLERS +#ifdef CONFIG_MMIOTRACE_HOOKS int ret = 0; - struct pf_handler *cur; - struct hlist_node *ncur; - - if (hlist_empty(&pf_handlers)) - return 0; - rcu_read_lock(); - hlist_for_each_entry_rcu(cur, ncur, &pf_handlers, hlist) { - ret = cur->handler(regs, error_code, address); - if (ret) - break; - } + if (mmiotrace_pf_handler) + ret = mmiotrace_pf_handler(regs, error_code, address); rcu_read_unlock(); return ret; #else @@ -655,7 +657,7 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) if (notify_page_fault(regs)) return; - if (handle_custom_pf(regs, error_code, address)) + if (call_mmiotrace(regs, error_code, address)) return; /* diff --git a/include/asm-x86/kdebug.h b/include/asm-x86/kdebug.h index a80f2d6..7063281 100644 --- a/include/asm-x86/kdebug.h +++ b/include/asm-x86/kdebug.h @@ -35,13 +35,11 @@ extern void show_regs(struct pt_regs *regs); extern unsigned long oops_begin(void); extern void oops_end(unsigned long, struct pt_regs *, int signr); -struct pf_handler { - struct hlist_node hlist; - int (*handler)(struct pt_regs *regs, unsigned long error_code, - unsigned long address); -}; +typedef int (*pf_handler_func)(struct pt_regs *regs, + unsigned long error_code, + unsigned long address); -extern void register_page_fault_handler(struct pf_handler *new_pfh); -extern void unregister_page_fault_handler(struct pf_handler *old_pfh); +extern int mmiotrace_register_pf(pf_handler_func new_pfh); +extern int mmiotrace_unregister_pf(pf_handler_func old_pfh); #endif -- cgit v0.10.2 From f513638030ca384b0bace4df64f0b82f6ae1e4c6 Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Mon, 12 May 2008 21:20:57 +0200 Subject: x86 mmiotrace: Use percpu instead of arrays. Signed-off-by: Pekka Paalanen Cc: Eric Dumazet Cc: pq@iki.fi Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/arch/x86/kernel/mmiotrace/kmmio.c b/arch/x86/kernel/mmiotrace/kmmio.c index e759f7c..5e239d0 100644 --- a/arch/x86/kernel/mmiotrace/kmmio.c +++ b/arch/x86/kernel/mmiotrace/kmmio.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -49,7 +50,8 @@ static unsigned int handler_registered; static struct list_head kmmio_page_table[KMMIO_PAGE_TABLE_SIZE]; static LIST_HEAD(kmmio_probes); -static struct kmmio_context kmmio_ctx[NR_CPUS]; +/* Accessed per-cpu */ +static DEFINE_PER_CPU(struct kmmio_context, kmmio_ctx); static struct notifier_block nb_die = { .notifier_call = kmmio_die_notifier @@ -173,8 +175,7 @@ static void disarm_kmmio_fault_page(unsigned long page, int *page_level) */ static int kmmio_handler(struct pt_regs *regs, unsigned long addr) { - struct kmmio_context *ctx; - int cpu; + struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx); /* * Preemption is now disabled to prevent process switch during @@ -187,8 +188,6 @@ static int kmmio_handler(struct pt_regs *regs, unsigned long addr) * And that interrupt triggers a kmmio trap? */ preempt_disable(); - cpu = smp_processor_id(); - ctx = &kmmio_ctx[cpu]; /* interrupts disabled and CPU-local data => atomicity guaranteed. */ if (ctx->active) { @@ -199,7 +198,7 @@ static int kmmio_handler(struct pt_regs *regs, unsigned long addr) */ printk(KERN_EMERG "mmiotrace: recursive probe hit on CPU %d, " "for address %lu. Ignoring.\n", - cpu, addr); + smp_processor_id(), addr); goto no_kmmio; } ctx->active++; @@ -231,6 +230,7 @@ static int kmmio_handler(struct pt_regs *regs, unsigned long addr) /* We hold lock, now we set present bit in PTE and single step. */ disarm_kmmio_fault_page(ctx->fpage->page, NULL); + put_cpu_var(kmmio_ctx); return 1; no_kmmio_locked: @@ -238,6 +238,7 @@ no_kmmio_locked: ctx->active--; no_kmmio: preempt_enable_no_resched(); + put_cpu_var(kmmio_ctx); /* page fault not handled by kmmio */ return 0; } @@ -249,11 +250,11 @@ no_kmmio: */ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs) { - int cpu = smp_processor_id(); - struct kmmio_context *ctx = &kmmio_ctx[cpu]; + int ret = 0; + struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx); if (!ctx->active) - return 0; + goto out; if (ctx->probe && ctx->probe->post_handler) ctx->probe->post_handler(ctx->probe, condition, regs); @@ -273,10 +274,12 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs) * will have TF set, in which case, continue the remaining processing * of do_debug, as if this is not a probe hit. */ - if (regs->flags & TF_MASK) - return 0; + if (!(regs->flags & TF_MASK)) + ret = 1; - return 1; +out: + put_cpu_var(kmmio_ctx); + return ret; } static int add_kmmio_fault_page(unsigned long page) diff --git a/arch/x86/kernel/mmiotrace/mmio-mod.c b/arch/x86/kernel/mmiotrace/mmio-mod.c index 0019dcd..f9c6092 100644 --- a/arch/x86/kernel/mmiotrace/mmio-mod.c +++ b/arch/x86/kernel/mmiotrace/mmio-mod.c @@ -30,6 +30,7 @@ #include #include /* for ISA_START_ADDRESS */ #include +#include #include "kmmio.h" #include "pf_in.h" @@ -49,11 +50,11 @@ struct trap_reason { }; /* Accessed per-cpu. */ -static struct trap_reason pf_reason[NR_CPUS]; -static struct mm_io_header_rw cpu_trace[NR_CPUS]; +static DEFINE_PER_CPU(struct trap_reason, pf_reason); +static DEFINE_PER_CPU(struct mm_io_header_rw, cpu_trace); /* Access to this is not per-cpu. */ -static atomic_t dropped[NR_CPUS]; +static DEFINE_PER_CPU(atomic_t, dropped); static struct file_operations mmio_fops = { .owner = THIS_MODULE, @@ -150,15 +151,15 @@ static void print_pte(unsigned long address) */ static void die_kmmio_nesting_error(struct pt_regs *regs, unsigned long addr) { - const unsigned long cpu = smp_processor_id(); + const struct trap_reason *my_reason = &get_cpu_var(pf_reason); printk(KERN_EMERG MODULE_NAME ": unexpected fault for address: %lx, " "last fault for address: %lx\n", - addr, pf_reason[cpu].addr); + addr, my_reason->addr); print_pte(addr); #ifdef __i386__ print_symbol(KERN_EMERG "faulting EIP is at %s\n", regs->ip); print_symbol(KERN_EMERG "last faulting EIP was at %s\n", - pf_reason[cpu].ip); + my_reason->ip); printk(KERN_EMERG "eax: %08lx ebx: %08lx ecx: %08lx edx: %08lx\n", regs->ax, regs->bx, regs->cx, regs->dx); @@ -168,100 +169,105 @@ static void die_kmmio_nesting_error(struct pt_regs *regs, unsigned long addr) #else print_symbol(KERN_EMERG "faulting RIP is at %s\n", regs->ip); print_symbol(KERN_EMERG "last faulting RIP was at %s\n", - pf_reason[cpu].ip); + my_reason->ip); printk(KERN_EMERG "rax: %016lx rcx: %016lx rdx: %016lx\n", regs->ax, regs->cx, regs->dx); printk(KERN_EMERG "rsi: %016lx rdi: %016lx " "rbp: %016lx rsp: %016lx\n", regs->si, regs->di, regs->bp, regs->sp); #endif + put_cpu_var(pf_reason); BUG(); } static void pre(struct kmmio_probe *p, struct pt_regs *regs, unsigned long addr) { - const unsigned long cpu = smp_processor_id(); + struct trap_reason *my_reason = &get_cpu_var(pf_reason); + struct mm_io_header_rw *my_trace = &get_cpu_var(cpu_trace); const unsigned long instptr = instruction_pointer(regs); const enum reason_type type = get_ins_type(instptr); /* it doesn't make sense to have more than one active trace per cpu */ - if (pf_reason[cpu].active_traces) + if (my_reason->active_traces) die_kmmio_nesting_error(regs, addr); else - pf_reason[cpu].active_traces++; + my_reason->active_traces++; - pf_reason[cpu].type = type; - pf_reason[cpu].addr = addr; - pf_reason[cpu].ip = instptr; + my_reason->type = type; + my_reason->addr = addr; + my_reason->ip = instptr; - cpu_trace[cpu].header.type = MMIO_MAGIC; - cpu_trace[cpu].header.pid = 0; - cpu_trace[cpu].header.data_len = sizeof(struct mm_io_rw); - cpu_trace[cpu].rw.address = addr; + my_trace->header.type = MMIO_MAGIC; + my_trace->header.pid = 0; + my_trace->header.data_len = sizeof(struct mm_io_rw); + my_trace->rw.address = addr; /* * Only record the program counter when requested. * It may taint clean-room reverse engineering. */ if (trace_pc) - cpu_trace[cpu].rw.pc = instptr; + my_trace->rw.pc = instptr; else - cpu_trace[cpu].rw.pc = 0; + my_trace->rw.pc = 0; - record_timestamp(&cpu_trace[cpu].header); + record_timestamp(&my_trace->header); switch (type) { case REG_READ: - cpu_trace[cpu].header.type |= + my_trace->header.type |= (MMIO_READ << MMIO_OPCODE_SHIFT) | (get_ins_mem_width(instptr) << MMIO_WIDTH_SHIFT); break; case REG_WRITE: - cpu_trace[cpu].header.type |= + my_trace->header.type |= (MMIO_WRITE << MMIO_OPCODE_SHIFT) | (get_ins_mem_width(instptr) << MMIO_WIDTH_SHIFT); - cpu_trace[cpu].rw.value = get_ins_reg_val(instptr, regs); + my_trace->rw.value = get_ins_reg_val(instptr, regs); break; case IMM_WRITE: - cpu_trace[cpu].header.type |= + my_trace->header.type |= (MMIO_WRITE << MMIO_OPCODE_SHIFT) | (get_ins_mem_width(instptr) << MMIO_WIDTH_SHIFT); - cpu_trace[cpu].rw.value = get_ins_imm_val(instptr); + my_trace->rw.value = get_ins_imm_val(instptr); break; default: { unsigned char *ip = (unsigned char *)instptr; - cpu_trace[cpu].header.type |= + my_trace->header.type |= (MMIO_UNKNOWN_OP << MMIO_OPCODE_SHIFT); - cpu_trace[cpu].rw.value = (*ip) << 16 | - *(ip + 1) << 8 | - *(ip + 2); + my_trace->rw.value = (*ip) << 16 | *(ip + 1) << 8 | + *(ip + 2); } } + put_cpu_var(cpu_trace); + put_cpu_var(pf_reason); } static void post(struct kmmio_probe *p, unsigned long condition, struct pt_regs *regs) { - const unsigned long cpu = smp_processor_id(); + struct trap_reason *my_reason = &get_cpu_var(pf_reason); + struct mm_io_header_rw *my_trace = &get_cpu_var(cpu_trace); /* this should always return the active_trace count to 0 */ - pf_reason[cpu].active_traces--; - if (pf_reason[cpu].active_traces) { + my_reason->active_traces--; + if (my_reason->active_traces) { printk(KERN_EMERG MODULE_NAME ": unexpected post handler"); BUG(); } - switch (pf_reason[cpu].type) { + switch (my_reason->type) { case REG_READ: - cpu_trace[cpu].rw.value = get_ins_reg_val(pf_reason[cpu].ip, - regs); + my_trace->rw.value = get_ins_reg_val(my_reason->ip, regs); break; default: break; } - relay_write(chan, &cpu_trace[cpu], sizeof(struct mm_io_header_rw)); + relay_write(chan, my_trace, sizeof(*my_trace)); + put_cpu_var(cpu_trace); + put_cpu_var(pf_reason); } /* @@ -274,7 +280,7 @@ static int subbuf_start_handler(struct rchan_buf *buf, void *subbuf, void *prev_subbuf, size_t prev_padding) { unsigned int cpu = buf->cpu; - atomic_t *drop = &dropped[cpu]; + atomic_t *drop = &per_cpu(dropped, cpu); int count; if (relay_buf_full(buf)) { if (atomic_inc_return(drop) == 1) { -- cgit v0.10.2 From 0fd0e3da4557c479b820b9a4a7afa25b4637ddf2 Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Mon, 12 May 2008 21:20:57 +0200 Subject: x86: mmiotrace full patch, preview 1 kmmio.c handles the list of mmio probes with callbacks, list of traced pages, and attaching into the page fault handler and die notifier. It arms, traps and disarms the given pages, this is the core of mmiotrace. mmio-mod.c is a user interface, hooking into ioremap functions and registering the mmio probes. It also decodes the required information from trapped mmio accesses via the pre and post callbacks in each probe. Currently, hooking into ioremap functions works by redefining the symbols of the target (binary) kernel module, so that it calls the traced versions of the functions. The most notable changes done since the last discussion are: - kmmio.c is a built-in, not part of the module - direct call from fault.c to kmmio.c, removing all dynamic hooks - prepare for unregistering probes at any time - make kmmio re-initializable and accessible to more than one user - rewrite kmmio locking to remove all spinlocks from page fault path Can I abuse call_rcu() like I do in kmmio.c:unregister_kmmio_probe() or is there a better way? The function called via call_rcu() itself calls call_rcu() again, will this work or break? There I need a second grace period for RCU after the first grace period for page faults. Mmiotrace itself (mmio-mod.c) is still a module, I am going to attack that next. At some point I will start looking into how to make mmiotrace a tracer component of ftrace (thanks for the hint, Ingo). Ftrace should make the user space part of mmiotracing as simple as 'cat /debug/trace/mmio > dump.txt'. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c index 027a5b6..a4f93b4 100644 --- a/arch/x86/kernel/init_task.c +++ b/arch/x86/kernel/init_task.c @@ -15,7 +15,6 @@ static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); struct mm_struct init_mm = INIT_MM(init_mm); EXPORT_UNUSED_SYMBOL(init_mm); /* will be removed in 2.6.26 */ -EXPORT_SYMBOL_GPL(init_mm); /* * Initial thread structure. diff --git a/arch/x86/kernel/mmiotrace/Makefile b/arch/x86/kernel/mmiotrace/Makefile index d6905f7..cf1e747 100644 --- a/arch/x86/kernel/mmiotrace/Makefile +++ b/arch/x86/kernel/mmiotrace/Makefile @@ -1,4 +1,4 @@ -obj-$(CONFIG_MMIOTRACE) += mmiotrace.o -mmiotrace-objs := pf_in.o kmmio.o mmio-mod.o - -obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o +obj-$(CONFIG_MMIOTRACE_HOOKS) += kmmio.o +obj-$(CONFIG_MMIOTRACE) += mmiotrace.o +mmiotrace-objs := pf_in.o mmio-mod.o +obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o diff --git a/arch/x86/kernel/mmiotrace/kmmio.c b/arch/x86/kernel/mmiotrace/kmmio.c index 5e239d0..539a9b1 100644 --- a/arch/x86/kernel/mmiotrace/kmmio.c +++ b/arch/x86/kernel/mmiotrace/kmmio.c @@ -6,6 +6,7 @@ */ #include +#include #include #include #include @@ -17,70 +18,119 @@ #include #include #include +#include #include #include #include #include #include -#include "kmmio.h" +#include -#define KMMIO_HASH_BITS 6 -#define KMMIO_TABLE_SIZE (1 << KMMIO_HASH_BITS) #define KMMIO_PAGE_HASH_BITS 4 #define KMMIO_PAGE_TABLE_SIZE (1 << KMMIO_PAGE_HASH_BITS) +struct kmmio_fault_page { + struct list_head list; + struct kmmio_fault_page *release_next; + unsigned long page; /* location of the fault page */ + + /* + * Number of times this page has been registered as a part + * of a probe. If zero, page is disarmed and this may be freed. + * Used only by writers (RCU). + */ + int count; +}; + +struct kmmio_delayed_release { + struct rcu_head rcu; + struct kmmio_fault_page *release_list; +}; + struct kmmio_context { struct kmmio_fault_page *fpage; struct kmmio_probe *probe; unsigned long saved_flags; + unsigned long addr; int active; }; -static int kmmio_page_fault(struct pt_regs *regs, unsigned long error_code, - unsigned long address); static int kmmio_die_notifier(struct notifier_block *nb, unsigned long val, void *args); +static DECLARE_MUTEX(kmmio_init_mutex); static DEFINE_SPINLOCK(kmmio_lock); /* These are protected by kmmio_lock */ +static int kmmio_initialized; unsigned int kmmio_count; -static unsigned int handler_registered; + +/* Read-protected by RCU, write-protected by kmmio_lock. */ static struct list_head kmmio_page_table[KMMIO_PAGE_TABLE_SIZE]; static LIST_HEAD(kmmio_probes); +static struct list_head *kmmio_page_list(unsigned long page) +{ + return &kmmio_page_table[hash_long(page, KMMIO_PAGE_HASH_BITS)]; +} + /* Accessed per-cpu */ static DEFINE_PER_CPU(struct kmmio_context, kmmio_ctx); +/* protected by kmmio_init_mutex */ static struct notifier_block nb_die = { .notifier_call = kmmio_die_notifier }; -int init_kmmio(void) +/** + * Makes sure kmmio is initialized and usable. + * This must be called before any other kmmio function defined here. + * May sleep. + */ +void reference_kmmio(void) { - int i; - for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++) - INIT_LIST_HEAD(&kmmio_page_table[i]); - - register_die_notifier(&nb_die); - return 0; + down(&kmmio_init_mutex); + spin_lock_irq(&kmmio_lock); + if (!kmmio_initialized) { + int i; + for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++) + INIT_LIST_HEAD(&kmmio_page_table[i]); + if (register_die_notifier(&nb_die)) + BUG(); + } + kmmio_initialized++; + spin_unlock_irq(&kmmio_lock); + up(&kmmio_init_mutex); } +EXPORT_SYMBOL_GPL(reference_kmmio); -void cleanup_kmmio(void) +/** + * Clean up kmmio after use. This must be called for every call to + * reference_kmmio(). All probes registered after the corresponding + * reference_kmmio() must have been unregistered when calling this. + * May sleep. + */ +void unreference_kmmio(void) { - /* - * Assume the following have been already cleaned by calling - * unregister_kmmio_probe() appropriately: - * kmmio_page_table, kmmio_probes - */ - if (handler_registered) { - if (mmiotrace_unregister_pf(&kmmio_page_fault)) - BUG(); - synchronize_rcu(); + bool unreg = false; + + down(&kmmio_init_mutex); + spin_lock_irq(&kmmio_lock); + + if (kmmio_initialized == 1) { + BUG_ON(is_kmmio_active()); + unreg = true; } - unregister_die_notifier(&nb_die); + kmmio_initialized--; + BUG_ON(kmmio_initialized < 0); + spin_unlock_irq(&kmmio_lock); + + if (unreg) + unregister_die_notifier(&nb_die); /* calls sync_rcu() */ + up(&kmmio_init_mutex); } +EXPORT_SYMBOL(unreference_kmmio); /* * this is basically a dynamic stabbing problem: @@ -90,33 +140,33 @@ void cleanup_kmmio(void) * Overlap a Point (might be simple) * Space Efficient Dynamic Stabbing with Fast Queries - Mikkel Thorup */ -/* Get the kmmio at this addr (if any). You must be holding kmmio_lock. */ +/* Get the kmmio at this addr (if any). You must be holding RCU read lock. */ static struct kmmio_probe *get_kmmio_probe(unsigned long addr) { struct kmmio_probe *p; - list_for_each_entry(p, &kmmio_probes, list) { + list_for_each_entry_rcu(p, &kmmio_probes, list) { if (addr >= p->addr && addr <= (p->addr + p->len)) return p; } return NULL; } +/* You must be holding RCU read lock. */ static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page) { - struct list_head *head, *tmp; + struct list_head *head; + struct kmmio_fault_page *p; page &= PAGE_MASK; - head = &kmmio_page_table[hash_long(page, KMMIO_PAGE_HASH_BITS)]; - list_for_each(tmp, head) { - struct kmmio_fault_page *p - = list_entry(tmp, struct kmmio_fault_page, list); + head = kmmio_page_list(page); + list_for_each_entry_rcu(p, head, list) { if (p->page == page) return p; } - return NULL; } +/** Mark the given page as not present. Access to it will trigger a fault. */ static void arm_kmmio_fault_page(unsigned long page, int *page_level) { unsigned long address = page & PAGE_MASK; @@ -124,8 +174,8 @@ static void arm_kmmio_fault_page(unsigned long page, int *page_level) pte_t *pte = lookup_address(address, &level); if (!pte) { - printk(KERN_ERR "Error in %s: no pte for page 0x%08lx\n", - __FUNCTION__, page); + pr_err("kmmio: Error in %s: no pte for page 0x%08lx\n", + __func__, page); return; } @@ -143,6 +193,7 @@ static void arm_kmmio_fault_page(unsigned long page, int *page_level) __flush_tlb_one(page); } +/** Mark the given page as present. */ static void disarm_kmmio_fault_page(unsigned long page, int *page_level) { unsigned long address = page & PAGE_MASK; @@ -150,8 +201,8 @@ static void disarm_kmmio_fault_page(unsigned long page, int *page_level) pte_t *pte = lookup_address(address, &level); if (!pte) { - printk(KERN_ERR "Error in %s: no pte for page 0x%08lx\n", - __FUNCTION__, page); + pr_err("kmmio: Error in %s: no pte for page 0x%08lx\n", + __func__, page); return; } @@ -170,12 +221,24 @@ static void disarm_kmmio_fault_page(unsigned long page, int *page_level) } /* + * This is being called from do_page_fault(). + * + * We may be in an interrupt or a critical section. Also prefecthing may + * trigger a page fault. We may be in the middle of process switch. + * We cannot take any locks, because we could be executing especially + * within a kmmio critical section. + * + * Local interrupts are disabled, so preemption cannot happen. + * Do not enable interrupts, do not sleep, and watch out for other CPUs. + */ +/* * Interrupts are disabled on entry as trap3 is an interrupt gate * and they remain disabled thorough out this function. */ -static int kmmio_handler(struct pt_regs *regs, unsigned long addr) +int kmmio_handler(struct pt_regs *regs, unsigned long addr) { - struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx); + struct kmmio_context *ctx; + struct kmmio_fault_page *faultpage; /* * Preemption is now disabled to prevent process switch during @@ -186,40 +249,40 @@ static int kmmio_handler(struct pt_regs *regs, unsigned long addr) * XXX what if an interrupt occurs between returning from * do_page_fault() and entering the single-step exception handler? * And that interrupt triggers a kmmio trap? + * XXX If we tracing an interrupt service routine or whatever, is + * this enough to keep it on the current cpu? */ preempt_disable(); - /* interrupts disabled and CPU-local data => atomicity guaranteed. */ + rcu_read_lock(); + faultpage = get_kmmio_fault_page(addr); + if (!faultpage) { + /* + * Either this page fault is not caused by kmmio, or + * another CPU just pulled the kmmio probe from under + * our feet. In the latter case all hell breaks loose. + */ + goto no_kmmio; + } + + ctx = &get_cpu_var(kmmio_ctx); if (ctx->active) { /* - * This avoids a deadlock with kmmio_lock. + * Prevent overwriting already in-flight context. * If this page fault really was due to kmmio trap, * all hell breaks loose. */ - printk(KERN_EMERG "mmiotrace: recursive probe hit on CPU %d, " - "for address %lu. Ignoring.\n", + pr_emerg("kmmio: recursive probe hit on CPU %d, " + "for address 0x%08lx. Ignoring.\n", smp_processor_id(), addr); - goto no_kmmio; + goto no_kmmio_ctx; } ctx->active++; - /* - * Acquire the kmmio lock to prevent changes affecting - * get_kmmio_fault_page() and get_kmmio_probe(), since we save their - * returned pointers. - * The lock is released in post_kmmio_handler(). - * XXX: could/should get_kmmio_*() be using RCU instead of spinlock? - */ - spin_lock(&kmmio_lock); - - ctx->fpage = get_kmmio_fault_page(addr); - if (!ctx->fpage) { - /* this page fault is not caused by kmmio */ - goto no_kmmio_locked; - } - + ctx->fpage = faultpage; ctx->probe = get_kmmio_probe(addr); ctx->saved_flags = (regs->flags & (TF_MASK|IF_MASK)); + ctx->addr = addr; if (ctx->probe && ctx->probe->pre_handler) ctx->probe->pre_handler(ctx->probe, regs, addr); @@ -227,46 +290,62 @@ static int kmmio_handler(struct pt_regs *regs, unsigned long addr) regs->flags |= TF_MASK; regs->flags &= ~IF_MASK; - /* We hold lock, now we set present bit in PTE and single step. */ + /* Now we set present bit in PTE and single step. */ disarm_kmmio_fault_page(ctx->fpage->page, NULL); put_cpu_var(kmmio_ctx); + rcu_read_unlock(); return 1; -no_kmmio_locked: - spin_unlock(&kmmio_lock); - ctx->active--; +no_kmmio_ctx: + put_cpu_var(kmmio_ctx); no_kmmio: + rcu_read_unlock(); preempt_enable_no_resched(); - put_cpu_var(kmmio_ctx); - /* page fault not handled by kmmio */ - return 0; + return 0; /* page fault not handled by kmmio */ } /* * Interrupts are disabled on entry as trap1 is an interrupt gate * and they remain disabled thorough out this function. - * And we hold kmmio lock. + * This must always get called as the pair to kmmio_handler(). */ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs) { int ret = 0; + struct kmmio_probe *probe; + struct kmmio_fault_page *faultpage; struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx); if (!ctx->active) goto out; + rcu_read_lock(); + + faultpage = get_kmmio_fault_page(ctx->addr); + probe = get_kmmio_probe(ctx->addr); + if (faultpage != ctx->fpage || probe != ctx->probe) { + /* + * The trace setup changed after kmmio_handler() and before + * running this respective post handler. User does not want + * the result anymore. + */ + ctx->probe = NULL; + ctx->fpage = NULL; + } + if (ctx->probe && ctx->probe->post_handler) ctx->probe->post_handler(ctx->probe, condition, regs); - arm_kmmio_fault_page(ctx->fpage->page, NULL); + if (ctx->fpage) + arm_kmmio_fault_page(ctx->fpage->page, NULL); regs->flags &= ~TF_MASK; regs->flags |= ctx->saved_flags; /* These were acquired in kmmio_handler(). */ ctx->active--; - spin_unlock(&kmmio_lock); + BUG_ON(ctx->active); preempt_enable_no_resched(); /* @@ -277,11 +356,13 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs) if (!(regs->flags & TF_MASK)) ret = 1; + rcu_read_unlock(); out: put_cpu_var(kmmio_ctx); return ret; } +/* You must be holding kmmio_lock. */ static int add_kmmio_fault_page(unsigned long page) { struct kmmio_fault_page *f; @@ -289,6 +370,8 @@ static int add_kmmio_fault_page(unsigned long page) page &= PAGE_MASK; f = get_kmmio_fault_page(page); if (f) { + if (!f->count) + arm_kmmio_fault_page(f->page, NULL); f->count++; return 0; } @@ -299,15 +382,16 @@ static int add_kmmio_fault_page(unsigned long page) f->count = 1; f->page = page; - list_add(&f->list, - &kmmio_page_table[hash_long(f->page, KMMIO_PAGE_HASH_BITS)]); + list_add_rcu(&f->list, kmmio_page_list(f->page)); arm_kmmio_fault_page(f->page, NULL); return 0; } -static void release_kmmio_fault_page(unsigned long page) +/* You must be holding kmmio_lock. */ +static void release_kmmio_fault_page(unsigned long page, + struct kmmio_fault_page **release_list) { struct kmmio_fault_page *f; @@ -317,9 +401,11 @@ static void release_kmmio_fault_page(unsigned long page) return; f->count--; + BUG_ON(f->count < 0); if (!f->count) { disarm_kmmio_fault_page(f->page, NULL); - list_del(&f->list); + f->release_next = *release_list; + *release_list = f; } } @@ -334,68 +420,113 @@ int register_kmmio_probe(struct kmmio_probe *p) ret = -EEXIST; goto out; } - list_add(&p->list, &kmmio_probes); - /*printk("adding fault pages...\n");*/ + list_add_rcu(&p->list, &kmmio_probes); while (size < p->len) { if (add_kmmio_fault_page(p->addr + size)) - printk(KERN_ERR "mmio: Unable to set page fault.\n"); + pr_err("kmmio: Unable to set page fault.\n"); size += PAGE_SIZE; } - - if (!handler_registered) { - if (mmiotrace_register_pf(&kmmio_page_fault)) - printk(KERN_ERR "mmiotrace: Cannot register page " - "fault handler.\n"); - else - handler_registered++; - } - out: spin_unlock_irq(&kmmio_lock); /* * XXX: What should I do here? * Here was a call to global_flush_tlb(), but it does not exist - * anymore. + * anymore. It seems it's not needed after all. */ return ret; } +EXPORT_SYMBOL(register_kmmio_probe); +static void rcu_free_kmmio_fault_pages(struct rcu_head *head) +{ + struct kmmio_delayed_release *dr = container_of( + head, + struct kmmio_delayed_release, + rcu); + struct kmmio_fault_page *p = dr->release_list; + while (p) { + struct kmmio_fault_page *next = p->release_next; + BUG_ON(p->count); + kfree(p); + p = next; + } + kfree(dr); +} + +static void remove_kmmio_fault_pages(struct rcu_head *head) +{ + struct kmmio_delayed_release *dr = container_of( + head, + struct kmmio_delayed_release, + rcu); + struct kmmio_fault_page *p = dr->release_list; + struct kmmio_fault_page **prevp = &dr->release_list; + unsigned long flags; + spin_lock_irqsave(&kmmio_lock, flags); + while (p) { + if (!p->count) + list_del_rcu(&p->list); + else + *prevp = p->release_next; + prevp = &p->release_next; + p = p->release_next; + } + spin_unlock_irqrestore(&kmmio_lock, flags); + /* This is the real RCU destroy call. */ + call_rcu(&dr->rcu, rcu_free_kmmio_fault_pages); +} + +/* + * Remove a kmmio probe. You have to synchronize_rcu() before you can be + * sure that the callbacks will not be called anymore. + * + * Unregistering a kmmio fault page has three steps: + * 1. release_kmmio_fault_page() + * Disarm the page, wait a grace period to let all faults finish. + * 2. remove_kmmio_fault_pages() + * Remove the pages from kmmio_page_table. + * 3. rcu_free_kmmio_fault_pages() + * Actally free the kmmio_fault_page structs as with RCU. + */ void unregister_kmmio_probe(struct kmmio_probe *p) { unsigned long size = 0; + struct kmmio_fault_page *release_list = NULL; + struct kmmio_delayed_release *drelease; spin_lock_irq(&kmmio_lock); while (size < p->len) { - release_kmmio_fault_page(p->addr + size); + release_kmmio_fault_page(p->addr + size, &release_list); size += PAGE_SIZE; } - list_del(&p->list); + list_del_rcu(&p->list); kmmio_count--; spin_unlock_irq(&kmmio_lock); -} -/* - * According to 2.6.20, mainly x86_64 arch: - * This is being called from do_page_fault(), via the page fault notifier - * chain. The chain is called for both user space faults and kernel space - * faults (address >= TASK_SIZE64), except not on faults serviced by - * vmalloc_fault(). - * - * We may be in an interrupt or a critical section. Also prefecthing may - * trigger a page fault. We may be in the middle of process switch. - * The page fault hook functionality has put us inside RCU read lock. - * - * Local interrupts are disabled, so preemption cannot happen. - * Do not enable interrupts, do not sleep, and watch out for other CPUs. - */ -static int kmmio_page_fault(struct pt_regs *regs, unsigned long error_code, - unsigned long address) -{ - if (is_kmmio_active()) - if (kmmio_handler(regs, address) == 1) - return -1; - return 0; + drelease = kmalloc(sizeof(*drelease), GFP_ATOMIC); + if (!drelease) { + pr_crit("kmmio: leaking kmmio_fault_page objects.\n"); + return; + } + drelease->release_list = release_list; + + /* + * This is not really RCU here. We have just disarmed a set of + * pages so that they cannot trigger page faults anymore. However, + * we cannot remove the pages from kmmio_page_table, + * because a probe hit might be in flight on another CPU. The + * pages are collected into a list, and they will be removed from + * kmmio_page_table when it is certain that no probe hit related to + * these pages can be in flight. RCU grace period sounds like a + * good choice. + * + * If we removed the pages too early, kmmio page fault handler might + * not find the respective kmmio_fault_page and determine it's not + * a kmmio fault, when it actually is. This would lead to madness. + */ + call_rcu(&drelease->rcu, remove_kmmio_fault_pages); } +EXPORT_SYMBOL(unregister_kmmio_probe); static int kmmio_die_notifier(struct notifier_block *nb, unsigned long val, void *args) diff --git a/arch/x86/kernel/mmiotrace/kmmio.h b/arch/x86/kernel/mmiotrace/kmmio.h deleted file mode 100644 index 85b7f68..0000000 --- a/arch/x86/kernel/mmiotrace/kmmio.h +++ /dev/null @@ -1,58 +0,0 @@ -#ifndef _LINUX_KMMIO_H -#define _LINUX_KMMIO_H - -#include -#include -#include -#include -#include -#include -#include - -struct kmmio_probe; -struct kmmio_fault_page; -struct pt_regs; - -typedef void (*kmmio_pre_handler_t)(struct kmmio_probe *, - struct pt_regs *, unsigned long addr); -typedef void (*kmmio_post_handler_t)(struct kmmio_probe *, - unsigned long condition, struct pt_regs *); - -struct kmmio_probe { - struct list_head list; - - /* start location of the probe point */ - unsigned long addr; - - /* length of the probe region */ - unsigned long len; - - /* Called before addr is executed. */ - kmmio_pre_handler_t pre_handler; - - /* Called after addr is executed, unless... */ - kmmio_post_handler_t post_handler; -}; - -struct kmmio_fault_page { - struct list_head list; - - /* location of the fault page */ - unsigned long page; - - int count; -}; - -/* kmmio is active by some kmmio_probes? */ -static inline int is_kmmio_active(void) -{ - extern unsigned int kmmio_count; - return kmmio_count; -} - -int init_kmmio(void); -void cleanup_kmmio(void); -int register_kmmio_probe(struct kmmio_probe *p); -void unregister_kmmio_probe(struct kmmio_probe *p); - -#endif /* _LINUX_KMMIO_H */ diff --git a/arch/x86/kernel/mmiotrace/mmio-mod.c b/arch/x86/kernel/mmiotrace/mmio-mod.c index f9c6092..e1a5085 100644 --- a/arch/x86/kernel/mmiotrace/mmio-mod.c +++ b/arch/x86/kernel/mmiotrace/mmio-mod.c @@ -32,7 +32,6 @@ #include #include -#include "kmmio.h" #include "pf_in.h" /* This app's relay channel files will appear in /debug/mmio-trace */ @@ -129,18 +128,17 @@ static void print_pte(unsigned long address) pte_t *pte = lookup_address(address, &level); if (!pte) { - printk(KERN_ERR "Error in %s: no pte for page 0x%08lx\n", - __FUNCTION__, address); + pr_err(MODULE_NAME ": Error in %s: no pte for page 0x%08lx\n", + __func__, address); return; } if (level == PG_LEVEL_2M) { - printk(KERN_EMERG MODULE_NAME ": 4MB pages are not " - "currently supported: %lx\n", - address); + pr_emerg(MODULE_NAME ": 4MB pages are not currently " + "supported: %lx\n", address); BUG(); } - printk(KERN_DEBUG MODULE_NAME ": pte for 0x%lx: 0x%lx 0x%lx\n", + pr_info(MODULE_NAME ": pte for 0x%lx: 0x%lx 0x%lx\n", address, pte_val(*pte), pte_val(*pte) & _PAGE_PRESENT); } @@ -152,7 +150,7 @@ static void print_pte(unsigned long address) static void die_kmmio_nesting_error(struct pt_regs *regs, unsigned long addr) { const struct trap_reason *my_reason = &get_cpu_var(pf_reason); - printk(KERN_EMERG MODULE_NAME ": unexpected fault for address: %lx, " + pr_emerg(MODULE_NAME ": unexpected fault for address: %lx, " "last fault for address: %lx\n", addr, my_reason->addr); print_pte(addr); @@ -160,20 +158,17 @@ static void die_kmmio_nesting_error(struct pt_regs *regs, unsigned long addr) print_symbol(KERN_EMERG "faulting EIP is at %s\n", regs->ip); print_symbol(KERN_EMERG "last faulting EIP was at %s\n", my_reason->ip); - printk(KERN_EMERG - "eax: %08lx ebx: %08lx ecx: %08lx edx: %08lx\n", + pr_emerg("eax: %08lx ebx: %08lx ecx: %08lx edx: %08lx\n", regs->ax, regs->bx, regs->cx, regs->dx); - printk(KERN_EMERG - "esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n", + pr_emerg("esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n", regs->si, regs->di, regs->bp, regs->sp); #else print_symbol(KERN_EMERG "faulting RIP is at %s\n", regs->ip); print_symbol(KERN_EMERG "last faulting RIP was at %s\n", my_reason->ip); - printk(KERN_EMERG "rax: %016lx rcx: %016lx rdx: %016lx\n", + pr_emerg("rax: %016lx rcx: %016lx rdx: %016lx\n", regs->ax, regs->cx, regs->dx); - printk(KERN_EMERG "rsi: %016lx rdi: %016lx " - "rbp: %016lx rsp: %016lx\n", + pr_emerg("rsi: %016lx rdi: %016lx rbp: %016lx rsp: %016lx\n", regs->si, regs->di, regs->bp, regs->sp); #endif put_cpu_var(pf_reason); @@ -251,10 +246,15 @@ static void post(struct kmmio_probe *p, unsigned long condition, struct trap_reason *my_reason = &get_cpu_var(pf_reason); struct mm_io_header_rw *my_trace = &get_cpu_var(cpu_trace); + /* + * XXX: This might not get called, if the probe is removed while + * trace hit is on flight. + */ + /* this should always return the active_trace count to 0 */ my_reason->active_traces--; if (my_reason->active_traces) { - printk(KERN_EMERG MODULE_NAME ": unexpected post handler"); + pr_emerg(MODULE_NAME ": unexpected post handler"); BUG(); } @@ -283,16 +283,15 @@ static int subbuf_start_handler(struct rchan_buf *buf, void *subbuf, atomic_t *drop = &per_cpu(dropped, cpu); int count; if (relay_buf_full(buf)) { - if (atomic_inc_return(drop) == 1) { - printk(KERN_ERR MODULE_NAME ": cpu %d buffer full!\n", - cpu); - } + if (atomic_inc_return(drop) == 1) + pr_err(MODULE_NAME ": cpu %d buffer full!\n", cpu); return 0; - } else if ((count = atomic_read(drop))) { - printk(KERN_ERR MODULE_NAME - ": cpu %d buffer no longer full, " - "missed %d events.\n", - cpu, count); + } + count = atomic_read(drop); + if (count) { + pr_err(MODULE_NAME ": cpu %d buffer no longer full, " + "missed %d events.\n", + cpu, count); atomic_sub(count, drop); } @@ -407,8 +406,8 @@ static void ioremap_trace_core(unsigned long offset, unsigned long size, /* Don't trace the low PCI/ISA area, it's always mapped.. */ if (!ISA_trace && (offset < ISA_END_ADDRESS) && (offset + size > ISA_START_ADDRESS)) { - printk(KERN_NOTICE MODULE_NAME ": Ignoring map of low " - "PCI/ISA area (0x%lx-0x%lx)\n", + pr_notice(MODULE_NAME ": Ignoring map of low PCI/ISA area " + "(0x%lx-0x%lx)\n", offset, offset + size); return; } @@ -418,7 +417,7 @@ static void ioremap_trace_core(unsigned long offset, unsigned long size, void __iomem *ioremap_cache_trace(unsigned long offset, unsigned long size) { void __iomem *p = ioremap_cache(offset, size); - printk(KERN_DEBUG MODULE_NAME ": ioremap_cache(0x%lx, 0x%lx) = %p\n", + pr_debug(MODULE_NAME ": ioremap_cache(0x%lx, 0x%lx) = %p\n", offset, size, p); ioremap_trace_core(offset, size, p); return p; @@ -428,7 +427,7 @@ EXPORT_SYMBOL(ioremap_cache_trace); void __iomem *ioremap_nocache_trace(unsigned long offset, unsigned long size) { void __iomem *p = ioremap_nocache(offset, size); - printk(KERN_DEBUG MODULE_NAME ": ioremap_nocache(0x%lx, 0x%lx) = %p\n", + pr_debug(MODULE_NAME ": ioremap_nocache(0x%lx, 0x%lx) = %p\n", offset, size, p); ioremap_trace_core(offset, size, p); return p; @@ -455,7 +454,7 @@ void iounmap_trace(volatile void __iomem *addr) }; struct remap_trace *trace; struct remap_trace *tmp; - printk(KERN_DEBUG MODULE_NAME ": Unmapping %p.\n", addr); + pr_debug(MODULE_NAME ": Unmapping %p.\n", addr); record_timestamp(&event.header); spin_lock(&trace_list_lock); @@ -481,7 +480,7 @@ static void clear_trace_list(void) spin_lock(&trace_list_lock); list_for_each_entry_safe(trace, tmp, &trace_list, list) { - printk(KERN_WARNING MODULE_NAME ": purging non-iounmapped " + pr_warning(MODULE_NAME ": purging non-iounmapped " "trace @0x%08lx, size 0x%lx.\n", trace->probe.addr, trace->probe.len); if (!nommiotrace) @@ -500,39 +499,37 @@ static int __init init(void) dir = debugfs_create_dir(APP_DIR, NULL); if (!dir) { - printk(KERN_ERR MODULE_NAME - ": Couldn't create relay app directory.\n"); + pr_err(MODULE_NAME ": Couldn't create relay app directory.\n"); return -ENOMEM; } chan = create_channel(subbuf_size, n_subbufs); if (!chan) { debugfs_remove(dir); - printk(KERN_ERR MODULE_NAME - ": relay app channel creation failed\n"); + pr_err(MODULE_NAME ": relay app channel creation failed\n"); return -ENOMEM; } - init_kmmio(); + reference_kmmio(); proc_marker_file = create_proc_entry(MARKER_FILE, 0, NULL); if (proc_marker_file) proc_marker_file->write_proc = write_marker; - printk(KERN_DEBUG MODULE_NAME ": loaded.\n"); + pr_debug(MODULE_NAME ": loaded.\n"); if (nommiotrace) - printk(KERN_DEBUG MODULE_NAME ": MMIO tracing disabled.\n"); + pr_info(MODULE_NAME ": MMIO tracing disabled.\n"); if (ISA_trace) - printk(KERN_WARNING MODULE_NAME - ": Warning! low ISA range will be traced.\n"); + pr_warning(MODULE_NAME ": Warning! low ISA range will be " + "traced.\n"); return 0; } static void __exit cleanup(void) { - printk(KERN_DEBUG MODULE_NAME ": unload...\n"); + pr_debug(MODULE_NAME ": unload...\n"); clear_trace_list(); - cleanup_kmmio(); + unreference_kmmio(); remove_proc_entry(MARKER_FILE, NULL); destroy_channel(); if (dir) diff --git a/arch/x86/kernel/mmiotrace/pf_in.c b/arch/x86/kernel/mmiotrace/pf_in.c index 67ea520..efa1911 100644 --- a/arch/x86/kernel/mmiotrace/pf_in.c +++ b/arch/x86/kernel/mmiotrace/pf_in.c @@ -19,7 +19,7 @@ * */ -/* $Id: pf_in.c,v 1.1.1.1 2002/11/12 05:56:32 brlock Exp $ +/* Id: pf_in.c,v 1.1.1.1 2002/11/12 05:56:32 brlock Exp * Copyright by Intel Crop., 2002 * Louis Zhuang (louis.zhuang@intel.com) * diff --git a/arch/x86/kernel/mmiotrace/testmmiotrace.c b/arch/x86/kernel/mmiotrace/testmmiotrace.c index 40e66b0..5ecff57 100644 --- a/arch/x86/kernel/mmiotrace/testmmiotrace.c +++ b/arch/x86/kernel/mmiotrace/testmmiotrace.c @@ -41,8 +41,7 @@ static void do_test(void) { void __iomem *p = ioremap_nocache_trace(mmio_address, 0x4000); if (!p) { - printk(KERN_ERR MODULE_NAME ": could not ioremap IO memory, " - "aborting.\n"); + pr_err(MODULE_NAME ": could not ioremap, aborting.\n"); return; } do_write_test(p); @@ -53,14 +52,14 @@ static void do_test(void) static int __init init(void) { if (mmio_address == 0) { - printk(KERN_ERR MODULE_NAME ": you have to use the module " - "argument mmio_address.\n"); - printk(KERN_ERR MODULE_NAME ": DO NOT LOAD THIS MODULE UNLESS" + pr_err(MODULE_NAME ": you have to use the module argument " + "mmio_address.\n"); + pr_err(MODULE_NAME ": DO NOT LOAD THIS MODULE UNLESS" " YOU REALLY KNOW WHAT YOU ARE DOING!\n"); return -ENXIO; } - printk(KERN_WARNING MODULE_NAME ": WARNING: mapping 16 kB @ 0x%08lx " + pr_warning(MODULE_NAME ": WARNING: mapping 16 kB @ 0x%08lx " "in PCI address space, and writing " "rubbish in there.\n", mmio_address); do_test(); @@ -69,7 +68,7 @@ static int __init init(void) static void __exit cleanup(void) { - printk(KERN_DEBUG MODULE_NAME ": unloaded.\n"); + pr_debug(MODULE_NAME ": unloaded.\n"); } module_init(init); diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index e9a086a..8c828a6 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -49,60 +50,14 @@ #define PF_RSVD (1<<3) #define PF_INSTR (1<<4) -#ifdef CONFIG_MMIOTRACE_HOOKS -static pf_handler_func mmiotrace_pf_handler; /* protected by RCU */ -static DEFINE_SPINLOCK(mmiotrace_handler_lock); - -int mmiotrace_register_pf(pf_handler_func new_pfh) -{ - int ret = 0; - unsigned long flags; - spin_lock_irqsave(&mmiotrace_handler_lock, flags); - if (mmiotrace_pf_handler) - ret = -EBUSY; - else - mmiotrace_pf_handler = new_pfh; - spin_unlock_irqrestore(&mmiotrace_handler_lock, flags); - return ret; -} -EXPORT_SYMBOL_GPL(mmiotrace_register_pf); - -/** - * mmiotrace_unregister_pf: - * The caller must ensure @old_pfh is not in use anymore before freeing it. - * This function does not guarantee it. The handler function pointer is - * protected by RCU, so you can do this by e.g. calling synchronize_rcu(). - */ -int mmiotrace_unregister_pf(pf_handler_func old_pfh) -{ - int ret = 0; - unsigned long flags; - spin_lock_irqsave(&mmiotrace_handler_lock, flags); - if (mmiotrace_pf_handler != old_pfh) - ret = -EPERM; - else - mmiotrace_pf_handler = NULL; - spin_unlock_irqrestore(&mmiotrace_handler_lock, flags); - return ret; -} -EXPORT_SYMBOL_GPL(mmiotrace_unregister_pf); -#endif /* CONFIG_MMIOTRACE_HOOKS */ - -/* returns non-zero if do_page_fault() should return */ -static inline int call_mmiotrace(struct pt_regs *regs, - unsigned long error_code, - unsigned long address) +static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr) { #ifdef CONFIG_MMIOTRACE_HOOKS - int ret = 0; - rcu_read_lock(); - if (mmiotrace_pf_handler) - ret = mmiotrace_pf_handler(regs, error_code, address); - rcu_read_unlock(); - return ret; -#else - return 0; + if (unlikely(is_kmmio_active())) + if (kmmio_handler(regs, addr) == 1) + return -1; #endif + return 0; } static inline int notify_page_fault(struct pt_regs *regs) @@ -657,7 +612,7 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) if (notify_page_fault(regs)) return; - if (call_mmiotrace(regs, error_code, address)) + if (unlikely(kmmio_fault(regs, address))) return; /* diff --git a/include/asm-x86/kdebug.h b/include/asm-x86/kdebug.h index 7063281..96651bb 100644 --- a/include/asm-x86/kdebug.h +++ b/include/asm-x86/kdebug.h @@ -35,11 +35,4 @@ extern void show_regs(struct pt_regs *regs); extern unsigned long oops_begin(void); extern void oops_end(unsigned long, struct pt_regs *, int signr); -typedef int (*pf_handler_func)(struct pt_regs *regs, - unsigned long error_code, - unsigned long address); - -extern int mmiotrace_register_pf(pf_handler_func new_pfh); -extern int mmiotrace_unregister_pf(pf_handler_func old_pfh); - #endif diff --git a/include/linux/mmiotrace.h b/include/linux/mmiotrace.h index 6ec288f..d87a6cd 100644 --- a/include/linux/mmiotrace.h +++ b/include/linux/mmiotrace.h @@ -3,6 +3,44 @@ #include +#ifdef __KERNEL__ + +#include + +struct kmmio_probe; +struct pt_regs; + +typedef void (*kmmio_pre_handler_t)(struct kmmio_probe *, + struct pt_regs *, unsigned long addr); +typedef void (*kmmio_post_handler_t)(struct kmmio_probe *, + unsigned long condition, struct pt_regs *); + +struct kmmio_probe { + struct list_head list; + unsigned long addr; /* start location of the probe point */ + unsigned long len; /* length of the probe region */ + kmmio_pre_handler_t pre_handler; /* Called before addr is executed. */ + kmmio_post_handler_t post_handler; /* Called after addr is executed */ +}; + +/* kmmio is active by some kmmio_probes? */ +static inline int is_kmmio_active(void) +{ + extern unsigned int kmmio_count; + return kmmio_count; +} + +extern void reference_kmmio(void); +extern void unreference_kmmio(void); +extern int register_kmmio_probe(struct kmmio_probe *p); +extern void unregister_kmmio_probe(struct kmmio_probe *p); + +/* Called from page fault handler. */ +extern int kmmio_handler(struct pt_regs *regs, unsigned long addr); + +#endif /* __KERNEL__ */ + + /* * If you change anything here, you must bump MMIO_VERSION. * This is the relay data format for user space. -- cgit v0.10.2 From d61fc44853f46fb002228b18aa5f30db21fcd4ac Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Mon, 12 May 2008 21:20:57 +0200 Subject: x86: mmiotrace, preview 2 Kconfig.debug, Makefile and testmmiotrace.c style fixes. Use real mutex instead of mutex. Fix failure path in register probe func. kmmio: RCU read-locked over single stepping. Generate mapping id's. Make mmio-mod.c built-in and rewrite its locking. Add debugfs file to enable/disable mmiotracing. kmmio: use irqsave spinlocks. Lots of cleanups in mmio-mod.c Marker file moved from /proc into debugfs. Call mmiotrace entrypoints directly from ioremap.c. Signed-off-by: Pekka Paalanen Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 9491c0a..aa0d646 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -170,22 +170,19 @@ config IOMMU_LEAK config MMIOTRACE_HOOKS bool - default n config MMIOTRACE - tristate "Memory mapped IO tracing" + bool "Memory mapped IO tracing" depends on DEBUG_KERNEL && RELAY && DEBUG_FS select MMIOTRACE_HOOKS - default n + default y help - This will build a kernel module called mmiotrace. - Making this a built-in is heavily discouraged. - - Mmiotrace traces Memory Mapped I/O access and is meant for debugging - and reverse engineering. The kernel module offers wrapped - versions of the ioremap family of functions. The driver to be traced - must be modified to call these wrappers. A user space program is - required to collect the MMIO data. + Mmiotrace traces Memory Mapped I/O access and is meant for + debugging and reverse engineering. It is called from the ioremap + implementation and works via page faults. A user space program is + required to collect the MMIO data from debugfs files. + Tracing is disabled by default and can be enabled from a debugfs + file. See http://nouveau.freedesktop.org/wiki/MmioTrace If you are not helping to develop drivers, say N. @@ -193,7 +190,6 @@ config MMIOTRACE config MMIOTRACE_TEST tristate "Test module for mmiotrace" depends on MMIOTRACE && m - default n help This is a dumb module for testing mmiotrace. It is very dangerous as it will write garbage to IO memory starting at a given address. diff --git a/arch/x86/kernel/mmiotrace/Makefile b/arch/x86/kernel/mmiotrace/Makefile index cf1e747..dbcd8d5 100644 --- a/arch/x86/kernel/mmiotrace/Makefile +++ b/arch/x86/kernel/mmiotrace/Makefile @@ -1,4 +1,4 @@ obj-$(CONFIG_MMIOTRACE_HOOKS) += kmmio.o obj-$(CONFIG_MMIOTRACE) += mmiotrace.o -mmiotrace-objs := pf_in.o mmio-mod.o +mmiotrace-y := pf_in.o mmio-mod.o obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o diff --git a/arch/x86/kernel/mmiotrace/kmmio.c b/arch/x86/kernel/mmiotrace/kmmio.c index 539a9b1..efb4679 100644 --- a/arch/x86/kernel/mmiotrace/kmmio.c +++ b/arch/x86/kernel/mmiotrace/kmmio.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -59,7 +60,7 @@ struct kmmio_context { static int kmmio_die_notifier(struct notifier_block *nb, unsigned long val, void *args); -static DECLARE_MUTEX(kmmio_init_mutex); +static DEFINE_MUTEX(kmmio_init_mutex); static DEFINE_SPINLOCK(kmmio_lock); /* These are protected by kmmio_lock */ @@ -90,7 +91,7 @@ static struct notifier_block nb_die = { */ void reference_kmmio(void) { - down(&kmmio_init_mutex); + mutex_lock(&kmmio_init_mutex); spin_lock_irq(&kmmio_lock); if (!kmmio_initialized) { int i; @@ -101,7 +102,7 @@ void reference_kmmio(void) } kmmio_initialized++; spin_unlock_irq(&kmmio_lock); - up(&kmmio_init_mutex); + mutex_unlock(&kmmio_init_mutex); } EXPORT_SYMBOL_GPL(reference_kmmio); @@ -115,7 +116,7 @@ void unreference_kmmio(void) { bool unreg = false; - down(&kmmio_init_mutex); + mutex_lock(&kmmio_init_mutex); spin_lock_irq(&kmmio_lock); if (kmmio_initialized == 1) { @@ -128,7 +129,7 @@ void unreference_kmmio(void) if (unreg) unregister_die_notifier(&nb_die); /* calls sync_rcu() */ - up(&kmmio_init_mutex); + mutex_unlock(&kmmio_init_mutex); } EXPORT_SYMBOL(unreference_kmmio); @@ -244,17 +245,13 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr) * Preemption is now disabled to prevent process switch during * single stepping. We can only handle one active kmmio trace * per cpu, so ensure that we finish it before something else - * gets to run. - * - * XXX what if an interrupt occurs between returning from - * do_page_fault() and entering the single-step exception handler? - * And that interrupt triggers a kmmio trap? - * XXX If we tracing an interrupt service routine or whatever, is - * this enough to keep it on the current cpu? + * gets to run. We also hold the RCU read lock over single + * stepping to avoid looking up the probe and kmmio_fault_page + * again. */ preempt_disable(); - rcu_read_lock(); + faultpage = get_kmmio_fault_page(addr); if (!faultpage) { /* @@ -287,14 +284,24 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr) if (ctx->probe && ctx->probe->pre_handler) ctx->probe->pre_handler(ctx->probe, regs, addr); + /* + * Enable single-stepping and disable interrupts for the faulting + * context. Local interrupts must not get enabled during stepping. + */ regs->flags |= TF_MASK; regs->flags &= ~IF_MASK; /* Now we set present bit in PTE and single step. */ disarm_kmmio_fault_page(ctx->fpage->page, NULL); + /* + * If another cpu accesses the same page while we are stepping, + * the access will not be caught. It will simply succeed and the + * only downside is we lose the event. If this becomes a problem, + * the user should drop to single cpu before tracing. + */ + put_cpu_var(kmmio_ctx); - rcu_read_unlock(); return 1; no_kmmio_ctx: @@ -313,32 +320,15 @@ no_kmmio: static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs) { int ret = 0; - struct kmmio_probe *probe; - struct kmmio_fault_page *faultpage; struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx); if (!ctx->active) goto out; - rcu_read_lock(); - - faultpage = get_kmmio_fault_page(ctx->addr); - probe = get_kmmio_probe(ctx->addr); - if (faultpage != ctx->fpage || probe != ctx->probe) { - /* - * The trace setup changed after kmmio_handler() and before - * running this respective post handler. User does not want - * the result anymore. - */ - ctx->probe = NULL; - ctx->fpage = NULL; - } - if (ctx->probe && ctx->probe->post_handler) ctx->probe->post_handler(ctx->probe, condition, regs); - if (ctx->fpage) - arm_kmmio_fault_page(ctx->fpage->page, NULL); + arm_kmmio_fault_page(ctx->fpage->page, NULL); regs->flags &= ~TF_MASK; regs->flags |= ctx->saved_flags; @@ -346,6 +336,7 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs) /* These were acquired in kmmio_handler(). */ ctx->active--; BUG_ON(ctx->active); + rcu_read_unlock(); preempt_enable_no_resched(); /* @@ -355,8 +346,6 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs) */ if (!(regs->flags & TF_MASK)) ret = 1; - - rcu_read_unlock(); out: put_cpu_var(kmmio_ctx); return ret; @@ -411,15 +400,16 @@ static void release_kmmio_fault_page(unsigned long page, int register_kmmio_probe(struct kmmio_probe *p) { + unsigned long flags; int ret = 0; unsigned long size = 0; - spin_lock_irq(&kmmio_lock); - kmmio_count++; + spin_lock_irqsave(&kmmio_lock, flags); if (get_kmmio_probe(p->addr)) { ret = -EEXIST; goto out; } + kmmio_count++; list_add_rcu(&p->list, &kmmio_probes); while (size < p->len) { if (add_kmmio_fault_page(p->addr + size)) @@ -427,7 +417,7 @@ int register_kmmio_probe(struct kmmio_probe *p) size += PAGE_SIZE; } out: - spin_unlock_irq(&kmmio_lock); + spin_unlock_irqrestore(&kmmio_lock, flags); /* * XXX: What should I do here? * Here was a call to global_flush_tlb(), but it does not exist @@ -478,7 +468,8 @@ static void remove_kmmio_fault_pages(struct rcu_head *head) /* * Remove a kmmio probe. You have to synchronize_rcu() before you can be - * sure that the callbacks will not be called anymore. + * sure that the callbacks will not be called anymore. Only after that + * you may actually release your struct kmmio_probe. * * Unregistering a kmmio fault page has three steps: * 1. release_kmmio_fault_page() @@ -490,18 +481,19 @@ static void remove_kmmio_fault_pages(struct rcu_head *head) */ void unregister_kmmio_probe(struct kmmio_probe *p) { + unsigned long flags; unsigned long size = 0; struct kmmio_fault_page *release_list = NULL; struct kmmio_delayed_release *drelease; - spin_lock_irq(&kmmio_lock); + spin_lock_irqsave(&kmmio_lock, flags); while (size < p->len) { release_kmmio_fault_page(p->addr + size, &release_list); size += PAGE_SIZE; } list_del_rcu(&p->list); kmmio_count--; - spin_unlock_irq(&kmmio_lock); + spin_unlock_irqrestore(&kmmio_lock, flags); drelease = kmalloc(sizeof(*drelease), GFP_ATOMIC); if (!drelease) { diff --git a/arch/x86/kernel/mmiotrace/mmio-mod.c b/arch/x86/kernel/mmiotrace/mmio-mod.c index e1a5085..7386440 100644 --- a/arch/x86/kernel/mmiotrace/mmio-mod.c +++ b/arch/x86/kernel/mmiotrace/mmio-mod.c @@ -19,6 +19,8 @@ * * Derived from the read-mod example from relay-examples by Tom Zanussi. */ +#define DEBUG 1 + #include #include #include @@ -34,12 +36,12 @@ #include "pf_in.h" -/* This app's relay channel files will appear in /debug/mmio-trace */ -#define APP_DIR "mmio-trace" -/* the marker injection file in /proc */ -#define MARKER_FILE "mmio-marker" +#define NAME "mmiotrace: " -#define MODULE_NAME "mmiotrace" +/* This app's relay channel files will appear in /debug/mmio-trace */ +static const char APP_DIR[] = "mmio-trace"; +/* the marker injection file in /debug/APP_DIR */ +static const char MARKER_FILE[] = "mmio-marker"; struct trap_reason { unsigned long addr; @@ -48,6 +50,15 @@ struct trap_reason { int active_traces; }; +struct remap_trace { + struct list_head list; + struct kmmio_probe probe; + unsigned long phys; + unsigned long id; +}; + +static const size_t subbuf_size = 256*1024; + /* Accessed per-cpu. */ static DEFINE_PER_CPU(struct trap_reason, pf_reason); static DEFINE_PER_CPU(struct mm_io_header_rw, cpu_trace); @@ -55,33 +66,53 @@ static DEFINE_PER_CPU(struct mm_io_header_rw, cpu_trace); /* Access to this is not per-cpu. */ static DEFINE_PER_CPU(atomic_t, dropped); -static struct file_operations mmio_fops = { - .owner = THIS_MODULE, -}; +static struct dentry *dir; +static struct dentry *enabled_file; +static struct dentry *marker_file; -static const size_t subbuf_size = 256*1024; +static DEFINE_MUTEX(mmiotrace_mutex); +static DEFINE_SPINLOCK(trace_lock); +static atomic_t mmiotrace_enabled; +static LIST_HEAD(trace_list); /* struct remap_trace */ static struct rchan *chan; -static struct dentry *dir; -static struct proc_dir_entry *proc_marker_file; + +/* + * Locking in this file: + * - mmiotrace_mutex enforces enable/disable_mmiotrace() critical sections. + * - mmiotrace_enabled may be modified only when holding mmiotrace_mutex + * and trace_lock. + * - Routines depending on is_enabled() must take trace_lock. + * - trace_list users must hold trace_lock. + * - is_enabled() guarantees that chan is valid. + * - pre/post callbacks assume the effect of is_enabled() being true. + */ /* module parameters */ -static unsigned int n_subbufs = 32*4; -static unsigned long filter_offset; -static int nommiotrace; -static int ISA_trace; -static int trace_pc; +static unsigned int n_subbufs = 32*4; +static unsigned long filter_offset; +static int nommiotrace; +static int ISA_trace; +static int trace_pc; +static int enable_now; module_param(n_subbufs, uint, 0); module_param(filter_offset, ulong, 0); module_param(nommiotrace, bool, 0); module_param(ISA_trace, bool, 0); module_param(trace_pc, bool, 0); +module_param(enable_now, bool, 0); MODULE_PARM_DESC(n_subbufs, "Number of 256kB buffers, default 128."); MODULE_PARM_DESC(filter_offset, "Start address of traced mappings."); MODULE_PARM_DESC(nommiotrace, "Disable actual MMIO tracing."); MODULE_PARM_DESC(ISA_trace, "Do not exclude the low ISA range."); MODULE_PARM_DESC(trace_pc, "Record address of faulting instructions."); +MODULE_PARM_DESC(enable_now, "Start mmiotracing immediately on module load."); + +static bool is_enabled(void) +{ + return atomic_read(&mmiotrace_enabled); +} static void record_timestamp(struct mm_io_header *header) { @@ -93,15 +124,15 @@ static void record_timestamp(struct mm_io_header *header) } /* - * Write callback for the /proc entry: + * Write callback for the debugfs entry: * Read a marker and write it to the mmio trace log */ -static int write_marker(struct file *file, const char __user *buffer, - unsigned long count, void *data) +static ssize_t write_marker(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos) { char *event = NULL; struct mm_io_header *headp; - int len = (count > 65535) ? 65535 : count; + ssize_t len = (count > 65535) ? 65535 : count; event = kzalloc(sizeof(*headp) + len, GFP_KERNEL); if (!event) @@ -117,7 +148,12 @@ static int write_marker(struct file *file, const char __user *buffer, return -EFAULT; } - relay_write(chan, event, sizeof(*headp) + len); + spin_lock_irq(&trace_lock); + if (is_enabled()) + relay_write(chan, event, sizeof(*headp) + len); + else + len = -EINVAL; + spin_unlock_irq(&trace_lock); kfree(event); return len; } @@ -128,19 +164,18 @@ static void print_pte(unsigned long address) pte_t *pte = lookup_address(address, &level); if (!pte) { - pr_err(MODULE_NAME ": Error in %s: no pte for page 0x%08lx\n", + pr_err(NAME "Error in %s: no pte for page 0x%08lx\n", __func__, address); return; } if (level == PG_LEVEL_2M) { - pr_emerg(MODULE_NAME ": 4MB pages are not currently " - "supported: %lx\n", address); + pr_emerg(NAME "4MB pages are not currently supported: " + "0x%08lx\n", address); BUG(); } - pr_info(MODULE_NAME ": pte for 0x%lx: 0x%lx 0x%lx\n", - address, pte_val(*pte), - pte_val(*pte) & _PAGE_PRESENT); + pr_info(NAME "pte for 0x%lx: 0x%lx 0x%lx\n", address, pte_val(*pte), + pte_val(*pte) & _PAGE_PRESENT); } /* @@ -150,22 +185,18 @@ static void print_pte(unsigned long address) static void die_kmmio_nesting_error(struct pt_regs *regs, unsigned long addr) { const struct trap_reason *my_reason = &get_cpu_var(pf_reason); - pr_emerg(MODULE_NAME ": unexpected fault for address: %lx, " - "last fault for address: %lx\n", + pr_emerg(NAME "unexpected fault for address: 0x%08lx, " + "last fault for address: 0x%08lx\n", addr, my_reason->addr); print_pte(addr); + print_symbol(KERN_EMERG "faulting IP is at %s\n", regs->ip); + print_symbol(KERN_EMERG "last faulting IP was at %s\n", my_reason->ip); #ifdef __i386__ - print_symbol(KERN_EMERG "faulting EIP is at %s\n", regs->ip); - print_symbol(KERN_EMERG "last faulting EIP was at %s\n", - my_reason->ip); pr_emerg("eax: %08lx ebx: %08lx ecx: %08lx edx: %08lx\n", regs->ax, regs->bx, regs->cx, regs->dx); pr_emerg("esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n", regs->si, regs->di, regs->bp, regs->sp); #else - print_symbol(KERN_EMERG "faulting RIP is at %s\n", regs->ip); - print_symbol(KERN_EMERG "last faulting RIP was at %s\n", - my_reason->ip); pr_emerg("rax: %016lx rcx: %016lx rdx: %016lx\n", regs->ax, regs->cx, regs->dx); pr_emerg("rsi: %016lx rdi: %016lx rbp: %016lx rsp: %016lx\n", @@ -197,6 +228,10 @@ static void pre(struct kmmio_probe *p, struct pt_regs *regs, my_trace->header.pid = 0; my_trace->header.data_len = sizeof(struct mm_io_rw); my_trace->rw.address = addr; + /* + * struct remap_trace *trace = p->user_data; + * phys = addr - trace->probe.addr + trace->phys; + */ /* * Only record the program counter when requested. @@ -246,15 +281,10 @@ static void post(struct kmmio_probe *p, unsigned long condition, struct trap_reason *my_reason = &get_cpu_var(pf_reason); struct mm_io_header_rw *my_trace = &get_cpu_var(cpu_trace); - /* - * XXX: This might not get called, if the probe is removed while - * trace hit is on flight. - */ - /* this should always return the active_trace count to 0 */ my_reason->active_traces--; if (my_reason->active_traces) { - pr_emerg(MODULE_NAME ": unexpected post handler"); + pr_emerg(NAME "unexpected post handler"); BUG(); } @@ -284,20 +314,23 @@ static int subbuf_start_handler(struct rchan_buf *buf, void *subbuf, int count; if (relay_buf_full(buf)) { if (atomic_inc_return(drop) == 1) - pr_err(MODULE_NAME ": cpu %d buffer full!\n", cpu); + pr_err(NAME "cpu %d buffer full!\n", cpu); return 0; } count = atomic_read(drop); if (count) { - pr_err(MODULE_NAME ": cpu %d buffer no longer full, " - "missed %d events.\n", - cpu, count); + pr_err(NAME "cpu %d buffer no longer full, missed %d events.\n", + cpu, count); atomic_sub(count, drop); } return 1; } +static struct file_operations mmio_fops = { + .owner = THIS_MODULE, +}; + /* file_create() callback. Creates relay file in debugfs. */ static struct dentry *create_buf_file_handler(const char *filename, struct dentry *parent, @@ -333,34 +366,10 @@ static struct rchan_callbacks relay_callbacks = { .remove_buf_file = remove_buf_file_handler, }; -/* - * create_channel - creates channel /debug/APP_DIR/cpuXXX - * Returns channel on success, NULL otherwise - */ -static struct rchan *create_channel(unsigned size, unsigned n) -{ - return relay_open("cpu", dir, size, n, &relay_callbacks, NULL); -} - -/* destroy_channel - destroys channel /debug/APP_DIR/cpuXXX */ -static void destroy_channel(void) -{ - if (chan) { - relay_close(chan); - chan = NULL; - } -} - -struct remap_trace { - struct list_head list; - struct kmmio_probe probe; -}; -static LIST_HEAD(trace_list); -static DEFINE_SPINLOCK(trace_list_lock); - -static void do_ioremap_trace_core(unsigned long offset, unsigned long size, +static void ioremap_trace_core(unsigned long offset, unsigned long size, void __iomem *addr) { + static atomic_t next_id; struct remap_trace *trace = kmalloc(sizeof(*trace), GFP_KERNEL); struct mm_io_header_map event = { .header = { @@ -380,61 +389,49 @@ static void do_ioremap_trace_core(unsigned long offset, unsigned long size, }; record_timestamp(&event.header); + if (!trace) { + pr_err(NAME "kmalloc failed in ioremap\n"); + return; + } + *trace = (struct remap_trace) { .probe = { .addr = (unsigned long)addr, .len = size, .pre_handler = pre, .post_handler = post, - } + .user_data = trace + }, + .phys = offset, + .id = atomic_inc_return(&next_id) }; + spin_lock_irq(&trace_lock); + if (!is_enabled()) + goto not_enabled; + relay_write(chan, &event, sizeof(event)); - spin_lock(&trace_list_lock); list_add_tail(&trace->list, &trace_list); - spin_unlock(&trace_list_lock); if (!nommiotrace) register_kmmio_probe(&trace->probe); + +not_enabled: + spin_unlock_irq(&trace_lock); } -static void ioremap_trace_core(unsigned long offset, unsigned long size, - void __iomem *addr) +void +mmiotrace_ioremap(unsigned long offset, unsigned long size, void __iomem *addr) { - if ((filter_offset) && (offset != filter_offset)) + if (!is_enabled()) /* recheck and proper locking in *_core() */ return; - /* Don't trace the low PCI/ISA area, it's always mapped.. */ - if (!ISA_trace && (offset < ISA_END_ADDRESS) && - (offset + size > ISA_START_ADDRESS)) { - pr_notice(MODULE_NAME ": Ignoring map of low PCI/ISA area " - "(0x%lx-0x%lx)\n", - offset, offset + size); + pr_debug(NAME "ioremap_*(0x%lx, 0x%lx) = %p\n", offset, size, addr); + if ((filter_offset) && (offset != filter_offset)) return; - } - do_ioremap_trace_core(offset, size, addr); -} - -void __iomem *ioremap_cache_trace(unsigned long offset, unsigned long size) -{ - void __iomem *p = ioremap_cache(offset, size); - pr_debug(MODULE_NAME ": ioremap_cache(0x%lx, 0x%lx) = %p\n", - offset, size, p); - ioremap_trace_core(offset, size, p); - return p; + ioremap_trace_core(offset, size, addr); } -EXPORT_SYMBOL(ioremap_cache_trace); -void __iomem *ioremap_nocache_trace(unsigned long offset, unsigned long size) -{ - void __iomem *p = ioremap_nocache(offset, size); - pr_debug(MODULE_NAME ": ioremap_nocache(0x%lx, 0x%lx) = %p\n", - offset, size, p); - ioremap_trace_core(offset, size, p); - return p; -} -EXPORT_SYMBOL(ioremap_nocache_trace); - -void iounmap_trace(volatile void __iomem *addr) +static void iounmap_trace_core(volatile void __iomem *addr) { struct mm_io_header_map event = { .header = { @@ -454,84 +451,212 @@ void iounmap_trace(volatile void __iomem *addr) }; struct remap_trace *trace; struct remap_trace *tmp; - pr_debug(MODULE_NAME ": Unmapping %p.\n", addr); + struct remap_trace *found_trace = NULL; + + pr_debug(NAME "Unmapping %p.\n", addr); record_timestamp(&event.header); - spin_lock(&trace_list_lock); + spin_lock_irq(&trace_lock); + if (!is_enabled()) + goto not_enabled; + list_for_each_entry_safe(trace, tmp, &trace_list, list) { if ((unsigned long)addr == trace->probe.addr) { if (!nommiotrace) unregister_kmmio_probe(&trace->probe); list_del(&trace->list); - kfree(trace); + found_trace = trace; break; } } - spin_unlock(&trace_list_lock); relay_write(chan, &event, sizeof(event)); - iounmap(addr); + +not_enabled: + spin_unlock_irq(&trace_lock); + if (found_trace) { + synchronize_rcu(); /* unregister_kmmio_probe() requirement */ + kfree(found_trace); + } +} + +void mmiotrace_iounmap(volatile void __iomem *addr) +{ + might_sleep(); + if (is_enabled()) /* recheck and proper locking in *_core() */ + iounmap_trace_core(addr); } -EXPORT_SYMBOL(iounmap_trace); static void clear_trace_list(void) { struct remap_trace *trace; struct remap_trace *tmp; - spin_lock(&trace_list_lock); - list_for_each_entry_safe(trace, tmp, &trace_list, list) { - pr_warning(MODULE_NAME ": purging non-iounmapped " + /* + * No locking required, because the caller ensures we are in a + * critical section via mutex, and is_enabled() is false, + * i.e. nothing can traverse or modify this list. + * Caller also ensures is_enabled() cannot change. + */ + list_for_each_entry(trace, &trace_list, list) { + pr_notice(NAME "purging non-iounmapped " "trace @0x%08lx, size 0x%lx.\n", trace->probe.addr, trace->probe.len); if (!nommiotrace) unregister_kmmio_probe(&trace->probe); + } + synchronize_rcu(); /* unregister_kmmio_probe() requirement */ + + list_for_each_entry_safe(trace, tmp, &trace_list, list) { list_del(&trace->list); kfree(trace); + } +} + +static ssize_t read_enabled_file_bool(struct file *file, + char __user *user_buf, size_t count, loff_t *ppos) +{ + char buf[3]; + + if (is_enabled()) + buf[0] = '1'; + else + buf[0] = '0'; + buf[1] = '\n'; + buf[2] = '\0'; + return simple_read_from_buffer(user_buf, count, ppos, buf, 2); +} + +static void enable_mmiotrace(void); +static void disable_mmiotrace(void); + +static ssize_t write_enabled_file_bool(struct file *file, + const char __user *user_buf, size_t count, loff_t *ppos) +{ + char buf[32]; + int buf_size = min(count, (sizeof(buf)-1)); + + if (copy_from_user(buf, user_buf, buf_size)) + return -EFAULT; + + switch (buf[0]) { + case 'y': + case 'Y': + case '1': + enable_mmiotrace(); + break; + case 'n': + case 'N': + case '0': + disable_mmiotrace(); break; } - spin_unlock(&trace_list_lock); + + return count; +} + +/* this ripped from kernel/kprobes.c */ +static struct file_operations fops_enabled = { + .owner = THIS_MODULE, + .read = read_enabled_file_bool, + .write = write_enabled_file_bool +}; + +static struct file_operations fops_marker = { + .owner = THIS_MODULE, + .write = write_marker +}; + +static void enable_mmiotrace(void) +{ + mutex_lock(&mmiotrace_mutex); + if (is_enabled()) + goto out; + + chan = relay_open("cpu", dir, subbuf_size, n_subbufs, + &relay_callbacks, NULL); + if (!chan) { + pr_err(NAME "relay app channel creation failed.\n"); + goto out; + } + + reference_kmmio(); + + marker_file = debugfs_create_file("marker", 0660, dir, NULL, + &fops_marker); + if (!marker_file) + pr_err(NAME "marker file creation failed.\n"); + + if (nommiotrace) + pr_info(NAME "MMIO tracing disabled.\n"); + if (ISA_trace) + pr_warning(NAME "Warning! low ISA range will be traced.\n"); + spin_lock_irq(&trace_lock); + atomic_inc(&mmiotrace_enabled); + spin_unlock_irq(&trace_lock); + pr_info(NAME "enabled.\n"); +out: + mutex_unlock(&mmiotrace_mutex); +} + +static void disable_mmiotrace(void) +{ + mutex_lock(&mmiotrace_mutex); + if (!is_enabled()) + goto out; + + spin_lock_irq(&trace_lock); + atomic_dec(&mmiotrace_enabled); + BUG_ON(is_enabled()); + spin_unlock_irq(&trace_lock); + + clear_trace_list(); /* guarantees: no more kmmio callbacks */ + unreference_kmmio(); + if (marker_file) { + debugfs_remove(marker_file); + marker_file = NULL; + } + if (chan) { + relay_close(chan); + chan = NULL; + } + + pr_info(NAME "disabled.\n"); +out: + mutex_unlock(&mmiotrace_mutex); } static int __init init(void) { + pr_debug(NAME "load...\n"); if (n_subbufs < 2) return -EINVAL; dir = debugfs_create_dir(APP_DIR, NULL); if (!dir) { - pr_err(MODULE_NAME ": Couldn't create relay app directory.\n"); + pr_err(NAME "Couldn't create relay app directory.\n"); return -ENOMEM; } - chan = create_channel(subbuf_size, n_subbufs); - if (!chan) { + enabled_file = debugfs_create_file("enabled", 0600, dir, NULL, + &fops_enabled); + if (!enabled_file) { + pr_err(NAME "Couldn't create enabled file.\n"); debugfs_remove(dir); - pr_err(MODULE_NAME ": relay app channel creation failed\n"); return -ENOMEM; } - reference_kmmio(); - - proc_marker_file = create_proc_entry(MARKER_FILE, 0, NULL); - if (proc_marker_file) - proc_marker_file->write_proc = write_marker; + if (enable_now) + enable_mmiotrace(); - pr_debug(MODULE_NAME ": loaded.\n"); - if (nommiotrace) - pr_info(MODULE_NAME ": MMIO tracing disabled.\n"); - if (ISA_trace) - pr_warning(MODULE_NAME ": Warning! low ISA range will be " - "traced.\n"); return 0; } static void __exit cleanup(void) { - pr_debug(MODULE_NAME ": unload...\n"); - clear_trace_list(); - unreference_kmmio(); - remove_proc_entry(MARKER_FILE, NULL); - destroy_channel(); + pr_debug(NAME "unload...\n"); + if (enabled_file) + debugfs_remove(enabled_file); + disable_mmiotrace(); if (dir) debugfs_remove(dir); } diff --git a/arch/x86/kernel/mmiotrace/testmmiotrace.c b/arch/x86/kernel/mmiotrace/testmmiotrace.c index 5ecff57..cfa60b2 100644 --- a/arch/x86/kernel/mmiotrace/testmmiotrace.c +++ b/arch/x86/kernel/mmiotrace/testmmiotrace.c @@ -4,10 +4,6 @@ #include #include -extern void __iomem *ioremap_nocache_trace(unsigned long offset, - unsigned long size); -extern void iounmap_trace(volatile void __iomem *addr); - #define MODULE_NAME "testmmiotrace" static unsigned long mmio_address; @@ -28,25 +24,24 @@ static void do_write_test(void __iomem *p) static void do_read_test(void __iomem *p) { unsigned int i; - volatile unsigned int v; for (i = 0; i < 256; i++) - v = ioread8(p + i); + ioread8(p + i); for (i = 1024; i < (5 * 1024); i += 2) - v = ioread16(p + i); + ioread16(p + i); for (i = (5 * 1024); i < (16 * 1024); i += 4) - v = ioread32(p + i); + ioread32(p + i); } static void do_test(void) { - void __iomem *p = ioremap_nocache_trace(mmio_address, 0x4000); + void __iomem *p = ioremap_nocache(mmio_address, 0x4000); if (!p) { pr_err(MODULE_NAME ": could not ioremap, aborting.\n"); return; } do_write_test(p); do_read_test(p); - iounmap_trace(p); + iounmap(p); } static int __init init(void) diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 71bb315..8927c87 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -126,6 +127,7 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, unsigned long new_prot_val; pgprot_t prot; int retval; + void __iomem *ret_addr; /* Don't allow wraparound or zero size */ last_addr = phys_addr + size - 1; @@ -233,7 +235,10 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, return NULL; } - return (void __iomem *) (vaddr + offset); + ret_addr = (void __iomem *) (vaddr + offset); + mmiotrace_ioremap(phys_addr, size, ret_addr); + + return ret_addr; } /** @@ -325,6 +330,8 @@ void iounmap(volatile void __iomem *addr) addr = (volatile void __iomem *) (PAGE_MASK & (unsigned long __force)addr); + mmiotrace_iounmap(addr); + /* Use the vm area unlocked, assuming the caller ensures there isn't another iounmap for the same address in parallel. Reuse of the virtual address is prevented by diff --git a/include/linux/mmiotrace.h b/include/linux/mmiotrace.h index d87a6cd..cb5efd0 100644 --- a/include/linux/mmiotrace.h +++ b/include/linux/mmiotrace.h @@ -16,11 +16,12 @@ typedef void (*kmmio_post_handler_t)(struct kmmio_probe *, unsigned long condition, struct pt_regs *); struct kmmio_probe { - struct list_head list; + struct list_head list; /* kmmio internal list */ unsigned long addr; /* start location of the probe point */ unsigned long len; /* length of the probe region */ kmmio_pre_handler_t pre_handler; /* Called before addr is executed. */ kmmio_post_handler_t post_handler; /* Called after addr is executed */ + void *user_data; }; /* kmmio is active by some kmmio_probes? */ @@ -38,6 +39,21 @@ extern void unregister_kmmio_probe(struct kmmio_probe *p); /* Called from page fault handler. */ extern int kmmio_handler(struct pt_regs *regs, unsigned long addr); +/* Called from ioremap.c */ +#ifdef CONFIG_MMIOTRACE +extern void +mmiotrace_ioremap(unsigned long offset, unsigned long size, void __iomem *addr); +extern void mmiotrace_iounmap(volatile void __iomem *addr); +#else +static inline void +mmiotrace_ioremap(unsigned long offset, unsigned long size, void __iomem *addr) +{ +} +static inline void mmiotrace_iounmap(volatile void __iomem *addr) +{ +} +#endif /* CONFIG_MMIOTRACE_HOOKS */ + #endif /* __KERNEL__ */ -- cgit v0.10.2 From f984b51e0779a6dd30feedc41404013ca54e5d05 Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Mon, 12 May 2008 21:20:57 +0200 Subject: ftrace: add mmiotrace plugin On Sat, 22 Mar 2008 13:07:47 +0100 Ingo Molnar wrote: > > > i'd suggest the following: pull x86.git and sched-devel.git into a > > > single tree [the two will combine without rejects]. Then try to add a > > > kernel/tracing/trace_mmiotrace.c ftrace plugin. The trace_sysprof.c > > > plugin might be a good example. > > > > I did this and now I have mmiotrace enabled/disabled via the tracing > > framework (what do we call this, since ftrace is one of the tracers?). > > cool! could you send the patches for that? (even if they are not fully > functional yet) Patch attached in the end. Nice to see how much code disappeared. I tried to mark all the features I had to break with XXX-comments. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index aa0d646..7e4b849 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -173,7 +173,8 @@ config MMIOTRACE_HOOKS config MMIOTRACE bool "Memory mapped IO tracing" - depends on DEBUG_KERNEL && RELAY && DEBUG_FS + depends on DEBUG_KERNEL && RELAY + select TRACING select MMIOTRACE_HOOKS default y help diff --git a/arch/x86/kernel/mmiotrace/mmio-mod.c b/arch/x86/kernel/mmiotrace/mmio-mod.c index 7386440..c7a67d7 100644 --- a/arch/x86/kernel/mmiotrace/mmio-mod.c +++ b/arch/x86/kernel/mmiotrace/mmio-mod.c @@ -22,9 +22,8 @@ #define DEBUG 1 #include -#include #include -#include +#include #include #include #include @@ -63,18 +62,18 @@ static const size_t subbuf_size = 256*1024; static DEFINE_PER_CPU(struct trap_reason, pf_reason); static DEFINE_PER_CPU(struct mm_io_header_rw, cpu_trace); +#if 0 /* XXX: no way gather this info anymore */ /* Access to this is not per-cpu. */ static DEFINE_PER_CPU(atomic_t, dropped); +#endif static struct dentry *dir; -static struct dentry *enabled_file; static struct dentry *marker_file; static DEFINE_MUTEX(mmiotrace_mutex); static DEFINE_SPINLOCK(trace_lock); static atomic_t mmiotrace_enabled; static LIST_HEAD(trace_list); /* struct remap_trace */ -static struct rchan *chan; /* * Locking in this file: @@ -93,36 +92,24 @@ static unsigned long filter_offset; static int nommiotrace; static int ISA_trace; static int trace_pc; -static int enable_now; module_param(n_subbufs, uint, 0); module_param(filter_offset, ulong, 0); module_param(nommiotrace, bool, 0); module_param(ISA_trace, bool, 0); module_param(trace_pc, bool, 0); -module_param(enable_now, bool, 0); MODULE_PARM_DESC(n_subbufs, "Number of 256kB buffers, default 128."); MODULE_PARM_DESC(filter_offset, "Start address of traced mappings."); MODULE_PARM_DESC(nommiotrace, "Disable actual MMIO tracing."); MODULE_PARM_DESC(ISA_trace, "Do not exclude the low ISA range."); MODULE_PARM_DESC(trace_pc, "Record address of faulting instructions."); -MODULE_PARM_DESC(enable_now, "Start mmiotracing immediately on module load."); static bool is_enabled(void) { return atomic_read(&mmiotrace_enabled); } -static void record_timestamp(struct mm_io_header *header) -{ - struct timespec now; - - getnstimeofday(&now); - header->sec = now.tv_sec; - header->nsec = now.tv_nsec; -} - /* * Write callback for the debugfs entry: * Read a marker and write it to the mmio trace log @@ -141,7 +128,6 @@ static ssize_t write_marker(struct file *file, const char __user *buffer, headp = (struct mm_io_header *)event; headp->type = MMIO_MAGIC | (MMIO_MARKER << MMIO_OPCODE_SHIFT); headp->data_len = len; - record_timestamp(headp); if (copy_from_user(event + sizeof(*headp), buffer, len)) { kfree(event); @@ -149,9 +135,11 @@ static ssize_t write_marker(struct file *file, const char __user *buffer, } spin_lock_irq(&trace_lock); +#if 0 /* XXX: convert this to use tracing */ if (is_enabled()) relay_write(chan, event, sizeof(*headp) + len); else +#endif len = -EINVAL; spin_unlock_irq(&trace_lock); kfree(event); @@ -242,7 +230,11 @@ static void pre(struct kmmio_probe *p, struct pt_regs *regs, else my_trace->rw.pc = 0; - record_timestamp(&my_trace->header); + /* + * XXX: the timestamp recorded will be *after* the tracing has been + * done, not at the time we hit the instruction. SMP implications + * on event ordering? + */ switch (type) { case REG_READ: @@ -295,77 +287,19 @@ static void post(struct kmmio_probe *p, unsigned long condition, default: break; } - relay_write(chan, my_trace, sizeof(*my_trace)); + + /* + * XXX: Several required values are ignored: + * - mapping id + * - program counter + * Also the address should be physical, not virtual. + */ + mmio_trace_record(my_trace->header.type, my_trace->rw.address, + my_trace->rw.value); put_cpu_var(cpu_trace); put_cpu_var(pf_reason); } -/* - * subbuf_start() relay callback. - * - * Defined so that we know when events are dropped due to the buffer-full - * condition. - */ -static int subbuf_start_handler(struct rchan_buf *buf, void *subbuf, - void *prev_subbuf, size_t prev_padding) -{ - unsigned int cpu = buf->cpu; - atomic_t *drop = &per_cpu(dropped, cpu); - int count; - if (relay_buf_full(buf)) { - if (atomic_inc_return(drop) == 1) - pr_err(NAME "cpu %d buffer full!\n", cpu); - return 0; - } - count = atomic_read(drop); - if (count) { - pr_err(NAME "cpu %d buffer no longer full, missed %d events.\n", - cpu, count); - atomic_sub(count, drop); - } - - return 1; -} - -static struct file_operations mmio_fops = { - .owner = THIS_MODULE, -}; - -/* file_create() callback. Creates relay file in debugfs. */ -static struct dentry *create_buf_file_handler(const char *filename, - struct dentry *parent, - int mode, - struct rchan_buf *buf, - int *is_global) -{ - struct dentry *buf_file; - - mmio_fops.read = relay_file_operations.read; - mmio_fops.open = relay_file_operations.open; - mmio_fops.poll = relay_file_operations.poll; - mmio_fops.mmap = relay_file_operations.mmap; - mmio_fops.release = relay_file_operations.release; - mmio_fops.splice_read = relay_file_operations.splice_read; - - buf_file = debugfs_create_file(filename, mode, parent, buf, - &mmio_fops); - - return buf_file; -} - -/* file_remove() default callback. Removes relay file in debugfs. */ -static int remove_buf_file_handler(struct dentry *dentry) -{ - debugfs_remove(dentry); - return 0; -} - -static struct rchan_callbacks relay_callbacks = { - .subbuf_start = subbuf_start_handler, - .create_buf_file = create_buf_file_handler, - .remove_buf_file = remove_buf_file_handler, -}; - static void ioremap_trace_core(unsigned long offset, unsigned long size, void __iomem *addr) { @@ -387,7 +321,6 @@ static void ioremap_trace_core(unsigned long offset, unsigned long size, .pc = 0 } }; - record_timestamp(&event.header); if (!trace) { pr_err(NAME "kmalloc failed in ioremap\n"); @@ -410,7 +343,10 @@ static void ioremap_trace_core(unsigned long offset, unsigned long size, if (!is_enabled()) goto not_enabled; - relay_write(chan, &event, sizeof(event)); + /* + * XXX: Insufficient data recorded! + */ + mmio_trace_record(event.header.type, event.map.addr, event.map.len); list_add_tail(&trace->list, &trace_list); if (!nommiotrace) register_kmmio_probe(&trace->probe); @@ -454,7 +390,6 @@ static void iounmap_trace_core(volatile void __iomem *addr) struct remap_trace *found_trace = NULL; pr_debug(NAME "Unmapping %p.\n", addr); - record_timestamp(&event.header); spin_lock_irq(&trace_lock); if (!is_enabled()) @@ -469,7 +404,8 @@ static void iounmap_trace_core(volatile void __iomem *addr) break; } } - relay_write(chan, &event, sizeof(event)); + mmio_trace_record(event.header.type, event.map.addr, + found_trace ? found_trace->id : -1); not_enabled: spin_unlock_irq(&trace_lock); @@ -512,77 +448,23 @@ static void clear_trace_list(void) } } -static ssize_t read_enabled_file_bool(struct file *file, - char __user *user_buf, size_t count, loff_t *ppos) -{ - char buf[3]; - - if (is_enabled()) - buf[0] = '1'; - else - buf[0] = '0'; - buf[1] = '\n'; - buf[2] = '\0'; - return simple_read_from_buffer(user_buf, count, ppos, buf, 2); -} - -static void enable_mmiotrace(void); -static void disable_mmiotrace(void); - -static ssize_t write_enabled_file_bool(struct file *file, - const char __user *user_buf, size_t count, loff_t *ppos) -{ - char buf[32]; - int buf_size = min(count, (sizeof(buf)-1)); - - if (copy_from_user(buf, user_buf, buf_size)) - return -EFAULT; - - switch (buf[0]) { - case 'y': - case 'Y': - case '1': - enable_mmiotrace(); - break; - case 'n': - case 'N': - case '0': - disable_mmiotrace(); - break; - } - - return count; -} - -/* this ripped from kernel/kprobes.c */ -static struct file_operations fops_enabled = { - .owner = THIS_MODULE, - .read = read_enabled_file_bool, - .write = write_enabled_file_bool -}; - static struct file_operations fops_marker = { .owner = THIS_MODULE, .write = write_marker }; -static void enable_mmiotrace(void) +void enable_mmiotrace(void) { mutex_lock(&mmiotrace_mutex); if (is_enabled()) goto out; - chan = relay_open("cpu", dir, subbuf_size, n_subbufs, - &relay_callbacks, NULL); - if (!chan) { - pr_err(NAME "relay app channel creation failed.\n"); - goto out; - } - reference_kmmio(); +#if 0 /* XXX: tracing does not support text entries */ marker_file = debugfs_create_file("marker", 0660, dir, NULL, &fops_marker); +#endif if (!marker_file) pr_err(NAME "marker file creation failed.\n"); @@ -598,7 +480,7 @@ out: mutex_unlock(&mmiotrace_mutex); } -static void disable_mmiotrace(void) +void disable_mmiotrace(void) { mutex_lock(&mmiotrace_mutex); if (!is_enabled()) @@ -615,17 +497,13 @@ static void disable_mmiotrace(void) debugfs_remove(marker_file); marker_file = NULL; } - if (chan) { - relay_close(chan); - chan = NULL; - } pr_info(NAME "disabled.\n"); out: mutex_unlock(&mmiotrace_mutex); } -static int __init init(void) +int __init init_mmiotrace(void) { pr_debug(NAME "load...\n"); if (n_subbufs < 2) @@ -636,31 +514,5 @@ static int __init init(void) pr_err(NAME "Couldn't create relay app directory.\n"); return -ENOMEM; } - - enabled_file = debugfs_create_file("enabled", 0600, dir, NULL, - &fops_enabled); - if (!enabled_file) { - pr_err(NAME "Couldn't create enabled file.\n"); - debugfs_remove(dir); - return -ENOMEM; - } - - if (enable_now) - enable_mmiotrace(); - return 0; } - -static void __exit cleanup(void) -{ - pr_debug(NAME "unload...\n"); - if (enabled_file) - debugfs_remove(enabled_file); - disable_mmiotrace(); - if (dir) - debugfs_remove(dir); -} - -module_init(init); -module_exit(cleanup); -MODULE_LICENSE("GPL"); diff --git a/include/linux/mmiotrace.h b/include/linux/mmiotrace.h index cb5efd0..579b3b06 100644 --- a/include/linux/mmiotrace.h +++ b/include/linux/mmiotrace.h @@ -54,6 +54,12 @@ static inline void mmiotrace_iounmap(volatile void __iomem *addr) } #endif /* CONFIG_MMIOTRACE_HOOKS */ +/* in kernel/trace/trace_mmiotrace.c */ +extern int __init init_mmiotrace(void); +extern void enable_mmiotrace(void); +extern void disable_mmiotrace(void); +extern void mmio_trace_record(u32 type, unsigned long addr, unsigned long arg); + #endif /* __KERNEL__ */ diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index d9efbbf..c44a7dc 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -18,5 +18,6 @@ obj-$(CONFIG_FTRACE) += trace_functions.o obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o +obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o libftrace-y := ftrace.o diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c new file mode 100644 index 0000000..e4dd03c --- /dev/null +++ b/kernel/trace/trace_mmiotrace.c @@ -0,0 +1,84 @@ +/* + * Memory mapped I/O tracing + * + * Copyright (C) 2008 Pekka Paalanen + */ + +#define DEBUG 1 + +#include +#include + +#include "trace.h" + +extern void +__trace_special(void *__tr, void *__data, + unsigned long arg1, unsigned long arg2, unsigned long arg3); + +static struct trace_array *mmio_trace_array; + + +static void mmio_trace_init(struct trace_array *tr) +{ + pr_debug("in %s\n", __func__); + mmio_trace_array = tr; + if (tr->ctrl) + enable_mmiotrace(); +} + +static void mmio_trace_reset(struct trace_array *tr) +{ + pr_debug("in %s\n", __func__); + if (tr->ctrl) + disable_mmiotrace(); +} + +static void mmio_trace_ctrl_update(struct trace_array *tr) +{ + pr_debug("in %s\n", __func__); + if (tr->ctrl) + enable_mmiotrace(); + else + disable_mmiotrace(); +} + +static struct tracer mmio_tracer __read_mostly = +{ + .name = "mmiotrace", + .init = mmio_trace_init, + .reset = mmio_trace_reset, + .ctrl_update = mmio_trace_ctrl_update, +}; + +__init static int init_mmio_trace(void) +{ + int ret = init_mmiotrace(); + if (ret) + return ret; + return register_tracer(&mmio_tracer); +} +device_initcall(init_mmio_trace); + +void mmio_trace_record(u32 type, unsigned long addr, unsigned long arg) +{ + struct trace_array *tr = mmio_trace_array; + struct trace_array_cpu *data = tr->data[smp_processor_id()]; + + if (!current || current->pid == 0) { + /* + * XXX: This is a problem. We need to able to record, no + * matter what. tracing_generic_entry_update() would crash. + */ + static unsigned limit; + if (limit++ < 12) + pr_err("Error in %s: no current.\n", __func__); + return; + } + if (!tr || !data) { + static unsigned limit; + if (limit++ < 12) + pr_err("%s: no tr or data\n", __func__); + return; + } + __trace_special(tr, data, type, addr, arg); +} -- cgit v0.10.2 From bd8ac686c73c7e925fcfe0b02dc4e7b947127864 Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Mon, 12 May 2008 21:20:57 +0200 Subject: ftrace: mmiotrace, updates here is a patch that makes mmiotrace work almost well within the tracing framework. The patch applies on top of my previous patch. I have my own output formatting in place now. Summary of changes: - fix the NULL dereference that was due to not calling tracing_reset() - add print_line() callback into struct tracer - implement print_line() for mmiotrace, producing up-to-spec text - add my output header, but that is not really called in the right place - rewrote the main structs in mmiotrace - added two new trace entry types: TRACE_MMIO_RW and TRACE_MMIO_MAP - made some functions in trace.c non-static - check current==NULL in tracing_generic_entry_update() - fix(?) comparison in trace_seq_printf() Things seem to work fine except a few issues. Markers (text lines injected into mmiotrace log) are missing, I did not feel hacking them in before we have variable length entries. My output header is printed only for 'trace' file, but not 'trace_pipe'. For some reason, despite my quick fix, iter->trace is NULL in print_trace_line() when called from 'trace_pipe' file, which means I don't get proper output formatting. I only tried by loading nouveau.ko, which just detects the card, and that is traced fine. I didn't try further. Map, two reads and unmap. Works perfectly. I am missing the information about overflows, I'd prefer to have a counter for lost events. I didn't try, but I guess currently there is no way of knowning when it overflows? So, not too far from being fully operational, it seems :-) And looking at the diffstat, there also is some 700-900 lines of user space code that just became obsolete. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 7e4b849..1d6de0d 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -173,7 +173,7 @@ config MMIOTRACE_HOOKS config MMIOTRACE bool "Memory mapped IO tracing" - depends on DEBUG_KERNEL && RELAY + depends on DEBUG_KERNEL select TRACING select MMIOTRACE_HOOKS default y diff --git a/arch/x86/kernel/mmiotrace/mmio-mod.c b/arch/x86/kernel/mmiotrace/mmio-mod.c index c7a67d7..62abc28 100644 --- a/arch/x86/kernel/mmiotrace/mmio-mod.c +++ b/arch/x86/kernel/mmiotrace/mmio-mod.c @@ -37,11 +37,6 @@ #define NAME "mmiotrace: " -/* This app's relay channel files will appear in /debug/mmio-trace */ -static const char APP_DIR[] = "mmio-trace"; -/* the marker injection file in /debug/APP_DIR */ -static const char MARKER_FILE[] = "mmio-marker"; - struct trap_reason { unsigned long addr; unsigned long ip; @@ -56,18 +51,15 @@ struct remap_trace { unsigned long id; }; -static const size_t subbuf_size = 256*1024; - /* Accessed per-cpu. */ static DEFINE_PER_CPU(struct trap_reason, pf_reason); -static DEFINE_PER_CPU(struct mm_io_header_rw, cpu_trace); +static DEFINE_PER_CPU(struct mmiotrace_rw, cpu_trace); #if 0 /* XXX: no way gather this info anymore */ /* Access to this is not per-cpu. */ static DEFINE_PER_CPU(atomic_t, dropped); #endif -static struct dentry *dir; static struct dentry *marker_file; static DEFINE_MUTEX(mmiotrace_mutex); @@ -82,24 +74,21 @@ static LIST_HEAD(trace_list); /* struct remap_trace */ * and trace_lock. * - Routines depending on is_enabled() must take trace_lock. * - trace_list users must hold trace_lock. - * - is_enabled() guarantees that chan is valid. + * - is_enabled() guarantees that mmio_trace_record is allowed. * - pre/post callbacks assume the effect of is_enabled() being true. */ /* module parameters */ -static unsigned int n_subbufs = 32*4; static unsigned long filter_offset; static int nommiotrace; static int ISA_trace; static int trace_pc; -module_param(n_subbufs, uint, 0); module_param(filter_offset, ulong, 0); module_param(nommiotrace, bool, 0); module_param(ISA_trace, bool, 0); module_param(trace_pc, bool, 0); -MODULE_PARM_DESC(n_subbufs, "Number of 256kB buffers, default 128."); MODULE_PARM_DESC(filter_offset, "Start address of traced mappings."); MODULE_PARM_DESC(nommiotrace, "Disable actual MMIO tracing."); MODULE_PARM_DESC(ISA_trace, "Do not exclude the low ISA range."); @@ -110,6 +99,7 @@ static bool is_enabled(void) return atomic_read(&mmiotrace_enabled); } +#if 0 /* XXX: needs rewrite */ /* * Write callback for the debugfs entry: * Read a marker and write it to the mmio trace log @@ -145,6 +135,7 @@ static ssize_t write_marker(struct file *file, const char __user *buffer, kfree(event); return len; } +#endif static void print_pte(unsigned long address) { @@ -198,9 +189,10 @@ static void pre(struct kmmio_probe *p, struct pt_regs *regs, unsigned long addr) { struct trap_reason *my_reason = &get_cpu_var(pf_reason); - struct mm_io_header_rw *my_trace = &get_cpu_var(cpu_trace); + struct mmiotrace_rw *my_trace = &get_cpu_var(cpu_trace); const unsigned long instptr = instruction_pointer(regs); const enum reason_type type = get_ins_type(instptr); + struct remap_trace *trace = p->user_data; /* it doesn't make sense to have more than one active trace per cpu */ if (my_reason->active_traces) @@ -212,23 +204,17 @@ static void pre(struct kmmio_probe *p, struct pt_regs *regs, my_reason->addr = addr; my_reason->ip = instptr; - my_trace->header.type = MMIO_MAGIC; - my_trace->header.pid = 0; - my_trace->header.data_len = sizeof(struct mm_io_rw); - my_trace->rw.address = addr; - /* - * struct remap_trace *trace = p->user_data; - * phys = addr - trace->probe.addr + trace->phys; - */ + my_trace->phys = addr - trace->probe.addr + trace->phys; + my_trace->map_id = trace->id; /* * Only record the program counter when requested. * It may taint clean-room reverse engineering. */ if (trace_pc) - my_trace->rw.pc = instptr; + my_trace->pc = instptr; else - my_trace->rw.pc = 0; + my_trace->pc = 0; /* * XXX: the timestamp recorded will be *after* the tracing has been @@ -238,28 +224,25 @@ static void pre(struct kmmio_probe *p, struct pt_regs *regs, switch (type) { case REG_READ: - my_trace->header.type |= - (MMIO_READ << MMIO_OPCODE_SHIFT) | - (get_ins_mem_width(instptr) << MMIO_WIDTH_SHIFT); + my_trace->opcode = MMIO_READ; + my_trace->width = get_ins_mem_width(instptr); break; case REG_WRITE: - my_trace->header.type |= - (MMIO_WRITE << MMIO_OPCODE_SHIFT) | - (get_ins_mem_width(instptr) << MMIO_WIDTH_SHIFT); - my_trace->rw.value = get_ins_reg_val(instptr, regs); + my_trace->opcode = MMIO_WRITE; + my_trace->width = get_ins_mem_width(instptr); + my_trace->value = get_ins_reg_val(instptr, regs); break; case IMM_WRITE: - my_trace->header.type |= - (MMIO_WRITE << MMIO_OPCODE_SHIFT) | - (get_ins_mem_width(instptr) << MMIO_WIDTH_SHIFT); - my_trace->rw.value = get_ins_imm_val(instptr); + my_trace->opcode = MMIO_WRITE; + my_trace->width = get_ins_mem_width(instptr); + my_trace->value = get_ins_imm_val(instptr); break; default: { unsigned char *ip = (unsigned char *)instptr; - my_trace->header.type |= - (MMIO_UNKNOWN_OP << MMIO_OPCODE_SHIFT); - my_trace->rw.value = (*ip) << 16 | *(ip + 1) << 8 | + my_trace->opcode = MMIO_UNKNOWN_OP; + my_trace->width = 0; + my_trace->value = (*ip) << 16 | *(ip + 1) << 8 | *(ip + 2); } } @@ -271,7 +254,7 @@ static void post(struct kmmio_probe *p, unsigned long condition, struct pt_regs *regs) { struct trap_reason *my_reason = &get_cpu_var(pf_reason); - struct mm_io_header_rw *my_trace = &get_cpu_var(cpu_trace); + struct mmiotrace_rw *my_trace = &get_cpu_var(cpu_trace); /* this should always return the active_trace count to 0 */ my_reason->active_traces--; @@ -282,20 +265,13 @@ static void post(struct kmmio_probe *p, unsigned long condition, switch (my_reason->type) { case REG_READ: - my_trace->rw.value = get_ins_reg_val(my_reason->ip, regs); + my_trace->value = get_ins_reg_val(my_reason->ip, regs); break; default: break; } - /* - * XXX: Several required values are ignored: - * - mapping id - * - program counter - * Also the address should be physical, not virtual. - */ - mmio_trace_record(my_trace->header.type, my_trace->rw.address, - my_trace->rw.value); + mmio_trace_rw(my_trace); put_cpu_var(cpu_trace); put_cpu_var(pf_reason); } @@ -305,21 +281,11 @@ static void ioremap_trace_core(unsigned long offset, unsigned long size, { static atomic_t next_id; struct remap_trace *trace = kmalloc(sizeof(*trace), GFP_KERNEL); - struct mm_io_header_map event = { - .header = { - .type = MMIO_MAGIC | - (MMIO_PROBE << MMIO_OPCODE_SHIFT), - .sec = 0, - .nsec = 0, - .pid = 0, - .data_len = sizeof(struct mm_io_map) - }, - .map = { - .phys = offset, - .addr = (unsigned long)addr, - .len = size, - .pc = 0 - } + struct mmiotrace_map map = { + .phys = offset, + .virt = (unsigned long)addr, + .len = size, + .opcode = MMIO_PROBE }; if (!trace) { @@ -338,15 +304,13 @@ static void ioremap_trace_core(unsigned long offset, unsigned long size, .phys = offset, .id = atomic_inc_return(&next_id) }; + map.map_id = trace->id; spin_lock_irq(&trace_lock); if (!is_enabled()) goto not_enabled; - /* - * XXX: Insufficient data recorded! - */ - mmio_trace_record(event.header.type, event.map.addr, event.map.len); + mmio_trace_mapping(&map); list_add_tail(&trace->list, &trace_list); if (!nommiotrace) register_kmmio_probe(&trace->probe); @@ -369,21 +333,11 @@ mmiotrace_ioremap(unsigned long offset, unsigned long size, void __iomem *addr) static void iounmap_trace_core(volatile void __iomem *addr) { - struct mm_io_header_map event = { - .header = { - .type = MMIO_MAGIC | - (MMIO_UNPROBE << MMIO_OPCODE_SHIFT), - .sec = 0, - .nsec = 0, - .pid = 0, - .data_len = sizeof(struct mm_io_map) - }, - .map = { - .phys = 0, - .addr = (unsigned long)addr, - .len = 0, - .pc = 0 - } + struct mmiotrace_map map = { + .phys = 0, + .virt = (unsigned long)addr, + .len = 0, + .opcode = MMIO_UNPROBE }; struct remap_trace *trace; struct remap_trace *tmp; @@ -404,8 +358,8 @@ static void iounmap_trace_core(volatile void __iomem *addr) break; } } - mmio_trace_record(event.header.type, event.map.addr, - found_trace ? found_trace->id : -1); + map.map_id = (found_trace) ? found_trace->id : -1; + mmio_trace_mapping(&map); not_enabled: spin_unlock_irq(&trace_lock); @@ -448,10 +402,12 @@ static void clear_trace_list(void) } } +#if 0 /* XXX: out of order */ static struct file_operations fops_marker = { .owner = THIS_MODULE, .write = write_marker }; +#endif void enable_mmiotrace(void) { @@ -464,9 +420,9 @@ void enable_mmiotrace(void) #if 0 /* XXX: tracing does not support text entries */ marker_file = debugfs_create_file("marker", 0660, dir, NULL, &fops_marker); -#endif if (!marker_file) pr_err(NAME "marker file creation failed.\n"); +#endif if (nommiotrace) pr_info(NAME "MMIO tracing disabled.\n"); @@ -502,17 +458,3 @@ void disable_mmiotrace(void) out: mutex_unlock(&mmiotrace_mutex); } - -int __init init_mmiotrace(void) -{ - pr_debug(NAME "load...\n"); - if (n_subbufs < 2) - return -EINVAL; - - dir = debugfs_create_dir(APP_DIR, NULL); - if (!dir) { - pr_err(NAME "Couldn't create relay app directory.\n"); - return -ENOMEM; - } - return 0; -} diff --git a/include/linux/mmiotrace.h b/include/linux/mmiotrace.h index 579b3b06..c88a9c1 100644 --- a/include/linux/mmiotrace.h +++ b/include/linux/mmiotrace.h @@ -54,73 +54,38 @@ static inline void mmiotrace_iounmap(volatile void __iomem *addr) } #endif /* CONFIG_MMIOTRACE_HOOKS */ -/* in kernel/trace/trace_mmiotrace.c */ -extern int __init init_mmiotrace(void); -extern void enable_mmiotrace(void); -extern void disable_mmiotrace(void); -extern void mmio_trace_record(u32 type, unsigned long addr, unsigned long arg); - -#endif /* __KERNEL__ */ - - -/* - * If you change anything here, you must bump MMIO_VERSION. - * This is the relay data format for user space. - */ -#define MMIO_VERSION 0x04 - -/* mm_io_header.type */ -#define MMIO_OPCODE_MASK 0xff -#define MMIO_OPCODE_SHIFT 0 -#define MMIO_WIDTH_MASK 0xff00 -#define MMIO_WIDTH_SHIFT 8 -#define MMIO_MAGIC (0x6f000000 | (MMIO_VERSION<<16)) -#define MMIO_MAGIC_MASK 0xffff0000 - -enum mm_io_opcode { /* payload type: */ - MMIO_READ = 0x1, /* struct mm_io_rw */ - MMIO_WRITE = 0x2, /* struct mm_io_rw */ - MMIO_PROBE = 0x3, /* struct mm_io_map */ - MMIO_UNPROBE = 0x4, /* struct mm_io_map */ +enum mm_io_opcode { + MMIO_READ = 0x1, /* struct mmiotrace_rw */ + MMIO_WRITE = 0x2, /* struct mmiotrace_rw */ + MMIO_PROBE = 0x3, /* struct mmiotrace_map */ + MMIO_UNPROBE = 0x4, /* struct mmiotrace_map */ MMIO_MARKER = 0x5, /* raw char data */ - MMIO_UNKNOWN_OP = 0x6, /* struct mm_io_rw */ + MMIO_UNKNOWN_OP = 0x6, /* struct mmiotrace_rw */ }; -struct mm_io_header { - __u32 type; /* see MMIO_* macros above */ - __u32 sec; /* timestamp */ - __u32 nsec; - __u32 pid; /* PID of the process, or 0 for kernel core */ - __u16 data_len; /* length of the following payload */ +struct mmiotrace_rw { + unsigned long phys; /* PCI address of register */ + unsigned long value; + unsigned long pc; /* optional program counter */ + int map_id; + unsigned char opcode; /* one of MMIO_{READ,WRITE,UNKNOWN_OP} */ + unsigned char width; /* size of register access in bytes */ }; -struct mm_io_rw { - __u64 address; /* virtual address of register */ - __u64 value; - __u64 pc; /* optional program counter */ +struct mmiotrace_map { + unsigned long phys; /* base address in PCI space */ + unsigned long virt; /* base virtual address */ + unsigned long len; /* mapping size */ + int map_id; + unsigned char opcode; /* MMIO_PROBE or MMIO_UNPROBE */ }; -struct mm_io_map { - __u64 phys; /* base address in PCI space */ - __u64 addr; /* base virtual address */ - __u64 len; /* mapping size */ - __u64 pc; /* optional program counter */ -}; - - -/* - * These structures are used to allow a single relay_write() - * call to write a full packet. - */ - -struct mm_io_header_rw { - struct mm_io_header header; - struct mm_io_rw rw; -} __attribute__((packed)); +/* in kernel/trace/trace_mmiotrace.c */ +extern void enable_mmiotrace(void); +extern void disable_mmiotrace(void); +extern void mmio_trace_rw(struct mmiotrace_rw *rw); +extern void mmio_trace_mapping(struct mmiotrace_map *map); -struct mm_io_header_map { - struct mm_io_header header; - struct mm_io_map map; -} __attribute__((packed)); +#endif /* __KERNEL__ */ #endif /* MMIOTRACE_H */ diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 3271916..d14fe49 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -831,6 +831,40 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data, trace_function(tr, data, ip, parent_ip, flags); } +#ifdef CONFIG_MMIOTRACE +void __trace_mmiotrace_rw(struct trace_array *tr, struct trace_array_cpu *data, + struct mmiotrace_rw *rw) +{ + struct trace_entry *entry; + unsigned long irq_flags; + + spin_lock_irqsave(&data->lock, irq_flags); + entry = tracing_get_trace_entry(tr, data); + tracing_generic_entry_update(entry, 0); + entry->type = TRACE_MMIO_RW; + entry->mmiorw = *rw; + spin_unlock_irqrestore(&data->lock, irq_flags); + + trace_wake_up(); +} + +void __trace_mmiotrace_map(struct trace_array *tr, struct trace_array_cpu *data, + struct mmiotrace_map *map) +{ + struct trace_entry *entry; + unsigned long irq_flags; + + spin_lock_irqsave(&data->lock, irq_flags); + entry = tracing_get_trace_entry(tr, data); + tracing_generic_entry_update(entry, 0); + entry->type = TRACE_MMIO_MAP; + entry->mmiomap = *map; + spin_unlock_irqrestore(&data->lock, irq_flags); + + trace_wake_up(); +} +#endif + void __trace_stack(struct trace_array *tr, struct trace_array_cpu *data, unsigned long flags, diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index c460e85..0ef9ef7 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -5,6 +5,7 @@ #include #include #include +#include enum trace_type { __TRACE_FIRST_TYPE = 0, @@ -14,6 +15,8 @@ enum trace_type { TRACE_WAKE, TRACE_STACK, TRACE_SPECIAL, + TRACE_MMIO_RW, + TRACE_MMIO_MAP, __TRACE_LAST_TYPE }; @@ -75,6 +78,8 @@ struct trace_entry { struct ctx_switch_entry ctx; struct special_entry special; struct stack_entry stack; + struct mmiotrace_rw mmiorw; + struct mmiotrace_map mmiomap; }; }; @@ -255,6 +260,15 @@ extern unsigned long ftrace_update_tot_cnt; extern int DYN_FTRACE_TEST_NAME(void); #endif +#ifdef CONFIG_MMIOTRACE +extern void __trace_mmiotrace_rw(struct trace_array *tr, + struct trace_array_cpu *data, + struct mmiotrace_rw *rw); +extern void __trace_mmiotrace_map(struct trace_array *tr, + struct trace_array_cpu *data, + struct mmiotrace_map *map); +#endif + #ifdef CONFIG_FTRACE_STARTUP_TEST #ifdef CONFIG_FTRACE extern int trace_selftest_startup_function(struct tracer *trace, diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index e4dd03c..3a12b1a 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -11,19 +11,26 @@ #include "trace.h" -extern void -__trace_special(void *__tr, void *__data, - unsigned long arg1, unsigned long arg2, unsigned long arg3); - static struct trace_array *mmio_trace_array; +static void mmio_reset_data(struct trace_array *tr) +{ + int cpu; + + tr->time_start = ftrace_now(tr->cpu); + + for_each_online_cpu(cpu) + tracing_reset(tr->data[cpu]); +} static void mmio_trace_init(struct trace_array *tr) { pr_debug("in %s\n", __func__); mmio_trace_array = tr; - if (tr->ctrl) + if (tr->ctrl) { + mmio_reset_data(tr); enable_mmiotrace(); + } } static void mmio_trace_reset(struct trace_array *tr) @@ -31,15 +38,110 @@ static void mmio_trace_reset(struct trace_array *tr) pr_debug("in %s\n", __func__); if (tr->ctrl) disable_mmiotrace(); + mmio_reset_data(tr); + mmio_trace_array = NULL; } static void mmio_trace_ctrl_update(struct trace_array *tr) { pr_debug("in %s\n", __func__); - if (tr->ctrl) + if (tr->ctrl) { + mmio_reset_data(tr); enable_mmiotrace(); - else + } else { disable_mmiotrace(); + } +} + +/* XXX: This is not called for trace_pipe file! */ +void mmio_print_header(struct trace_iterator *iter) +{ + struct trace_seq *s = &iter->seq; + trace_seq_printf(s, "VERSION broken 20070824\n"); + /* TODO: print /proc/bus/pci/devices contents as PCIDEV lines */ +} + +static int mmio_print_rw(struct trace_iterator *iter) +{ + struct trace_entry *entry = iter->ent; + struct mmiotrace_rw *rw = &entry->mmiorw; + struct trace_seq *s = &iter->seq; + unsigned long long t = ns2usecs(entry->t); + unsigned long usec_rem = do_div(t, 1000000ULL); + unsigned secs = (unsigned long)t; + int ret = 1; + + switch (entry->mmiorw.opcode) { + case MMIO_READ: + ret = trace_seq_printf(s, + "R %d %lu.%06lu %d 0x%lx 0x%lx 0x%lx %d\n", + rw->width, secs, usec_rem, rw->map_id, rw->phys, + rw->value, rw->pc, entry->pid); + break; + case MMIO_WRITE: + ret = trace_seq_printf(s, + "W %d %lu.%06lu %d 0x%lx 0x%lx 0x%lx %d\n", + rw->width, secs, usec_rem, rw->map_id, rw->phys, + rw->value, rw->pc, entry->pid); + break; + case MMIO_UNKNOWN_OP: + ret = trace_seq_printf(s, + "UNKNOWN %lu.%06lu %d 0x%lx %02x,%02x,%02x 0x%lx %d\n", + secs, usec_rem, rw->map_id, rw->phys, + (rw->value >> 16) & 0xff, (rw->value >> 8) & 0xff, + (rw->value >> 0) & 0xff, rw->pc, entry->pid); + break; + default: + ret = trace_seq_printf(s, "rw what?\n"); + break; + } + if (ret) + return 1; + return 0; +} + +static int mmio_print_map(struct trace_iterator *iter) +{ + struct trace_entry *entry = iter->ent; + struct mmiotrace_map *m = &entry->mmiomap; + struct trace_seq *s = &iter->seq; + unsigned long long t = ns2usecs(entry->t); + unsigned long usec_rem = do_div(t, 1000000ULL); + unsigned secs = (unsigned long)t; + int ret = 1; + + switch (entry->mmiorw.opcode) { + case MMIO_PROBE: + ret = trace_seq_printf(s, + "MAP %lu.%06lu %d 0x%lx 0x%lx 0x%lx 0x%lx %d\n", + secs, usec_rem, m->map_id, m->phys, m->virt, m->len, + 0UL, entry->pid); + break; + case MMIO_UNPROBE: + ret = trace_seq_printf(s, + "UNMAP %lu.%06lu %d 0x%lx %d\n", + secs, usec_rem, m->map_id, 0UL, entry->pid); + break; + default: + ret = trace_seq_printf(s, "map what?\n"); + break; + } + if (ret) + return 1; + return 0; +} + +/* return 0 to abort printing without consuming current entry in pipe mode */ +static int mmio_print_line(struct trace_iterator *iter) +{ + switch (iter->ent->type) { + case TRACE_MMIO_RW: + return mmio_print_rw(iter); + case TRACE_MMIO_MAP: + return mmio_print_map(iter); + default: + return 1; /* ignore unknown entries */ + } } static struct tracer mmio_tracer __read_mostly = @@ -47,38 +149,31 @@ static struct tracer mmio_tracer __read_mostly = .name = "mmiotrace", .init = mmio_trace_init, .reset = mmio_trace_reset, + .open = mmio_print_header, .ctrl_update = mmio_trace_ctrl_update, + .print_line = mmio_print_line, }; __init static int init_mmio_trace(void) { - int ret = init_mmiotrace(); - if (ret) - return ret; return register_tracer(&mmio_tracer); } device_initcall(init_mmio_trace); -void mmio_trace_record(u32 type, unsigned long addr, unsigned long arg) +void mmio_trace_rw(struct mmiotrace_rw *rw) { struct trace_array *tr = mmio_trace_array; struct trace_array_cpu *data = tr->data[smp_processor_id()]; + __trace_mmiotrace_rw(tr, data, rw); +} - if (!current || current->pid == 0) { - /* - * XXX: This is a problem. We need to able to record, no - * matter what. tracing_generic_entry_update() would crash. - */ - static unsigned limit; - if (limit++ < 12) - pr_err("Error in %s: no current.\n", __func__); - return; - } - if (!tr || !data) { - static unsigned limit; - if (limit++ < 12) - pr_err("%s: no tr or data\n", __func__); - return; - } - __trace_special(tr, data, type, addr, arg); +void mmio_trace_mapping(struct mmiotrace_map *map) +{ + struct trace_array *tr = mmio_trace_array; + struct trace_array_cpu *data; + + preempt_disable(); + data = tr->data[smp_processor_id()]; + __trace_mmiotrace_map(tr, data, map); + preempt_enable(); } -- cgit v0.10.2 From 138295373ccf7625fcb0218dfea114837983bc39 Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Mon, 12 May 2008 21:20:58 +0200 Subject: ftrace: mmiotrace update, #2 another weekend, another patch. This should apply on top of my previous patch from March 23rd. Summary of changes: - Print PCI device list in output header - work around recursive probe hits on SMP - refactor dis/arm_kmmio_fault_page() and add check for page levels - remove un/reference_kmmio(), the die notifier hook is registered permanently into the list - explicitly check for single stepping in die notifier callback I have tested this version on my UP Athlon64 desktop with Nouveau, and SMP Core 2 Duo laptop with the proprietary nvidia driver. Both systems are 64-bit. One previously unknown bug crept into daylight: the ftrace framework's output routines print the first entry last after buffer has wrapped around. The most important regressions compared to non-ftrace mmiotrace at this time are: - failure of trace_pipe file - illegal lines in output file - unaware of losing data due to buffer full Personally I'd like to see these three solved before submitting to mainline. Other issues may come up once we know when we lose events. Signed-off-by: Pekka Paalanen Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/arch/x86/kernel/mmiotrace/kmmio.c b/arch/x86/kernel/mmiotrace/kmmio.c index efb4679..cd0d95f 100644 --- a/arch/x86/kernel/mmiotrace/kmmio.c +++ b/arch/x86/kernel/mmiotrace/kmmio.c @@ -5,15 +5,12 @@ * 2008 Pekka Paalanen */ -#include #include #include #include #include #include -#include #include -#include #include #include #include @@ -22,10 +19,9 @@ #include #include #include -#include #include -#include - +#include +#include #include #define KMMIO_PAGE_HASH_BITS 4 @@ -57,14 +53,9 @@ struct kmmio_context { int active; }; -static int kmmio_die_notifier(struct notifier_block *nb, unsigned long val, - void *args); - -static DEFINE_MUTEX(kmmio_init_mutex); static DEFINE_SPINLOCK(kmmio_lock); -/* These are protected by kmmio_lock */ -static int kmmio_initialized; +/* Protected by kmmio_lock */ unsigned int kmmio_count; /* Read-protected by RCU, write-protected by kmmio_lock. */ @@ -79,60 +70,6 @@ static struct list_head *kmmio_page_list(unsigned long page) /* Accessed per-cpu */ static DEFINE_PER_CPU(struct kmmio_context, kmmio_ctx); -/* protected by kmmio_init_mutex */ -static struct notifier_block nb_die = { - .notifier_call = kmmio_die_notifier -}; - -/** - * Makes sure kmmio is initialized and usable. - * This must be called before any other kmmio function defined here. - * May sleep. - */ -void reference_kmmio(void) -{ - mutex_lock(&kmmio_init_mutex); - spin_lock_irq(&kmmio_lock); - if (!kmmio_initialized) { - int i; - for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++) - INIT_LIST_HEAD(&kmmio_page_table[i]); - if (register_die_notifier(&nb_die)) - BUG(); - } - kmmio_initialized++; - spin_unlock_irq(&kmmio_lock); - mutex_unlock(&kmmio_init_mutex); -} -EXPORT_SYMBOL_GPL(reference_kmmio); - -/** - * Clean up kmmio after use. This must be called for every call to - * reference_kmmio(). All probes registered after the corresponding - * reference_kmmio() must have been unregistered when calling this. - * May sleep. - */ -void unreference_kmmio(void) -{ - bool unreg = false; - - mutex_lock(&kmmio_init_mutex); - spin_lock_irq(&kmmio_lock); - - if (kmmio_initialized == 1) { - BUG_ON(is_kmmio_active()); - unreg = true; - } - kmmio_initialized--; - BUG_ON(kmmio_initialized < 0); - spin_unlock_irq(&kmmio_lock); - - if (unreg) - unregister_die_notifier(&nb_die); /* calls sync_rcu() */ - mutex_unlock(&kmmio_init_mutex); -} -EXPORT_SYMBOL(unreference_kmmio); - /* * this is basically a dynamic stabbing problem: * Could use the existing prio tree code or @@ -167,58 +104,56 @@ static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page) return NULL; } -/** Mark the given page as not present. Access to it will trigger a fault. */ -static void arm_kmmio_fault_page(unsigned long page, int *page_level) +static void set_page_present(unsigned long addr, bool present, int *pglevel) { - unsigned long address = page & PAGE_MASK; + pteval_t pteval; + pmdval_t pmdval; int level; - pte_t *pte = lookup_address(address, &level); + pmd_t *pmd; + pte_t *pte = lookup_address(addr, &level); if (!pte) { - pr_err("kmmio: Error in %s: no pte for page 0x%08lx\n", - __func__, page); + pr_err("kmmio: no pte for page 0x%08lx\n", addr); return; } - if (level == PG_LEVEL_2M) { - pmd_t *pmd = (pmd_t *)pte; - set_pmd(pmd, __pmd(pmd_val(*pmd) & ~_PAGE_PRESENT)); - } else { - /* PG_LEVEL_4K */ - set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT)); + if (pglevel) + *pglevel = level; + + switch (level) { + case PG_LEVEL_2M: + pmd = (pmd_t *)pte; + pmdval = pmd_val(*pmd) & ~_PAGE_PRESENT; + if (present) + pmdval |= _PAGE_PRESENT; + set_pmd(pmd, __pmd(pmdval)); + break; + + case PG_LEVEL_4K: + pteval = pte_val(*pte) & ~_PAGE_PRESENT; + if (present) + pteval |= _PAGE_PRESENT; + set_pte_atomic(pte, __pte(pteval)); + break; + + default: + pr_err("kmmio: unexpected page level 0x%x.\n", level); + return; } - if (page_level) - *page_level = level; + __flush_tlb_one(addr); +} - __flush_tlb_one(page); +/** Mark the given page as not present. Access to it will trigger a fault. */ +static void arm_kmmio_fault_page(unsigned long page, int *page_level) +{ + set_page_present(page & PAGE_MASK, false, page_level); } /** Mark the given page as present. */ static void disarm_kmmio_fault_page(unsigned long page, int *page_level) { - unsigned long address = page & PAGE_MASK; - int level; - pte_t *pte = lookup_address(address, &level); - - if (!pte) { - pr_err("kmmio: Error in %s: no pte for page 0x%08lx\n", - __func__, page); - return; - } - - if (level == PG_LEVEL_2M) { - pmd_t *pmd = (pmd_t *)pte; - set_pmd(pmd, __pmd(pmd_val(*pmd) | _PAGE_PRESENT)); - } else { - /* PG_LEVEL_4K */ - set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT)); - } - - if (page_level) - *page_level = level; - - __flush_tlb_one(page); + set_page_present(page & PAGE_MASK, true, page_level); } /* @@ -240,6 +175,7 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr) { struct kmmio_context *ctx; struct kmmio_fault_page *faultpage; + int ret = 0; /* default to fault not handled */ /* * Preemption is now disabled to prevent process switch during @@ -257,21 +193,35 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr) /* * Either this page fault is not caused by kmmio, or * another CPU just pulled the kmmio probe from under - * our feet. In the latter case all hell breaks loose. + * our feet. The latter case should not be possible. */ goto no_kmmio; } ctx = &get_cpu_var(kmmio_ctx); if (ctx->active) { + disarm_kmmio_fault_page(faultpage->page, NULL); + if (addr == ctx->addr) { + /* + * On SMP we sometimes get recursive probe hits on the + * same address. Context is already saved, fall out. + */ + pr_debug("kmmio: duplicate probe hit on CPU %d, for " + "address 0x%08lx.\n", + smp_processor_id(), addr); + ret = 1; + goto no_kmmio_ctx; + } /* * Prevent overwriting already in-flight context. - * If this page fault really was due to kmmio trap, - * all hell breaks loose. + * This should not happen, let's hope disarming at least + * prevents a panic. */ pr_emerg("kmmio: recursive probe hit on CPU %d, " "for address 0x%08lx. Ignoring.\n", smp_processor_id(), addr); + pr_emerg("kmmio: previous hit was at 0x%08lx.\n", + ctx->addr); goto no_kmmio_ctx; } ctx->active++; @@ -302,14 +252,14 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr) */ put_cpu_var(kmmio_ctx); - return 1; + return 1; /* fault handled */ no_kmmio_ctx: put_cpu_var(kmmio_ctx); no_kmmio: rcu_read_unlock(); preempt_enable_no_resched(); - return 0; /* page fault not handled by kmmio */ + return ret; } /* @@ -322,8 +272,11 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs) int ret = 0; struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx); - if (!ctx->active) + if (!ctx->active) { + pr_debug("kmmio: spurious debug trap on CPU %d.\n", + smp_processor_id()); goto out; + } if (ctx->probe && ctx->probe->post_handler) ctx->probe->post_handler(ctx->probe, condition, regs); @@ -525,9 +478,22 @@ static int kmmio_die_notifier(struct notifier_block *nb, unsigned long val, { struct die_args *arg = args; - if (val == DIE_DEBUG) + if (val == DIE_DEBUG && (arg->err & DR_STEP)) if (post_kmmio_handler(arg->err, arg->regs) == 1) return NOTIFY_STOP; return NOTIFY_DONE; } + +static struct notifier_block nb_die = { + .notifier_call = kmmio_die_notifier +}; + +static int __init init_kmmio(void) +{ + int i; + for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++) + INIT_LIST_HEAD(&kmmio_page_table[i]); + return register_die_notifier(&nb_die); +} +fs_initcall(init_kmmio); /* should be before device_initcall() */ diff --git a/arch/x86/kernel/mmiotrace/mmio-mod.c b/arch/x86/kernel/mmiotrace/mmio-mod.c index 62abc28..8256546 100644 --- a/arch/x86/kernel/mmiotrace/mmio-mod.c +++ b/arch/x86/kernel/mmiotrace/mmio-mod.c @@ -415,8 +415,6 @@ void enable_mmiotrace(void) if (is_enabled()) goto out; - reference_kmmio(); - #if 0 /* XXX: tracing does not support text entries */ marker_file = debugfs_create_file("marker", 0660, dir, NULL, &fops_marker); @@ -448,7 +446,6 @@ void disable_mmiotrace(void) spin_unlock_irq(&trace_lock); clear_trace_list(); /* guarantees: no more kmmio callbacks */ - unreference_kmmio(); if (marker_file) { debugfs_remove(marker_file); marker_file = NULL; diff --git a/include/linux/mmiotrace.h b/include/linux/mmiotrace.h index c88a9c1..dd6b64b 100644 --- a/include/linux/mmiotrace.h +++ b/include/linux/mmiotrace.h @@ -31,8 +31,6 @@ static inline int is_kmmio_active(void) return kmmio_count; } -extern void reference_kmmio(void); -extern void unreference_kmmio(void); extern int register_kmmio_probe(struct kmmio_probe *p); extern void unregister_kmmio_probe(struct kmmio_probe *p); diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index 3a12b1a..361472b 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -8,6 +8,7 @@ #include #include +#include #include "trace.h" @@ -53,12 +54,52 @@ static void mmio_trace_ctrl_update(struct trace_array *tr) } } +static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev) +{ + int ret = 0; + int i; + resource_size_t start, end; + const struct pci_driver *drv = pci_dev_driver(dev); + + /* XXX: incomplete checks for trace_seq_printf() return value */ + ret += trace_seq_printf(s, "PCIDEV %02x%02x %04x%04x %x", + dev->bus->number, dev->devfn, + dev->vendor, dev->device, dev->irq); + /* + * XXX: is pci_resource_to_user() appropriate, since we are + * supposed to interpret the __ioremap() phys_addr argument based on + * these printed values? + */ + for (i = 0; i < 7; i++) { + pci_resource_to_user(dev, i, &dev->resource[i], &start, &end); + ret += trace_seq_printf(s, " %llx", + (unsigned long long)(start | + (dev->resource[i].flags & PCI_REGION_FLAG_MASK))); + } + for (i = 0; i < 7; i++) { + pci_resource_to_user(dev, i, &dev->resource[i], &start, &end); + ret += trace_seq_printf(s, " %llx", + dev->resource[i].start < dev->resource[i].end ? + (unsigned long long)(end - start) + 1 : 0); + } + if (drv) + ret += trace_seq_printf(s, " %s\n", drv->name); + else + ret += trace_seq_printf(s, " \n"); + return ret; +} + /* XXX: This is not called for trace_pipe file! */ -void mmio_print_header(struct trace_iterator *iter) +static void mmio_print_header(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; - trace_seq_printf(s, "VERSION broken 20070824\n"); - /* TODO: print /proc/bus/pci/devices contents as PCIDEV lines */ + struct pci_dev *dev = NULL; + + trace_seq_printf(s, "VERSION 20070824\n"); + + for_each_pci_dev(dev) + mmio_print_pcidev(s, dev); + /* XXX: return value? What if header is very long? */ } static int mmio_print_rw(struct trace_iterator *iter) -- cgit v0.10.2 From 801a175bf601f9a9d5e86e92dee9adeeb6625da8 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:58 +0200 Subject: mmiotrace: ftrace fix Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d14fe49..4dcc4e8 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -838,12 +838,16 @@ void __trace_mmiotrace_rw(struct trace_array *tr, struct trace_array_cpu *data, struct trace_entry *entry; unsigned long irq_flags; - spin_lock_irqsave(&data->lock, irq_flags); + raw_local_irq_save(irq_flags); + __raw_spin_lock(&data->lock); + entry = tracing_get_trace_entry(tr, data); tracing_generic_entry_update(entry, 0); entry->type = TRACE_MMIO_RW; entry->mmiorw = *rw; - spin_unlock_irqrestore(&data->lock, irq_flags); + + __raw_spin_unlock(&data->lock); + raw_local_irq_restore(irq_flags); trace_wake_up(); } @@ -854,12 +858,16 @@ void __trace_mmiotrace_map(struct trace_array *tr, struct trace_array_cpu *data, struct trace_entry *entry; unsigned long irq_flags; - spin_lock_irqsave(&data->lock, irq_flags); + raw_local_irq_save(irq_flags); + __raw_spin_lock(&data->lock); + entry = tracing_get_trace_entry(tr, data); tracing_generic_entry_update(entry, 0); entry->type = TRACE_MMIO_MAP; entry->mmiomap = *map; - spin_unlock_irqrestore(&data->lock, irq_flags); + + __raw_spin_unlock(&data->lock); + raw_local_irq_restore(irq_flags); trace_wake_up(); } -- cgit v0.10.2 From 49023168261a7f9a2fd4a1ca1adbfea922556015 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:58 +0200 Subject: mmiotrace: cleanup Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/arch/x86/kernel/mmiotrace/kmmio.c b/arch/x86/kernel/mmiotrace/kmmio.c index cd0d95f..3ad27b8 100644 --- a/arch/x86/kernel/mmiotrace/kmmio.c +++ b/arch/x86/kernel/mmiotrace/kmmio.c @@ -228,7 +228,7 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr) ctx->fpage = faultpage; ctx->probe = get_kmmio_probe(addr); - ctx->saved_flags = (regs->flags & (TF_MASK|IF_MASK)); + ctx->saved_flags = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF)); ctx->addr = addr; if (ctx->probe && ctx->probe->pre_handler) @@ -238,8 +238,8 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr) * Enable single-stepping and disable interrupts for the faulting * context. Local interrupts must not get enabled during stepping. */ - regs->flags |= TF_MASK; - regs->flags &= ~IF_MASK; + regs->flags |= X86_EFLAGS_TF; + regs->flags &= ~X86_EFLAGS_IF; /* Now we set present bit in PTE and single step. */ disarm_kmmio_fault_page(ctx->fpage->page, NULL); @@ -283,7 +283,7 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs) arm_kmmio_fault_page(ctx->fpage->page, NULL); - regs->flags &= ~TF_MASK; + regs->flags &= ~X86_EFLAGS_TF; regs->flags |= ctx->saved_flags; /* These were acquired in kmmio_handler(). */ @@ -297,7 +297,7 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs) * will have TF set, in which case, continue the remaining processing * of do_debug, as if this is not a probe hit. */ - if (!(regs->flags & TF_MASK)) + if (!(regs->flags & X86_EFLAGS_TF)) ret = 1; out: put_cpu_var(kmmio_ctx); -- cgit v0.10.2 From ff3a3e9ba5e4273a8bc10570adab4a390fb90757 Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Mon, 12 May 2008 21:20:59 +0200 Subject: x86 mmiotrace: move files into arch/x86/mm/. Signed-off-by: Pekka Paalanen Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index a51ac15..739d49a 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -79,8 +79,6 @@ obj-$(CONFIG_KGDB) += kgdb.o obj-$(CONFIG_VM86) += vm86_32.o obj-$(CONFIG_EARLY_PRINTK) += early_printk.o -obj-$(CONFIG_MMIOTRACE) += mmiotrace/ - obj-$(CONFIG_HPET_TIMER) += hpet.o obj-$(CONFIG_K8_NB) += k8.o diff --git a/arch/x86/kernel/mmiotrace/Makefile b/arch/x86/kernel/mmiotrace/Makefile deleted file mode 100644 index dbcd8d5..0000000 --- a/arch/x86/kernel/mmiotrace/Makefile +++ /dev/null @@ -1,4 +0,0 @@ -obj-$(CONFIG_MMIOTRACE_HOOKS) += kmmio.o -obj-$(CONFIG_MMIOTRACE) += mmiotrace.o -mmiotrace-y := pf_in.o mmio-mod.o -obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o diff --git a/arch/x86/kernel/mmiotrace/kmmio.c b/arch/x86/kernel/mmiotrace/kmmio.c deleted file mode 100644 index 3ad27b8..0000000 --- a/arch/x86/kernel/mmiotrace/kmmio.c +++ /dev/null @@ -1,499 +0,0 @@ -/* Support for MMIO probes. - * Benfit many code from kprobes - * (C) 2002 Louis Zhuang . - * 2007 Alexander Eichner - * 2008 Pekka Paalanen - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define KMMIO_PAGE_HASH_BITS 4 -#define KMMIO_PAGE_TABLE_SIZE (1 << KMMIO_PAGE_HASH_BITS) - -struct kmmio_fault_page { - struct list_head list; - struct kmmio_fault_page *release_next; - unsigned long page; /* location of the fault page */ - - /* - * Number of times this page has been registered as a part - * of a probe. If zero, page is disarmed and this may be freed. - * Used only by writers (RCU). - */ - int count; -}; - -struct kmmio_delayed_release { - struct rcu_head rcu; - struct kmmio_fault_page *release_list; -}; - -struct kmmio_context { - struct kmmio_fault_page *fpage; - struct kmmio_probe *probe; - unsigned long saved_flags; - unsigned long addr; - int active; -}; - -static DEFINE_SPINLOCK(kmmio_lock); - -/* Protected by kmmio_lock */ -unsigned int kmmio_count; - -/* Read-protected by RCU, write-protected by kmmio_lock. */ -static struct list_head kmmio_page_table[KMMIO_PAGE_TABLE_SIZE]; -static LIST_HEAD(kmmio_probes); - -static struct list_head *kmmio_page_list(unsigned long page) -{ - return &kmmio_page_table[hash_long(page, KMMIO_PAGE_HASH_BITS)]; -} - -/* Accessed per-cpu */ -static DEFINE_PER_CPU(struct kmmio_context, kmmio_ctx); - -/* - * this is basically a dynamic stabbing problem: - * Could use the existing prio tree code or - * Possible better implementations: - * The Interval Skip List: A Data Structure for Finding All Intervals That - * Overlap a Point (might be simple) - * Space Efficient Dynamic Stabbing with Fast Queries - Mikkel Thorup - */ -/* Get the kmmio at this addr (if any). You must be holding RCU read lock. */ -static struct kmmio_probe *get_kmmio_probe(unsigned long addr) -{ - struct kmmio_probe *p; - list_for_each_entry_rcu(p, &kmmio_probes, list) { - if (addr >= p->addr && addr <= (p->addr + p->len)) - return p; - } - return NULL; -} - -/* You must be holding RCU read lock. */ -static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page) -{ - struct list_head *head; - struct kmmio_fault_page *p; - - page &= PAGE_MASK; - head = kmmio_page_list(page); - list_for_each_entry_rcu(p, head, list) { - if (p->page == page) - return p; - } - return NULL; -} - -static void set_page_present(unsigned long addr, bool present, int *pglevel) -{ - pteval_t pteval; - pmdval_t pmdval; - int level; - pmd_t *pmd; - pte_t *pte = lookup_address(addr, &level); - - if (!pte) { - pr_err("kmmio: no pte for page 0x%08lx\n", addr); - return; - } - - if (pglevel) - *pglevel = level; - - switch (level) { - case PG_LEVEL_2M: - pmd = (pmd_t *)pte; - pmdval = pmd_val(*pmd) & ~_PAGE_PRESENT; - if (present) - pmdval |= _PAGE_PRESENT; - set_pmd(pmd, __pmd(pmdval)); - break; - - case PG_LEVEL_4K: - pteval = pte_val(*pte) & ~_PAGE_PRESENT; - if (present) - pteval |= _PAGE_PRESENT; - set_pte_atomic(pte, __pte(pteval)); - break; - - default: - pr_err("kmmio: unexpected page level 0x%x.\n", level); - return; - } - - __flush_tlb_one(addr); -} - -/** Mark the given page as not present. Access to it will trigger a fault. */ -static void arm_kmmio_fault_page(unsigned long page, int *page_level) -{ - set_page_present(page & PAGE_MASK, false, page_level); -} - -/** Mark the given page as present. */ -static void disarm_kmmio_fault_page(unsigned long page, int *page_level) -{ - set_page_present(page & PAGE_MASK, true, page_level); -} - -/* - * This is being called from do_page_fault(). - * - * We may be in an interrupt or a critical section. Also prefecthing may - * trigger a page fault. We may be in the middle of process switch. - * We cannot take any locks, because we could be executing especially - * within a kmmio critical section. - * - * Local interrupts are disabled, so preemption cannot happen. - * Do not enable interrupts, do not sleep, and watch out for other CPUs. - */ -/* - * Interrupts are disabled on entry as trap3 is an interrupt gate - * and they remain disabled thorough out this function. - */ -int kmmio_handler(struct pt_regs *regs, unsigned long addr) -{ - struct kmmio_context *ctx; - struct kmmio_fault_page *faultpage; - int ret = 0; /* default to fault not handled */ - - /* - * Preemption is now disabled to prevent process switch during - * single stepping. We can only handle one active kmmio trace - * per cpu, so ensure that we finish it before something else - * gets to run. We also hold the RCU read lock over single - * stepping to avoid looking up the probe and kmmio_fault_page - * again. - */ - preempt_disable(); - rcu_read_lock(); - - faultpage = get_kmmio_fault_page(addr); - if (!faultpage) { - /* - * Either this page fault is not caused by kmmio, or - * another CPU just pulled the kmmio probe from under - * our feet. The latter case should not be possible. - */ - goto no_kmmio; - } - - ctx = &get_cpu_var(kmmio_ctx); - if (ctx->active) { - disarm_kmmio_fault_page(faultpage->page, NULL); - if (addr == ctx->addr) { - /* - * On SMP we sometimes get recursive probe hits on the - * same address. Context is already saved, fall out. - */ - pr_debug("kmmio: duplicate probe hit on CPU %d, for " - "address 0x%08lx.\n", - smp_processor_id(), addr); - ret = 1; - goto no_kmmio_ctx; - } - /* - * Prevent overwriting already in-flight context. - * This should not happen, let's hope disarming at least - * prevents a panic. - */ - pr_emerg("kmmio: recursive probe hit on CPU %d, " - "for address 0x%08lx. Ignoring.\n", - smp_processor_id(), addr); - pr_emerg("kmmio: previous hit was at 0x%08lx.\n", - ctx->addr); - goto no_kmmio_ctx; - } - ctx->active++; - - ctx->fpage = faultpage; - ctx->probe = get_kmmio_probe(addr); - ctx->saved_flags = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF)); - ctx->addr = addr; - - if (ctx->probe && ctx->probe->pre_handler) - ctx->probe->pre_handler(ctx->probe, regs, addr); - - /* - * Enable single-stepping and disable interrupts for the faulting - * context. Local interrupts must not get enabled during stepping. - */ - regs->flags |= X86_EFLAGS_TF; - regs->flags &= ~X86_EFLAGS_IF; - - /* Now we set present bit in PTE and single step. */ - disarm_kmmio_fault_page(ctx->fpage->page, NULL); - - /* - * If another cpu accesses the same page while we are stepping, - * the access will not be caught. It will simply succeed and the - * only downside is we lose the event. If this becomes a problem, - * the user should drop to single cpu before tracing. - */ - - put_cpu_var(kmmio_ctx); - return 1; /* fault handled */ - -no_kmmio_ctx: - put_cpu_var(kmmio_ctx); -no_kmmio: - rcu_read_unlock(); - preempt_enable_no_resched(); - return ret; -} - -/* - * Interrupts are disabled on entry as trap1 is an interrupt gate - * and they remain disabled thorough out this function. - * This must always get called as the pair to kmmio_handler(). - */ -static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs) -{ - int ret = 0; - struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx); - - if (!ctx->active) { - pr_debug("kmmio: spurious debug trap on CPU %d.\n", - smp_processor_id()); - goto out; - } - - if (ctx->probe && ctx->probe->post_handler) - ctx->probe->post_handler(ctx->probe, condition, regs); - - arm_kmmio_fault_page(ctx->fpage->page, NULL); - - regs->flags &= ~X86_EFLAGS_TF; - regs->flags |= ctx->saved_flags; - - /* These were acquired in kmmio_handler(). */ - ctx->active--; - BUG_ON(ctx->active); - rcu_read_unlock(); - preempt_enable_no_resched(); - - /* - * if somebody else is singlestepping across a probe point, flags - * will have TF set, in which case, continue the remaining processing - * of do_debug, as if this is not a probe hit. - */ - if (!(regs->flags & X86_EFLAGS_TF)) - ret = 1; -out: - put_cpu_var(kmmio_ctx); - return ret; -} - -/* You must be holding kmmio_lock. */ -static int add_kmmio_fault_page(unsigned long page) -{ - struct kmmio_fault_page *f; - - page &= PAGE_MASK; - f = get_kmmio_fault_page(page); - if (f) { - if (!f->count) - arm_kmmio_fault_page(f->page, NULL); - f->count++; - return 0; - } - - f = kmalloc(sizeof(*f), GFP_ATOMIC); - if (!f) - return -1; - - f->count = 1; - f->page = page; - list_add_rcu(&f->list, kmmio_page_list(f->page)); - - arm_kmmio_fault_page(f->page, NULL); - - return 0; -} - -/* You must be holding kmmio_lock. */ -static void release_kmmio_fault_page(unsigned long page, - struct kmmio_fault_page **release_list) -{ - struct kmmio_fault_page *f; - - page &= PAGE_MASK; - f = get_kmmio_fault_page(page); - if (!f) - return; - - f->count--; - BUG_ON(f->count < 0); - if (!f->count) { - disarm_kmmio_fault_page(f->page, NULL); - f->release_next = *release_list; - *release_list = f; - } -} - -int register_kmmio_probe(struct kmmio_probe *p) -{ - unsigned long flags; - int ret = 0; - unsigned long size = 0; - - spin_lock_irqsave(&kmmio_lock, flags); - if (get_kmmio_probe(p->addr)) { - ret = -EEXIST; - goto out; - } - kmmio_count++; - list_add_rcu(&p->list, &kmmio_probes); - while (size < p->len) { - if (add_kmmio_fault_page(p->addr + size)) - pr_err("kmmio: Unable to set page fault.\n"); - size += PAGE_SIZE; - } -out: - spin_unlock_irqrestore(&kmmio_lock, flags); - /* - * XXX: What should I do here? - * Here was a call to global_flush_tlb(), but it does not exist - * anymore. It seems it's not needed after all. - */ - return ret; -} -EXPORT_SYMBOL(register_kmmio_probe); - -static void rcu_free_kmmio_fault_pages(struct rcu_head *head) -{ - struct kmmio_delayed_release *dr = container_of( - head, - struct kmmio_delayed_release, - rcu); - struct kmmio_fault_page *p = dr->release_list; - while (p) { - struct kmmio_fault_page *next = p->release_next; - BUG_ON(p->count); - kfree(p); - p = next; - } - kfree(dr); -} - -static void remove_kmmio_fault_pages(struct rcu_head *head) -{ - struct kmmio_delayed_release *dr = container_of( - head, - struct kmmio_delayed_release, - rcu); - struct kmmio_fault_page *p = dr->release_list; - struct kmmio_fault_page **prevp = &dr->release_list; - unsigned long flags; - spin_lock_irqsave(&kmmio_lock, flags); - while (p) { - if (!p->count) - list_del_rcu(&p->list); - else - *prevp = p->release_next; - prevp = &p->release_next; - p = p->release_next; - } - spin_unlock_irqrestore(&kmmio_lock, flags); - /* This is the real RCU destroy call. */ - call_rcu(&dr->rcu, rcu_free_kmmio_fault_pages); -} - -/* - * Remove a kmmio probe. You have to synchronize_rcu() before you can be - * sure that the callbacks will not be called anymore. Only after that - * you may actually release your struct kmmio_probe. - * - * Unregistering a kmmio fault page has three steps: - * 1. release_kmmio_fault_page() - * Disarm the page, wait a grace period to let all faults finish. - * 2. remove_kmmio_fault_pages() - * Remove the pages from kmmio_page_table. - * 3. rcu_free_kmmio_fault_pages() - * Actally free the kmmio_fault_page structs as with RCU. - */ -void unregister_kmmio_probe(struct kmmio_probe *p) -{ - unsigned long flags; - unsigned long size = 0; - struct kmmio_fault_page *release_list = NULL; - struct kmmio_delayed_release *drelease; - - spin_lock_irqsave(&kmmio_lock, flags); - while (size < p->len) { - release_kmmio_fault_page(p->addr + size, &release_list); - size += PAGE_SIZE; - } - list_del_rcu(&p->list); - kmmio_count--; - spin_unlock_irqrestore(&kmmio_lock, flags); - - drelease = kmalloc(sizeof(*drelease), GFP_ATOMIC); - if (!drelease) { - pr_crit("kmmio: leaking kmmio_fault_page objects.\n"); - return; - } - drelease->release_list = release_list; - - /* - * This is not really RCU here. We have just disarmed a set of - * pages so that they cannot trigger page faults anymore. However, - * we cannot remove the pages from kmmio_page_table, - * because a probe hit might be in flight on another CPU. The - * pages are collected into a list, and they will be removed from - * kmmio_page_table when it is certain that no probe hit related to - * these pages can be in flight. RCU grace period sounds like a - * good choice. - * - * If we removed the pages too early, kmmio page fault handler might - * not find the respective kmmio_fault_page and determine it's not - * a kmmio fault, when it actually is. This would lead to madness. - */ - call_rcu(&drelease->rcu, remove_kmmio_fault_pages); -} -EXPORT_SYMBOL(unregister_kmmio_probe); - -static int kmmio_die_notifier(struct notifier_block *nb, unsigned long val, - void *args) -{ - struct die_args *arg = args; - - if (val == DIE_DEBUG && (arg->err & DR_STEP)) - if (post_kmmio_handler(arg->err, arg->regs) == 1) - return NOTIFY_STOP; - - return NOTIFY_DONE; -} - -static struct notifier_block nb_die = { - .notifier_call = kmmio_die_notifier -}; - -static int __init init_kmmio(void) -{ - int i; - for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++) - INIT_LIST_HEAD(&kmmio_page_table[i]); - return register_die_notifier(&nb_die); -} -fs_initcall(init_kmmio); /* should be before device_initcall() */ diff --git a/arch/x86/kernel/mmiotrace/mmio-mod.c b/arch/x86/kernel/mmiotrace/mmio-mod.c deleted file mode 100644 index 8256546..0000000 --- a/arch/x86/kernel/mmiotrace/mmio-mod.c +++ /dev/null @@ -1,457 +0,0 @@ -/* - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - * Copyright (C) IBM Corporation, 2005 - * Jeff Muizelaar, 2006, 2007 - * Pekka Paalanen, 2008 - * - * Derived from the read-mod example from relay-examples by Tom Zanussi. - */ -#define DEBUG 1 - -#include -#include -#include -#include -#include -#include -#include -#include -#include /* for ISA_START_ADDRESS */ -#include -#include - -#include "pf_in.h" - -#define NAME "mmiotrace: " - -struct trap_reason { - unsigned long addr; - unsigned long ip; - enum reason_type type; - int active_traces; -}; - -struct remap_trace { - struct list_head list; - struct kmmio_probe probe; - unsigned long phys; - unsigned long id; -}; - -/* Accessed per-cpu. */ -static DEFINE_PER_CPU(struct trap_reason, pf_reason); -static DEFINE_PER_CPU(struct mmiotrace_rw, cpu_trace); - -#if 0 /* XXX: no way gather this info anymore */ -/* Access to this is not per-cpu. */ -static DEFINE_PER_CPU(atomic_t, dropped); -#endif - -static struct dentry *marker_file; - -static DEFINE_MUTEX(mmiotrace_mutex); -static DEFINE_SPINLOCK(trace_lock); -static atomic_t mmiotrace_enabled; -static LIST_HEAD(trace_list); /* struct remap_trace */ - -/* - * Locking in this file: - * - mmiotrace_mutex enforces enable/disable_mmiotrace() critical sections. - * - mmiotrace_enabled may be modified only when holding mmiotrace_mutex - * and trace_lock. - * - Routines depending on is_enabled() must take trace_lock. - * - trace_list users must hold trace_lock. - * - is_enabled() guarantees that mmio_trace_record is allowed. - * - pre/post callbacks assume the effect of is_enabled() being true. - */ - -/* module parameters */ -static unsigned long filter_offset; -static int nommiotrace; -static int ISA_trace; -static int trace_pc; - -module_param(filter_offset, ulong, 0); -module_param(nommiotrace, bool, 0); -module_param(ISA_trace, bool, 0); -module_param(trace_pc, bool, 0); - -MODULE_PARM_DESC(filter_offset, "Start address of traced mappings."); -MODULE_PARM_DESC(nommiotrace, "Disable actual MMIO tracing."); -MODULE_PARM_DESC(ISA_trace, "Do not exclude the low ISA range."); -MODULE_PARM_DESC(trace_pc, "Record address of faulting instructions."); - -static bool is_enabled(void) -{ - return atomic_read(&mmiotrace_enabled); -} - -#if 0 /* XXX: needs rewrite */ -/* - * Write callback for the debugfs entry: - * Read a marker and write it to the mmio trace log - */ -static ssize_t write_marker(struct file *file, const char __user *buffer, - size_t count, loff_t *ppos) -{ - char *event = NULL; - struct mm_io_header *headp; - ssize_t len = (count > 65535) ? 65535 : count; - - event = kzalloc(sizeof(*headp) + len, GFP_KERNEL); - if (!event) - return -ENOMEM; - - headp = (struct mm_io_header *)event; - headp->type = MMIO_MAGIC | (MMIO_MARKER << MMIO_OPCODE_SHIFT); - headp->data_len = len; - - if (copy_from_user(event + sizeof(*headp), buffer, len)) { - kfree(event); - return -EFAULT; - } - - spin_lock_irq(&trace_lock); -#if 0 /* XXX: convert this to use tracing */ - if (is_enabled()) - relay_write(chan, event, sizeof(*headp) + len); - else -#endif - len = -EINVAL; - spin_unlock_irq(&trace_lock); - kfree(event); - return len; -} -#endif - -static void print_pte(unsigned long address) -{ - int level; - pte_t *pte = lookup_address(address, &level); - - if (!pte) { - pr_err(NAME "Error in %s: no pte for page 0x%08lx\n", - __func__, address); - return; - } - - if (level == PG_LEVEL_2M) { - pr_emerg(NAME "4MB pages are not currently supported: " - "0x%08lx\n", address); - BUG(); - } - pr_info(NAME "pte for 0x%lx: 0x%lx 0x%lx\n", address, pte_val(*pte), - pte_val(*pte) & _PAGE_PRESENT); -} - -/* - * For some reason the pre/post pairs have been called in an - * unmatched order. Report and die. - */ -static void die_kmmio_nesting_error(struct pt_regs *regs, unsigned long addr) -{ - const struct trap_reason *my_reason = &get_cpu_var(pf_reason); - pr_emerg(NAME "unexpected fault for address: 0x%08lx, " - "last fault for address: 0x%08lx\n", - addr, my_reason->addr); - print_pte(addr); - print_symbol(KERN_EMERG "faulting IP is at %s\n", regs->ip); - print_symbol(KERN_EMERG "last faulting IP was at %s\n", my_reason->ip); -#ifdef __i386__ - pr_emerg("eax: %08lx ebx: %08lx ecx: %08lx edx: %08lx\n", - regs->ax, regs->bx, regs->cx, regs->dx); - pr_emerg("esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n", - regs->si, regs->di, regs->bp, regs->sp); -#else - pr_emerg("rax: %016lx rcx: %016lx rdx: %016lx\n", - regs->ax, regs->cx, regs->dx); - pr_emerg("rsi: %016lx rdi: %016lx rbp: %016lx rsp: %016lx\n", - regs->si, regs->di, regs->bp, regs->sp); -#endif - put_cpu_var(pf_reason); - BUG(); -} - -static void pre(struct kmmio_probe *p, struct pt_regs *regs, - unsigned long addr) -{ - struct trap_reason *my_reason = &get_cpu_var(pf_reason); - struct mmiotrace_rw *my_trace = &get_cpu_var(cpu_trace); - const unsigned long instptr = instruction_pointer(regs); - const enum reason_type type = get_ins_type(instptr); - struct remap_trace *trace = p->user_data; - - /* it doesn't make sense to have more than one active trace per cpu */ - if (my_reason->active_traces) - die_kmmio_nesting_error(regs, addr); - else - my_reason->active_traces++; - - my_reason->type = type; - my_reason->addr = addr; - my_reason->ip = instptr; - - my_trace->phys = addr - trace->probe.addr + trace->phys; - my_trace->map_id = trace->id; - - /* - * Only record the program counter when requested. - * It may taint clean-room reverse engineering. - */ - if (trace_pc) - my_trace->pc = instptr; - else - my_trace->pc = 0; - - /* - * XXX: the timestamp recorded will be *after* the tracing has been - * done, not at the time we hit the instruction. SMP implications - * on event ordering? - */ - - switch (type) { - case REG_READ: - my_trace->opcode = MMIO_READ; - my_trace->width = get_ins_mem_width(instptr); - break; - case REG_WRITE: - my_trace->opcode = MMIO_WRITE; - my_trace->width = get_ins_mem_width(instptr); - my_trace->value = get_ins_reg_val(instptr, regs); - break; - case IMM_WRITE: - my_trace->opcode = MMIO_WRITE; - my_trace->width = get_ins_mem_width(instptr); - my_trace->value = get_ins_imm_val(instptr); - break; - default: - { - unsigned char *ip = (unsigned char *)instptr; - my_trace->opcode = MMIO_UNKNOWN_OP; - my_trace->width = 0; - my_trace->value = (*ip) << 16 | *(ip + 1) << 8 | - *(ip + 2); - } - } - put_cpu_var(cpu_trace); - put_cpu_var(pf_reason); -} - -static void post(struct kmmio_probe *p, unsigned long condition, - struct pt_regs *regs) -{ - struct trap_reason *my_reason = &get_cpu_var(pf_reason); - struct mmiotrace_rw *my_trace = &get_cpu_var(cpu_trace); - - /* this should always return the active_trace count to 0 */ - my_reason->active_traces--; - if (my_reason->active_traces) { - pr_emerg(NAME "unexpected post handler"); - BUG(); - } - - switch (my_reason->type) { - case REG_READ: - my_trace->value = get_ins_reg_val(my_reason->ip, regs); - break; - default: - break; - } - - mmio_trace_rw(my_trace); - put_cpu_var(cpu_trace); - put_cpu_var(pf_reason); -} - -static void ioremap_trace_core(unsigned long offset, unsigned long size, - void __iomem *addr) -{ - static atomic_t next_id; - struct remap_trace *trace = kmalloc(sizeof(*trace), GFP_KERNEL); - struct mmiotrace_map map = { - .phys = offset, - .virt = (unsigned long)addr, - .len = size, - .opcode = MMIO_PROBE - }; - - if (!trace) { - pr_err(NAME "kmalloc failed in ioremap\n"); - return; - } - - *trace = (struct remap_trace) { - .probe = { - .addr = (unsigned long)addr, - .len = size, - .pre_handler = pre, - .post_handler = post, - .user_data = trace - }, - .phys = offset, - .id = atomic_inc_return(&next_id) - }; - map.map_id = trace->id; - - spin_lock_irq(&trace_lock); - if (!is_enabled()) - goto not_enabled; - - mmio_trace_mapping(&map); - list_add_tail(&trace->list, &trace_list); - if (!nommiotrace) - register_kmmio_probe(&trace->probe); - -not_enabled: - spin_unlock_irq(&trace_lock); -} - -void -mmiotrace_ioremap(unsigned long offset, unsigned long size, void __iomem *addr) -{ - if (!is_enabled()) /* recheck and proper locking in *_core() */ - return; - - pr_debug(NAME "ioremap_*(0x%lx, 0x%lx) = %p\n", offset, size, addr); - if ((filter_offset) && (offset != filter_offset)) - return; - ioremap_trace_core(offset, size, addr); -} - -static void iounmap_trace_core(volatile void __iomem *addr) -{ - struct mmiotrace_map map = { - .phys = 0, - .virt = (unsigned long)addr, - .len = 0, - .opcode = MMIO_UNPROBE - }; - struct remap_trace *trace; - struct remap_trace *tmp; - struct remap_trace *found_trace = NULL; - - pr_debug(NAME "Unmapping %p.\n", addr); - - spin_lock_irq(&trace_lock); - if (!is_enabled()) - goto not_enabled; - - list_for_each_entry_safe(trace, tmp, &trace_list, list) { - if ((unsigned long)addr == trace->probe.addr) { - if (!nommiotrace) - unregister_kmmio_probe(&trace->probe); - list_del(&trace->list); - found_trace = trace; - break; - } - } - map.map_id = (found_trace) ? found_trace->id : -1; - mmio_trace_mapping(&map); - -not_enabled: - spin_unlock_irq(&trace_lock); - if (found_trace) { - synchronize_rcu(); /* unregister_kmmio_probe() requirement */ - kfree(found_trace); - } -} - -void mmiotrace_iounmap(volatile void __iomem *addr) -{ - might_sleep(); - if (is_enabled()) /* recheck and proper locking in *_core() */ - iounmap_trace_core(addr); -} - -static void clear_trace_list(void) -{ - struct remap_trace *trace; - struct remap_trace *tmp; - - /* - * No locking required, because the caller ensures we are in a - * critical section via mutex, and is_enabled() is false, - * i.e. nothing can traverse or modify this list. - * Caller also ensures is_enabled() cannot change. - */ - list_for_each_entry(trace, &trace_list, list) { - pr_notice(NAME "purging non-iounmapped " - "trace @0x%08lx, size 0x%lx.\n", - trace->probe.addr, trace->probe.len); - if (!nommiotrace) - unregister_kmmio_probe(&trace->probe); - } - synchronize_rcu(); /* unregister_kmmio_probe() requirement */ - - list_for_each_entry_safe(trace, tmp, &trace_list, list) { - list_del(&trace->list); - kfree(trace); - } -} - -#if 0 /* XXX: out of order */ -static struct file_operations fops_marker = { - .owner = THIS_MODULE, - .write = write_marker -}; -#endif - -void enable_mmiotrace(void) -{ - mutex_lock(&mmiotrace_mutex); - if (is_enabled()) - goto out; - -#if 0 /* XXX: tracing does not support text entries */ - marker_file = debugfs_create_file("marker", 0660, dir, NULL, - &fops_marker); - if (!marker_file) - pr_err(NAME "marker file creation failed.\n"); -#endif - - if (nommiotrace) - pr_info(NAME "MMIO tracing disabled.\n"); - if (ISA_trace) - pr_warning(NAME "Warning! low ISA range will be traced.\n"); - spin_lock_irq(&trace_lock); - atomic_inc(&mmiotrace_enabled); - spin_unlock_irq(&trace_lock); - pr_info(NAME "enabled.\n"); -out: - mutex_unlock(&mmiotrace_mutex); -} - -void disable_mmiotrace(void) -{ - mutex_lock(&mmiotrace_mutex); - if (!is_enabled()) - goto out; - - spin_lock_irq(&trace_lock); - atomic_dec(&mmiotrace_enabled); - BUG_ON(is_enabled()); - spin_unlock_irq(&trace_lock); - - clear_trace_list(); /* guarantees: no more kmmio callbacks */ - if (marker_file) { - debugfs_remove(marker_file); - marker_file = NULL; - } - - pr_info(NAME "disabled.\n"); -out: - mutex_unlock(&mmiotrace_mutex); -} diff --git a/arch/x86/kernel/mmiotrace/pf_in.c b/arch/x86/kernel/mmiotrace/pf_in.c deleted file mode 100644 index efa1911..0000000 --- a/arch/x86/kernel/mmiotrace/pf_in.c +++ /dev/null @@ -1,489 +0,0 @@ -/* - * Fault Injection Test harness (FI) - * Copyright (C) Intel Crop. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, - * USA. - * - */ - -/* Id: pf_in.c,v 1.1.1.1 2002/11/12 05:56:32 brlock Exp - * Copyright by Intel Crop., 2002 - * Louis Zhuang (louis.zhuang@intel.com) - * - * Bjorn Steinbrink (B.Steinbrink@gmx.de), 2007 - */ - -#include -#include /* struct pt_regs */ -#include "pf_in.h" - -#ifdef __i386__ -/* IA32 Manual 3, 2-1 */ -static unsigned char prefix_codes[] = { - 0xF0, 0xF2, 0xF3, 0x2E, 0x36, 0x3E, 0x26, 0x64, - 0x65, 0x2E, 0x3E, 0x66, 0x67 -}; -/* IA32 Manual 3, 3-432*/ -static unsigned int reg_rop[] = { - 0x8A, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F -}; -static unsigned int reg_wop[] = { 0x88, 0x89 }; -static unsigned int imm_wop[] = { 0xC6, 0xC7 }; -/* IA32 Manual 3, 3-432*/ -static unsigned int rw8[] = { 0x88, 0x8A, 0xC6 }; -static unsigned int rw32[] = { - 0x89, 0x8B, 0xC7, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F -}; -static unsigned int mw8[] = { 0x88, 0x8A, 0xC6, 0xB60F, 0xBE0F }; -static unsigned int mw16[] = { 0xB70F, 0xBF0F }; -static unsigned int mw32[] = { 0x89, 0x8B, 0xC7 }; -static unsigned int mw64[] = {}; -#else /* not __i386__ */ -static unsigned char prefix_codes[] = { - 0x66, 0x67, 0x2E, 0x3E, 0x26, 0x64, 0x65, 0x36, - 0xF0, 0xF3, 0xF2, - /* REX Prefixes */ - 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, - 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f -}; -/* AMD64 Manual 3, Appendix A*/ -static unsigned int reg_rop[] = { - 0x8A, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F -}; -static unsigned int reg_wop[] = { 0x88, 0x89 }; -static unsigned int imm_wop[] = { 0xC6, 0xC7 }; -static unsigned int rw8[] = { 0xC6, 0x88, 0x8A }; -static unsigned int rw32[] = { - 0xC7, 0x89, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F -}; -/* 8 bit only */ -static unsigned int mw8[] = { 0xC6, 0x88, 0x8A, 0xB60F, 0xBE0F }; -/* 16 bit only */ -static unsigned int mw16[] = { 0xB70F, 0xBF0F }; -/* 16 or 32 bit */ -static unsigned int mw32[] = { 0xC7 }; -/* 16, 32 or 64 bit */ -static unsigned int mw64[] = { 0x89, 0x8B }; -#endif /* not __i386__ */ - -static int skip_prefix(unsigned char *addr, int *shorted, int *enlarged, - int *rexr) -{ - int i; - unsigned char *p = addr; - *shorted = 0; - *enlarged = 0; - *rexr = 0; - -restart: - for (i = 0; i < ARRAY_SIZE(prefix_codes); i++) { - if (*p == prefix_codes[i]) { - if (*p == 0x66) - *shorted = 1; -#ifdef __amd64__ - if ((*p & 0xf8) == 0x48) - *enlarged = 1; - if ((*p & 0xf4) == 0x44) - *rexr = 1; -#endif - p++; - goto restart; - } - } - - return (p - addr); -} - -static int get_opcode(unsigned char *addr, unsigned int *opcode) -{ - int len; - - if (*addr == 0x0F) { - /* 0x0F is extension instruction */ - *opcode = *(unsigned short *)addr; - len = 2; - } else { - *opcode = *addr; - len = 1; - } - - return len; -} - -#define CHECK_OP_TYPE(opcode, array, type) \ - for (i = 0; i < ARRAY_SIZE(array); i++) { \ - if (array[i] == opcode) { \ - rv = type; \ - goto exit; \ - } \ - } - -enum reason_type get_ins_type(unsigned long ins_addr) -{ - unsigned int opcode; - unsigned char *p; - int shorted, enlarged, rexr; - int i; - enum reason_type rv = OTHERS; - - p = (unsigned char *)ins_addr; - p += skip_prefix(p, &shorted, &enlarged, &rexr); - p += get_opcode(p, &opcode); - - CHECK_OP_TYPE(opcode, reg_rop, REG_READ); - CHECK_OP_TYPE(opcode, reg_wop, REG_WRITE); - CHECK_OP_TYPE(opcode, imm_wop, IMM_WRITE); - -exit: - return rv; -} -#undef CHECK_OP_TYPE - -static unsigned int get_ins_reg_width(unsigned long ins_addr) -{ - unsigned int opcode; - unsigned char *p; - int i, shorted, enlarged, rexr; - - p = (unsigned char *)ins_addr; - p += skip_prefix(p, &shorted, &enlarged, &rexr); - p += get_opcode(p, &opcode); - - for (i = 0; i < ARRAY_SIZE(rw8); i++) - if (rw8[i] == opcode) - return 1; - - for (i = 0; i < ARRAY_SIZE(rw32); i++) - if (rw32[i] == opcode) - return (shorted ? 2 : (enlarged ? 8 : 4)); - - printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode); - return 0; -} - -unsigned int get_ins_mem_width(unsigned long ins_addr) -{ - unsigned int opcode; - unsigned char *p; - int i, shorted, enlarged, rexr; - - p = (unsigned char *)ins_addr; - p += skip_prefix(p, &shorted, &enlarged, &rexr); - p += get_opcode(p, &opcode); - - for (i = 0; i < ARRAY_SIZE(mw8); i++) - if (mw8[i] == opcode) - return 1; - - for (i = 0; i < ARRAY_SIZE(mw16); i++) - if (mw16[i] == opcode) - return 2; - - for (i = 0; i < ARRAY_SIZE(mw32); i++) - if (mw32[i] == opcode) - return shorted ? 2 : 4; - - for (i = 0; i < ARRAY_SIZE(mw64); i++) - if (mw64[i] == opcode) - return shorted ? 2 : (enlarged ? 8 : 4); - - printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode); - return 0; -} - -/* - * Define register ident in mod/rm byte. - * Note: these are NOT the same as in ptrace-abi.h. - */ -enum { - arg_AL = 0, - arg_CL = 1, - arg_DL = 2, - arg_BL = 3, - arg_AH = 4, - arg_CH = 5, - arg_DH = 6, - arg_BH = 7, - - arg_AX = 0, - arg_CX = 1, - arg_DX = 2, - arg_BX = 3, - arg_SP = 4, - arg_BP = 5, - arg_SI = 6, - arg_DI = 7, -#ifdef __amd64__ - arg_R8 = 8, - arg_R9 = 9, - arg_R10 = 10, - arg_R11 = 11, - arg_R12 = 12, - arg_R13 = 13, - arg_R14 = 14, - arg_R15 = 15 -#endif -}; - -static unsigned char *get_reg_w8(int no, struct pt_regs *regs) -{ - unsigned char *rv = NULL; - - switch (no) { - case arg_AL: - rv = (unsigned char *)®s->ax; - break; - case arg_BL: - rv = (unsigned char *)®s->bx; - break; - case arg_CL: - rv = (unsigned char *)®s->cx; - break; - case arg_DL: - rv = (unsigned char *)®s->dx; - break; - case arg_AH: - rv = 1 + (unsigned char *)®s->ax; - break; - case arg_BH: - rv = 1 + (unsigned char *)®s->bx; - break; - case arg_CH: - rv = 1 + (unsigned char *)®s->cx; - break; - case arg_DH: - rv = 1 + (unsigned char *)®s->dx; - break; -#ifdef __amd64__ - case arg_R8: - rv = (unsigned char *)®s->r8; - break; - case arg_R9: - rv = (unsigned char *)®s->r9; - break; - case arg_R10: - rv = (unsigned char *)®s->r10; - break; - case arg_R11: - rv = (unsigned char *)®s->r11; - break; - case arg_R12: - rv = (unsigned char *)®s->r12; - break; - case arg_R13: - rv = (unsigned char *)®s->r13; - break; - case arg_R14: - rv = (unsigned char *)®s->r14; - break; - case arg_R15: - rv = (unsigned char *)®s->r15; - break; -#endif - default: - printk(KERN_ERR "mmiotrace: Error reg no# %d\n", no); - break; - } - return rv; -} - -static unsigned long *get_reg_w32(int no, struct pt_regs *regs) -{ - unsigned long *rv = NULL; - - switch (no) { - case arg_AX: - rv = ®s->ax; - break; - case arg_BX: - rv = ®s->bx; - break; - case arg_CX: - rv = ®s->cx; - break; - case arg_DX: - rv = ®s->dx; - break; - case arg_SP: - rv = ®s->sp; - break; - case arg_BP: - rv = ®s->bp; - break; - case arg_SI: - rv = ®s->si; - break; - case arg_DI: - rv = ®s->di; - break; -#ifdef __amd64__ - case arg_R8: - rv = ®s->r8; - break; - case arg_R9: - rv = ®s->r9; - break; - case arg_R10: - rv = ®s->r10; - break; - case arg_R11: - rv = ®s->r11; - break; - case arg_R12: - rv = ®s->r12; - break; - case arg_R13: - rv = ®s->r13; - break; - case arg_R14: - rv = ®s->r14; - break; - case arg_R15: - rv = ®s->r15; - break; -#endif - default: - printk(KERN_ERR "mmiotrace: Error reg no# %d\n", no); - } - - return rv; -} - -unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs) -{ - unsigned int opcode; - unsigned char mod_rm; - int reg; - unsigned char *p; - int i, shorted, enlarged, rexr; - unsigned long rv; - - p = (unsigned char *)ins_addr; - p += skip_prefix(p, &shorted, &enlarged, &rexr); - p += get_opcode(p, &opcode); - for (i = 0; i < ARRAY_SIZE(reg_rop); i++) - if (reg_rop[i] == opcode) { - rv = REG_READ; - goto do_work; - } - - for (i = 0; i < ARRAY_SIZE(reg_wop); i++) - if (reg_wop[i] == opcode) { - rv = REG_WRITE; - goto do_work; - } - - printk(KERN_ERR "mmiotrace: Not a register instruction, opcode " - "0x%02x\n", opcode); - goto err; - -do_work: - mod_rm = *p; - reg = ((mod_rm >> 3) & 0x7) | (rexr << 3); - switch (get_ins_reg_width(ins_addr)) { - case 1: - return *get_reg_w8(reg, regs); - - case 2: - return *(unsigned short *)get_reg_w32(reg, regs); - - case 4: - return *(unsigned int *)get_reg_w32(reg, regs); - -#ifdef __amd64__ - case 8: - return *(unsigned long *)get_reg_w32(reg, regs); -#endif - - default: - printk(KERN_ERR "mmiotrace: Error width# %d\n", reg); - } - -err: - return 0; -} - -unsigned long get_ins_imm_val(unsigned long ins_addr) -{ - unsigned int opcode; - unsigned char mod_rm; - unsigned char mod; - unsigned char *p; - int i, shorted, enlarged, rexr; - unsigned long rv; - - p = (unsigned char *)ins_addr; - p += skip_prefix(p, &shorted, &enlarged, &rexr); - p += get_opcode(p, &opcode); - for (i = 0; i < ARRAY_SIZE(imm_wop); i++) - if (imm_wop[i] == opcode) { - rv = IMM_WRITE; - goto do_work; - } - - printk(KERN_ERR "mmiotrace: Not an immediate instruction, opcode " - "0x%02x\n", opcode); - goto err; - -do_work: - mod_rm = *p; - mod = mod_rm >> 6; - p++; - switch (mod) { - case 0: - /* if r/m is 5 we have a 32 disp (IA32 Manual 3, Table 2-2) */ - /* AMD64: XXX Check for address size prefix? */ - if ((mod_rm & 0x7) == 0x5) - p += 4; - break; - - case 1: - p += 1; - break; - - case 2: - p += 4; - break; - - case 3: - default: - printk(KERN_ERR "mmiotrace: not a memory access instruction " - "at 0x%lx, rm_mod=0x%02x\n", - ins_addr, mod_rm); - } - - switch (get_ins_reg_width(ins_addr)) { - case 1: - return *(unsigned char *)p; - - case 2: - return *(unsigned short *)p; - - case 4: - return *(unsigned int *)p; - -#ifdef __amd64__ - case 8: - return *(unsigned long *)p; -#endif - - default: - printk(KERN_ERR "mmiotrace: Error: width.\n"); - } - -err: - return 0; -} diff --git a/arch/x86/kernel/mmiotrace/pf_in.h b/arch/x86/kernel/mmiotrace/pf_in.h deleted file mode 100644 index e05341a..0000000 --- a/arch/x86/kernel/mmiotrace/pf_in.h +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Fault Injection Test harness (FI) - * Copyright (C) Intel Crop. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, - * USA. - * - */ - -#ifndef __PF_H_ -#define __PF_H_ - -enum reason_type { - NOT_ME, /* page fault is not in regions */ - NOTHING, /* access others point in regions */ - REG_READ, /* read from addr to reg */ - REG_WRITE, /* write from reg to addr */ - IMM_WRITE, /* write from imm to addr */ - OTHERS /* Other instructions can not intercept */ -}; - -enum reason_type get_ins_type(unsigned long ins_addr); -unsigned int get_ins_mem_width(unsigned long ins_addr); -unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs); -unsigned long get_ins_imm_val(unsigned long ins_addr); - -#endif /* __PF_H_ */ diff --git a/arch/x86/kernel/mmiotrace/testmmiotrace.c b/arch/x86/kernel/mmiotrace/testmmiotrace.c deleted file mode 100644 index cfa60b2..0000000 --- a/arch/x86/kernel/mmiotrace/testmmiotrace.c +++ /dev/null @@ -1,71 +0,0 @@ -/* - * Written by Pekka Paalanen, 2008 - */ -#include -#include - -#define MODULE_NAME "testmmiotrace" - -static unsigned long mmio_address; -module_param(mmio_address, ulong, 0); -MODULE_PARM_DESC(mmio_address, "Start address of the mapping of 16 kB."); - -static void do_write_test(void __iomem *p) -{ - unsigned int i; - for (i = 0; i < 256; i++) - iowrite8(i, p + i); - for (i = 1024; i < (5 * 1024); i += 2) - iowrite16(i * 12 + 7, p + i); - for (i = (5 * 1024); i < (16 * 1024); i += 4) - iowrite32(i * 212371 + 13, p + i); -} - -static void do_read_test(void __iomem *p) -{ - unsigned int i; - for (i = 0; i < 256; i++) - ioread8(p + i); - for (i = 1024; i < (5 * 1024); i += 2) - ioread16(p + i); - for (i = (5 * 1024); i < (16 * 1024); i += 4) - ioread32(p + i); -} - -static void do_test(void) -{ - void __iomem *p = ioremap_nocache(mmio_address, 0x4000); - if (!p) { - pr_err(MODULE_NAME ": could not ioremap, aborting.\n"); - return; - } - do_write_test(p); - do_read_test(p); - iounmap(p); -} - -static int __init init(void) -{ - if (mmio_address == 0) { - pr_err(MODULE_NAME ": you have to use the module argument " - "mmio_address.\n"); - pr_err(MODULE_NAME ": DO NOT LOAD THIS MODULE UNLESS" - " YOU REALLY KNOW WHAT YOU ARE DOING!\n"); - return -ENXIO; - } - - pr_warning(MODULE_NAME ": WARNING: mapping 16 kB @ 0x%08lx " - "in PCI address space, and writing " - "rubbish in there.\n", mmio_address); - do_test(); - return 0; -} - -static void __exit cleanup(void) -{ - pr_debug(MODULE_NAME ": unloaded.\n"); -} - -module_init(init); -module_exit(cleanup); -MODULE_LICENSE("GPL"); diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index b7b3e4c..07dab50 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -8,6 +8,11 @@ obj-$(CONFIG_X86_PTDUMP) += dump_pagetables.o obj-$(CONFIG_HIGHMEM) += highmem_32.o +obj-$(CONFIG_MMIOTRACE_HOOKS) += kmmio.o +obj-$(CONFIG_MMIOTRACE) += mmiotrace.o +mmiotrace-y := pf_in.o mmio-mod.o +obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o + ifeq ($(CONFIG_X86_32),y) obj-$(CONFIG_NUMA) += discontig_32.o else diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c new file mode 100644 index 0000000..3ad27b8 --- /dev/null +++ b/arch/x86/mm/kmmio.c @@ -0,0 +1,499 @@ +/* Support for MMIO probes. + * Benfit many code from kprobes + * (C) 2002 Louis Zhuang . + * 2007 Alexander Eichner + * 2008 Pekka Paalanen + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define KMMIO_PAGE_HASH_BITS 4 +#define KMMIO_PAGE_TABLE_SIZE (1 << KMMIO_PAGE_HASH_BITS) + +struct kmmio_fault_page { + struct list_head list; + struct kmmio_fault_page *release_next; + unsigned long page; /* location of the fault page */ + + /* + * Number of times this page has been registered as a part + * of a probe. If zero, page is disarmed and this may be freed. + * Used only by writers (RCU). + */ + int count; +}; + +struct kmmio_delayed_release { + struct rcu_head rcu; + struct kmmio_fault_page *release_list; +}; + +struct kmmio_context { + struct kmmio_fault_page *fpage; + struct kmmio_probe *probe; + unsigned long saved_flags; + unsigned long addr; + int active; +}; + +static DEFINE_SPINLOCK(kmmio_lock); + +/* Protected by kmmio_lock */ +unsigned int kmmio_count; + +/* Read-protected by RCU, write-protected by kmmio_lock. */ +static struct list_head kmmio_page_table[KMMIO_PAGE_TABLE_SIZE]; +static LIST_HEAD(kmmio_probes); + +static struct list_head *kmmio_page_list(unsigned long page) +{ + return &kmmio_page_table[hash_long(page, KMMIO_PAGE_HASH_BITS)]; +} + +/* Accessed per-cpu */ +static DEFINE_PER_CPU(struct kmmio_context, kmmio_ctx); + +/* + * this is basically a dynamic stabbing problem: + * Could use the existing prio tree code or + * Possible better implementations: + * The Interval Skip List: A Data Structure for Finding All Intervals That + * Overlap a Point (might be simple) + * Space Efficient Dynamic Stabbing with Fast Queries - Mikkel Thorup + */ +/* Get the kmmio at this addr (if any). You must be holding RCU read lock. */ +static struct kmmio_probe *get_kmmio_probe(unsigned long addr) +{ + struct kmmio_probe *p; + list_for_each_entry_rcu(p, &kmmio_probes, list) { + if (addr >= p->addr && addr <= (p->addr + p->len)) + return p; + } + return NULL; +} + +/* You must be holding RCU read lock. */ +static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page) +{ + struct list_head *head; + struct kmmio_fault_page *p; + + page &= PAGE_MASK; + head = kmmio_page_list(page); + list_for_each_entry_rcu(p, head, list) { + if (p->page == page) + return p; + } + return NULL; +} + +static void set_page_present(unsigned long addr, bool present, int *pglevel) +{ + pteval_t pteval; + pmdval_t pmdval; + int level; + pmd_t *pmd; + pte_t *pte = lookup_address(addr, &level); + + if (!pte) { + pr_err("kmmio: no pte for page 0x%08lx\n", addr); + return; + } + + if (pglevel) + *pglevel = level; + + switch (level) { + case PG_LEVEL_2M: + pmd = (pmd_t *)pte; + pmdval = pmd_val(*pmd) & ~_PAGE_PRESENT; + if (present) + pmdval |= _PAGE_PRESENT; + set_pmd(pmd, __pmd(pmdval)); + break; + + case PG_LEVEL_4K: + pteval = pte_val(*pte) & ~_PAGE_PRESENT; + if (present) + pteval |= _PAGE_PRESENT; + set_pte_atomic(pte, __pte(pteval)); + break; + + default: + pr_err("kmmio: unexpected page level 0x%x.\n", level); + return; + } + + __flush_tlb_one(addr); +} + +/** Mark the given page as not present. Access to it will trigger a fault. */ +static void arm_kmmio_fault_page(unsigned long page, int *page_level) +{ + set_page_present(page & PAGE_MASK, false, page_level); +} + +/** Mark the given page as present. */ +static void disarm_kmmio_fault_page(unsigned long page, int *page_level) +{ + set_page_present(page & PAGE_MASK, true, page_level); +} + +/* + * This is being called from do_page_fault(). + * + * We may be in an interrupt or a critical section. Also prefecthing may + * trigger a page fault. We may be in the middle of process switch. + * We cannot take any locks, because we could be executing especially + * within a kmmio critical section. + * + * Local interrupts are disabled, so preemption cannot happen. + * Do not enable interrupts, do not sleep, and watch out for other CPUs. + */ +/* + * Interrupts are disabled on entry as trap3 is an interrupt gate + * and they remain disabled thorough out this function. + */ +int kmmio_handler(struct pt_regs *regs, unsigned long addr) +{ + struct kmmio_context *ctx; + struct kmmio_fault_page *faultpage; + int ret = 0; /* default to fault not handled */ + + /* + * Preemption is now disabled to prevent process switch during + * single stepping. We can only handle one active kmmio trace + * per cpu, so ensure that we finish it before something else + * gets to run. We also hold the RCU read lock over single + * stepping to avoid looking up the probe and kmmio_fault_page + * again. + */ + preempt_disable(); + rcu_read_lock(); + + faultpage = get_kmmio_fault_page(addr); + if (!faultpage) { + /* + * Either this page fault is not caused by kmmio, or + * another CPU just pulled the kmmio probe from under + * our feet. The latter case should not be possible. + */ + goto no_kmmio; + } + + ctx = &get_cpu_var(kmmio_ctx); + if (ctx->active) { + disarm_kmmio_fault_page(faultpage->page, NULL); + if (addr == ctx->addr) { + /* + * On SMP we sometimes get recursive probe hits on the + * same address. Context is already saved, fall out. + */ + pr_debug("kmmio: duplicate probe hit on CPU %d, for " + "address 0x%08lx.\n", + smp_processor_id(), addr); + ret = 1; + goto no_kmmio_ctx; + } + /* + * Prevent overwriting already in-flight context. + * This should not happen, let's hope disarming at least + * prevents a panic. + */ + pr_emerg("kmmio: recursive probe hit on CPU %d, " + "for address 0x%08lx. Ignoring.\n", + smp_processor_id(), addr); + pr_emerg("kmmio: previous hit was at 0x%08lx.\n", + ctx->addr); + goto no_kmmio_ctx; + } + ctx->active++; + + ctx->fpage = faultpage; + ctx->probe = get_kmmio_probe(addr); + ctx->saved_flags = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF)); + ctx->addr = addr; + + if (ctx->probe && ctx->probe->pre_handler) + ctx->probe->pre_handler(ctx->probe, regs, addr); + + /* + * Enable single-stepping and disable interrupts for the faulting + * context. Local interrupts must not get enabled during stepping. + */ + regs->flags |= X86_EFLAGS_TF; + regs->flags &= ~X86_EFLAGS_IF; + + /* Now we set present bit in PTE and single step. */ + disarm_kmmio_fault_page(ctx->fpage->page, NULL); + + /* + * If another cpu accesses the same page while we are stepping, + * the access will not be caught. It will simply succeed and the + * only downside is we lose the event. If this becomes a problem, + * the user should drop to single cpu before tracing. + */ + + put_cpu_var(kmmio_ctx); + return 1; /* fault handled */ + +no_kmmio_ctx: + put_cpu_var(kmmio_ctx); +no_kmmio: + rcu_read_unlock(); + preempt_enable_no_resched(); + return ret; +} + +/* + * Interrupts are disabled on entry as trap1 is an interrupt gate + * and they remain disabled thorough out this function. + * This must always get called as the pair to kmmio_handler(). + */ +static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs) +{ + int ret = 0; + struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx); + + if (!ctx->active) { + pr_debug("kmmio: spurious debug trap on CPU %d.\n", + smp_processor_id()); + goto out; + } + + if (ctx->probe && ctx->probe->post_handler) + ctx->probe->post_handler(ctx->probe, condition, regs); + + arm_kmmio_fault_page(ctx->fpage->page, NULL); + + regs->flags &= ~X86_EFLAGS_TF; + regs->flags |= ctx->saved_flags; + + /* These were acquired in kmmio_handler(). */ + ctx->active--; + BUG_ON(ctx->active); + rcu_read_unlock(); + preempt_enable_no_resched(); + + /* + * if somebody else is singlestepping across a probe point, flags + * will have TF set, in which case, continue the remaining processing + * of do_debug, as if this is not a probe hit. + */ + if (!(regs->flags & X86_EFLAGS_TF)) + ret = 1; +out: + put_cpu_var(kmmio_ctx); + return ret; +} + +/* You must be holding kmmio_lock. */ +static int add_kmmio_fault_page(unsigned long page) +{ + struct kmmio_fault_page *f; + + page &= PAGE_MASK; + f = get_kmmio_fault_page(page); + if (f) { + if (!f->count) + arm_kmmio_fault_page(f->page, NULL); + f->count++; + return 0; + } + + f = kmalloc(sizeof(*f), GFP_ATOMIC); + if (!f) + return -1; + + f->count = 1; + f->page = page; + list_add_rcu(&f->list, kmmio_page_list(f->page)); + + arm_kmmio_fault_page(f->page, NULL); + + return 0; +} + +/* You must be holding kmmio_lock. */ +static void release_kmmio_fault_page(unsigned long page, + struct kmmio_fault_page **release_list) +{ + struct kmmio_fault_page *f; + + page &= PAGE_MASK; + f = get_kmmio_fault_page(page); + if (!f) + return; + + f->count--; + BUG_ON(f->count < 0); + if (!f->count) { + disarm_kmmio_fault_page(f->page, NULL); + f->release_next = *release_list; + *release_list = f; + } +} + +int register_kmmio_probe(struct kmmio_probe *p) +{ + unsigned long flags; + int ret = 0; + unsigned long size = 0; + + spin_lock_irqsave(&kmmio_lock, flags); + if (get_kmmio_probe(p->addr)) { + ret = -EEXIST; + goto out; + } + kmmio_count++; + list_add_rcu(&p->list, &kmmio_probes); + while (size < p->len) { + if (add_kmmio_fault_page(p->addr + size)) + pr_err("kmmio: Unable to set page fault.\n"); + size += PAGE_SIZE; + } +out: + spin_unlock_irqrestore(&kmmio_lock, flags); + /* + * XXX: What should I do here? + * Here was a call to global_flush_tlb(), but it does not exist + * anymore. It seems it's not needed after all. + */ + return ret; +} +EXPORT_SYMBOL(register_kmmio_probe); + +static void rcu_free_kmmio_fault_pages(struct rcu_head *head) +{ + struct kmmio_delayed_release *dr = container_of( + head, + struct kmmio_delayed_release, + rcu); + struct kmmio_fault_page *p = dr->release_list; + while (p) { + struct kmmio_fault_page *next = p->release_next; + BUG_ON(p->count); + kfree(p); + p = next; + } + kfree(dr); +} + +static void remove_kmmio_fault_pages(struct rcu_head *head) +{ + struct kmmio_delayed_release *dr = container_of( + head, + struct kmmio_delayed_release, + rcu); + struct kmmio_fault_page *p = dr->release_list; + struct kmmio_fault_page **prevp = &dr->release_list; + unsigned long flags; + spin_lock_irqsave(&kmmio_lock, flags); + while (p) { + if (!p->count) + list_del_rcu(&p->list); + else + *prevp = p->release_next; + prevp = &p->release_next; + p = p->release_next; + } + spin_unlock_irqrestore(&kmmio_lock, flags); + /* This is the real RCU destroy call. */ + call_rcu(&dr->rcu, rcu_free_kmmio_fault_pages); +} + +/* + * Remove a kmmio probe. You have to synchronize_rcu() before you can be + * sure that the callbacks will not be called anymore. Only after that + * you may actually release your struct kmmio_probe. + * + * Unregistering a kmmio fault page has three steps: + * 1. release_kmmio_fault_page() + * Disarm the page, wait a grace period to let all faults finish. + * 2. remove_kmmio_fault_pages() + * Remove the pages from kmmio_page_table. + * 3. rcu_free_kmmio_fault_pages() + * Actally free the kmmio_fault_page structs as with RCU. + */ +void unregister_kmmio_probe(struct kmmio_probe *p) +{ + unsigned long flags; + unsigned long size = 0; + struct kmmio_fault_page *release_list = NULL; + struct kmmio_delayed_release *drelease; + + spin_lock_irqsave(&kmmio_lock, flags); + while (size < p->len) { + release_kmmio_fault_page(p->addr + size, &release_list); + size += PAGE_SIZE; + } + list_del_rcu(&p->list); + kmmio_count--; + spin_unlock_irqrestore(&kmmio_lock, flags); + + drelease = kmalloc(sizeof(*drelease), GFP_ATOMIC); + if (!drelease) { + pr_crit("kmmio: leaking kmmio_fault_page objects.\n"); + return; + } + drelease->release_list = release_list; + + /* + * This is not really RCU here. We have just disarmed a set of + * pages so that they cannot trigger page faults anymore. However, + * we cannot remove the pages from kmmio_page_table, + * because a probe hit might be in flight on another CPU. The + * pages are collected into a list, and they will be removed from + * kmmio_page_table when it is certain that no probe hit related to + * these pages can be in flight. RCU grace period sounds like a + * good choice. + * + * If we removed the pages too early, kmmio page fault handler might + * not find the respective kmmio_fault_page and determine it's not + * a kmmio fault, when it actually is. This would lead to madness. + */ + call_rcu(&drelease->rcu, remove_kmmio_fault_pages); +} +EXPORT_SYMBOL(unregister_kmmio_probe); + +static int kmmio_die_notifier(struct notifier_block *nb, unsigned long val, + void *args) +{ + struct die_args *arg = args; + + if (val == DIE_DEBUG && (arg->err & DR_STEP)) + if (post_kmmio_handler(arg->err, arg->regs) == 1) + return NOTIFY_STOP; + + return NOTIFY_DONE; +} + +static struct notifier_block nb_die = { + .notifier_call = kmmio_die_notifier +}; + +static int __init init_kmmio(void) +{ + int i; + for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++) + INIT_LIST_HEAD(&kmmio_page_table[i]); + return register_die_notifier(&nb_die); +} +fs_initcall(init_kmmio); /* should be before device_initcall() */ diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c new file mode 100644 index 0000000..8256546 --- /dev/null +++ b/arch/x86/mm/mmio-mod.c @@ -0,0 +1,457 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2005 + * Jeff Muizelaar, 2006, 2007 + * Pekka Paalanen, 2008 + * + * Derived from the read-mod example from relay-examples by Tom Zanussi. + */ +#define DEBUG 1 + +#include +#include +#include +#include +#include +#include +#include +#include +#include /* for ISA_START_ADDRESS */ +#include +#include + +#include "pf_in.h" + +#define NAME "mmiotrace: " + +struct trap_reason { + unsigned long addr; + unsigned long ip; + enum reason_type type; + int active_traces; +}; + +struct remap_trace { + struct list_head list; + struct kmmio_probe probe; + unsigned long phys; + unsigned long id; +}; + +/* Accessed per-cpu. */ +static DEFINE_PER_CPU(struct trap_reason, pf_reason); +static DEFINE_PER_CPU(struct mmiotrace_rw, cpu_trace); + +#if 0 /* XXX: no way gather this info anymore */ +/* Access to this is not per-cpu. */ +static DEFINE_PER_CPU(atomic_t, dropped); +#endif + +static struct dentry *marker_file; + +static DEFINE_MUTEX(mmiotrace_mutex); +static DEFINE_SPINLOCK(trace_lock); +static atomic_t mmiotrace_enabled; +static LIST_HEAD(trace_list); /* struct remap_trace */ + +/* + * Locking in this file: + * - mmiotrace_mutex enforces enable/disable_mmiotrace() critical sections. + * - mmiotrace_enabled may be modified only when holding mmiotrace_mutex + * and trace_lock. + * - Routines depending on is_enabled() must take trace_lock. + * - trace_list users must hold trace_lock. + * - is_enabled() guarantees that mmio_trace_record is allowed. + * - pre/post callbacks assume the effect of is_enabled() being true. + */ + +/* module parameters */ +static unsigned long filter_offset; +static int nommiotrace; +static int ISA_trace; +static int trace_pc; + +module_param(filter_offset, ulong, 0); +module_param(nommiotrace, bool, 0); +module_param(ISA_trace, bool, 0); +module_param(trace_pc, bool, 0); + +MODULE_PARM_DESC(filter_offset, "Start address of traced mappings."); +MODULE_PARM_DESC(nommiotrace, "Disable actual MMIO tracing."); +MODULE_PARM_DESC(ISA_trace, "Do not exclude the low ISA range."); +MODULE_PARM_DESC(trace_pc, "Record address of faulting instructions."); + +static bool is_enabled(void) +{ + return atomic_read(&mmiotrace_enabled); +} + +#if 0 /* XXX: needs rewrite */ +/* + * Write callback for the debugfs entry: + * Read a marker and write it to the mmio trace log + */ +static ssize_t write_marker(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos) +{ + char *event = NULL; + struct mm_io_header *headp; + ssize_t len = (count > 65535) ? 65535 : count; + + event = kzalloc(sizeof(*headp) + len, GFP_KERNEL); + if (!event) + return -ENOMEM; + + headp = (struct mm_io_header *)event; + headp->type = MMIO_MAGIC | (MMIO_MARKER << MMIO_OPCODE_SHIFT); + headp->data_len = len; + + if (copy_from_user(event + sizeof(*headp), buffer, len)) { + kfree(event); + return -EFAULT; + } + + spin_lock_irq(&trace_lock); +#if 0 /* XXX: convert this to use tracing */ + if (is_enabled()) + relay_write(chan, event, sizeof(*headp) + len); + else +#endif + len = -EINVAL; + spin_unlock_irq(&trace_lock); + kfree(event); + return len; +} +#endif + +static void print_pte(unsigned long address) +{ + int level; + pte_t *pte = lookup_address(address, &level); + + if (!pte) { + pr_err(NAME "Error in %s: no pte for page 0x%08lx\n", + __func__, address); + return; + } + + if (level == PG_LEVEL_2M) { + pr_emerg(NAME "4MB pages are not currently supported: " + "0x%08lx\n", address); + BUG(); + } + pr_info(NAME "pte for 0x%lx: 0x%lx 0x%lx\n", address, pte_val(*pte), + pte_val(*pte) & _PAGE_PRESENT); +} + +/* + * For some reason the pre/post pairs have been called in an + * unmatched order. Report and die. + */ +static void die_kmmio_nesting_error(struct pt_regs *regs, unsigned long addr) +{ + const struct trap_reason *my_reason = &get_cpu_var(pf_reason); + pr_emerg(NAME "unexpected fault for address: 0x%08lx, " + "last fault for address: 0x%08lx\n", + addr, my_reason->addr); + print_pte(addr); + print_symbol(KERN_EMERG "faulting IP is at %s\n", regs->ip); + print_symbol(KERN_EMERG "last faulting IP was at %s\n", my_reason->ip); +#ifdef __i386__ + pr_emerg("eax: %08lx ebx: %08lx ecx: %08lx edx: %08lx\n", + regs->ax, regs->bx, regs->cx, regs->dx); + pr_emerg("esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n", + regs->si, regs->di, regs->bp, regs->sp); +#else + pr_emerg("rax: %016lx rcx: %016lx rdx: %016lx\n", + regs->ax, regs->cx, regs->dx); + pr_emerg("rsi: %016lx rdi: %016lx rbp: %016lx rsp: %016lx\n", + regs->si, regs->di, regs->bp, regs->sp); +#endif + put_cpu_var(pf_reason); + BUG(); +} + +static void pre(struct kmmio_probe *p, struct pt_regs *regs, + unsigned long addr) +{ + struct trap_reason *my_reason = &get_cpu_var(pf_reason); + struct mmiotrace_rw *my_trace = &get_cpu_var(cpu_trace); + const unsigned long instptr = instruction_pointer(regs); + const enum reason_type type = get_ins_type(instptr); + struct remap_trace *trace = p->user_data; + + /* it doesn't make sense to have more than one active trace per cpu */ + if (my_reason->active_traces) + die_kmmio_nesting_error(regs, addr); + else + my_reason->active_traces++; + + my_reason->type = type; + my_reason->addr = addr; + my_reason->ip = instptr; + + my_trace->phys = addr - trace->probe.addr + trace->phys; + my_trace->map_id = trace->id; + + /* + * Only record the program counter when requested. + * It may taint clean-room reverse engineering. + */ + if (trace_pc) + my_trace->pc = instptr; + else + my_trace->pc = 0; + + /* + * XXX: the timestamp recorded will be *after* the tracing has been + * done, not at the time we hit the instruction. SMP implications + * on event ordering? + */ + + switch (type) { + case REG_READ: + my_trace->opcode = MMIO_READ; + my_trace->width = get_ins_mem_width(instptr); + break; + case REG_WRITE: + my_trace->opcode = MMIO_WRITE; + my_trace->width = get_ins_mem_width(instptr); + my_trace->value = get_ins_reg_val(instptr, regs); + break; + case IMM_WRITE: + my_trace->opcode = MMIO_WRITE; + my_trace->width = get_ins_mem_width(instptr); + my_trace->value = get_ins_imm_val(instptr); + break; + default: + { + unsigned char *ip = (unsigned char *)instptr; + my_trace->opcode = MMIO_UNKNOWN_OP; + my_trace->width = 0; + my_trace->value = (*ip) << 16 | *(ip + 1) << 8 | + *(ip + 2); + } + } + put_cpu_var(cpu_trace); + put_cpu_var(pf_reason); +} + +static void post(struct kmmio_probe *p, unsigned long condition, + struct pt_regs *regs) +{ + struct trap_reason *my_reason = &get_cpu_var(pf_reason); + struct mmiotrace_rw *my_trace = &get_cpu_var(cpu_trace); + + /* this should always return the active_trace count to 0 */ + my_reason->active_traces--; + if (my_reason->active_traces) { + pr_emerg(NAME "unexpected post handler"); + BUG(); + } + + switch (my_reason->type) { + case REG_READ: + my_trace->value = get_ins_reg_val(my_reason->ip, regs); + break; + default: + break; + } + + mmio_trace_rw(my_trace); + put_cpu_var(cpu_trace); + put_cpu_var(pf_reason); +} + +static void ioremap_trace_core(unsigned long offset, unsigned long size, + void __iomem *addr) +{ + static atomic_t next_id; + struct remap_trace *trace = kmalloc(sizeof(*trace), GFP_KERNEL); + struct mmiotrace_map map = { + .phys = offset, + .virt = (unsigned long)addr, + .len = size, + .opcode = MMIO_PROBE + }; + + if (!trace) { + pr_err(NAME "kmalloc failed in ioremap\n"); + return; + } + + *trace = (struct remap_trace) { + .probe = { + .addr = (unsigned long)addr, + .len = size, + .pre_handler = pre, + .post_handler = post, + .user_data = trace + }, + .phys = offset, + .id = atomic_inc_return(&next_id) + }; + map.map_id = trace->id; + + spin_lock_irq(&trace_lock); + if (!is_enabled()) + goto not_enabled; + + mmio_trace_mapping(&map); + list_add_tail(&trace->list, &trace_list); + if (!nommiotrace) + register_kmmio_probe(&trace->probe); + +not_enabled: + spin_unlock_irq(&trace_lock); +} + +void +mmiotrace_ioremap(unsigned long offset, unsigned long size, void __iomem *addr) +{ + if (!is_enabled()) /* recheck and proper locking in *_core() */ + return; + + pr_debug(NAME "ioremap_*(0x%lx, 0x%lx) = %p\n", offset, size, addr); + if ((filter_offset) && (offset != filter_offset)) + return; + ioremap_trace_core(offset, size, addr); +} + +static void iounmap_trace_core(volatile void __iomem *addr) +{ + struct mmiotrace_map map = { + .phys = 0, + .virt = (unsigned long)addr, + .len = 0, + .opcode = MMIO_UNPROBE + }; + struct remap_trace *trace; + struct remap_trace *tmp; + struct remap_trace *found_trace = NULL; + + pr_debug(NAME "Unmapping %p.\n", addr); + + spin_lock_irq(&trace_lock); + if (!is_enabled()) + goto not_enabled; + + list_for_each_entry_safe(trace, tmp, &trace_list, list) { + if ((unsigned long)addr == trace->probe.addr) { + if (!nommiotrace) + unregister_kmmio_probe(&trace->probe); + list_del(&trace->list); + found_trace = trace; + break; + } + } + map.map_id = (found_trace) ? found_trace->id : -1; + mmio_trace_mapping(&map); + +not_enabled: + spin_unlock_irq(&trace_lock); + if (found_trace) { + synchronize_rcu(); /* unregister_kmmio_probe() requirement */ + kfree(found_trace); + } +} + +void mmiotrace_iounmap(volatile void __iomem *addr) +{ + might_sleep(); + if (is_enabled()) /* recheck and proper locking in *_core() */ + iounmap_trace_core(addr); +} + +static void clear_trace_list(void) +{ + struct remap_trace *trace; + struct remap_trace *tmp; + + /* + * No locking required, because the caller ensures we are in a + * critical section via mutex, and is_enabled() is false, + * i.e. nothing can traverse or modify this list. + * Caller also ensures is_enabled() cannot change. + */ + list_for_each_entry(trace, &trace_list, list) { + pr_notice(NAME "purging non-iounmapped " + "trace @0x%08lx, size 0x%lx.\n", + trace->probe.addr, trace->probe.len); + if (!nommiotrace) + unregister_kmmio_probe(&trace->probe); + } + synchronize_rcu(); /* unregister_kmmio_probe() requirement */ + + list_for_each_entry_safe(trace, tmp, &trace_list, list) { + list_del(&trace->list); + kfree(trace); + } +} + +#if 0 /* XXX: out of order */ +static struct file_operations fops_marker = { + .owner = THIS_MODULE, + .write = write_marker +}; +#endif + +void enable_mmiotrace(void) +{ + mutex_lock(&mmiotrace_mutex); + if (is_enabled()) + goto out; + +#if 0 /* XXX: tracing does not support text entries */ + marker_file = debugfs_create_file("marker", 0660, dir, NULL, + &fops_marker); + if (!marker_file) + pr_err(NAME "marker file creation failed.\n"); +#endif + + if (nommiotrace) + pr_info(NAME "MMIO tracing disabled.\n"); + if (ISA_trace) + pr_warning(NAME "Warning! low ISA range will be traced.\n"); + spin_lock_irq(&trace_lock); + atomic_inc(&mmiotrace_enabled); + spin_unlock_irq(&trace_lock); + pr_info(NAME "enabled.\n"); +out: + mutex_unlock(&mmiotrace_mutex); +} + +void disable_mmiotrace(void) +{ + mutex_lock(&mmiotrace_mutex); + if (!is_enabled()) + goto out; + + spin_lock_irq(&trace_lock); + atomic_dec(&mmiotrace_enabled); + BUG_ON(is_enabled()); + spin_unlock_irq(&trace_lock); + + clear_trace_list(); /* guarantees: no more kmmio callbacks */ + if (marker_file) { + debugfs_remove(marker_file); + marker_file = NULL; + } + + pr_info(NAME "disabled.\n"); +out: + mutex_unlock(&mmiotrace_mutex); +} diff --git a/arch/x86/mm/pf_in.c b/arch/x86/mm/pf_in.c new file mode 100644 index 0000000..efa1911 --- /dev/null +++ b/arch/x86/mm/pf_in.c @@ -0,0 +1,489 @@ +/* + * Fault Injection Test harness (FI) + * Copyright (C) Intel Crop. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, + * USA. + * + */ + +/* Id: pf_in.c,v 1.1.1.1 2002/11/12 05:56:32 brlock Exp + * Copyright by Intel Crop., 2002 + * Louis Zhuang (louis.zhuang@intel.com) + * + * Bjorn Steinbrink (B.Steinbrink@gmx.de), 2007 + */ + +#include +#include /* struct pt_regs */ +#include "pf_in.h" + +#ifdef __i386__ +/* IA32 Manual 3, 2-1 */ +static unsigned char prefix_codes[] = { + 0xF0, 0xF2, 0xF3, 0x2E, 0x36, 0x3E, 0x26, 0x64, + 0x65, 0x2E, 0x3E, 0x66, 0x67 +}; +/* IA32 Manual 3, 3-432*/ +static unsigned int reg_rop[] = { + 0x8A, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F +}; +static unsigned int reg_wop[] = { 0x88, 0x89 }; +static unsigned int imm_wop[] = { 0xC6, 0xC7 }; +/* IA32 Manual 3, 3-432*/ +static unsigned int rw8[] = { 0x88, 0x8A, 0xC6 }; +static unsigned int rw32[] = { + 0x89, 0x8B, 0xC7, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F +}; +static unsigned int mw8[] = { 0x88, 0x8A, 0xC6, 0xB60F, 0xBE0F }; +static unsigned int mw16[] = { 0xB70F, 0xBF0F }; +static unsigned int mw32[] = { 0x89, 0x8B, 0xC7 }; +static unsigned int mw64[] = {}; +#else /* not __i386__ */ +static unsigned char prefix_codes[] = { + 0x66, 0x67, 0x2E, 0x3E, 0x26, 0x64, 0x65, 0x36, + 0xF0, 0xF3, 0xF2, + /* REX Prefixes */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f +}; +/* AMD64 Manual 3, Appendix A*/ +static unsigned int reg_rop[] = { + 0x8A, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F +}; +static unsigned int reg_wop[] = { 0x88, 0x89 }; +static unsigned int imm_wop[] = { 0xC6, 0xC7 }; +static unsigned int rw8[] = { 0xC6, 0x88, 0x8A }; +static unsigned int rw32[] = { + 0xC7, 0x89, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F +}; +/* 8 bit only */ +static unsigned int mw8[] = { 0xC6, 0x88, 0x8A, 0xB60F, 0xBE0F }; +/* 16 bit only */ +static unsigned int mw16[] = { 0xB70F, 0xBF0F }; +/* 16 or 32 bit */ +static unsigned int mw32[] = { 0xC7 }; +/* 16, 32 or 64 bit */ +static unsigned int mw64[] = { 0x89, 0x8B }; +#endif /* not __i386__ */ + +static int skip_prefix(unsigned char *addr, int *shorted, int *enlarged, + int *rexr) +{ + int i; + unsigned char *p = addr; + *shorted = 0; + *enlarged = 0; + *rexr = 0; + +restart: + for (i = 0; i < ARRAY_SIZE(prefix_codes); i++) { + if (*p == prefix_codes[i]) { + if (*p == 0x66) + *shorted = 1; +#ifdef __amd64__ + if ((*p & 0xf8) == 0x48) + *enlarged = 1; + if ((*p & 0xf4) == 0x44) + *rexr = 1; +#endif + p++; + goto restart; + } + } + + return (p - addr); +} + +static int get_opcode(unsigned char *addr, unsigned int *opcode) +{ + int len; + + if (*addr == 0x0F) { + /* 0x0F is extension instruction */ + *opcode = *(unsigned short *)addr; + len = 2; + } else { + *opcode = *addr; + len = 1; + } + + return len; +} + +#define CHECK_OP_TYPE(opcode, array, type) \ + for (i = 0; i < ARRAY_SIZE(array); i++) { \ + if (array[i] == opcode) { \ + rv = type; \ + goto exit; \ + } \ + } + +enum reason_type get_ins_type(unsigned long ins_addr) +{ + unsigned int opcode; + unsigned char *p; + int shorted, enlarged, rexr; + int i; + enum reason_type rv = OTHERS; + + p = (unsigned char *)ins_addr; + p += skip_prefix(p, &shorted, &enlarged, &rexr); + p += get_opcode(p, &opcode); + + CHECK_OP_TYPE(opcode, reg_rop, REG_READ); + CHECK_OP_TYPE(opcode, reg_wop, REG_WRITE); + CHECK_OP_TYPE(opcode, imm_wop, IMM_WRITE); + +exit: + return rv; +} +#undef CHECK_OP_TYPE + +static unsigned int get_ins_reg_width(unsigned long ins_addr) +{ + unsigned int opcode; + unsigned char *p; + int i, shorted, enlarged, rexr; + + p = (unsigned char *)ins_addr; + p += skip_prefix(p, &shorted, &enlarged, &rexr); + p += get_opcode(p, &opcode); + + for (i = 0; i < ARRAY_SIZE(rw8); i++) + if (rw8[i] == opcode) + return 1; + + for (i = 0; i < ARRAY_SIZE(rw32); i++) + if (rw32[i] == opcode) + return (shorted ? 2 : (enlarged ? 8 : 4)); + + printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode); + return 0; +} + +unsigned int get_ins_mem_width(unsigned long ins_addr) +{ + unsigned int opcode; + unsigned char *p; + int i, shorted, enlarged, rexr; + + p = (unsigned char *)ins_addr; + p += skip_prefix(p, &shorted, &enlarged, &rexr); + p += get_opcode(p, &opcode); + + for (i = 0; i < ARRAY_SIZE(mw8); i++) + if (mw8[i] == opcode) + return 1; + + for (i = 0; i < ARRAY_SIZE(mw16); i++) + if (mw16[i] == opcode) + return 2; + + for (i = 0; i < ARRAY_SIZE(mw32); i++) + if (mw32[i] == opcode) + return shorted ? 2 : 4; + + for (i = 0; i < ARRAY_SIZE(mw64); i++) + if (mw64[i] == opcode) + return shorted ? 2 : (enlarged ? 8 : 4); + + printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode); + return 0; +} + +/* + * Define register ident in mod/rm byte. + * Note: these are NOT the same as in ptrace-abi.h. + */ +enum { + arg_AL = 0, + arg_CL = 1, + arg_DL = 2, + arg_BL = 3, + arg_AH = 4, + arg_CH = 5, + arg_DH = 6, + arg_BH = 7, + + arg_AX = 0, + arg_CX = 1, + arg_DX = 2, + arg_BX = 3, + arg_SP = 4, + arg_BP = 5, + arg_SI = 6, + arg_DI = 7, +#ifdef __amd64__ + arg_R8 = 8, + arg_R9 = 9, + arg_R10 = 10, + arg_R11 = 11, + arg_R12 = 12, + arg_R13 = 13, + arg_R14 = 14, + arg_R15 = 15 +#endif +}; + +static unsigned char *get_reg_w8(int no, struct pt_regs *regs) +{ + unsigned char *rv = NULL; + + switch (no) { + case arg_AL: + rv = (unsigned char *)®s->ax; + break; + case arg_BL: + rv = (unsigned char *)®s->bx; + break; + case arg_CL: + rv = (unsigned char *)®s->cx; + break; + case arg_DL: + rv = (unsigned char *)®s->dx; + break; + case arg_AH: + rv = 1 + (unsigned char *)®s->ax; + break; + case arg_BH: + rv = 1 + (unsigned char *)®s->bx; + break; + case arg_CH: + rv = 1 + (unsigned char *)®s->cx; + break; + case arg_DH: + rv = 1 + (unsigned char *)®s->dx; + break; +#ifdef __amd64__ + case arg_R8: + rv = (unsigned char *)®s->r8; + break; + case arg_R9: + rv = (unsigned char *)®s->r9; + break; + case arg_R10: + rv = (unsigned char *)®s->r10; + break; + case arg_R11: + rv = (unsigned char *)®s->r11; + break; + case arg_R12: + rv = (unsigned char *)®s->r12; + break; + case arg_R13: + rv = (unsigned char *)®s->r13; + break; + case arg_R14: + rv = (unsigned char *)®s->r14; + break; + case arg_R15: + rv = (unsigned char *)®s->r15; + break; +#endif + default: + printk(KERN_ERR "mmiotrace: Error reg no# %d\n", no); + break; + } + return rv; +} + +static unsigned long *get_reg_w32(int no, struct pt_regs *regs) +{ + unsigned long *rv = NULL; + + switch (no) { + case arg_AX: + rv = ®s->ax; + break; + case arg_BX: + rv = ®s->bx; + break; + case arg_CX: + rv = ®s->cx; + break; + case arg_DX: + rv = ®s->dx; + break; + case arg_SP: + rv = ®s->sp; + break; + case arg_BP: + rv = ®s->bp; + break; + case arg_SI: + rv = ®s->si; + break; + case arg_DI: + rv = ®s->di; + break; +#ifdef __amd64__ + case arg_R8: + rv = ®s->r8; + break; + case arg_R9: + rv = ®s->r9; + break; + case arg_R10: + rv = ®s->r10; + break; + case arg_R11: + rv = ®s->r11; + break; + case arg_R12: + rv = ®s->r12; + break; + case arg_R13: + rv = ®s->r13; + break; + case arg_R14: + rv = ®s->r14; + break; + case arg_R15: + rv = ®s->r15; + break; +#endif + default: + printk(KERN_ERR "mmiotrace: Error reg no# %d\n", no); + } + + return rv; +} + +unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs) +{ + unsigned int opcode; + unsigned char mod_rm; + int reg; + unsigned char *p; + int i, shorted, enlarged, rexr; + unsigned long rv; + + p = (unsigned char *)ins_addr; + p += skip_prefix(p, &shorted, &enlarged, &rexr); + p += get_opcode(p, &opcode); + for (i = 0; i < ARRAY_SIZE(reg_rop); i++) + if (reg_rop[i] == opcode) { + rv = REG_READ; + goto do_work; + } + + for (i = 0; i < ARRAY_SIZE(reg_wop); i++) + if (reg_wop[i] == opcode) { + rv = REG_WRITE; + goto do_work; + } + + printk(KERN_ERR "mmiotrace: Not a register instruction, opcode " + "0x%02x\n", opcode); + goto err; + +do_work: + mod_rm = *p; + reg = ((mod_rm >> 3) & 0x7) | (rexr << 3); + switch (get_ins_reg_width(ins_addr)) { + case 1: + return *get_reg_w8(reg, regs); + + case 2: + return *(unsigned short *)get_reg_w32(reg, regs); + + case 4: + return *(unsigned int *)get_reg_w32(reg, regs); + +#ifdef __amd64__ + case 8: + return *(unsigned long *)get_reg_w32(reg, regs); +#endif + + default: + printk(KERN_ERR "mmiotrace: Error width# %d\n", reg); + } + +err: + return 0; +} + +unsigned long get_ins_imm_val(unsigned long ins_addr) +{ + unsigned int opcode; + unsigned char mod_rm; + unsigned char mod; + unsigned char *p; + int i, shorted, enlarged, rexr; + unsigned long rv; + + p = (unsigned char *)ins_addr; + p += skip_prefix(p, &shorted, &enlarged, &rexr); + p += get_opcode(p, &opcode); + for (i = 0; i < ARRAY_SIZE(imm_wop); i++) + if (imm_wop[i] == opcode) { + rv = IMM_WRITE; + goto do_work; + } + + printk(KERN_ERR "mmiotrace: Not an immediate instruction, opcode " + "0x%02x\n", opcode); + goto err; + +do_work: + mod_rm = *p; + mod = mod_rm >> 6; + p++; + switch (mod) { + case 0: + /* if r/m is 5 we have a 32 disp (IA32 Manual 3, Table 2-2) */ + /* AMD64: XXX Check for address size prefix? */ + if ((mod_rm & 0x7) == 0x5) + p += 4; + break; + + case 1: + p += 1; + break; + + case 2: + p += 4; + break; + + case 3: + default: + printk(KERN_ERR "mmiotrace: not a memory access instruction " + "at 0x%lx, rm_mod=0x%02x\n", + ins_addr, mod_rm); + } + + switch (get_ins_reg_width(ins_addr)) { + case 1: + return *(unsigned char *)p; + + case 2: + return *(unsigned short *)p; + + case 4: + return *(unsigned int *)p; + +#ifdef __amd64__ + case 8: + return *(unsigned long *)p; +#endif + + default: + printk(KERN_ERR "mmiotrace: Error: width.\n"); + } + +err: + return 0; +} diff --git a/arch/x86/mm/pf_in.h b/arch/x86/mm/pf_in.h new file mode 100644 index 0000000..e05341a --- /dev/null +++ b/arch/x86/mm/pf_in.h @@ -0,0 +1,39 @@ +/* + * Fault Injection Test harness (FI) + * Copyright (C) Intel Crop. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, + * USA. + * + */ + +#ifndef __PF_H_ +#define __PF_H_ + +enum reason_type { + NOT_ME, /* page fault is not in regions */ + NOTHING, /* access others point in regions */ + REG_READ, /* read from addr to reg */ + REG_WRITE, /* write from reg to addr */ + IMM_WRITE, /* write from imm to addr */ + OTHERS /* Other instructions can not intercept */ +}; + +enum reason_type get_ins_type(unsigned long ins_addr); +unsigned int get_ins_mem_width(unsigned long ins_addr); +unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs); +unsigned long get_ins_imm_val(unsigned long ins_addr); + +#endif /* __PF_H_ */ diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c new file mode 100644 index 0000000..cfa60b2 --- /dev/null +++ b/arch/x86/mm/testmmiotrace.c @@ -0,0 +1,71 @@ +/* + * Written by Pekka Paalanen, 2008 + */ +#include +#include + +#define MODULE_NAME "testmmiotrace" + +static unsigned long mmio_address; +module_param(mmio_address, ulong, 0); +MODULE_PARM_DESC(mmio_address, "Start address of the mapping of 16 kB."); + +static void do_write_test(void __iomem *p) +{ + unsigned int i; + for (i = 0; i < 256; i++) + iowrite8(i, p + i); + for (i = 1024; i < (5 * 1024); i += 2) + iowrite16(i * 12 + 7, p + i); + for (i = (5 * 1024); i < (16 * 1024); i += 4) + iowrite32(i * 212371 + 13, p + i); +} + +static void do_read_test(void __iomem *p) +{ + unsigned int i; + for (i = 0; i < 256; i++) + ioread8(p + i); + for (i = 1024; i < (5 * 1024); i += 2) + ioread16(p + i); + for (i = (5 * 1024); i < (16 * 1024); i += 4) + ioread32(p + i); +} + +static void do_test(void) +{ + void __iomem *p = ioremap_nocache(mmio_address, 0x4000); + if (!p) { + pr_err(MODULE_NAME ": could not ioremap, aborting.\n"); + return; + } + do_write_test(p); + do_read_test(p); + iounmap(p); +} + +static int __init init(void) +{ + if (mmio_address == 0) { + pr_err(MODULE_NAME ": you have to use the module argument " + "mmio_address.\n"); + pr_err(MODULE_NAME ": DO NOT LOAD THIS MODULE UNLESS" + " YOU REALLY KNOW WHAT YOU ARE DOING!\n"); + return -ENXIO; + } + + pr_warning(MODULE_NAME ": WARNING: mapping 16 kB @ 0x%08lx " + "in PCI address space, and writing " + "rubbish in there.\n", mmio_address); + do_test(); + return 0; +} + +static void __exit cleanup(void) +{ + pr_debug(MODULE_NAME ": unloaded.\n"); +} + +module_init(init); +module_exit(cleanup); +MODULE_LICENSE("GPL"); -- cgit v0.10.2 From e4b37ee68609037ffcaa2fcfae47cd31a605bb9e Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Mon, 12 May 2008 21:20:59 +0200 Subject: x86 mmiotrace: remove ISA_trace parameter. This had become a no-op. Signed-off-by: Pekka Paalanen Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c index 8256546..ab2bb77 100644 --- a/arch/x86/mm/mmio-mod.c +++ b/arch/x86/mm/mmio-mod.c @@ -81,17 +81,14 @@ static LIST_HEAD(trace_list); /* struct remap_trace */ /* module parameters */ static unsigned long filter_offset; static int nommiotrace; -static int ISA_trace; static int trace_pc; module_param(filter_offset, ulong, 0); module_param(nommiotrace, bool, 0); -module_param(ISA_trace, bool, 0); module_param(trace_pc, bool, 0); MODULE_PARM_DESC(filter_offset, "Start address of traced mappings."); MODULE_PARM_DESC(nommiotrace, "Disable actual MMIO tracing."); -MODULE_PARM_DESC(ISA_trace, "Do not exclude the low ISA range."); MODULE_PARM_DESC(trace_pc, "Record address of faulting instructions."); static bool is_enabled(void) @@ -424,8 +421,6 @@ void enable_mmiotrace(void) if (nommiotrace) pr_info(NAME "MMIO tracing disabled.\n"); - if (ISA_trace) - pr_warning(NAME "Warning! low ISA range will be traced.\n"); spin_lock_irq(&trace_lock); atomic_inc(&mmiotrace_enabled); spin_unlock_irq(&trace_lock); -- cgit v0.10.2 From 736ca61fa81874b3fee205a593251b1869d0bcf1 Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Mon, 12 May 2008 21:20:59 +0200 Subject: x86 mmiotrace: Do not print bogus pid Non-zero pid indicates the MMIO access originated in user space. We do not catch that kind of accesses yet, so always print zero for now. Signed-off-by: Pekka Paalanen Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index 361472b..79be4a1 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -117,20 +117,20 @@ static int mmio_print_rw(struct trace_iterator *iter) ret = trace_seq_printf(s, "R %d %lu.%06lu %d 0x%lx 0x%lx 0x%lx %d\n", rw->width, secs, usec_rem, rw->map_id, rw->phys, - rw->value, rw->pc, entry->pid); + rw->value, rw->pc, 0); break; case MMIO_WRITE: ret = trace_seq_printf(s, "W %d %lu.%06lu %d 0x%lx 0x%lx 0x%lx %d\n", rw->width, secs, usec_rem, rw->map_id, rw->phys, - rw->value, rw->pc, entry->pid); + rw->value, rw->pc, 0); break; case MMIO_UNKNOWN_OP: ret = trace_seq_printf(s, "UNKNOWN %lu.%06lu %d 0x%lx %02x,%02x,%02x 0x%lx %d\n", secs, usec_rem, rw->map_id, rw->phys, (rw->value >> 16) & 0xff, (rw->value >> 8) & 0xff, - (rw->value >> 0) & 0xff, rw->pc, entry->pid); + (rw->value >> 0) & 0xff, rw->pc, 0); break; default: ret = trace_seq_printf(s, "rw what?\n"); -- cgit v0.10.2 From c6c67c1afcce71335b18ed8769b1165c468bfb03 Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Mon, 12 May 2008 21:20:59 +0200 Subject: mmiotrace: add user documentation Signed-off-by: Pekka Paalanen Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/Documentation/tracers/mmiotrace.txt b/Documentation/tracers/mmiotrace.txt new file mode 100644 index 0000000..84246f7 --- /dev/null +++ b/Documentation/tracers/mmiotrace.txt @@ -0,0 +1,153 @@ + In-kernel memory-mapped I/O tracing + + +Home page and links to optional user space tools: + + http://nouveau.freedesktop.org/wiki/MmioTrace + +MMIO tracing was originally developed by Intel around 2003 for their Fault +Injection Test Harness. In Dec 2006 - Jan 2007, using the code from Intel, +Jeff Muizelaar created a tool for tracing MMIO accesses with the Nouveau +project in mind. Since then many people have contributed. + +Mmiotrace was built for reverse engineering any memory-mapped IO device with +the Nouveau project as the first real user. Only x86 and x86_64 architectures +are supported. + +Out-of-tree mmiotrace was originally modified for mainline inclusion and +ftrace framework by Pekka Paalanen . + + +Preparation +----------- + +Mmiotrace feature is compiled in by the CONFIG_MMIOTRACE option. Tracing is +disabled by default, so it is safe to have this set to yes. SMP systems are +supported, but tracing is unreliable and may miss events if more than one CPU +is on-line, therefore mmiotrace takes all but one CPU off-line during run-time +activation [not implemented]. + + +Usage Quick Reference +--------------------- + +$ mount -t debugfs debugfs /debug +$ echo mmiotrace > /debug/tracing/current_tracer +$ cat /debug/tracing/trace_pipe > mydump.txt & +Start X or whatever. +$ echo "X is up" > /debug/tracing/marker +$ echo none > /debug/tracing/current_tracer +Check kernel log. + + +Usage +----- + +Make sure debugfs is mounted to /debug. If not, (requires root privileges) +$ mount -t debugfs debugfs /debug + +Check that the driver you are about to trace is not loaded. + +Activate mmiotrace (requires root privileges): +$ echo mmiotrace > /debug/tracing/current_tracer + +Start storing the trace: +$ cat /debug/tracing/trace_pipe > mydump.txt & +The 'cat' process should stay running (sleeping) in the background. + +Load the driver you want to trace and use it. Mmiotrace will only catch MMIO +accesses to areas that are ioremapped while mmiotrace is active. + +[Unimplemented feature:] +During tracing you can place comments (markers) into the trace by +$ echo "X is up" > /debug/tracing/marker +This makes it easier to see which part of the (huge) trace corresponds to +which action. It is recommended to place descriptive markers about what you +do. + +Shut down mmiotrace (requires root privileges): +$ echo none > /debug/tracing/current_tracer +The 'cat' process exits. If it does not, kill it by 'fg' and pressing ctrl+c. + +[This feature is not implemented yet!] +Check your kernel log. If there are messages about mmiotrace losing events, +this is due to buffer overrun, and the trace is incomplete. You should enlarge +the buffers and try again. [How?] + +If you are doing a trace for a driver project, e.g. Nouveau, you should also +do the following before sending your results: +$ lspci -vvv > lspci.txt +$ dmesg > dmesg.txt +$ tar zcf pciid-nick-mmiotrace.tar.gz mydump.txt lspci.txt dmesg.txt +and then send the .tar.gz file. The trace compresses considerably. Replace +"pciid" and "nick" with the PCI ID or model name of your piece of hardware +under investigation and your nick name. + + +How Mmiotrace Works +------------------- + +Access to hardware IO-memory is gained by mapping addresses from PCI bus by +calling one of the ioremap_*() functions. Mmiotrace is hooked into the +__ioremap() function and gets called whenever a mapping is created. Mapping is +an event that is recorded into the trace log. Note, that ISA range mappings +are not caught, since the mapping always exists and is returned directly. + +MMIO accesses are recorded via page faults. Just before __ioremap() returns, +the mapped pages are marked as not present. Any access to the pages causes a +fault. The page fault handler calls mmiotrace to handle the fault. Mmiotrace +marks the page present, sets TF flag to achieve single stepping and exits the +fault handler. The instruction that faulted is executed and debug trap is +entered. Here mmiotrace again marks the page as not present. The instruction +is decoded to get the type of operation (read/write), data width and the value +read or written. These are stored to the trace log. + +Setting the page present in the page fault handler has a race condition on SMP +machines. During the single stepping other CPUs may run freely on that page +and events can be missed without a notice. Re-enabling other CPUs during +tracing is discouraged. + + +Trace Log Format +---------------- + +The raw log is text and easily filtered with e.g. grep and awk. One record is +one line in the log. A record starts with a keyword, followed by keyword +dependant arguments. Arguments are separated by a space, or continue until the +end of line. The format for version 20070824 is as follows: + +Explanation Keyword Space separated arguments +--------------------------------------------------------------------------- + +read event R width, timestamp, map id, physical, value, PC, PID +write event W width, timestamp, map id, physical, value, PC, PID +ioremap event MAP timestamp, map id, physical, virtual, length, PC, PID +iounmap event UNMAP timestamp, map id, PC, PID +marker MARK timestamp, text +version VERSION the string "20070824" +info for reader LSPCI one line from lspci -v +PCI address map PCIDEV space separated /proc/bus/pci/devices data +unk. opcode UNKNOWN timestamp, map id, physical, data, PC, PID + +Timestamp is in seconds with decimals. Physical is a PCI bus address, virtual +is a kernel virtual address. Width is the data width in bytes and value is the +data value. Map id is an arbitrary id number identifying the mapping that was +used in an operation. PC is the program counter and PID is process id. PC is +zero if it is not recorded. PID is always zero as tracing MMIO accesses +originating in user space memory is not yet supported. + +For instance, the following awk filter will pass all 32-bit writes that target +physical addresses in the range [0xfb73ce40, 0xfb800000[ + +$ awk '/W 4 / { adr=strtonum($5); if (adr >= 0xfb73ce40 && +adr < 0xfb800000) print; }' + + +Tools for Developers +-------------------- + +The user space tools include utilities for: +- replacing numeric addresses and values with hardware register names +- replaying MMIO logs, i.e., re-executing the recorded writes + + diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 1d6de0d..b28ace2 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -180,12 +180,10 @@ config MMIOTRACE help Mmiotrace traces Memory Mapped I/O access and is meant for debugging and reverse engineering. It is called from the ioremap - implementation and works via page faults. A user space program is - required to collect the MMIO data from debugfs files. - Tracing is disabled by default and can be enabled from a debugfs - file. + implementation and works via page faults. Tracing is disabled by + default and can be enabled run-time. - See http://nouveau.freedesktop.org/wiki/MmioTrace + See Documentation/tracers/mmiotrace.txt. If you are not helping to develop drivers, say N. config MMIOTRACE_TEST -- cgit v0.10.2 From 0663bb6cd9a457fbd8ca95c627bb762d07321a39 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 12 May 2008 21:20:59 +0200 Subject: mmiotrace: fix printk format Fix gcc printk format warnings: next-20080415/arch/x86/mm/mmio-mod.c: In function 'print_pte': next-20080415/arch/x86/mm/mmio-mod.c:154: warning: format '%lx' expects type 'long unsigned int', but argument 3 has type 'pteval_t' next-20080415/arch/x86/mm/mmio-mod.c:154: warning: format '%lx' expects type 'long unsigned int', but argument 4 has type 'pteval_t' next-20080415/arch/x86/mm/mmio-mod.c: At top level: next-20080415/arch/x86/mm/mmio-mod.c:403: warning: 'downed_cpus' defined but not used Signed-off-by: Randy Dunlap Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c index ab2bb77..6d6cac8 100644 --- a/arch/x86/mm/mmio-mod.c +++ b/arch/x86/mm/mmio-mod.c @@ -150,8 +150,9 @@ static void print_pte(unsigned long address) "0x%08lx\n", address); BUG(); } - pr_info(NAME "pte for 0x%lx: 0x%lx 0x%lx\n", address, pte_val(*pte), - pte_val(*pte) & _PAGE_PRESENT); + pr_info(NAME "pte for 0x%lx: 0x%llx 0x%llx\n", address, + (unsigned long long)pte_val(*pte), + (unsigned long long)pte_val(*pte) & _PAGE_PRESENT); } /* -- cgit v0.10.2 From 37b3619257d3190f47f233d7ed626d4b9916462c Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 12 May 2008 21:20:59 +0200 Subject: x86/mmiotrace: uses/depends on PCI Don't try to build mmiotrace when CONFIG_PCI=n. next-20080416/kernel/trace/trace_mmiotrace.c: In function 'mmio_print_pcidev': next-20080416/kernel/trace/trace_mmiotrace.c:62: error: implicit declaration of function 'pci_dev_driver' Signed-off-by: Randy Dunlap Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index b28ace2..1e53df0 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -173,7 +173,7 @@ config MMIOTRACE_HOOKS config MMIOTRACE bool "Memory mapped IO tracing" - depends on DEBUG_KERNEL + depends on DEBUG_KERNEL && PCI select TRACING select MMIOTRACE_HOOKS default y @@ -181,7 +181,7 @@ config MMIOTRACE Mmiotrace traces Memory Mapped I/O access and is meant for debugging and reverse engineering. It is called from the ioremap implementation and works via page faults. Tracing is disabled by - default and can be enabled run-time. + default and can be enabled at run-time. See Documentation/tracers/mmiotrace.txt. If you are not helping to develop drivers, say N. -- cgit v0.10.2 From 7423d1115f18627666d475fccc7c62394406ff8c Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Mon, 12 May 2008 21:21:02 +0200 Subject: x86 mmiotrace: dynamically disable non-boot CPUs From 8979ee55cb6a429c4edd72ebec2244b849f6a79a Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Sat, 12 Apr 2008 00:18:57 +0300 Mmiotrace is not reliable with multiple CPUs and may miss events. Drop to single CPU when mmiotrace is activated. Signed-off-by: Pekka Paalanen Signed-off-by: Ingo Molnar diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c index 6d6cac8..1f77d85 100644 --- a/arch/x86/mm/mmio-mod.c +++ b/arch/x86/mm/mmio-mod.c @@ -32,6 +32,7 @@ #include /* for ISA_START_ADDRESS */ #include #include +#include #include "pf_in.h" @@ -400,6 +401,64 @@ static void clear_trace_list(void) } } +#ifdef CONFIG_HOTPLUG_CPU +static cpumask_t downed_cpus; + +static void enter_uniprocessor(void) +{ + int cpu; + int err; + + get_online_cpus(); + downed_cpus = cpu_online_map; + cpu_clear(first_cpu(cpu_online_map), downed_cpus); + if (num_online_cpus() > 1) + pr_notice(NAME "Disabling non-boot CPUs...\n"); + put_online_cpus(); + + for_each_cpu_mask(cpu, downed_cpus) { + err = cpu_down(cpu); + if (!err) { + pr_info(NAME "CPU%d is down.\n", cpu); + } else { + pr_err(NAME "Error taking CPU%d down: %d\n", cpu, err); + } + } + if (num_online_cpus() > 1) + pr_warning(NAME "multiple CPUs still online, " + "may miss events.\n"); +} + +static void leave_uniprocessor(void) +{ + int cpu; + int err; + + if (cpus_weight(downed_cpus) == 0) + return; + pr_notice(NAME "Re-enabling CPUs...\n"); + for_each_cpu_mask(cpu, downed_cpus) { + err = cpu_up(cpu); + if (!err) + pr_info(NAME "enabled CPU%d.\n", cpu); + else + pr_err(NAME "cannot re-enable CPU%d: %d\n", cpu, err); + } +} + +#else /* !CONFIG_HOTPLUG_CPU */ +static void enter_uniprocessor(void) +{ + if (num_online_cpus() > 1) + pr_warning(NAME "multiple CPUs are online, may miss events. " + "Suggest booting with maxcpus=1 kernel argument.\n"); +} + +static void leave_uniprocessor(void) +{ +} +#endif + #if 0 /* XXX: out of order */ static struct file_operations fops_marker = { .owner = THIS_MODULE, @@ -422,6 +481,7 @@ void enable_mmiotrace(void) if (nommiotrace) pr_info(NAME "MMIO tracing disabled.\n"); + enter_uniprocessor(); spin_lock_irq(&trace_lock); atomic_inc(&mmiotrace_enabled); spin_unlock_irq(&trace_lock); @@ -442,6 +502,7 @@ void disable_mmiotrace(void) spin_unlock_irq(&trace_lock); clear_trace_list(); /* guarantees: no more kmmio callbacks */ + leave_uniprocessor(); if (marker_file) { debugfs_remove(marker_file); marker_file = NULL; -- cgit v0.10.2 From d0a7e8ca5b996d36219e6fc002907291c8ee677b Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Mon, 12 May 2008 21:21:02 +0200 Subject: mmiotrace: print header using the read hook. Now the header is printed only for `trace_pipe' file. Signed-off-by: Pekka Paalanen Signed-off-by: Ingo Molnar diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index 79be4a1..6d2edbd 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -12,6 +12,10 @@ #include "trace.h" +struct header_iter { + struct pci_dev *dev; +}; + static struct trace_array *mmio_trace_array; static void mmio_reset_data(struct trace_array *tr) @@ -89,17 +93,57 @@ static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev) return ret; } -/* XXX: This is not called for trace_pipe file! */ -static void mmio_print_header(struct trace_iterator *iter) +static void destroy_header_iter(struct header_iter *hiter) +{ + if (!hiter) + return; + pci_dev_put(hiter->dev); + kfree(hiter); +} + +static void mmio_pipe_open(struct trace_iterator *iter) { + struct header_iter *hiter; struct trace_seq *s = &iter->seq; - struct pci_dev *dev = NULL; trace_seq_printf(s, "VERSION 20070824\n"); - for_each_pci_dev(dev) - mmio_print_pcidev(s, dev); - /* XXX: return value? What if header is very long? */ + hiter = kzalloc(sizeof(*hiter), GFP_KERNEL); + if (!hiter) + return; + + hiter->dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, NULL); + iter->private = hiter; +} + +/* XXX: This is not called when the pipe is closed! */ +static void mmio_close(struct trace_iterator *iter) +{ + struct header_iter *hiter = iter->private; + destroy_header_iter(hiter); + iter->private = NULL; +} + +static ssize_t mmio_read(struct trace_iterator *iter, struct file *filp, + char __user *ubuf, size_t cnt, loff_t *ppos) +{ + ssize_t ret; + struct header_iter *hiter = iter->private; + struct trace_seq *s = &iter->seq; + + if (!hiter) + return 0; + + mmio_print_pcidev(s, hiter->dev); + hiter->dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, hiter->dev); + + if (!hiter->dev) { + destroy_header_iter(hiter); + iter->private = NULL; + } + + ret = trace_seq_to_user(s, ubuf, cnt); + return (ret == -EBUSY) ? 0 : ret; } static int mmio_print_rw(struct trace_iterator *iter) @@ -190,7 +234,9 @@ static struct tracer mmio_tracer __read_mostly = .name = "mmiotrace", .init = mmio_trace_init, .reset = mmio_trace_reset, - .open = mmio_print_header, + .pipe_open = mmio_pipe_open, + .close = mmio_close, + .read = mmio_read, .ctrl_update = mmio_trace_ctrl_update, .print_line = mmio_print_line, }; -- cgit v0.10.2 From 2039238b79b51a50f8477f53f33750e1c3fc146a Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Mon, 12 May 2008 21:21:02 +0200 Subject: mmiotrace: print overrun counts Signed-off-by: Pekka Paalanen Signed-off-by: Ingo Molnar diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index 6d2edbd..d0f649a 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -17,11 +17,13 @@ struct header_iter { }; static struct trace_array *mmio_trace_array; +static bool overrun_detected; static void mmio_reset_data(struct trace_array *tr) { int cpu; + overrun_detected = false; tr->time_start = ftrace_now(tr->cpu); for_each_online_cpu(cpu) @@ -124,12 +126,34 @@ static void mmio_close(struct trace_iterator *iter) iter->private = NULL; } +static unsigned long count_overruns(struct trace_iterator *iter) +{ + int cpu; + unsigned long cnt = 0; + for_each_online_cpu(cpu) { + cnt += iter->overrun[cpu]; + iter->overrun[cpu] = 0; + } + return cnt; +} + static ssize_t mmio_read(struct trace_iterator *iter, struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { ssize_t ret; struct header_iter *hiter = iter->private; struct trace_seq *s = &iter->seq; + unsigned long n; + + n = count_overruns(iter); + if (n) { + /* XXX: This is later than where events were lost. */ + trace_seq_printf(s, "MARK 0.000000 Lost %lu events.\n", n); + if (!overrun_detected) + pr_warning("mmiotrace has lost events.\n"); + overrun_detected = true; + goto print_out; + } if (!hiter) return 0; @@ -142,6 +166,7 @@ static ssize_t mmio_read(struct trace_iterator *iter, struct file *filp, iter->private = NULL; } +print_out: ret = trace_seq_to_user(s, ubuf, cnt); return (ret == -EBUSY) ? 0 : ret; } -- cgit v0.10.2 From e0fd5c2fa188311667267c02a702ae699a9fc2bd Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Mon, 12 May 2008 21:21:02 +0200 Subject: mmiotrace: do not print bogus pid for maps either Signed-off-by: Pekka Paalanen Signed-off-by: Ingo Molnar diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index d0f649a..3c1dacd 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -225,12 +225,12 @@ static int mmio_print_map(struct trace_iterator *iter) ret = trace_seq_printf(s, "MAP %lu.%06lu %d 0x%lx 0x%lx 0x%lx 0x%lx %d\n", secs, usec_rem, m->map_id, m->phys, m->virt, m->len, - 0UL, entry->pid); + 0UL, 0); break; case MMIO_UNPROBE: ret = trace_seq_printf(s, "UNMAP %lu.%06lu %d 0x%lx %d\n", - secs, usec_rem, m->map_id, 0UL, entry->pid); + secs, usec_rem, m->map_id, 0UL, 0); break; default: ret = trace_seq_printf(s, "map what?\n"); -- cgit v0.10.2 From 6f6f394d9ca61fcc73ecfd2f2bf58e92dc1ab269 Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Mon, 12 May 2008 21:21:03 +0200 Subject: doc: update mmiotrace doc to current status Signed-off-by: Pekka Paalanen Signed-off-by: Ingo Molnar diff --git a/Documentation/tracers/mmiotrace.txt b/Documentation/tracers/mmiotrace.txt index 84246f7..a4afb56 100644 --- a/Documentation/tracers/mmiotrace.txt +++ b/Documentation/tracers/mmiotrace.txt @@ -25,7 +25,8 @@ Mmiotrace feature is compiled in by the CONFIG_MMIOTRACE option. Tracing is disabled by default, so it is safe to have this set to yes. SMP systems are supported, but tracing is unreliable and may miss events if more than one CPU is on-line, therefore mmiotrace takes all but one CPU off-line during run-time -activation [not implemented]. +activation. You can re-enable CPUs by hand, but you have been warned, there +is no way to automatically detect if you are losing events due to CPUs racing. Usage Quick Reference @@ -37,7 +38,7 @@ $ cat /debug/tracing/trace_pipe > mydump.txt & Start X or whatever. $ echo "X is up" > /debug/tracing/marker $ echo none > /debug/tracing/current_tracer -Check kernel log. +Check for lost events. Usage @@ -67,12 +68,22 @@ do. Shut down mmiotrace (requires root privileges): $ echo none > /debug/tracing/current_tracer -The 'cat' process exits. If it does not, kill it by 'fg' and pressing ctrl+c. - -[This feature is not implemented yet!] -Check your kernel log. If there are messages about mmiotrace losing events, -this is due to buffer overrun, and the trace is incomplete. You should enlarge -the buffers and try again. [How?] +The 'cat' process exits. If it does not, kill it by issuing 'fg' command and +pressing ctrl+c. + +Check that mmiotrace did not lose events due to a buffer filling up. Either +$ grep -i lost mydump.txt +which tells you exactly how many events were lost, or use +$ dmesg +to view your kernel log and look for "mmiotrace has lost events" warning. If +events were lost, the trace is incomplete. You should enlarge the buffers and +try again. Buffers are enlarged by first seeing how large the current buffers +are: +$ cat /debug/tracing/trace_entries +gives you a number. Approximately double this number and write it back, for +instance: +$ echo 128000 > /debug/tracing/trace_entries +Then start again from the top. If you are doing a trace for a driver project, e.g. Nouveau, you should also do the following before sending your results: -- cgit v0.10.2 From 970e6fa03885f32cc43e42cb08c73a5f54cd8bd9 Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Mon, 12 May 2008 21:21:03 +0200 Subject: mmiotrace: code style cleanups From c2da03771e29159627c5c7b9509ec70bce9f91ee Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Mon, 28 Apr 2008 21:25:22 +0300 Signed-off-by: Pekka Paalanen Signed-off-by: Ingo Molnar diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index 3ad27b8..6a92d91 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -17,10 +17,10 @@ #include #include #include -#include +#include #include #include -#include +#include #include #include diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c index 1f77d85..a8d2a00 100644 --- a/arch/x86/mm/mmio-mod.c +++ b/arch/x86/mm/mmio-mod.c @@ -24,7 +24,7 @@ #include #include #include -#include +#include #include #include #include @@ -418,11 +418,10 @@ static void enter_uniprocessor(void) for_each_cpu_mask(cpu, downed_cpus) { err = cpu_down(cpu); - if (!err) { + if (!err) pr_info(NAME "CPU%d is down.\n", cpu); - } else { + else pr_err(NAME "Error taking CPU%d down: %d\n", cpu, err); - } } if (num_online_cpus() > 1) pr_warning(NAME "multiple CPUs still online, " diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c index cfa60b2..d877c5b 100644 --- a/arch/x86/mm/testmmiotrace.c +++ b/arch/x86/mm/testmmiotrace.c @@ -2,7 +2,7 @@ * Written by Pekka Paalanen, 2008 */ #include -#include +#include #define MODULE_NAME "testmmiotrace" diff --git a/include/linux/mmiotrace.h b/include/linux/mmiotrace.h index dd6b64b..de8e912 100644 --- a/include/linux/mmiotrace.h +++ b/include/linux/mmiotrace.h @@ -1,9 +1,7 @@ #ifndef MMIOTRACE_H #define MMIOTRACE_H -#include - -#ifdef __KERNEL__ +#include #include @@ -84,6 +82,4 @@ extern void disable_mmiotrace(void); extern void mmio_trace_rw(struct mmiotrace_rw *rw); extern void mmio_trace_mapping(struct mmiotrace_map *map); -#endif /* __KERNEL__ */ - #endif /* MMIOTRACE_H */ -- cgit v0.10.2 From 87e547fe41a8b57d6d80afc67a0031fbe477eb0d Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Mon, 12 May 2008 21:21:03 +0200 Subject: x86 mmiotrace: fix page-unaligned ioremaps mmiotrace_ioremap() expects to receive the original unaligned map phys address and size. Also fix {un,}register_kmmio_probe() to deal properly with unaligned size. Signed-off-by: Pekka Paalanen Signed-off-by: Ingo Molnar diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 8927c87..a7c80a6 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -123,6 +123,8 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, { unsigned long pfn, offset, vaddr; resource_size_t last_addr; + const resource_size_t unaligned_phys_addr = phys_addr; + const unsigned long unaligned_size = size; struct vm_struct *area; unsigned long new_prot_val; pgprot_t prot; @@ -236,7 +238,7 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, } ret_addr = (void __iomem *) (vaddr + offset); - mmiotrace_ioremap(phys_addr, size, ret_addr); + mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr); return ret_addr; } diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index 6a92d91..93b1797 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -351,11 +351,19 @@ static void release_kmmio_fault_page(unsigned long page, } } +/* + * With page-unaligned ioremaps, one or two armed pages may contain + * addresses from outside the intended mapping. Events for these addresses + * are currently silently dropped. The events may result only from programming + * mistakes by accessing addresses before the beginning or past the end of a + * mapping. + */ int register_kmmio_probe(struct kmmio_probe *p) { unsigned long flags; int ret = 0; unsigned long size = 0; + const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK); spin_lock_irqsave(&kmmio_lock, flags); if (get_kmmio_probe(p->addr)) { @@ -364,7 +372,7 @@ int register_kmmio_probe(struct kmmio_probe *p) } kmmio_count++; list_add_rcu(&p->list, &kmmio_probes); - while (size < p->len) { + while (size < size_lim) { if (add_kmmio_fault_page(p->addr + size)) pr_err("kmmio: Unable to set page fault.\n"); size += PAGE_SIZE; @@ -436,11 +444,12 @@ void unregister_kmmio_probe(struct kmmio_probe *p) { unsigned long flags; unsigned long size = 0; + const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK); struct kmmio_fault_page *release_list = NULL; struct kmmio_delayed_release *drelease; spin_lock_irqsave(&kmmio_lock, flags); - while (size < p->len) { + while (size < size_lim) { release_kmmio_fault_page(p->addr + size, &release_list); size += PAGE_SIZE; } diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c index a8d2a00..278998c 100644 --- a/arch/x86/mm/mmio-mod.c +++ b/arch/x86/mm/mmio-mod.c @@ -280,6 +280,7 @@ static void ioremap_trace_core(unsigned long offset, unsigned long size, { static atomic_t next_id; struct remap_trace *trace = kmalloc(sizeof(*trace), GFP_KERNEL); + /* These are page-unaligned. */ struct mmiotrace_map map = { .phys = offset, .virt = (unsigned long)addr, -- cgit v0.10.2 From dee310d0adf41019aca476052ac3085ff286d9be Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Mon, 12 May 2008 21:21:03 +0200 Subject: x86 mmiotrace: use resource_size_t for phys addresses Signed-off-by: Pekka Paalanen Signed-off-by: Ingo Molnar diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c index 278998c..3b04a01 100644 --- a/arch/x86/mm/mmio-mod.c +++ b/arch/x86/mm/mmio-mod.c @@ -48,7 +48,7 @@ struct trap_reason { struct remap_trace { struct list_head list; struct kmmio_probe probe; - unsigned long phys; + resource_size_t phys; unsigned long id; }; @@ -275,7 +275,7 @@ static void post(struct kmmio_probe *p, unsigned long condition, put_cpu_var(pf_reason); } -static void ioremap_trace_core(unsigned long offset, unsigned long size, +static void ioremap_trace_core(resource_size_t offset, unsigned long size, void __iomem *addr) { static atomic_t next_id; @@ -319,13 +319,14 @@ not_enabled: spin_unlock_irq(&trace_lock); } -void -mmiotrace_ioremap(unsigned long offset, unsigned long size, void __iomem *addr) +void mmiotrace_ioremap(resource_size_t offset, unsigned long size, + void __iomem *addr) { if (!is_enabled()) /* recheck and proper locking in *_core() */ return; - pr_debug(NAME "ioremap_*(0x%lx, 0x%lx) = %p\n", offset, size, addr); + pr_debug(NAME "ioremap_*(0x%llx, 0x%lx) = %p\n", + (unsigned long long)offset, size, addr); if ((filter_offset) && (offset != filter_offset)) return; ioremap_trace_core(offset, size, addr); diff --git a/include/linux/mmiotrace.h b/include/linux/mmiotrace.h index de8e912..5cbbc37 100644 --- a/include/linux/mmiotrace.h +++ b/include/linux/mmiotrace.h @@ -2,7 +2,6 @@ #define MMIOTRACE_H #include - #include struct kmmio_probe; @@ -37,14 +36,15 @@ extern int kmmio_handler(struct pt_regs *regs, unsigned long addr); /* Called from ioremap.c */ #ifdef CONFIG_MMIOTRACE -extern void -mmiotrace_ioremap(unsigned long offset, unsigned long size, void __iomem *addr); +extern void mmiotrace_ioremap(resource_size_t offset, unsigned long size, + void __iomem *addr); extern void mmiotrace_iounmap(volatile void __iomem *addr); #else -static inline void -mmiotrace_ioremap(unsigned long offset, unsigned long size, void __iomem *addr) +static inline void mmiotrace_ioremap(resource_size_t offset, + unsigned long size, void __iomem *addr) { } + static inline void mmiotrace_iounmap(volatile void __iomem *addr) { } @@ -60,7 +60,7 @@ enum mm_io_opcode { }; struct mmiotrace_rw { - unsigned long phys; /* PCI address of register */ + resource_size_t phys; /* PCI address of register */ unsigned long value; unsigned long pc; /* optional program counter */ int map_id; @@ -69,7 +69,7 @@ struct mmiotrace_rw { }; struct mmiotrace_map { - unsigned long phys; /* base address in PCI space */ + resource_size_t phys; /* base address in PCI space */ unsigned long virt; /* base virtual address */ unsigned long len; /* mapping size */ int map_id; diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index 3c1dacd..b13dc19 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -184,20 +184,23 @@ static int mmio_print_rw(struct trace_iterator *iter) switch (entry->mmiorw.opcode) { case MMIO_READ: ret = trace_seq_printf(s, - "R %d %lu.%06lu %d 0x%lx 0x%lx 0x%lx %d\n", - rw->width, secs, usec_rem, rw->map_id, rw->phys, + "R %d %lu.%06lu %d 0x%llx 0x%lx 0x%lx %d\n", + rw->width, secs, usec_rem, rw->map_id, + (unsigned long long)rw->phys, rw->value, rw->pc, 0); break; case MMIO_WRITE: ret = trace_seq_printf(s, - "W %d %lu.%06lu %d 0x%lx 0x%lx 0x%lx %d\n", - rw->width, secs, usec_rem, rw->map_id, rw->phys, + "W %d %lu.%06lu %d 0x%llx 0x%lx 0x%lx %d\n", + rw->width, secs, usec_rem, rw->map_id, + (unsigned long long)rw->phys, rw->value, rw->pc, 0); break; case MMIO_UNKNOWN_OP: ret = trace_seq_printf(s, - "UNKNOWN %lu.%06lu %d 0x%lx %02x,%02x,%02x 0x%lx %d\n", - secs, usec_rem, rw->map_id, rw->phys, + "UNKNOWN %lu.%06lu %d 0x%llx %02x,%02x,%02x 0x%lx %d\n", + secs, usec_rem, rw->map_id, + (unsigned long long)rw->phys, (rw->value >> 16) & 0xff, (rw->value >> 8) & 0xff, (rw->value >> 0) & 0xff, rw->pc, 0); break; @@ -223,8 +226,9 @@ static int mmio_print_map(struct trace_iterator *iter) switch (entry->mmiorw.opcode) { case MMIO_PROBE: ret = trace_seq_printf(s, - "MAP %lu.%06lu %d 0x%lx 0x%lx 0x%lx 0x%lx %d\n", - secs, usec_rem, m->map_id, m->phys, m->virt, m->len, + "MAP %lu.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n", + secs, usec_rem, m->map_id, + (unsigned long long)m->phys, m->virt, m->len, 0UL, 0); break; case MMIO_UNPROBE: -- cgit v0.10.2 From a50445d76c22a34ae149704ea5adaef171c8acb7 Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Mon, 12 May 2008 21:21:03 +0200 Subject: mmiotrace: rename kmmio_probe::user_data to :private. Signed-off-by: Pekka Paalanen Signed-off-by: Ingo Molnar diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c index 3b04a01..ed0e0e9 100644 --- a/arch/x86/mm/mmio-mod.c +++ b/arch/x86/mm/mmio-mod.c @@ -191,7 +191,7 @@ static void pre(struct kmmio_probe *p, struct pt_regs *regs, struct mmiotrace_rw *my_trace = &get_cpu_var(cpu_trace); const unsigned long instptr = instruction_pointer(regs); const enum reason_type type = get_ins_type(instptr); - struct remap_trace *trace = p->user_data; + struct remap_trace *trace = p->private; /* it doesn't make sense to have more than one active trace per cpu */ if (my_reason->active_traces) @@ -299,7 +299,7 @@ static void ioremap_trace_core(resource_size_t offset, unsigned long size, .len = size, .pre_handler = pre, .post_handler = post, - .user_data = trace + .private = trace }, .phys = offset, .id = atomic_inc_return(&next_id) diff --git a/include/linux/mmiotrace.h b/include/linux/mmiotrace.h index 5cbbc37..61d19e1 100644 --- a/include/linux/mmiotrace.h +++ b/include/linux/mmiotrace.h @@ -18,7 +18,7 @@ struct kmmio_probe { unsigned long len; /* length of the probe region */ kmmio_pre_handler_t pre_handler; /* Called before addr is executed. */ kmmio_post_handler_t post_handler; /* Called after addr is executed */ - void *user_data; + void *private; }; /* kmmio is active by some kmmio_probes? */ -- cgit v0.10.2 From 790e2a290b499b0400254e6870ec27969065d122 Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Mon, 12 May 2008 21:21:14 +0200 Subject: x86 mmiotrace: page level is unsigned Fixes some sparse warnings. Signed-off-by: Pekka Paalanen Signed-off-by: Ingo Molnar diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index 93b1797..b65871e 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -104,11 +104,12 @@ static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page) return NULL; } -static void set_page_present(unsigned long addr, bool present, int *pglevel) +static void set_page_present(unsigned long addr, bool present, + unsigned int *pglevel) { pteval_t pteval; pmdval_t pmdval; - int level; + unsigned int level; pmd_t *pmd; pte_t *pte = lookup_address(addr, &level); @@ -145,15 +146,15 @@ static void set_page_present(unsigned long addr, bool present, int *pglevel) } /** Mark the given page as not present. Access to it will trigger a fault. */ -static void arm_kmmio_fault_page(unsigned long page, int *page_level) +static void arm_kmmio_fault_page(unsigned long page, unsigned int *pglevel) { - set_page_present(page & PAGE_MASK, false, page_level); + set_page_present(page & PAGE_MASK, false, pglevel); } /** Mark the given page as present. */ -static void disarm_kmmio_fault_page(unsigned long page, int *page_level) +static void disarm_kmmio_fault_page(unsigned long page, unsigned int *pglevel) { - set_page_present(page & PAGE_MASK, true, page_level); + set_page_present(page & PAGE_MASK, true, pglevel); } /* diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c index ed0e0e9..e7397e1 100644 --- a/arch/x86/mm/mmio-mod.c +++ b/arch/x86/mm/mmio-mod.c @@ -137,7 +137,7 @@ static ssize_t write_marker(struct file *file, const char __user *buffer, static void print_pte(unsigned long address) { - int level; + unsigned int level; pte_t *pte = lookup_address(address, &level); if (!pte) { -- cgit v0.10.2 From 4d2df795f0c3eb91f97a666f47716121a2f166ed Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 24 May 2008 15:00:46 +0200 Subject: sysprof: make it depend on X86 Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index e101c9a..263e9e6 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -77,7 +77,7 @@ config PREEMPT_TRACER config SYSPROF_TRACER bool "Sysprof Tracer" - depends on DEBUG_KERNEL + depends on X86 select TRACING help This tracer provides the trace needed by the 'Sysprof' userspace -- cgit v0.10.2 From 668a6c3654560aef8741642478973e205a4f02bf Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 19 May 2008 13:35:24 +0200 Subject: - fix mmioftrace + rcu merge interaction Signed-off-by: Thomas Gleixner diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index b65871e..93d8203 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -6,6 +6,7 @@ */ #include +#include #include #include #include -- cgit v0.10.2 From c6531cce6e6e4b99bcda46b6268d6f2d9e30aea4 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:21:14 +0200 Subject: sched: do not trace sched_clock The tracer uses sched_clock, so do not trace it. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner diff --git a/kernel/sched.c b/kernel/sched.c index e2e985e..6590a82 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -884,12 +884,12 @@ static unsigned long long __cpu_clock(int cpu) * For kernel-internal use: high-speed (but slightly incorrect) per-cpu * clock constructed from sched_clock(): */ -unsigned long long cpu_clock(int cpu) +unsigned long long notrace cpu_clock(int cpu) { unsigned long long prev_cpu_time, time, delta_time; unsigned long flags; - local_irq_save(flags); + raw_local_irq_save(flags); prev_cpu_time = per_cpu(prev_cpu_time, cpu); time = __cpu_clock(cpu) + per_cpu(time_offset, cpu); delta_time = time-prev_cpu_time; @@ -898,7 +898,7 @@ unsigned long long cpu_clock(int cpu) time = __sync_cpu_clock(time, cpu); per_cpu(prev_cpu_time, cpu) = time; } - local_irq_restore(flags); + raw_local_irq_restore(flags); return time; } -- cgit v0.10.2 From 19384c0314342222b18d4c7f09cdce1ca74dfd2a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 22 May 2008 00:22:16 -0400 Subject: ftrace: limit use of check pages The check_pages function is called often enough that it can cause problems with trace outputs or even bringing the system to a halt. This patch limits the check_pages to the places that are most likely to have problems. The check is made at the flip between the global array and the max save array, as well as when the size of the buffers changes and the self tests. This patch also removes the BUG_ON from check_pages and replaces it with a WARN_ON and disabling of the tracer. Signed-off-by: Steven Rostedt Cc: pq@iki.fi Cc: proski@gnu.org Cc: sandmann@redhat.com Cc: a.p.zijlstra@chello.nl Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 3271916..0567f51 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -249,24 +249,32 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) tracing_record_cmdline(current); } +#define CHECK_COND(cond) \ + if (unlikely(cond)) { \ + tracing_disabled = 1; \ + WARN_ON(1); \ + return -1; \ + } + /** * check_pages - integrity check of trace buffers * * As a safty measure we check to make sure the data pages have not - * been corrupted. TODO: configure to disable this because it adds - * a bit of overhead. + * been corrupted. */ -void check_pages(struct trace_array_cpu *data) +int check_pages(struct trace_array_cpu *data) { struct page *page, *tmp; - BUG_ON(data->trace_pages.next->prev != &data->trace_pages); - BUG_ON(data->trace_pages.prev->next != &data->trace_pages); + CHECK_COND(data->trace_pages.next->prev != &data->trace_pages); + CHECK_COND(data->trace_pages.prev->next != &data->trace_pages); list_for_each_entry_safe(page, tmp, &data->trace_pages, lru) { - BUG_ON(page->lru.next->prev != &page->lru); - BUG_ON(page->lru.prev->next != &page->lru); + CHECK_COND(page->lru.next->prev != &page->lru); + CHECK_COND(page->lru.prev->next != &page->lru); } + + return 0; } /** @@ -280,7 +288,6 @@ void *head_page(struct trace_array_cpu *data) { struct page *page; - check_pages(data); if (list_empty(&data->trace_pages)) return NULL; @@ -2566,7 +2573,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, { unsigned long val; char buf[64]; - int ret; + int i, ret; if (cnt >= sizeof(buf)) return -EINVAL; @@ -2635,8 +2642,15 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, trace_free_page(); } + /* check integrity */ + for_each_tracing_cpu(i) + check_pages(global_trace.data[i]); + filp->f_pos += cnt; + /* If check pages failed, return ENOMEM */ + if (tracing_disabled) + cnt = -ENOMEM; out: max_tr.entries = global_trace.entries; mutex_unlock(&trace_types_lock); diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 3877dd9..18c5423 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -28,6 +28,7 @@ trace_test_buffer_cpu(struct trace_array *tr, struct trace_array_cpu *data) page = list_entry(data->trace_pages.next, struct page, lru); entries = page_address(page); + check_pages(data); if (head_page(data) != entries) goto failed; -- cgit v0.10.2 From 4902f8849da6d2805bd291551a6dfd48f1b4f604 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 22 May 2008 00:22:18 -0400 Subject: ftrace: move ftrace_special to trace.c Move the ftrace_special out of sched_switch to trace.c. Signed-off-by: Steven Rostedt Cc: pq@iki.fi Cc: proski@gnu.org Cc: sandmann@redhat.com Cc: a.p.zijlstra@chello.nl Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 0567f51..583fe24 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -941,6 +941,30 @@ tracing_sched_wakeup_trace(struct trace_array *tr, trace_wake_up(); } +void +ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) +{ + struct trace_array *tr = &global_trace; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int cpu; + + if (tracing_disabled || current_trace == &no_tracer || !tr->ctrl) + return; + + local_irq_save(flags); + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + disabled = atomic_inc_return(&data->disabled); + + if (likely(disabled == 1)) + __trace_special(tr, data, arg1, arg2, arg3); + + atomic_dec(&data->disabled); + local_irq_restore(flags); +} + #ifdef CONFIG_FTRACE static void function_trace_call(unsigned long ip, unsigned long parent_ip) @@ -2941,8 +2965,6 @@ __init static int tracer_alloc_buffers(void) int ret = -ENOMEM; int i; - global_trace.ctrl = tracer_enabled; - /* TODO: make the number of buffers hot pluggable with CPUS */ tracing_nr_buffers = num_possible_cpus(); tracing_buffer_mask = cpu_possible_map; @@ -3012,6 +3034,7 @@ __init static int tracer_alloc_buffers(void) current_trace = &no_tracer; /* All seems OK, enable tracing */ + global_trace.ctrl = tracer_enabled; tracing_disabled = 0; return 0; diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index d25ffa5..798ec0d 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -125,30 +125,6 @@ wake_up_callback(void *probe_data, void *call_data, wakeup_func(probe_data, __rq, task, curr); } -void -ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) -{ - struct trace_array *tr = ctx_trace; - struct trace_array_cpu *data; - unsigned long flags; - long disabled; - int cpu; - - if (!tracer_enabled) - return; - - local_irq_save(flags); - cpu = raw_smp_processor_id(); - data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); - - if (likely(disabled == 1)) - __trace_special(tr, data, arg1, arg2, arg3); - - atomic_dec(&data->disabled); - local_irq_restore(flags); -} - static void sched_switch_reset(struct trace_array *tr) { int cpu; -- cgit v0.10.2 From 7e18d8e701b6798a5df11e0a16881a60ab1018b6 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 22 May 2008 00:22:19 -0400 Subject: ftrace: add function tracing to wake up tracing This patch adds function tracing to the functions that are called on the CPU of the task being traced. Signed-off-by: Steven Rostedt Cc: pq@iki.fi Cc: proski@gnu.org Cc: sandmann@redhat.com Cc: a.p.zijlstra@chello.nl Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 5d2fb48..bf7e91c 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -30,6 +30,69 @@ static DEFINE_SPINLOCK(wakeup_lock); static void __wakeup_reset(struct trace_array *tr); +#ifdef CONFIG_FTRACE +/* + * irqsoff uses its own tracer function to keep the overhead down: + */ +static void +wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) +{ + struct trace_array *tr = wakeup_trace; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int resched; + int cpu; + + if (likely(!wakeup_task)) + return; + + resched = need_resched(); + preempt_disable_notrace(); + + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + disabled = atomic_inc_return(&data->disabled); + if (unlikely(disabled != 1)) + goto out; + + spin_lock_irqsave(&wakeup_lock, flags); + + if (unlikely(!wakeup_task)) + goto unlock; + + /* + * The task can't disappear because it needs to + * wake up first, and we have the wakeup_lock. + */ + if (task_cpu(wakeup_task) != cpu) + goto unlock; + + trace_function(tr, data, ip, parent_ip, flags); + + unlock: + spin_unlock_irqrestore(&wakeup_lock, flags); + + out: + atomic_dec(&data->disabled); + + /* + * To prevent recursion from the scheduler, if the + * resched flag was set before we entered, then + * don't reschedule. + */ + if (resched) + preempt_enable_no_resched_notrace(); + else + preempt_enable_notrace(); +} + +static struct ftrace_ops trace_ops __read_mostly = +{ + .func = wakeup_tracer_call, +}; +#endif /* CONFIG_FTRACE */ + /* * Should this new latency be reported/recorded? */ @@ -73,7 +136,7 @@ wakeup_sched_switch(void *private, void *rq, struct task_struct *prev, if (next != wakeup_task) return; - /* The task we are waitng for is waking up */ + /* The task we are waiting for is waking up */ data = tr->data[wakeup_cpu]; /* disable local data, not wakeup_cpu data */ @@ -290,6 +353,7 @@ static void start_wakeup_tracer(struct trace_array *tr) smp_wmb(); tracer_enabled = 1; + register_ftrace_function(&trace_ops); return; fail_deprobe_wake_new: @@ -305,6 +369,7 @@ fail_deprobe: static void stop_wakeup_tracer(struct trace_array *tr) { tracer_enabled = 0; + unregister_ftrace_function(&trace_ops); marker_probe_unregister("kernel_sched_schedule", sched_switch_callback, &wakeup_trace); -- cgit v0.10.2 From da89a7a2536c46e76a1a4351a70a8b8417e5fed1 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 22 May 2008 00:22:20 -0400 Subject: ftrace: remove printks from irqsoff trace Printing out new max latencies was fine for the old RT tracer. But for mainline it is a bit messy. We also need to test if the run queue is locked before we can do the print. This means that we may not be printing out latencies if the run queue is locked on another CPU. This produces inconsistencies in the output. This patch simply removes the print altogether. Signed-off-by: Steven Rostedt Cc: pq@iki.fi Cc: proski@gnu.org Cc: sandmann@redhat.com Cc: a.p.zijlstra@chello.nl Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 761f3ec..421d6fe 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -165,22 +165,6 @@ check_critical_timing(struct trace_array *tr, update_max_tr_single(tr, current, cpu); - if (!runqueue_is_locked()) { - if (tracing_thresh) { - printk(KERN_INFO "(%16s-%-5d|#%d): %lu us critical" - " section violates %lu us threshold.\n", - current->comm, current->pid, - raw_smp_processor_id(), - latency, nsecs_to_usecs(tracing_thresh)); - } else { - printk(KERN_INFO "(%16s-%-5d|#%d): new %lu us" - " maximum-latency critical section.\n", - current->comm, current->pid, - raw_smp_processor_id(), - latency); - } - } - max_sequence++; out_unlock: -- cgit v0.10.2 From 41c52c0db9607e59f90da7da5309489fa06e887f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 22 May 2008 11:46:33 -0400 Subject: ftrace: set_ftrace_notrace feature While debugging latencies in the RT kernel, I found that it would be nice to be able to filter away functions from the trace than just to filter on functions. I added a new interface to the debugfs tracing directory called set_ftrace_notrace When dynamic frace is enabled, this lets you filter away functions that will not be recorded in the trace. It is similar to adding 'notrace' to those functions but by doing it without recompiling the kernel. Here's how set_ftrace_filter and set_ftrace_notrace interact. Remember, if set_ftrace_filter is set, it removes all functions from the trace execpt for those listed in the set_ftrace_filter. set_ftrace_notrace will prevent those functions from being traced. If you were to set one function in both set_ftrace_filter and set_ftrace_notrace and that function was the same, then you would end up with an empty trace. the set of functions to trace is: set_ftrace_filter == empty then all functions not in set_ftrace_notrace else set of the set_ftrace_filter and not in set of set_ftrace_notrace. Signed-off-by: Steven Rostedt Signed-off-by: Thomas Gleixner diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 922e23d..ffbbd54 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -48,6 +48,7 @@ enum { FTRACE_FL_FAILED = (1 << 1), FTRACE_FL_FILTER = (1 << 2), FTRACE_FL_ENABLED = (1 << 3), + FTRACE_FL_NOTRACE = (1 << 4), }; struct dyn_ftrace { diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 89bd9a6..2552454 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -170,7 +170,7 @@ static DEFINE_PER_CPU(int, ftrace_shutdown_disable_cpu); static DEFINE_SPINLOCK(ftrace_shutdown_lock); static DEFINE_MUTEX(ftraced_lock); -static DEFINE_MUTEX(ftrace_filter_lock); +static DEFINE_MUTEX(ftrace_regex_lock); struct ftrace_page { struct ftrace_page *next; @@ -337,13 +337,12 @@ static void __ftrace_replace_code(struct dyn_ftrace *rec, unsigned char *old, unsigned char *new, int enable) { - unsigned long ip; + unsigned long ip, fl; int failed; ip = rec->ip; if (ftrace_filtered && enable) { - unsigned long fl; /* * If filtering is on: * @@ -356,13 +355,16 @@ __ftrace_replace_code(struct dyn_ftrace *rec, * If this record is not set to be filtered * and it is not enabled do nothing. * + * If this record is set not to trace then + * do nothing. + * * If this record is not set to be filtered and * it is enabled, disable it. */ fl = rec->flags & (FTRACE_FL_FILTER | FTRACE_FL_ENABLED); if ((fl == (FTRACE_FL_FILTER | FTRACE_FL_ENABLED)) || - (fl == 0)) + (fl == 0) || (rec->flags & FTRACE_FL_NOTRACE)) return; /* @@ -380,9 +382,17 @@ __ftrace_replace_code(struct dyn_ftrace *rec, } } else { - if (enable) + if (enable) { + /* + * If this record is set not to trace and is + * not enabled, do nothing. + */ + fl = rec->flags & (FTRACE_FL_NOTRACE | FTRACE_FL_ENABLED); + if (fl == FTRACE_FL_NOTRACE) + return; + new = ftrace_call_replace(ip, FTRACE_ADDR); - else + } else old = ftrace_call_replace(ip, FTRACE_ADDR); if (enable) { @@ -721,6 +731,7 @@ static int __init ftrace_dyn_table_alloc(void) enum { FTRACE_ITER_FILTER = (1 << 0), FTRACE_ITER_CONT = (1 << 1), + FTRACE_ITER_NOTRACE = (1 << 2), }; #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ @@ -754,7 +765,9 @@ t_next(struct seq_file *m, void *v, loff_t *pos) rec = &iter->pg->records[iter->idx++]; if ((rec->flags & FTRACE_FL_FAILED) || ((iter->flags & FTRACE_ITER_FILTER) && - !(rec->flags & FTRACE_FL_FILTER))) { + !(rec->flags & FTRACE_FL_FILTER)) || + ((iter->flags & FTRACE_ITER_NOTRACE) && + !(rec->flags & FTRACE_FL_NOTRACE))) { rec = NULL; goto retry; } @@ -847,22 +860,24 @@ int ftrace_avail_release(struct inode *inode, struct file *file) return 0; } -static void ftrace_filter_reset(void) +static void ftrace_filter_reset(int enable) { struct ftrace_page *pg; struct dyn_ftrace *rec; + unsigned long type = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; unsigned i; /* keep kstop machine from running */ preempt_disable(); - ftrace_filtered = 0; + if (enable) + ftrace_filtered = 0; pg = ftrace_pages_start; while (pg) { for (i = 0; i < pg->index; i++) { rec = &pg->records[i]; if (rec->flags & FTRACE_FL_FAILED) continue; - rec->flags &= ~FTRACE_FL_FILTER; + rec->flags &= ~type; } pg = pg->next; } @@ -870,7 +885,7 @@ static void ftrace_filter_reset(void) } static int -ftrace_filter_open(struct inode *inode, struct file *file) +ftrace_regex_open(struct inode *inode, struct file *file, int enable) { struct ftrace_iterator *iter; int ret = 0; @@ -882,15 +897,16 @@ ftrace_filter_open(struct inode *inode, struct file *file) if (!iter) return -ENOMEM; - mutex_lock(&ftrace_filter_lock); + mutex_lock(&ftrace_regex_lock); if ((file->f_mode & FMODE_WRITE) && !(file->f_flags & O_APPEND)) - ftrace_filter_reset(); + ftrace_filter_reset(enable); if (file->f_mode & FMODE_READ) { iter->pg = ftrace_pages_start; iter->pos = -1; - iter->flags = FTRACE_ITER_FILTER; + iter->flags = enable ? FTRACE_ITER_FILTER : + FTRACE_ITER_NOTRACE; ret = seq_open(file, &show_ftrace_seq_ops); if (!ret) { @@ -900,13 +916,25 @@ ftrace_filter_open(struct inode *inode, struct file *file) kfree(iter); } else file->private_data = iter; - mutex_unlock(&ftrace_filter_lock); + mutex_unlock(&ftrace_regex_lock); return ret; } +static int +ftrace_filter_open(struct inode *inode, struct file *file) +{ + return ftrace_regex_open(inode, file, 1); +} + +static int +ftrace_notrace_open(struct inode *inode, struct file *file) +{ + return ftrace_regex_open(inode, file, 0); +} + static ssize_t -ftrace_filter_read(struct file *file, char __user *ubuf, +ftrace_regex_read(struct file *file, char __user *ubuf, size_t cnt, loff_t *ppos) { if (file->f_mode & FMODE_READ) @@ -916,7 +944,7 @@ ftrace_filter_read(struct file *file, char __user *ubuf, } static loff_t -ftrace_filter_lseek(struct file *file, loff_t offset, int origin) +ftrace_regex_lseek(struct file *file, loff_t offset, int origin) { loff_t ret; @@ -936,13 +964,14 @@ enum { }; static void -ftrace_match(unsigned char *buff, int len) +ftrace_match(unsigned char *buff, int len, int enable) { char str[KSYM_SYMBOL_LEN]; char *search = NULL; struct ftrace_page *pg; struct dyn_ftrace *rec; int type = MATCH_FULL; + unsigned long flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; unsigned i, match = 0, search_len = 0; for (i = 0; i < len; i++) { @@ -966,7 +995,8 @@ ftrace_match(unsigned char *buff, int len) /* keep kstop machine from running */ preempt_disable(); - ftrace_filtered = 1; + if (enable) + ftrace_filtered = 1; pg = ftrace_pages_start; while (pg) { for (i = 0; i < pg->index; i++) { @@ -997,7 +1027,7 @@ ftrace_match(unsigned char *buff, int len) break; } if (matched) - rec->flags |= FTRACE_FL_FILTER; + rec->flags |= flag; } pg = pg->next; } @@ -1005,8 +1035,8 @@ ftrace_match(unsigned char *buff, int len) } static ssize_t -ftrace_filter_write(struct file *file, const char __user *ubuf, - size_t cnt, loff_t *ppos) +ftrace_regex_write(struct file *file, const char __user *ubuf, + size_t cnt, loff_t *ppos, int enable) { struct ftrace_iterator *iter; char ch; @@ -1016,7 +1046,7 @@ ftrace_filter_write(struct file *file, const char __user *ubuf, if (!cnt || cnt < 0) return 0; - mutex_lock(&ftrace_filter_lock); + mutex_lock(&ftrace_regex_lock); if (file->f_mode & FMODE_READ) { struct seq_file *m = file->private_data; @@ -1045,7 +1075,6 @@ ftrace_filter_write(struct file *file, const char __user *ubuf, cnt--; } - if (isspace(ch)) { file->f_pos += read; ret = read; @@ -1072,7 +1101,7 @@ ftrace_filter_write(struct file *file, const char __user *ubuf, if (isspace(ch)) { iter->filtered++; iter->buffer[iter->buffer_idx] = 0; - ftrace_match(iter->buffer, iter->buffer_idx); + ftrace_match(iter->buffer, iter->buffer_idx, enable); iter->buffer_idx = 0; } else iter->flags |= FTRACE_ITER_CONT; @@ -1082,11 +1111,39 @@ ftrace_filter_write(struct file *file, const char __user *ubuf, ret = read; out: - mutex_unlock(&ftrace_filter_lock); + mutex_unlock(&ftrace_regex_lock); return ret; } +static ssize_t +ftrace_filter_write(struct file *file, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return ftrace_regex_write(file, ubuf, cnt, ppos, 1); +} + +static ssize_t +ftrace_notrace_write(struct file *file, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return ftrace_regex_write(file, ubuf, cnt, ppos, 0); +} + +static void +ftrace_set_regex(unsigned char *buf, int len, int reset, int enable) +{ + if (unlikely(ftrace_disabled)) + return; + + mutex_lock(&ftrace_regex_lock); + if (reset) + ftrace_filter_reset(enable); + if (buf) + ftrace_match(buf, len, enable); + mutex_unlock(&ftrace_regex_lock); +} + /** * ftrace_set_filter - set a function to filter on in ftrace * @buf - the string that holds the function filter text. @@ -1098,24 +1155,31 @@ ftrace_filter_write(struct file *file, const char __user *ubuf, */ void ftrace_set_filter(unsigned char *buf, int len, int reset) { - if (unlikely(ftrace_disabled)) - return; + ftrace_set_regex(buf, len, reset, 1); +} - mutex_lock(&ftrace_filter_lock); - if (reset) - ftrace_filter_reset(); - if (buf) - ftrace_match(buf, len); - mutex_unlock(&ftrace_filter_lock); +/** + * ftrace_set_notrace - set a function to not trace in ftrace + * @buf - the string that holds the function notrace text. + * @len - the length of the string. + * @reset - non zero to reset all filters before applying this filter. + * + * Notrace Filters denote which functions should not be enabled when tracing + * is enabled. If @buf is NULL and reset is set, all functions will be enabled + * for tracing. + */ +void ftrace_set_notrace(unsigned char *buf, int len, int reset) +{ + ftrace_set_regex(buf, len, reset, 0); } static int -ftrace_filter_release(struct inode *inode, struct file *file) +ftrace_regex_release(struct inode *inode, struct file *file, int enable) { struct seq_file *m = (struct seq_file *)file->private_data; struct ftrace_iterator *iter; - mutex_lock(&ftrace_filter_lock); + mutex_lock(&ftrace_regex_lock); if (file->f_mode & FMODE_READ) { iter = m->private; @@ -1126,7 +1190,7 @@ ftrace_filter_release(struct inode *inode, struct file *file) if (iter->buffer_idx) { iter->filtered++; iter->buffer[iter->buffer_idx] = 0; - ftrace_match(iter->buffer, iter->buffer_idx); + ftrace_match(iter->buffer, iter->buffer_idx, enable); } mutex_lock(&ftrace_sysctl_lock); @@ -1137,10 +1201,22 @@ ftrace_filter_release(struct inode *inode, struct file *file) mutex_unlock(&ftrace_sysctl_lock); kfree(iter); - mutex_unlock(&ftrace_filter_lock); + mutex_unlock(&ftrace_regex_lock); return 0; } +static int +ftrace_filter_release(struct inode *inode, struct file *file) +{ + return ftrace_regex_release(inode, file, 1); +} + +static int +ftrace_notrace_release(struct inode *inode, struct file *file) +{ + return ftrace_regex_release(inode, file, 0); +} + static struct file_operations ftrace_avail_fops = { .open = ftrace_avail_open, .read = seq_read, @@ -1150,12 +1226,20 @@ static struct file_operations ftrace_avail_fops = { static struct file_operations ftrace_filter_fops = { .open = ftrace_filter_open, - .read = ftrace_filter_read, + .read = ftrace_regex_read, .write = ftrace_filter_write, - .llseek = ftrace_filter_lseek, + .llseek = ftrace_regex_lseek, .release = ftrace_filter_release, }; +static struct file_operations ftrace_notrace_fops = { + .open = ftrace_notrace_open, + .read = ftrace_regex_read, + .write = ftrace_notrace_write, + .llseek = ftrace_regex_lseek, + .release = ftrace_notrace_release, +}; + /** * ftrace_force_update - force an update to all recording ftrace functions * @@ -1239,6 +1323,12 @@ static __init int ftrace_init_debugfs(void) if (!entry) pr_warning("Could not create debugfs " "'set_ftrace_filter' entry\n"); + + entry = debugfs_create_file("set_ftrace_notrace", 0644, d_tracer, + NULL, &ftrace_notrace_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'set_ftrace_notrace' entry\n"); return 0; } -- cgit v0.10.2 From 41bc8144d02028133bcd1d545023c6f49e8b2411 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 22 May 2008 11:49:22 -0400 Subject: ftrace: fix up cmdline recording The new work with converting the trace hooks over to markers broke the command line recording of ftrace. This patch fixes it again. Signed-off-by: Steven Rostedt Cc: Peter Zijlstra Cc: Mathieu Desnoyers Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 583fe24..0feae23 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -652,9 +652,6 @@ static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN]; static int cmdline_idx; static DEFINE_SPINLOCK(trace_cmdline_lock); -/* trace in all context switches */ -atomic_t trace_record_cmdline_enabled __read_mostly; - /* temporary disable recording */ atomic_t trace_record_cmdline_disabled __read_mostly; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index c460e85..6b8bd88 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -218,6 +218,8 @@ void trace_function(struct trace_array *tr, void tracing_start_function_trace(void); void tracing_stop_function_trace(void); +void tracing_start_cmdline_record(void); +void tracing_stop_cmdline_record(void); int register_tracer(struct tracer *type); void unregister_tracer(struct tracer *type); @@ -226,8 +228,6 @@ extern unsigned long nsecs_to_usecs(unsigned long nsecs); extern unsigned long tracing_max_latency; extern unsigned long tracing_thresh; -extern atomic_t trace_record_cmdline_enabled; - void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu); void update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu); diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 0a08465..7ee7dcd 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -29,14 +29,14 @@ static void function_reset(struct trace_array *tr) static void start_function_trace(struct trace_array *tr) { function_reset(tr); - atomic_inc(&trace_record_cmdline_enabled); + tracing_start_cmdline_record(); tracing_start_function_trace(); } static void stop_function_trace(struct trace_array *tr) { tracing_stop_function_trace(); - atomic_dec(&trace_record_cmdline_enabled); + tracing_stop_cmdline_record(); } static void function_trace_init(struct trace_array *tr) diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 798ec0d..c16935d 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -29,6 +29,9 @@ sched_switch_func(void *private, void *__rq, struct task_struct *prev, long disabled; int cpu; + tracing_record_cmdline(prev); + tracing_record_cmdline(next); + if (!tracer_enabled) return; @@ -63,8 +66,6 @@ sched_switch_callback(void *probe_data, void *call_data, prev = va_arg(*args, typeof(prev)); next = va_arg(*args, typeof(next)); - tracing_record_cmdline(prev); - /* * If tracer_switch_func only points to the local * switch func, it still needs the ptr passed to it. @@ -213,18 +214,26 @@ void tracing_stop_sched_switch(void) tracing_sched_unregister(); } +void tracing_start_cmdline_record(void) +{ + tracing_start_sched_switch(); +} + +void tracing_stop_cmdline_record(void) +{ + tracing_stop_sched_switch(); +} + static void start_sched_trace(struct trace_array *tr) { sched_switch_reset(tr); - atomic_inc(&trace_record_cmdline_enabled); tracer_enabled = 1; - tracing_start_sched_switch(); + tracing_start_cmdline_record(); } static void stop_sched_trace(struct trace_array *tr) { - tracing_stop_sched_switch(); - atomic_dec(&trace_record_cmdline_enabled); + tracing_stop_cmdline_record(); tracer_enabled = 0; } -- cgit v0.10.2 From ccbfac2923c9febaeaf07a50054027a92b502718 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 22 May 2008 14:31:07 -0400 Subject: ftrace: powerpc clean ups This patch cleans up the ftrace code in PowerPC based on the comments from Michael Ellerman. Signed-off-by: Steven Rostedt Cc: Michael Ellerman Cc: proski@gnu.org Cc: a.p.zijlstra@chello.nl Cc: Pekka Paalanen Cc: Steven Rostedt Cc: linuxppc-dev@ozlabs.org Cc: Soeren Sandmann Pedersen Cc: paulus@samba.org Signed-off-by: Thomas Gleixner diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index 0e62218..3b1dd29 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -1129,18 +1129,11 @@ _GLOBAL(_mcount) stw r5, 8(r1) LOAD_REG_ADDR(r5, ftrace_trace_function) -#if 0 - mtctr r3 - mr r1, r5 - bctrl -#endif lwz r5,0(r5) -#if 1 + mtctr r5 bctrl -#else - bl ftrace_stub -#endif + nop lwz r6, 8(r1) diff --git a/arch/powerpc/kernel/ftrace.c b/arch/powerpc/kernel/ftrace.c index 5a4993f..69ed412 100644 --- a/arch/powerpc/kernel/ftrace.c +++ b/arch/powerpc/kernel/ftrace.c @@ -51,10 +51,16 @@ notrace unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr) { static unsigned int op; + /* + * It would be nice to just use create_function_call, but that will + * update the code itself. Here we need to just return the + * instruction that is going to be modified, without modifying the + * code. + */ addr = GET_ADDR(addr); /* Set to "bl addr" */ - op = 0x48000001 | (ftrace_calc_offset(ip, addr) & 0x03fffffe); + op = 0x48000001 | (ftrace_calc_offset(ip, addr) & 0x03fffffc); /* * No locking needed, this must be called via kstop_machine diff --git a/arch/powerpc/kernel/ppc_ksyms.c b/arch/powerpc/kernel/ppc_ksyms.c index cf6b5a7..4300db5 100644 --- a/arch/powerpc/kernel/ppc_ksyms.c +++ b/arch/powerpc/kernel/ppc_ksyms.c @@ -43,6 +43,7 @@ #include #include #include +#include #ifdef CONFIG_PPC32 extern void transfer_to_handler(void); @@ -68,6 +69,10 @@ EXPORT_SYMBOL(single_step_exception); EXPORT_SYMBOL(sys_sigreturn); #endif +#ifdef CONFIG_FTRACE +EXPORT_SYMBOL(_mcount); +#endif + EXPORT_SYMBOL(strcpy); EXPORT_SYMBOL(strncpy); EXPORT_SYMBOL(strcat); diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c index 22f8e2b..19e8fcb 100644 --- a/arch/powerpc/kernel/setup_32.c +++ b/arch/powerpc/kernel/setup_32.c @@ -47,11 +47,6 @@ #include #endif -#ifdef CONFIG_FTRACE -extern void _mcount(void); -EXPORT_SYMBOL(_mcount); -#endif - extern void bootx_init(unsigned long r4, unsigned long phys); int boot_cpuid; diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 277bf18..098fd96 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -85,11 +85,6 @@ struct ppc64_caches ppc64_caches = { }; EXPORT_SYMBOL_GPL(ppc64_caches); -#ifdef CONFIG_FTRACE -extern void _mcount(void); -EXPORT_SYMBOL(_mcount); -#endif - /* * These are used in binfmt_elf.c to put aux entries on the stack * for each elf executable being started. diff --git a/include/asm-powerpc/ftrace.h b/include/asm-powerpc/ftrace.h new file mode 100644 index 0000000..b1bfa70 --- /dev/null +++ b/include/asm-powerpc/ftrace.h @@ -0,0 +1,6 @@ +#ifndef _ASM_POWERPC_FTRACE +#define _ASM_POWERPC_FTRACE + +extern void _mcount(void); + +#endif -- cgit v0.10.2 From ffdaa3582b6b39d625d585d07e329ffdc925e971 Mon Sep 17 00:00:00 2001 From: Abhishek Sagar Date: Sat, 24 May 2008 23:45:02 +0530 Subject: ftrace: safe traversal of ftrace_hash hlist Hi Steven, I noticed that concurrent instances of ftrace_record_ip() have a race between ftrace_hash list traversal during ftrace_ip_in_hash() (before acquiring ftrace_shutdown_lock) and ftrace_add_hash(). If it's so then this should fix it. Signed-off-by: Abhishek Sagar Cc: rostedt@goodmis.org Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 2552454..9b7c54f 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -201,7 +201,7 @@ ftrace_ip_in_hash(unsigned long ip, unsigned long key) struct hlist_node *t; int found = 0; - hlist_for_each_entry(p, t, &ftrace_hash[key], node) { + hlist_for_each_entry_rcu(p, t, &ftrace_hash[key], node) { if (p->ip == ip) { found = 1; break; @@ -214,7 +214,7 @@ ftrace_ip_in_hash(unsigned long ip, unsigned long key) static inline void ftrace_add_hash(struct dyn_ftrace *node, unsigned long key) { - hlist_add_head(&node->node, &ftrace_hash[key]); + hlist_add_head_rcu(&node->node, &ftrace_hash[key]); } static void ftrace_free_rec(struct dyn_ftrace *rec) -- cgit v0.10.2 From 492a7ea5bcf263ee02a9eb6a3ab0222a1946fade Mon Sep 17 00:00:00 2001 From: Abhishek Sagar Date: Sun, 25 May 2008 00:10:04 +0530 Subject: ftrace: fix updating of ftrace_update_cnt Hi Ingo/Steven, Ftrace currently maintains an update count which includes false updates, i.e, updates which failed. If anything, such failures should be tracked by some separate variable, but this patch provides a minimal fix. Signed-off-by: Abhishek Sagar Cc: rostedt@goodmis.org Signed-off-by: Thomas Gleixner diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 9b7c54f..1843edc 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -453,7 +453,7 @@ static void ftrace_shutdown_replenish(void) ftrace_pages->next = (void *)get_zeroed_page(GFP_KERNEL); } -static void +static int ftrace_code_disable(struct dyn_ftrace *rec) { unsigned long ip; @@ -469,7 +469,9 @@ ftrace_code_disable(struct dyn_ftrace *rec) if (failed) { rec->flags |= FTRACE_FL_FAILED; ftrace_free_rec(rec); + return 0; } + return 1; } static int __ftrace_modify_code(void *data) @@ -617,8 +619,8 @@ static int __ftrace_update_code(void *ignore) /* all CPUS are stopped, we are safe to modify code */ hlist_for_each_entry(p, t, &head, node) { - ftrace_code_disable(p); - ftrace_update_cnt++; + if (ftrace_code_disable(p)) + ftrace_update_cnt++; } } -- cgit v0.10.2 From b1829d2705daa7cb72eb1e08bdc8b7e9fad34266 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2008 01:22:08 +0200 Subject: ftrace: fix merge Signed-off-by: Ingo Molnar diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index ffbbd54..b482fe8 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -122,7 +122,7 @@ static inline void tracer_disable(void) # define trace_preempt_off(a0, a1) do { } while (0) #endif -#ifdef CONFIG_CONTEXT_SWITCH_TRACER +#ifdef CONFIG_TRACING extern void ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3); #else -- cgit v0.10.2 From 014c257cce65e9d1cd2d28ec1c89a37c536b151d Mon Sep 17 00:00:00 2001 From: Abhishek Sagar Date: Sat, 31 May 2008 14:23:50 +0530 Subject: ftrace: core support for ARM Core ftrace support for the ARM architecture, which includes support for dynamic function tracing. Signed-off-by: Abhishek Sagar Signed-off-by: Ingo Molnar diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index b786e68..3845e5c 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -14,6 +14,8 @@ config ARM select HAVE_OPROFILE select HAVE_KPROBES if (!XIP_KERNEL) select HAVE_KRETPROBES if (HAVE_KPROBES) + select HAVE_FTRACE if (!XIP_KERNEL) + select HAVE_DYNAMIC_FTRACE if (HAVE_FTRACE) help The ARM series is a line of low-power-consumption RISC chip designs licensed by ARM Ltd and targeted at embedded applications and diff --git a/arch/arm/boot/compressed/Makefile b/arch/arm/boot/compressed/Makefile index de9d9ee..95baac4 100644 --- a/arch/arm/boot/compressed/Makefile +++ b/arch/arm/boot/compressed/Makefile @@ -69,6 +69,12 @@ SEDFLAGS = s/TEXT_START/$(ZTEXTADDR)/;s/BSS_START/$(ZBSSADDR)/ targets := vmlinux vmlinux.lds piggy.gz piggy.o font.o font.c \ head.o misc.o $(OBJS) + +ifeq ($(CONFIG_FTRACE),y) +ORIG_CFLAGS := $(KBUILD_CFLAGS) +KBUILD_CFLAGS = $(subst -pg, , $(ORIG_CFLAGS)) +endif + EXTRA_CFLAGS := -fpic -fno-builtin EXTRA_AFLAGS := diff --git a/arch/arm/kernel/Makefile b/arch/arm/kernel/Makefile index ad455ff..eb9092c 100644 --- a/arch/arm/kernel/Makefile +++ b/arch/arm/kernel/Makefile @@ -4,6 +4,10 @@ AFLAGS_head.o := -DTEXT_OFFSET=$(TEXT_OFFSET) +ifdef CONFIG_DYNAMIC_FTRACE +CFLAGS_REMOVE_ftrace.o = -pg +endif + # Object file lists. obj-y := compat.o entry-armv.o entry-common.o irq.o \ @@ -18,6 +22,7 @@ obj-$(CONFIG_ARTHUR) += arthur.o obj-$(CONFIG_ISA_DMA) += dma-isa.o obj-$(CONFIG_PCI) += bios32.o isa.o obj-$(CONFIG_SMP) += smp.o +obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o obj-$(CONFIG_KPROBES) += kprobes.o kprobes-decode.o obj-$(CONFIG_ATAGS_PROC) += atags.o diff --git a/arch/arm/kernel/armksyms.c b/arch/arm/kernel/armksyms.c index 688b7b1..3b13221 100644 --- a/arch/arm/kernel/armksyms.c +++ b/arch/arm/kernel/armksyms.c @@ -48,6 +48,11 @@ extern void __aeabi_ulcmp(void); extern void fpundefinstr(void); extern void fp_enter(void); +#ifdef CONFIG_FTRACE +extern void mcount(void); +EXPORT_SYMBOL(mcount); +#endif + /* * This has a special calling convention; it doesn't * modify any of the usual registers, except for LR. diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S index 597ed00..8f79a47 100644 --- a/arch/arm/kernel/entry-common.S +++ b/arch/arm/kernel/entry-common.S @@ -99,6 +99,53 @@ ENTRY(ret_from_fork) #undef CALL #define CALL(x) .long x +#ifdef CONFIG_FTRACE +#ifdef CONFIG_DYNAMIC_FTRACE +ENTRY(mcount) + stmdb sp!, {r0-r3, lr} + mov r0, lr + + .globl mcount_call +mcount_call: + bl ftrace_stub + ldmia sp!, {r0-r3, pc} + +ENTRY(ftrace_caller) + stmdb sp!, {r0-r3, lr} + ldr r1, [fp, #-4] + mov r0, lr + + .globl ftrace_call +ftrace_call: + bl ftrace_stub + ldmia sp!, {r0-r3, pc} + +#else + +ENTRY(mcount) + stmdb sp!, {r0-r3, lr} + ldr r0, =ftrace_trace_function + ldr r2, [r0] + adr r0, ftrace_stub + cmp r0, r2 + bne trace + ldmia sp!, {r0-r3, pc} + +trace: + ldr r1, [fp, #-4] + mov r0, lr + mov lr, pc + mov pc, r2 + ldmia sp!, {r0-r3, pc} + +#endif /* CONFIG_DYNAMIC_FTRACE */ + + .globl ftrace_stub +ftrace_stub: + mov pc, lr + +#endif /* CONFIG_FTRACE */ + /*============================================================================= * SWI handler *----------------------------------------------------------------------------- diff --git a/arch/arm/kernel/ftrace.c b/arch/arm/kernel/ftrace.c new file mode 100644 index 0000000..f4cb4cc --- /dev/null +++ b/arch/arm/kernel/ftrace.c @@ -0,0 +1,128 @@ +/* + * Dynamic function tracing support. + * + * Copyright (C) 2008 Abhishek Sagar + * + * For licencing details, see COPYING. + * + * Defines low-level handling of mcount calls when the kernel + * is compiled with the -pg flag. When using dynamic ftrace, the + * mcount call-sites get patched lazily with NOP till they are + * enabled. All code mutation routines here take effect atomically. + */ + +#include +#include + +#define INSN_SIZE 4 +#define PC_OFFSET 8 +#define BL_OPCODE 0xeb000000 +#define BL_OFFSET_MASK 0x00ffffff + +static unsigned long bl_insn; +static const unsigned long NOP = 0xe1a00000; /* mov r0, r0 */ + +/* return true if mcount call site is already patched/no-op'ed */ +int ftrace_ip_converted(unsigned long pc) +{ + unsigned long save; + + pc -= INSN_SIZE; + save = *(unsigned long *)pc; + return save == NOP; +} + +unsigned char *ftrace_nop_replace(void) +{ + return (char *)&NOP; +} + +/* construct a branch (BL) instruction to addr */ +unsigned char *ftrace_call_replace(unsigned long pc, unsigned long addr) +{ + long offset; + + offset = (long)addr - (long)(pc - INSN_SIZE + PC_OFFSET); + if (unlikely(offset < -33554432 || offset > 33554428)) { + /* Can't generate branches that far (from ARM ARM). Ftrace + * doesn't generate branches outside of core kernel text. + */ + WARN_ON_ONCE(1); + return NULL; + } + offset = (offset >> 2) & BL_OFFSET_MASK; + bl_insn = BL_OPCODE | offset; + return (unsigned char *)&bl_insn; +} + +int ftrace_modify_code(unsigned long pc, unsigned char *old_code, + unsigned char *new_code) +{ + unsigned long err = 0, replaced = 0, old, new; + + old = *(unsigned long *)old_code; + new = *(unsigned long *)new_code; + pc -= INSN_SIZE; + + __asm__ __volatile__ ( + "1: ldr %1, [%2] \n" + " cmp %1, %4 \n" + "2: streq %3, [%2] \n" + " cmpne %1, %3 \n" + " movne %0, #2 \n" + "3:\n" + + ".section .fixup, \"ax\"\n" + "4: mov %0, #1 \n" + " b 3b \n" + ".previous\n" + + ".section __ex_table, \"a\"\n" + " .long 1b, 4b \n" + " .long 2b, 4b \n" + ".previous\n" + + : "=r"(err), "=r"(replaced) + : "r"(pc), "r"(new), "r"(old), "0"(err), "1"(replaced) + : "memory"); + + if (!err && (replaced == old)) + flush_icache_range(pc, pc + INSN_SIZE); + + return err; +} + +int ftrace_update_ftrace_func(ftrace_func_t func) +{ + int ret; + unsigned long pc, old; + unsigned char *new; + + pc = (unsigned long)&ftrace_call; + pc += INSN_SIZE; + memcpy(&old, &ftrace_call, INSN_SIZE); + new = ftrace_call_replace(pc, (unsigned long)func); + ret = ftrace_modify_code(pc, (unsigned char *)&old, new); + return ret; +} + +int ftrace_mcount_set(unsigned long *data) +{ + unsigned long pc, old; + unsigned long *addr = data; + unsigned char *new; + + pc = (unsigned long)&mcount_call; + pc += INSN_SIZE; + memcpy(&old, &mcount_call, INSN_SIZE); + new = ftrace_call_replace(pc, *addr); + *addr = ftrace_modify_code(pc, (unsigned char *)&old, new); + return 0; +} + +/* run from kstop_machine */ +int __init ftrace_dyn_arch_init(void *data) +{ + ftrace_mcount_set(data); + return 0; +} -- cgit v0.10.2 From 76094a2cf46e4ab776055d4086615b884408568c Mon Sep 17 00:00:00 2001 From: Abhishek Sagar Date: Wed, 28 May 2008 00:03:18 +0530 Subject: ftrace: distinguish kretprobe'd functions in trace logs Tracing functions via ftrace which have a kretprobe installed on them, can produce misleading output in their trace logs. E.g, consider the correct trace of the following sequence: do_IRQ() { ~ irq_enter(); ~ } Trace log (sample): -0 [00] 4154504455.781616: irq_enter <- do_IRQ But if irq_enter() has a kretprobe installed on it, the return value stored on the stack at each invocation is modified to divert the return to a kprobe trampoline function called kretprobe_trampoline(). So with this the trace would (currently) look like: -0 [00] 4154504455.781616: irq_enter <- kretprobe_trampoline Now this is quite misleading to the end user, as it suggests something that didn't actually happen. So just to avoid such misinterpretations, the inlined patch aims to output such a log as: -0 [00] 4154504455.781616: irq_enter <- [unknown/kretprobe'd] Signed-off-by: Abhishek Sagar Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 0feae23..12f5e81 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -1199,6 +1200,20 @@ static void s_stop(struct seq_file *m, void *p) mutex_unlock(&trace_types_lock); } +#define KRETPROBE_MSG "[unknown/kretprobe'd]" + +#ifdef CONFIG_KRETPROBES +static inline int kretprobed(unsigned long addr) +{ + return addr == (unsigned long)kretprobe_trampoline; +} +#else +static inline int kretprobed(unsigned long addr) +{ + return 0; +} +#endif /* CONFIG_KRETPROBES */ + static int seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address) { @@ -1434,7 +1449,10 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) case TRACE_FN: seq_print_ip_sym(s, entry->fn.ip, sym_flags); trace_seq_puts(s, " ("); - seq_print_ip_sym(s, entry->fn.parent_ip, sym_flags); + if (kretprobed(entry->fn.parent_ip)) + trace_seq_puts(s, KRETPROBE_MSG); + else + seq_print_ip_sym(s, entry->fn.parent_ip, sym_flags); trace_seq_puts(s, ")\n"); break; case TRACE_CTX: @@ -1514,8 +1532,11 @@ static int print_trace_fmt(struct trace_iterator *iter) ret = trace_seq_printf(s, " <-"); if (!ret) return 0; - ret = seq_print_ip_sym(s, entry->fn.parent_ip, - sym_flags); + if (kretprobed(entry->fn.parent_ip)) + ret = trace_seq_puts(s, KRETPROBE_MSG); + else + ret = seq_print_ip_sym(s, entry->fn.parent_ip, + sym_flags); if (!ret) return 0; } -- cgit v0.10.2 From ad90c0e3ce8d20d6873b57e36181ef6d7a0097fe Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 27 May 2008 20:48:37 -0400 Subject: ftrace: user update and disable dynamic ftrace daemon In dynamic ftrace, the mcount function starts off pointing to a stub function that just returns. On start up, the call to the stub is modified to point to a "record_ip" function. The job of the record_ip function is to add the function to a pre-allocated hash list. If the function is already there, it simply is ignored, otherwise it is added to the list. Later, a ftraced daemon wakes up and calls kstop_machine if any functions have been recorded, and changes the calls to the recorded functions to a simple nop. If no functions were recorded, the daemon goes back to sleep. The daemon wakes up once a second to see if it needs to update any newly recorded functions into nops. Usually it does not, but if a lot of code has been executed for the first time in the kernel, the ftraced daemon will call kstop_machine to update those into nops. The problem currently is that there's no way to stop the daemon from doing this, and it can cause unneeded latencies (800us which for some is bothersome). This patch adds a new file /debugfs/tracing/ftraced_enabled. If the daemon is active, reading this will return "enabled\n" and "disabled\n" when the daemon is not running. To disable the daemon, the user can echo "0" or "disable" into this file, and "1" or "enable" to re-enable the daemon. Since the daemon is used to convert the functions into nops to increase the performance of the system, I also added that anytime something is written into the ftraced_enabled file, kstop_machine will run if there are new functions that have been detected that need to be converted. This way the user can disable the daemon but still be able to control the conversion of the mcount calls to nops by simply, "echo 0 > /debugfs/tracing/ftraced_enabled" when they need to do more conversions. To see the number of converted functions: "cat /debugfs/tracing/dyn_ftrace_total_info" Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index b482fe8..6238194 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -72,9 +72,15 @@ extern int ftrace_update_ftrace_func(ftrace_func_t func); extern void ftrace_caller(void); extern void ftrace_call(void); extern void mcount_call(void); + +void ftrace_disable_daemon(void); +void ftrace_enable_daemon(void); + #else # define ftrace_force_update() ({ 0; }) # define ftrace_set_filter(buf, len, reset) do { } while (0) +# define ftrace_disable_daemon() do { } while (0) +# define ftrace_enable_daemon() do { } while (0) #endif /* totally disable ftrace - can not re-enable after this */ diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 1843edc..f762f5a 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -151,8 +151,6 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) #ifdef CONFIG_DYNAMIC_FTRACE static struct task_struct *ftraced_task; -static DECLARE_WAIT_QUEUE_HEAD(ftraced_waiters); -static unsigned long ftraced_iteration_counter; enum { FTRACE_ENABLE_CALLS = (1 << 0), @@ -189,6 +187,7 @@ static struct ftrace_page *ftrace_pages; static int ftraced_trigger; static int ftraced_suspend; +static int ftraced_stop; static int ftrace_record_suspend; @@ -474,14 +473,21 @@ ftrace_code_disable(struct dyn_ftrace *rec) return 1; } +static int __ftrace_update_code(void *ignore); + static int __ftrace_modify_code(void *data) { unsigned long addr; int *command = data; - if (*command & FTRACE_ENABLE_CALLS) + if (*command & FTRACE_ENABLE_CALLS) { + /* + * Update any recorded ips now that we have the + * machine stopped + */ + __ftrace_update_code(NULL); ftrace_replace_code(1); - else if (*command & FTRACE_DISABLE_CALLS) + } else if (*command & FTRACE_DISABLE_CALLS) ftrace_replace_code(0); if (*command & FTRACE_UPDATE_TRACE_FUNC) @@ -503,6 +509,25 @@ static void ftrace_run_update_code(int command) stop_machine_run(__ftrace_modify_code, &command, NR_CPUS); } +void ftrace_disable_daemon(void) +{ + /* Stop the daemon from calling kstop_machine */ + mutex_lock(&ftraced_lock); + ftraced_stop = 1; + mutex_unlock(&ftraced_lock); + + ftrace_force_update(); +} + +void ftrace_enable_daemon(void) +{ + mutex_lock(&ftraced_lock); + ftraced_stop = 0; + mutex_unlock(&ftraced_lock); + + ftrace_force_update(); +} + static ftrace_func_t saved_ftrace_func; static void ftrace_startup(void) @@ -603,6 +628,7 @@ static int __ftrace_update_code(void *ignore) int i; /* Don't be recording funcs now */ + ftrace_record_suspend++; save_ftrace_enabled = ftrace_enabled; ftrace_enabled = 0; @@ -628,18 +654,23 @@ static int __ftrace_update_code(void *ignore) stop = ftrace_now(raw_smp_processor_id()); ftrace_update_time = stop - start; ftrace_update_tot_cnt += ftrace_update_cnt; + ftraced_trigger = 0; ftrace_enabled = save_ftrace_enabled; + ftrace_record_suspend--; return 0; } -static void ftrace_update_code(void) +static int ftrace_update_code(void) { - if (unlikely(ftrace_disabled)) - return; + if (unlikely(ftrace_disabled) || + !ftrace_enabled || !ftraced_trigger) + return 0; stop_machine_run(__ftrace_update_code, NULL, NR_CPUS); + + return 1; } static int ftraced(void *ignore) @@ -658,14 +689,13 @@ static int ftraced(void *ignore) mutex_lock(&ftrace_sysctl_lock); mutex_lock(&ftraced_lock); - if (ftrace_enabled && ftraced_trigger && !ftraced_suspend) { - ftrace_record_suspend++; - ftrace_update_code(); + if (!ftraced_suspend && !ftraced_stop && + ftrace_update_code()) { usecs = nsecs_to_usecs(ftrace_update_time); if (ftrace_update_tot_cnt > 100000) { ftrace_update_tot_cnt = 0; pr_info("hm, dftrace overflow: %lu change%s" - " (%lu total) in %lu usec%s\n", + " (%lu total) in %lu usec%s\n", ftrace_update_cnt, ftrace_update_cnt != 1 ? "s" : "", ftrace_update_tot_cnt, @@ -673,15 +703,10 @@ static int ftraced(void *ignore) ftrace_disabled = 1; WARN_ON_ONCE(1); } - ftraced_trigger = 0; - ftrace_record_suspend--; } - ftraced_iteration_counter++; mutex_unlock(&ftraced_lock); mutex_unlock(&ftrace_sysctl_lock); - wake_up_interruptible(&ftraced_waiters); - ftrace_shutdown_replenish(); } __set_current_state(TASK_RUNNING); @@ -1219,6 +1244,55 @@ ftrace_notrace_release(struct inode *inode, struct file *file) return ftrace_regex_release(inode, file, 0); } +static ssize_t +ftraced_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + /* don't worry about races */ + char *buf = ftraced_stop ? "disabled\n" : "enabled\n"; + int r = strlen(buf); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static ssize_t +ftraced_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[64]; + long val; + int ret; + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + if (strncmp(buf, "enable", 6) == 0) + val = 1; + else if (strncmp(buf, "disable", 7) == 0) + val = 0; + else { + buf[cnt] = 0; + + ret = strict_strtoul(buf, 10, &val); + if (ret < 0) + return ret; + + val = !!val; + } + + if (val) + ftrace_enable_daemon(); + else + ftrace_disable_daemon(); + + filp->f_pos += cnt; + + return cnt; +} + static struct file_operations ftrace_avail_fops = { .open = ftrace_avail_open, .read = seq_read, @@ -1242,51 +1316,34 @@ static struct file_operations ftrace_notrace_fops = { .release = ftrace_notrace_release, }; +static struct file_operations ftraced_fops = { + .open = tracing_open_generic, + .read = ftraced_read, + .write = ftraced_write, +}; + /** * ftrace_force_update - force an update to all recording ftrace functions - * - * The ftrace dynamic update daemon only wakes up once a second. - * There may be cases where an update needs to be done immediately - * for tests or internal kernel tracing to begin. This function - * wakes the daemon to do an update and will not return until the - * update is complete. */ int ftrace_force_update(void) { - unsigned long last_counter; - DECLARE_WAITQUEUE(wait, current); int ret = 0; if (unlikely(ftrace_disabled)) return -ENODEV; + mutex_lock(&ftrace_sysctl_lock); mutex_lock(&ftraced_lock); - last_counter = ftraced_iteration_counter; - - set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue(&ftraced_waiters, &wait); - if (unlikely(!ftraced_task)) { - ret = -ENODEV; - goto out; - } - - do { - mutex_unlock(&ftraced_lock); - wake_up_process(ftraced_task); - schedule(); - mutex_lock(&ftraced_lock); - if (signal_pending(current)) { - ret = -EINTR; - break; - } - set_current_state(TASK_INTERRUPTIBLE); - } while (last_counter == ftraced_iteration_counter); + /* + * If ftraced_trigger is not set, then there is nothing + * to update. + */ + if (ftraced_trigger && !ftrace_update_code()) + ret = -EBUSY; - out: mutex_unlock(&ftraced_lock); - remove_wait_queue(&ftraced_waiters, &wait); - set_current_state(TASK_RUNNING); + mutex_unlock(&ftrace_sysctl_lock); return ret; } @@ -1331,6 +1388,12 @@ static __init int ftrace_init_debugfs(void) if (!entry) pr_warning("Could not create debugfs " "'set_ftrace_notrace' entry\n"); + + entry = debugfs_create_file("ftraced_enabled", 0644, d_tracer, + NULL, &ftraced_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'ftraced_enabled' entry\n"); return 0; } -- cgit v0.10.2 From e0773410247f1e5fc6f7c52a4c5f3c6c9873d527 Mon Sep 17 00:00:00 2001 From: Abhishek Sagar Date: Sat, 31 May 2008 14:24:02 +0530 Subject: ftrace: export kretprobe_trampoline for function tracer Follow suit from kprobe implementations on other archs and make kretprobe_trampoline non-static. Ftrace implmentation (more specifically, kernel/trace/trace.c) requires access to it (see-> http://kerneltrap.org/mailarchive/linux-kernel/2008/5/27/1955234). Signed-off-by: Abhishek Sagar Signed-off-by: Ingo Molnar diff --git a/arch/arm/kernel/kprobes.c b/arch/arm/kernel/kprobes.c index 5593dd2..5ee39e1 100644 --- a/arch/arm/kernel/kprobes.c +++ b/arch/arm/kernel/kprobes.c @@ -274,7 +274,7 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self, * for kretprobe handlers which should normally be interested in r0 only * anyway. */ -static void __attribute__((naked)) __kprobes kretprobe_trampoline(void) +void __naked __kprobes kretprobe_trampoline(void) { __asm__ __volatile__ ( "stmdb sp!, {r0 - r11} \n\t" diff --git a/include/asm-arm/kprobes.h b/include/asm-arm/kprobes.h index c042194..b1a3787 100644 --- a/include/asm-arm/kprobes.h +++ b/include/asm-arm/kprobes.h @@ -59,6 +59,7 @@ struct kprobe_ctlblk { }; void arch_remove_kprobe(struct kprobe *); +void kretprobe_trampoline(void); int kprobe_trap_handler(struct pt_regs *regs, unsigned int instr); int kprobe_fault_handler(struct pt_regs *regs, unsigned int fsr); -- cgit v0.10.2 From 0eb967012ea15e6e8cfab483d9fa37bc602d400c Mon Sep 17 00:00:00 2001 From: Abhishek Sagar Date: Sun, 1 Jun 2008 21:47:30 +0530 Subject: ftrace: prevent freeing of all failed updates Prevent freeing of records which cause problems and correspond to function from core kernel text. A new flag, FTRACE_FL_CONVERTED is used to mark a record as "converted". All other records are patched lazily to NOPs. Failed records now also remain on frace_hash table. Each invocation of ftrace_record_ip now checks whether the traced function has ever been recorded (including past failures) and doesn't re-record it again. Signed-off-by: Abhishek Sagar Signed-off-by: Ingo Molnar diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 6238194..20e14d0 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -49,6 +49,7 @@ enum { FTRACE_FL_FILTER = (1 << 2), FTRACE_FL_ENABLED = (1 << 3), FTRACE_FL_NOTRACE = (1 << 4), + FTRACE_FL_CONVERTED = (1 << 5), }; struct dyn_ftrace { diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f762f5a..ec54cb7 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -216,6 +216,12 @@ ftrace_add_hash(struct dyn_ftrace *node, unsigned long key) hlist_add_head_rcu(&node->node, &ftrace_hash[key]); } +/* called from kstop_machine */ +static inline void ftrace_del_hash(struct dyn_ftrace *node) +{ + hlist_del(&node->node); +} + static void ftrace_free_rec(struct dyn_ftrace *rec) { /* no locking, only called from kstop_machine */ @@ -332,12 +338,11 @@ ftrace_record_ip(unsigned long ip) #define FTRACE_ADDR ((long)(ftrace_caller)) #define MCOUNT_ADDR ((long)(mcount)) -static void +static int __ftrace_replace_code(struct dyn_ftrace *rec, unsigned char *old, unsigned char *new, int enable) { unsigned long ip, fl; - int failed; ip = rec->ip; @@ -364,7 +369,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, if ((fl == (FTRACE_FL_FILTER | FTRACE_FL_ENABLED)) || (fl == 0) || (rec->flags & FTRACE_FL_NOTRACE)) - return; + return 0; /* * If it is enabled disable it, @@ -388,7 +393,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, */ fl = rec->flags & (FTRACE_FL_NOTRACE | FTRACE_FL_ENABLED); if (fl == FTRACE_FL_NOTRACE) - return; + return 0; new = ftrace_call_replace(ip, FTRACE_ADDR); } else @@ -396,34 +401,24 @@ __ftrace_replace_code(struct dyn_ftrace *rec, if (enable) { if (rec->flags & FTRACE_FL_ENABLED) - return; + return 0; rec->flags |= FTRACE_FL_ENABLED; } else { if (!(rec->flags & FTRACE_FL_ENABLED)) - return; + return 0; rec->flags &= ~FTRACE_FL_ENABLED; } } - failed = ftrace_modify_code(ip, old, new); - if (failed) { - unsigned long key; - /* It is possible that the function hasn't been converted yet */ - key = hash_long(ip, FTRACE_HASHBITS); - if (!ftrace_ip_in_hash(ip, key)) { - rec->flags |= FTRACE_FL_FAILED; - ftrace_free_rec(rec); - } - - } + return ftrace_modify_code(ip, old, new); } static void ftrace_replace_code(int enable) { + int i, failed; unsigned char *new = NULL, *old = NULL; struct dyn_ftrace *rec; struct ftrace_page *pg; - int i; if (enable) old = ftrace_nop_replace(); @@ -438,7 +433,15 @@ static void ftrace_replace_code(int enable) if (rec->flags & FTRACE_FL_FAILED) continue; - __ftrace_replace_code(rec, old, new, enable); + failed = __ftrace_replace_code(rec, old, new, enable); + if (failed && (rec->flags & FTRACE_FL_CONVERTED)) { + rec->flags |= FTRACE_FL_FAILED; + if ((system_state == SYSTEM_BOOTING) || + !kernel_text_address(rec->ip)) { + ftrace_del_hash(rec); + ftrace_free_rec(rec); + } + } } } } @@ -467,7 +470,6 @@ ftrace_code_disable(struct dyn_ftrace *rec) failed = ftrace_modify_code(ip, call, nop); if (failed) { rec->flags |= FTRACE_FL_FAILED; - ftrace_free_rec(rec); return 0; } return 1; @@ -621,8 +623,7 @@ unsigned long ftrace_update_tot_cnt; static int __ftrace_update_code(void *ignore) { struct dyn_ftrace *p; - struct hlist_head head; - struct hlist_node *t; + struct hlist_node *t, *n; int save_ftrace_enabled; cycle_t start, stop; int i; @@ -637,18 +638,33 @@ static int __ftrace_update_code(void *ignore) /* No locks needed, the machine is stopped! */ for (i = 0; i < FTRACE_HASHSIZE; i++) { - if (hlist_empty(&ftrace_hash[i])) - continue; + /* all CPUS are stopped, we are safe to modify code */ + hlist_for_each_entry_safe(p, t, n, &ftrace_hash[i], node) { + /* Skip over failed records which have not been + * freed. */ + if (p->flags & FTRACE_FL_FAILED) + continue; - head = ftrace_hash[i]; - INIT_HLIST_HEAD(&ftrace_hash[i]); + /* Unconverted records are always at the head of the + * hash bucket. Once we encounter a converted record, + * simply skip over to the next bucket. Saves ftraced + * some processor cycles (ftrace does its bid for + * global warming :-p ). */ + if (p->flags & (FTRACE_FL_CONVERTED)) + break; - /* all CPUS are stopped, we are safe to modify code */ - hlist_for_each_entry(p, t, &head, node) { - if (ftrace_code_disable(p)) + if (ftrace_code_disable(p)) { + p->flags |= FTRACE_FL_CONVERTED; ftrace_update_cnt++; - } + } else { + if ((system_state == SYSTEM_BOOTING) || + !kernel_text_address(p->ip)) { + ftrace_del_hash(p); + ftrace_free_rec(p); + } + } + } } stop = ftrace_now(raw_smp_processor_id()); -- cgit v0.10.2 From 1d74f2a0f64b4091e5e91b55ac1b17dff93f4b59 Mon Sep 17 00:00:00 2001 From: Abhishek Sagar Date: Sun, 1 Jun 2008 21:47:42 +0530 Subject: ftrace: remove ftrace_ip_converted() Remove the unneeded function ftrace_ip_converted(). Signed-off-by: Abhishek Sagar Signed-off-by: Ingo Molnar diff --git a/arch/arm/kernel/ftrace.c b/arch/arm/kernel/ftrace.c index f4cb4cc..22f3d6e 100644 --- a/arch/arm/kernel/ftrace.c +++ b/arch/arm/kernel/ftrace.c @@ -22,16 +22,6 @@ static unsigned long bl_insn; static const unsigned long NOP = 0xe1a00000; /* mov r0, r0 */ -/* return true if mcount call site is already patched/no-op'ed */ -int ftrace_ip_converted(unsigned long pc) -{ - unsigned long save; - - pc -= INSN_SIZE; - save = *(unsigned long *)pc; - return save == NOP; -} - unsigned char *ftrace_nop_replace(void) { return (char *)&NOP; diff --git a/arch/powerpc/kernel/ftrace.c b/arch/powerpc/kernel/ftrace.c index 69ed412..e12c593 100644 --- a/arch/powerpc/kernel/ftrace.c +++ b/arch/powerpc/kernel/ftrace.c @@ -27,16 +27,6 @@ static unsigned int ftrace_nop = 0x60000000; # define GET_ADDR(addr) *(unsigned long *)addr #endif -notrace int ftrace_ip_converted(unsigned long ip) -{ - unsigned int save; - - ip -= CALL_BACK; - save = *(unsigned int *)ip; - - return save == ftrace_nop; -} - static unsigned int notrace ftrace_calc_offset(long ip, long addr) { return (int)((addr + CALL_BACK) - ip); diff --git a/arch/sparc64/kernel/ftrace.c b/arch/sparc64/kernel/ftrace.c index f449e6d..c173731 100644 --- a/arch/sparc64/kernel/ftrace.c +++ b/arch/sparc64/kernel/ftrace.c @@ -7,13 +7,6 @@ static const u32 ftrace_nop = 0x01000000; -notrace int ftrace_ip_converted(unsigned long ip) -{ - u32 insn = *(u32 *) ip; - - return (insn == ftrace_nop); -} - notrace unsigned char *ftrace_nop_replace(void) { return (char *)&ftrace_nop; diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 498608c..bc5cf8d 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -31,16 +31,6 @@ union ftrace_code_union { } __attribute__((packed)); }; -notrace int ftrace_ip_converted(unsigned long ip) -{ - unsigned long save; - - ip -= CALL_BACK; - save = *(long *)ip; - - return save == *ftrace_nop; -} - static int notrace ftrace_calc_offset(long ip, long addr) { return (int)(addr - ip); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index ec54cb7..a8929e4 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -306,13 +306,6 @@ ftrace_record_ip(unsigned long ip) if (ftrace_ip_in_hash(ip, key)) goto out_unlock; - /* - * There's a slight race that the ftraced will update the - * hash and reset here. If it is already converted, skip it. - */ - if (ftrace_ip_converted(ip)) - goto out_unlock; - node = ftrace_alloc_dyn_node(ip); if (!node) goto out_unlock; -- cgit v0.10.2 From eb9a7bf09172f409c10ec9560adeea95bb4045f5 Mon Sep 17 00:00:00 2001 From: Abhishek Sagar Date: Sun, 1 Jun 2008 21:47:54 +0530 Subject: ftrace: add debugfs entry 'failures' Identify functions which had their mcount call-site updates failed. This can help us track functions which ftrace shouldn't fiddle with, and are thus not being traced. If there is no race with any external agent which is modifying the mcount call-site, then this file displays no entries (normal case). Signed-off-by: Abhishek Sagar Signed-off-by: Ingo Molnar diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index a8929e4..ad568c7 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -768,6 +768,7 @@ enum { FTRACE_ITER_FILTER = (1 << 0), FTRACE_ITER_CONT = (1 << 1), FTRACE_ITER_NOTRACE = (1 << 2), + FTRACE_ITER_FAILURES = (1 << 3), }; #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ @@ -799,9 +800,16 @@ t_next(struct seq_file *m, void *v, loff_t *pos) } } else { rec = &iter->pg->records[iter->idx++]; - if ((rec->flags & FTRACE_FL_FAILED) || + if ((!(iter->flags & FTRACE_ITER_FAILURES) && + (rec->flags & FTRACE_FL_FAILED)) || + + ((iter->flags & FTRACE_ITER_FAILURES) && + (!(rec->flags & FTRACE_FL_FAILED) || + (rec->flags & FTRACE_FL_FREE))) || + ((iter->flags & FTRACE_ITER_FILTER) && !(rec->flags & FTRACE_FL_FILTER)) || + ((iter->flags & FTRACE_ITER_NOTRACE) && !(rec->flags & FTRACE_FL_NOTRACE))) { rec = NULL; @@ -896,6 +904,24 @@ int ftrace_avail_release(struct inode *inode, struct file *file) return 0; } +static int +ftrace_failures_open(struct inode *inode, struct file *file) +{ + int ret; + struct seq_file *m; + struct ftrace_iterator *iter; + + ret = ftrace_avail_open(inode, file); + if (!ret) { + m = (struct seq_file *)file->private_data; + iter = (struct ftrace_iterator *)m->private; + iter->flags = FTRACE_ITER_FAILURES; + } + + return ret; +} + + static void ftrace_filter_reset(int enable) { struct ftrace_page *pg; @@ -1309,6 +1335,13 @@ static struct file_operations ftrace_avail_fops = { .release = ftrace_avail_release, }; +static struct file_operations ftrace_failures_fops = { + .open = ftrace_failures_open, + .read = seq_read, + .llseek = seq_lseek, + .release = ftrace_avail_release, +}; + static struct file_operations ftrace_filter_fops = { .open = ftrace_filter_open, .read = ftrace_regex_read, @@ -1386,6 +1419,11 @@ static __init int ftrace_init_debugfs(void) pr_warning("Could not create debugfs " "'available_filter_functions' entry\n"); + entry = debugfs_create_file("failures", 0444, + d_tracer, NULL, &ftrace_failures_fops); + if (!entry) + pr_warning("Could not create debugfs 'failures' entry\n"); + entry = debugfs_create_file("set_ftrace_filter", 0644, d_tracer, NULL, &ftrace_filter_fops); if (!entry) -- cgit v0.10.2 From 34078a5e44db3cbed2e0ed580c29a39d94e0cd97 Mon Sep 17 00:00:00 2001 From: Abhishek Sagar Date: Tue, 3 Jun 2008 08:33:41 +0530 Subject: ftrace: prevent freeing of all failed updates Steven Rostedt wrote: > If we unload a module and reload it, will it ever get converted again? The intent was always to filter core kernel functions to prevent their freeing. Here's a fix which should allow re-recording of module call-sites. Signed-off-by: Ingo Molnar diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index ad568c7..0118979 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -430,7 +430,7 @@ static void ftrace_replace_code(int enable) if (failed && (rec->flags & FTRACE_FL_CONVERTED)) { rec->flags |= FTRACE_FL_FAILED; if ((system_state == SYSTEM_BOOTING) || - !kernel_text_address(rec->ip)) { + !core_kernel_text(rec->ip)) { ftrace_del_hash(rec); ftrace_free_rec(rec); } @@ -651,10 +651,9 @@ static int __ftrace_update_code(void *ignore) ftrace_update_cnt++; } else { if ((system_state == SYSTEM_BOOTING) || - !kernel_text_address(p->ip)) { + !core_kernel_text(p->ip)) { ftrace_del_hash(p); ftrace_free_rec(p); - } } } -- cgit v0.10.2 From 040ec23d07f95285e9777a85cda29cb339a3065b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 9 Jun 2008 01:45:29 -0700 Subject: sched: sched_clock() lockdep fix Sitsofe Wheeler bisected the following commit to cause a lockdep to warn about itself and turn itself off: > commit c6531cce6e6e4b99bcda46b6268d6f2d9e30aea4 > Author: Ingo Molnar > Date: Mon May 12 21:21:14 2008 +0200 > > sched: do not trace sched_clock do not use raw irq flags in cpu_clock() as it causes lockdep to lose track of the true state of the IRQ flag. Reported-and-bisected-by: Sitsofe Wheeler Signed-off-by: Ingo Molnar Signed-off-by: Andrew Morton diff --git a/kernel/sched.c b/kernel/sched.c index 6590a82..b8c9fe6 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -889,7 +889,7 @@ unsigned long long notrace cpu_clock(int cpu) unsigned long long prev_cpu_time, time, delta_time; unsigned long flags; - raw_local_irq_save(flags); + local_irq_save(flags); prev_cpu_time = per_cpu(prev_cpu_time, cpu); time = __cpu_clock(cpu) + per_cpu(time_offset, cpu); delta_time = time-prev_cpu_time; @@ -898,7 +898,7 @@ unsigned long long notrace cpu_clock(int cpu) time = __sync_cpu_clock(time, cpu); per_cpu(prev_cpu_time, cpu) = time; } - raw_local_irq_restore(flags); + local_irq_restore(flags); return time; } -- cgit v0.10.2 From 2b1bce1787700768cbc87c8509851c6f49d252dc Mon Sep 17 00:00:00 2001 From: Ankita Garg Date: Mon, 9 Jun 2008 14:10:25 +0530 Subject: ftrace: disable tracing when current_tracer is set to "none" Found that inspite of setting the current_tracer to "none", trace from the previous trace type continued to be collected. The patch below fixes this and causes the trace to be disabled when the "none" type is selected. Compile and boot tested the patch for functionality. Signed-off-by: Ankita Garg Signed-off-by: Ingo Molnar diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 12f5e81..dde6f0a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -43,11 +43,6 @@ static cpumask_t __read_mostly tracing_buffer_mask; #define for_each_tracing_cpu(cpu) \ for_each_cpu_mask(cpu, tracing_buffer_mask) -/* dummy trace to disable tracing */ -static struct tracer no_tracer __read_mostly = { - .name = "none", -}; - static int trace_alloc_page(void); static int trace_free_page(void); @@ -135,6 +130,23 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait); /* trace_flags holds iter_ctrl options */ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT; +static notrace void no_trace_init(struct trace_array *tr) +{ + int cpu; + + if(tr->ctrl) + for_each_online_cpu(cpu) + tracing_reset(tr->data[cpu]); + tracer_enabled = 0; +} + +/* dummy trace to disable tracing */ +static struct tracer no_tracer __read_mostly = { + .name = "none", + .init = no_trace_init +}; + + /** * trace_wake_up - wake up tasks waiting for trace input * -- cgit v0.10.2 From 20764ff1efb440640353053ec83263e69e1259e0 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Thu, 12 Jun 2008 11:27:03 +0200 Subject: ftrace: fix printout Do not print loglevel before "entries of %ld bytes". Move it to the previous pr_info. Signed-off-by: Jiri Slaby Cc: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index dde6f0a..6e9dae7 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3051,9 +3051,8 @@ __init static int tracer_alloc_buffers(void) } max_tr.entries = global_trace.entries; - pr_info("tracer: %d pages allocated for %ld", - pages, trace_nr_entries); - pr_info(" entries of %ld bytes\n", (long)TRACE_ENTRY_SIZE); + pr_info("tracer: %d pages allocated for %ld entries of %ld bytes\n", + pages, trace_nr_entries, (long)TRACE_ENTRY_SIZE); pr_info(" actual entries %ld\n", global_trace.entries); tracer_init_debugfs(); -- cgit v0.10.2 From a4500b84c51645bbc86be3ca84f2252b7ada060f Mon Sep 17 00:00:00 2001 From: Abhishek Sagar Date: Sat, 14 Jun 2008 11:59:39 +0530 Subject: ftrace: fix "notrace" filtering priority This is a fix to give notrace filter rules priority over "set_ftrace_filter" rules. This fix ensures that functions which are set to be filtered and are concurrently marked as "notrace" don't get recorded. As of now, if a record is marked as FTRACE_FL_FILTER and is enabled, then the notrace flag is not checked. Tested on x86-32. Signed-off-by: Abhishek Sagar Signed-off-by: Ingo Molnar diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 0118979..b532e4a 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -355,20 +355,26 @@ __ftrace_replace_code(struct dyn_ftrace *rec, * If this record is set not to trace then * do nothing. * + * If this record is set not to trace and + * it is enabled then disable it. + * * If this record is not set to be filtered and * it is enabled, disable it. */ - fl = rec->flags & (FTRACE_FL_FILTER | FTRACE_FL_ENABLED); + + fl = rec->flags & (FTRACE_FL_FILTER | FTRACE_FL_NOTRACE | + FTRACE_FL_ENABLED); if ((fl == (FTRACE_FL_FILTER | FTRACE_FL_ENABLED)) || - (fl == 0) || (rec->flags & FTRACE_FL_NOTRACE)) + (fl == (FTRACE_FL_FILTER | FTRACE_FL_NOTRACE)) || + !fl || (fl == FTRACE_FL_NOTRACE)) return 0; /* * If it is enabled disable it, * otherwise enable it! */ - if (fl == FTRACE_FL_ENABLED) { + if (fl & FTRACE_FL_ENABLED) { /* swap new and old */ new = old; old = ftrace_call_replace(ip, FTRACE_ADDR); -- cgit v0.10.2 From f22529351f7060d61eff3b76d7c9706f90aaedf3 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 22 May 2008 10:37:48 +0200 Subject: namespacecheck: fixes Signed-off-by: Ingo Molnar diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index b532e4a..0d5bcf6 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -50,7 +50,7 @@ static struct ftrace_ops ftrace_list_end __read_mostly = static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end; ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; -void ftrace_list_func(unsigned long ip, unsigned long parent_ip) +static void ftrace_list_func(unsigned long ip, unsigned long parent_ip) { struct ftrace_ops *op = ftrace_list; diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index c16935d..93a6620 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -196,7 +196,7 @@ static void tracing_sched_unregister(void) &ctx_trace); } -void tracing_start_sched_switch(void) +static void tracing_start_sched_switch(void) { long ref; @@ -205,7 +205,7 @@ void tracing_start_sched_switch(void) tracing_sched_register(); } -void tracing_stop_sched_switch(void) +static void tracing_stop_sched_switch(void) { long ref; -- cgit v0.10.2 From ee4311adf105f4d740f52e3948acc1d81598afcc Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 17 Jun 2008 17:43:02 +0200 Subject: ftrace: build fix with gcc 4.3 fix: arch/x86/kernel/ftrace.c: Assembler messages: arch/x86/kernel/ftrace.c:82: Error: bad register name `%sil' make[1]: *** [arch/x86/kernel/ftrace.o] Error 1 Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index bc5cf8d..5582814 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -88,7 +88,7 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code, ".previous\n" _ASM_EXTABLE(1b, 3b) : "=r"(faulted), "=a"(replaced) - : "r"(ip), "r"(new), "r"(newch), + : "r"(ip), "r"(new), "c"(newch), "0"(faulted), "a"(old) : "memory"); sync_core(); -- cgit v0.10.2 From 395a59d0f8e86bb39cd700c3d185d30c670bb958 Mon Sep 17 00:00:00 2001 From: Abhishek Sagar Date: Sat, 21 Jun 2008 23:47:27 +0530 Subject: ftrace: store mcount address in rec->ip Record the address of the mcount call-site. Currently all archs except sparc64 record the address of the instruction following the mcount call-site. Some general cleanups are entailed. Storing mcount addresses in rec->ip enables looking them up in the kprobe hash table later on to check if they're kprobe'd. Signed-off-by: Abhishek Sagar Cc: davem@davemloft.net Cc: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/arch/arm/kernel/armksyms.c b/arch/arm/kernel/armksyms.c index 3b13221..cc7b246 100644 --- a/arch/arm/kernel/armksyms.c +++ b/arch/arm/kernel/armksyms.c @@ -18,6 +18,7 @@ #include #include #include +#include /* * libgcc functions - functions that are used internally by the @@ -48,11 +49,6 @@ extern void __aeabi_ulcmp(void); extern void fpundefinstr(void); extern void fp_enter(void); -#ifdef CONFIG_FTRACE -extern void mcount(void); -EXPORT_SYMBOL(mcount); -#endif - /* * This has a special calling convention; it doesn't * modify any of the usual registers, except for LR. @@ -186,3 +182,7 @@ EXPORT_SYMBOL(_find_next_bit_be); #endif EXPORT_SYMBOL(copy_page); + +#ifdef CONFIG_FTRACE +EXPORT_SYMBOL(mcount); +#endif diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S index 8f79a47..84694e8 100644 --- a/arch/arm/kernel/entry-common.S +++ b/arch/arm/kernel/entry-common.S @@ -9,6 +9,7 @@ */ #include +#include #include #include "entry-header.S" @@ -104,6 +105,7 @@ ENTRY(ret_from_fork) ENTRY(mcount) stmdb sp!, {r0-r3, lr} mov r0, lr + sub r0, r0, #MCOUNT_INSN_SIZE .globl mcount_call mcount_call: @@ -114,6 +116,7 @@ ENTRY(ftrace_caller) stmdb sp!, {r0-r3, lr} ldr r1, [fp, #-4] mov r0, lr + sub r0, r0, #MCOUNT_INSN_SIZE .globl ftrace_call ftrace_call: @@ -134,6 +137,7 @@ ENTRY(mcount) trace: ldr r1, [fp, #-4] mov r0, lr + sub r0, r0, #MCOUNT_INSN_SIZE mov lr, pc mov pc, r2 ldmia sp!, {r0-r3, pc} diff --git a/arch/arm/kernel/ftrace.c b/arch/arm/kernel/ftrace.c index 22f3d6e..76d50e6 100644 --- a/arch/arm/kernel/ftrace.c +++ b/arch/arm/kernel/ftrace.c @@ -12,9 +12,10 @@ */ #include + #include +#include -#define INSN_SIZE 4 #define PC_OFFSET 8 #define BL_OPCODE 0xeb000000 #define BL_OFFSET_MASK 0x00ffffff @@ -32,10 +33,10 @@ unsigned char *ftrace_call_replace(unsigned long pc, unsigned long addr) { long offset; - offset = (long)addr - (long)(pc - INSN_SIZE + PC_OFFSET); + offset = (long)addr - (long)(pc + PC_OFFSET); if (unlikely(offset < -33554432 || offset > 33554428)) { /* Can't generate branches that far (from ARM ARM). Ftrace - * doesn't generate branches outside of core kernel text. + * doesn't generate branches outside of kernel text. */ WARN_ON_ONCE(1); return NULL; @@ -52,7 +53,6 @@ int ftrace_modify_code(unsigned long pc, unsigned char *old_code, old = *(unsigned long *)old_code; new = *(unsigned long *)new_code; - pc -= INSN_SIZE; __asm__ __volatile__ ( "1: ldr %1, [%2] \n" @@ -77,7 +77,7 @@ int ftrace_modify_code(unsigned long pc, unsigned char *old_code, : "memory"); if (!err && (replaced == old)) - flush_icache_range(pc, pc + INSN_SIZE); + flush_icache_range(pc, pc + MCOUNT_INSN_SIZE); return err; } @@ -89,8 +89,7 @@ int ftrace_update_ftrace_func(ftrace_func_t func) unsigned char *new; pc = (unsigned long)&ftrace_call; - pc += INSN_SIZE; - memcpy(&old, &ftrace_call, INSN_SIZE); + memcpy(&old, &ftrace_call, MCOUNT_INSN_SIZE); new = ftrace_call_replace(pc, (unsigned long)func); ret = ftrace_modify_code(pc, (unsigned char *)&old, new); return ret; @@ -103,8 +102,7 @@ int ftrace_mcount_set(unsigned long *data) unsigned char *new; pc = (unsigned long)&mcount_call; - pc += INSN_SIZE; - memcpy(&old, &mcount_call, INSN_SIZE); + memcpy(&old, &mcount_call, MCOUNT_INSN_SIZE); new = ftrace_call_replace(pc, *addr); *addr = ftrace_modify_code(pc, (unsigned char *)&old, new); return 0; diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index 3b1dd29..7231a70 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -30,6 +30,7 @@ #include #include #include +#include #undef SHOW_SYSCALLS #undef SHOW_SYSCALLS_TASK @@ -1053,6 +1054,7 @@ _GLOBAL(_mcount) stw r10,40(r1) stw r3, 44(r1) stw r5, 8(r1) + subi r3, r3, MCOUNT_INSN_SIZE .globl mcount_call mcount_call: bl ftrace_stub @@ -1090,6 +1092,7 @@ _GLOBAL(ftrace_caller) stw r10,40(r1) stw r3, 44(r1) stw r5, 8(r1) + subi r3, r3, MCOUNT_INSN_SIZE .globl ftrace_call ftrace_call: bl ftrace_stub @@ -1128,6 +1131,7 @@ _GLOBAL(_mcount) stw r3, 44(r1) stw r5, 8(r1) + subi r3, r3, MCOUNT_INSN_SIZE LOAD_REG_ADDR(r5, ftrace_trace_function) lwz r5,0(r5) diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 2c4d9e0..2f511a9 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -31,6 +31,7 @@ #include #include #include +#include /* * System calls. @@ -879,6 +880,7 @@ _GLOBAL(_mcount) mflr r3 stdu r1, -112(r1) std r3, 128(r1) + subi r3, r3, MCOUNT_INSN_SIZE .globl mcount_call mcount_call: bl ftrace_stub @@ -895,6 +897,7 @@ _GLOBAL(ftrace_caller) stdu r1, -112(r1) std r3, 128(r1) ld r4, 16(r11) + subi r3, r3, MCOUNT_INSN_SIZE .globl ftrace_call ftrace_call: bl ftrace_stub @@ -916,7 +919,7 @@ _GLOBAL(_mcount) std r3, 128(r1) ld r4, 16(r11) - + subi r3, r3, MCOUNT_INSN_SIZE LOAD_REG_ADDR(r5,ftrace_trace_function) ld r5,0(r5) ld r5,0(r5) diff --git a/arch/powerpc/kernel/ftrace.c b/arch/powerpc/kernel/ftrace.c index e12c593..3855ceb 100644 --- a/arch/powerpc/kernel/ftrace.c +++ b/arch/powerpc/kernel/ftrace.c @@ -15,8 +15,8 @@ #include #include +#include -#define CALL_BACK 4 static unsigned int ftrace_nop = 0x60000000; @@ -27,9 +27,10 @@ static unsigned int ftrace_nop = 0x60000000; # define GET_ADDR(addr) *(unsigned long *)addr #endif + static unsigned int notrace ftrace_calc_offset(long ip, long addr) { - return (int)((addr + CALL_BACK) - ip); + return (int)(addr - ip); } notrace unsigned char *ftrace_nop_replace(void) @@ -76,9 +77,6 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code, unsigned new = *(unsigned *)new_code; int faulted = 0; - /* move the IP back to the start of the call */ - ip -= CALL_BACK; - /* * Note: Due to modules and __init, code can * disappear and change, we need to protect against faulting @@ -118,12 +116,10 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code, notrace int ftrace_update_ftrace_func(ftrace_func_t func) { unsigned long ip = (unsigned long)(&ftrace_call); - unsigned char old[4], *new; + unsigned char old[MCOUNT_INSN_SIZE], *new; int ret; - ip += CALL_BACK; - - memcpy(old, &ftrace_call, 4); + memcpy(old, &ftrace_call, MCOUNT_INSN_SIZE); new = ftrace_call_replace(ip, (unsigned long)func); ret = ftrace_modify_code(ip, old, new); @@ -134,16 +130,13 @@ notrace int ftrace_mcount_set(unsigned long *data) { unsigned long ip = (long)(&mcount_call); unsigned long *addr = data; - unsigned char old[4], *new; - - /* ip is at the location, but modify code will subtact this */ - ip += CALL_BACK; + unsigned char old[MCOUNT_INSN_SIZE], *new; /* * Replace the mcount stub with a pointer to the * ip recorder function. */ - memcpy(old, &mcount_call, 4); + memcpy(old, &mcount_call, MCOUNT_INSN_SIZE); new = ftrace_call_replace(ip, *addr); *addr = ftrace_modify_code(ip, old, new); diff --git a/arch/sparc64/kernel/ftrace.c b/arch/sparc64/kernel/ftrace.c index c173731..4298d0a 100644 --- a/arch/sparc64/kernel/ftrace.c +++ b/arch/sparc64/kernel/ftrace.c @@ -5,6 +5,8 @@ #include #include +#include + static const u32 ftrace_nop = 0x01000000; notrace unsigned char *ftrace_nop_replace(void) @@ -60,9 +62,9 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code, notrace int ftrace_update_ftrace_func(ftrace_func_t func) { unsigned long ip = (unsigned long)(&ftrace_call); - unsigned char old[4], *new; + unsigned char old[MCOUNT_INSN_SIZE], *new; - memcpy(old, &ftrace_call, 4); + memcpy(old, &ftrace_call, MCOUNT_INSN_SIZE); new = ftrace_call_replace(ip, (unsigned long)func); return ftrace_modify_code(ip, old, new); } @@ -71,13 +73,13 @@ notrace int ftrace_mcount_set(unsigned long *data) { unsigned long ip = (long)(&mcount_call); unsigned long *addr = data; - unsigned char old[4], *new; + unsigned char old[MCOUNT_INSN_SIZE], *new; /* * Replace the mcount stub with a pointer to the * ip recorder function. */ - memcpy(old, &mcount_call, 4); + memcpy(old, &mcount_call, MCOUNT_INSN_SIZE); new = ftrace_call_replace(ip, *addr); *addr = ftrace_modify_code(ip, old, new); diff --git a/arch/sparc64/kernel/sparc64_ksyms.c b/arch/sparc64/kernel/sparc64_ksyms.c index 8ac0b99..b80d982 100644 --- a/arch/sparc64/kernel/sparc64_ksyms.c +++ b/arch/sparc64/kernel/sparc64_ksyms.c @@ -53,6 +53,7 @@ #include #include #include +#include struct poll { int fd; @@ -112,7 +113,6 @@ EXPORT_SYMBOL(smp_call_function); #endif /* CONFIG_SMP */ #if defined(CONFIG_MCOUNT) -extern void _mcount(void); EXPORT_SYMBOL(_mcount); #endif diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 04ea83c..95e6bbe 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -51,6 +51,7 @@ #include #include #include +#include #include "irq_vectors.h" /* @@ -1118,6 +1119,7 @@ ENTRY(mcount) pushl %ecx pushl %edx movl 0xc(%esp), %eax + subl $MCOUNT_INSN_SIZE, %eax .globl mcount_call mcount_call: @@ -1136,6 +1138,7 @@ ENTRY(ftrace_caller) pushl %edx movl 0xc(%esp), %eax movl 0x4(%ebp), %edx + subl $MCOUNT_INSN_SIZE, %eax .globl ftrace_call ftrace_call: @@ -1166,6 +1169,7 @@ trace: pushl %edx movl 0xc(%esp), %eax movl 0x4(%ebp), %edx + subl $MCOUNT_INSN_SIZE, %eax call *ftrace_trace_function diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index fe25e5f..b0f7308 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -51,6 +51,7 @@ #include #include #include +#include .code64 @@ -68,6 +69,7 @@ ENTRY(mcount) movq %r9, 48(%rsp) movq 0x38(%rsp), %rdi + subq $MCOUNT_INSN_SIZE, %rdi .globl mcount_call mcount_call: @@ -99,6 +101,7 @@ ENTRY(ftrace_caller) movq 0x38(%rsp), %rdi movq 8(%rbp), %rsi + subq $MCOUNT_INSN_SIZE, %rdi .globl ftrace_call ftrace_call: @@ -139,6 +142,7 @@ trace: movq 0x38(%rsp), %rdi movq 8(%rbp), %rsi + subq $MCOUNT_INSN_SIZE, %rdi call *ftrace_trace_function diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 5582814..ab115cd 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -17,20 +17,21 @@ #include #include +#include -#define CALL_BACK 5 /* Long is fine, even if it is only 4 bytes ;-) */ static long *ftrace_nop; union ftrace_code_union { - char code[5]; + char code[MCOUNT_INSN_SIZE]; struct { char e8; int offset; } __attribute__((packed)); }; + static int notrace ftrace_calc_offset(long ip, long addr) { return (int)(addr - ip); @@ -46,7 +47,7 @@ notrace unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr) static union ftrace_code_union calc; calc.e8 = 0xe8; - calc.offset = ftrace_calc_offset(ip, addr); + calc.offset = ftrace_calc_offset(ip + MCOUNT_INSN_SIZE, addr); /* * No locking needed, this must be called via kstop_machine @@ -65,9 +66,6 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code, unsigned char newch = new_code[4]; int faulted = 0; - /* move the IP back to the start of the call */ - ip -= CALL_BACK; - /* * Note: Due to modules and __init, code can * disappear and change, we need to protect against faulting @@ -102,12 +100,10 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code, notrace int ftrace_update_ftrace_func(ftrace_func_t func) { unsigned long ip = (unsigned long)(&ftrace_call); - unsigned char old[5], *new; + unsigned char old[MCOUNT_INSN_SIZE], *new; int ret; - ip += CALL_BACK; - - memcpy(old, &ftrace_call, 5); + memcpy(old, &ftrace_call, MCOUNT_INSN_SIZE); new = ftrace_call_replace(ip, (unsigned long)func); ret = ftrace_modify_code(ip, old, new); @@ -118,16 +114,13 @@ notrace int ftrace_mcount_set(unsigned long *data) { unsigned long ip = (long)(&mcount_call); unsigned long *addr = data; - unsigned char old[5], *new; - - /* ip is at the location, but modify code will subtact this */ - ip += CALL_BACK; + unsigned char old[MCOUNT_INSN_SIZE], *new; /* * Replace the mcount stub with a pointer to the * ip recorder function. */ - memcpy(old, &mcount_call, 5); + memcpy(old, &mcount_call, MCOUNT_INSN_SIZE); new = ftrace_call_replace(ip, *addr); *addr = ftrace_modify_code(ip, old, new); @@ -142,8 +135,7 @@ int __init ftrace_dyn_arch_init(void *data) ftrace_mcount_set(data); - ftrace_nop = (unsigned long *)noptable[CALL_BACK]; + ftrace_nop = (unsigned long *)noptable[MCOUNT_INSN_SIZE]; return 0; } - diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c index 29999db..dd7ebee 100644 --- a/arch/x86/kernel/i386_ksyms_32.c +++ b/arch/x86/kernel/i386_ksyms_32.c @@ -1,9 +1,9 @@ -#include #include #include #include #include +#include #ifdef CONFIG_FTRACE /* mcount is defined in assembly */ diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c index 122885b..16ff4bf 100644 --- a/arch/x86/kernel/x8664_ksyms_64.c +++ b/arch/x86/kernel/x8664_ksyms_64.c @@ -1,7 +1,6 @@ /* Exports for assembly files. All C exports should go in the respective C files. */ -#include #include #include @@ -11,6 +10,7 @@ #include #include #include +#include #ifdef CONFIG_FTRACE /* mcount is defined in assembly */ diff --git a/include/asm-arm/ftrace.h b/include/asm-arm/ftrace.h new file mode 100644 index 0000000..584ef9a --- /dev/null +++ b/include/asm-arm/ftrace.h @@ -0,0 +1,14 @@ +#ifndef _ASM_ARM_FTRACE +#define _ASM_ARM_FTRACE + +#ifdef CONFIG_FTRACE +#define MCOUNT_ADDR ((long)(mcount)) +#define MCOUNT_INSN_SIZE 4 /* sizeof mcount call */ + +#ifndef __ASSEMBLY__ +extern void mcount(void); +#endif + +#endif + +#endif /* _ASM_ARM_FTRACE */ diff --git a/include/asm-powerpc/ftrace.h b/include/asm-powerpc/ftrace.h index b1bfa70..de92132 100644 --- a/include/asm-powerpc/ftrace.h +++ b/include/asm-powerpc/ftrace.h @@ -1,6 +1,14 @@ #ifndef _ASM_POWERPC_FTRACE #define _ASM_POWERPC_FTRACE +#ifdef CONFIG_FTRACE +#define MCOUNT_ADDR ((long)(_mcount)) +#define MCOUNT_INSN_SIZE 4 /* sizeof mcount call */ + +#ifndef __ASSEMBLY__ extern void _mcount(void); +#endif #endif + +#endif /* _ASM_POWERPC_FTRACE */ diff --git a/include/asm-sparc64/ftrace.h b/include/asm-sparc64/ftrace.h new file mode 100644 index 0000000..f76a40a --- /dev/null +++ b/include/asm-sparc64/ftrace.h @@ -0,0 +1,14 @@ +#ifndef _ASM_SPARC64_FTRACE +#define _ASM_SPARC64_FTRACE + +#ifdef CONFIG_FTRACE +#define MCOUNT_ADDR ((long)(_mcount)) +#define MCOUNT_INSN_SIZE 4 /* sizeof mcount call */ + +#ifndef __ASSEMBLY__ +extern void _mcount(void); +#endif + +#endif + +#endif /* _ASM_SPARC64_FTRACE */ diff --git a/include/asm-x86/ftrace.h b/include/asm-x86/ftrace.h new file mode 100644 index 0000000..c184441 --- /dev/null +++ b/include/asm-x86/ftrace.h @@ -0,0 +1,14 @@ +#ifndef _ASM_X86_FTRACE +#define _ASM_SPARC64_FTRACE + +#ifdef CONFIG_FTRACE +#define MCOUNT_ADDR ((long)(mcount)) +#define MCOUNT_INSN_SIZE 5 /* sizeof mcount call */ + +#ifndef __ASSEMBLY__ +extern void mcount(void); +#endif + +#endif /* CONFIG_FTRACE */ + +#endif /* _ASM_X86_FTRACE */ diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 20e14d0..366098d 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -31,7 +31,6 @@ int unregister_ftrace_function(struct ftrace_ops *ops); void clear_ftrace_function(void); extern void ftrace_stub(unsigned long a0, unsigned long a1); -extern void mcount(void); #else /* !CONFIG_FTRACE */ # define register_ftrace_function(ops) do { } while (0) @@ -54,7 +53,7 @@ enum { struct dyn_ftrace { struct hlist_node node; - unsigned long ip; + unsigned long ip; /* address of mcount call-site */ unsigned long flags; }; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 0d5bcf6..f1e9e5c 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -27,6 +27,8 @@ #include #include +#include + #include "trace.h" /* ftrace_enabled is a method to turn ftrace on or off */ @@ -329,7 +331,6 @@ ftrace_record_ip(unsigned long ip) } #define FTRACE_ADDR ((long)(ftrace_caller)) -#define MCOUNT_ADDR ((long)(mcount)) static int __ftrace_replace_code(struct dyn_ftrace *rec, -- cgit v0.10.2 From 785656a41f9a9c0e843a23d1ae05d900b5158f8f Mon Sep 17 00:00:00 2001 From: Abhishek Sagar Date: Sat, 21 Jun 2008 23:47:39 +0530 Subject: kprobes: enable clean usage of get_kprobe Allow clean use of get_kprobe() outside of core kprobe code. Ftrace makes use of get_kprobe to identify probes installed on mcount call-sites. Signed-off-by: Abhishek Sagar Acked-by: Ananth N Mavinakayanahalli Cc: Masami Hiramatsu Cc: jkenisto@us.ibm.com Cc: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 1036631..04a3556 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -259,6 +259,10 @@ void recycle_rp_inst(struct kretprobe_instance *ri, struct hlist_head *head); struct jprobe; struct kretprobe; +static inline struct kprobe *get_kprobe(void *addr) +{ + return NULL; +} static inline struct kprobe *kprobe_running(void) { return NULL; -- cgit v0.10.2 From ecea656d1d5e912d2f3d332657ea4a6d8380f891 Mon Sep 17 00:00:00 2001 From: Abhishek Sagar Date: Sat, 21 Jun 2008 23:47:53 +0530 Subject: ftrace: freeze kprobe'd records Let records identified as being kprobe'd be marked as "frozen". The trouble with records which have a kprobe installed on their mcount call-site is that they don't get updated. So if such a function which is currently being traced gets its tracing disabled due to a new filter rule (or because it was added to the notrace list) then it won't be updated and continue being traced. This patch allows scanning of all frozen records during tracing to check if they should be traced. Signed-off-by: Abhishek Sagar Cc: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 366098d..3121b95 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -49,6 +49,7 @@ enum { FTRACE_FL_ENABLED = (1 << 3), FTRACE_FL_NOTRACE = (1 << 4), FTRACE_FL_CONVERTED = (1 << 5), + FTRACE_FL_FROZEN = (1 << 6), }; struct dyn_ftrace { @@ -73,15 +74,18 @@ extern void ftrace_caller(void); extern void ftrace_call(void); extern void mcount_call(void); +extern int skip_trace(unsigned long ip); + void ftrace_disable_daemon(void); void ftrace_enable_daemon(void); #else +# define skip_trace(ip) ({ 0; }) # define ftrace_force_update() ({ 0; }) # define ftrace_set_filter(buf, len, reset) do { } while (0) # define ftrace_disable_daemon() do { } while (0) # define ftrace_enable_daemon() do { } while (0) -#endif +#endif /* CONFIG_DYNAMIC_FTRACE */ /* totally disable ftrace - can not re-enable after this */ void ftrace_kill(void); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f1e9e5c..d123816 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -163,6 +163,8 @@ enum { }; static int ftrace_filtered; +static int tracing_on; +static int frozen_record_count; static struct hlist_head ftrace_hash[FTRACE_HASHSIZE]; @@ -195,6 +197,71 @@ static int ftrace_record_suspend; static struct dyn_ftrace *ftrace_free_records; + +#ifdef CONFIG_KPROBES +static inline void freeze_record(struct dyn_ftrace *rec) +{ + if (!(rec->flags & FTRACE_FL_FROZEN)) { + rec->flags |= FTRACE_FL_FROZEN; + frozen_record_count++; + } +} + +static inline void unfreeze_record(struct dyn_ftrace *rec) +{ + if (rec->flags & FTRACE_FL_FROZEN) { + rec->flags &= ~FTRACE_FL_FROZEN; + frozen_record_count--; + } +} + +static inline int record_frozen(struct dyn_ftrace *rec) +{ + return rec->flags & FTRACE_FL_FROZEN; +} +#else +# define freeze_record(rec) ({ 0; }) +# define unfreeze_record(rec) ({ 0; }) +# define record_frozen(rec) ({ 0; }) +#endif /* CONFIG_KPROBES */ + +int skip_trace(unsigned long ip) +{ + unsigned long fl; + struct dyn_ftrace *rec; + struct hlist_node *t; + struct hlist_head *head; + + if (frozen_record_count == 0) + return 0; + + head = &ftrace_hash[hash_long(ip, FTRACE_HASHBITS)]; + hlist_for_each_entry_rcu(rec, t, head, node) { + if (rec->ip == ip) { + if (record_frozen(rec)) { + if (rec->flags & FTRACE_FL_FAILED) + return 1; + + if (!(rec->flags & FTRACE_FL_CONVERTED)) + return 1; + + if (!tracing_on || !ftrace_enabled) + return 1; + + if (ftrace_filtered) { + fl = rec->flags & (FTRACE_FL_FILTER | + FTRACE_FL_NOTRACE); + if (!fl || (fl & FTRACE_FL_NOTRACE)) + return 1; + } + } + break; + } + } + + return 0; +} + static inline int ftrace_ip_in_hash(unsigned long ip, unsigned long key) { @@ -489,8 +556,11 @@ static int __ftrace_modify_code(void *data) */ __ftrace_update_code(NULL); ftrace_replace_code(1); - } else if (*command & FTRACE_DISABLE_CALLS) + tracing_on = 1; + } else if (*command & FTRACE_DISABLE_CALLS) { ftrace_replace_code(0); + tracing_on = 0; + } if (*command & FTRACE_UPDATE_TRACE_FUNC) ftrace_update_ftrace_func(ftrace_trace_function); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 6e9dae7..9ade793 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -988,6 +988,9 @@ function_trace_call(unsigned long ip, unsigned long parent_ip) if (unlikely(!tracer_enabled)) return; + if (skip_trace(ip)) + return; + local_irq_save(flags); cpu = raw_smp_processor_id(); data = tr->data[cpu]; -- cgit v0.10.2 From f22f9a89ce6857d377bf22dba4c1a8cd256c5136 Mon Sep 17 00:00:00 2001 From: Abhishek Sagar Date: Sat, 21 Jun 2008 23:50:29 +0530 Subject: ftrace: avoid modifying kprobe'd records Avoid modifying the mcount call-site if there is a kprobe installed on it. These records are not marked as failed however. This allowed the filter rules on them to remain up-to-date. Whenever the kprobe on the corresponding record is removed, the record gets updated as normal. Signed-off-by: Abhishek Sagar Cc: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index d123816..85e8413 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -500,6 +501,10 @@ static void ftrace_replace_code(int enable) if (rec->flags & FTRACE_FL_FAILED) continue; + /* ignore updates to this record's mcount site */ + if (get_kprobe((void *)rec->ip)) + continue; + failed = __ftrace_replace_code(rec, old, new, enable); if (failed && (rec->flags & FTRACE_FL_CONVERTED)) { rec->flags |= FTRACE_FL_FAILED; @@ -692,11 +697,11 @@ unsigned long ftrace_update_tot_cnt; static int __ftrace_update_code(void *ignore) { + int i, save_ftrace_enabled; + cycle_t start, stop; struct dyn_ftrace *p; struct hlist_node *t, *n; - int save_ftrace_enabled; - cycle_t start, stop; - int i; + struct hlist_head *head, temp_list; /* Don't be recording funcs now */ ftrace_record_suspend++; @@ -708,8 +713,11 @@ static int __ftrace_update_code(void *ignore) /* No locks needed, the machine is stopped! */ for (i = 0; i < FTRACE_HASHSIZE; i++) { + INIT_HLIST_HEAD(&temp_list); + head = &ftrace_hash[i]; + /* all CPUS are stopped, we are safe to modify code */ - hlist_for_each_entry_safe(p, t, n, &ftrace_hash[i], node) { + hlist_for_each_entry_safe(p, t, n, head, node) { /* Skip over failed records which have not been * freed. */ if (p->flags & FTRACE_FL_FAILED) @@ -723,6 +731,19 @@ static int __ftrace_update_code(void *ignore) if (p->flags & (FTRACE_FL_CONVERTED)) break; + /* Ignore updates to this record's mcount site. + * Reintroduce this record at the head of this + * bucket to attempt to "convert" it again if + * the kprobe on it is unregistered before the + * next run. */ + if (get_kprobe((void *)p->ip)) { + ftrace_del_hash(p); + INIT_HLIST_NODE(&p->node); + hlist_add_head(&p->node, &temp_list); + continue; + } + + /* convert record (i.e, patch mcount-call with NOP) */ if (ftrace_code_disable(p)) { p->flags |= FTRACE_FL_CONVERTED; ftrace_update_cnt++; @@ -734,6 +755,12 @@ static int __ftrace_update_code(void *ignore) } } } + + hlist_for_each_entry_safe(p, t, n, &temp_list, node) { + hlist_del(&p->node); + INIT_HLIST_NODE(&p->node); + hlist_add_head(&p->node, head); + } } stop = ftrace_now(raw_smp_processor_id()); -- cgit v0.10.2 From 3e61e0c976532a542b23bbb74c8f631815171078 Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Mon, 30 Jun 2008 23:48:37 +0300 Subject: mmiotrace broken in linux-next (8-bit writes only) The moment mmiotrace is enabled, I hit a NULL deref in: IP: [] __trace_special+0x17c/0x23a Call Trace: [] ftrace_special+0x6f/0x9a [] down+0x19/0x4a [] acquire_console_sem+0x42/0x58 [] con_flush_chars+0x28/0x43 [] write_chan+0x22e/0x334 [] ? default_wake_function+0x0/0xf [] tty_write+0x195/0x228 [] ? write_chan+0x0/0x334 [] vfs_write+0xae/0x137 [] sys_write+0x47/0x70 [] system_call_after_swapgs+0x7b/0x80 which means 'entry' in __trace_special() is NULL. [ mingo@elte.hu: that ftrace_special() was a leftover. ] Signed-off-by: Pekka Paalanen Cc: Steven Rostedt Cc: proski@gnu.org Cc: "Vegard Nossum" Signed-off-by: Ingo Molnar diff --git a/kernel/semaphore.c b/kernel/semaphore.c index 1a064ad..aaaeae8 100644 --- a/kernel/semaphore.c +++ b/kernel/semaphore.c @@ -54,7 +54,6 @@ void down(struct semaphore *sem) { unsigned long flags; - ftrace_special(sem->count, 0, __LINE__); spin_lock_irqsave(&sem->lock, flags); if (likely(sem->count > 0)) sem->count--; -- cgit v0.10.2 From 760378e1497841246ea7e42abad617d8a8ac0bcc Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 1 Jul 2008 17:35:06 +0200 Subject: fix "ftrace: store mcount address in rec->ip" Alexander Beregalov reported this build failure: $ make CROSS_COMPILE=sparc64-unknown-linux-gnu- image modules && sudo make modules_install CHK include/linux/version.h CHK include/linux/utsrelease.h CALL scripts/checksyscalls.sh CHK include/linux/compile.h dnsdomainname: Unknown host CC arch/sparc64/kernel/sparc64_ksyms.o arch/sparc64/kernel/sparc64_ksyms.c:116: error: '_mcount' undeclared here (not in a function) cc1: warnings being treated as errors arch/sparc64/kernel/sparc64_ksyms.c:116: error: type defaults to 'int' in declaration of '_mcount' And bisected it back to: | commit 395a59d0f8e86bb39cd700c3d185d30c670bb958 | Author: Abhishek Sagar | Date: Sat Jun 21 23:47:27 2008 +0530 | | ftrace: store mcount address in rec->ip the mcount prototype is only available under CONFIG_FTRACE, extend it to CONFIG_MCOUNT as well. Reported-and-bisected-by: Alexander Beregalov Signed-off-by: Ingo Molnar diff --git a/arch/sparc64/kernel/sparc64_ksyms.c b/arch/sparc64/kernel/sparc64_ksyms.c index b80d982..49d3ea5 100644 --- a/arch/sparc64/kernel/sparc64_ksyms.c +++ b/arch/sparc64/kernel/sparc64_ksyms.c @@ -112,7 +112,7 @@ EXPORT_SYMBOL(__write_trylock); EXPORT_SYMBOL(smp_call_function); #endif /* CONFIG_SMP */ -#if defined(CONFIG_MCOUNT) +#ifdef CONFIG_MCOUNT EXPORT_SYMBOL(_mcount); #endif diff --git a/include/asm-sparc64/ftrace.h b/include/asm-sparc64/ftrace.h index f76a40a..d27716c 100644 --- a/include/asm-sparc64/ftrace.h +++ b/include/asm-sparc64/ftrace.h @@ -1,7 +1,7 @@ #ifndef _ASM_SPARC64_FTRACE #define _ASM_SPARC64_FTRACE -#ifdef CONFIG_FTRACE +#ifdef CONFIG_MCOUNT #define MCOUNT_ADDR ((long)(_mcount)) #define MCOUNT_INSN_SIZE 4 /* sizeof mcount call */ -- cgit v0.10.2 From 98a05ed4bd7774f533ab185fe0bf2fdc58292d7c Mon Sep 17 00:00:00 2001 From: Abhishek Sagar Date: Thu, 26 Jun 2008 22:51:51 +0530 Subject: ftrace: prevent ftrace modifications while being kprobe'd, v2 add two missing chunks for ftrace+kprobe. Signed-off-by: Abhishek Sagar Cc: Steven Rostedt Signed-off-by: Ingo Molnar diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 85e8413..0f271c4 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -502,8 +502,12 @@ static void ftrace_replace_code(int enable) continue; /* ignore updates to this record's mcount site */ - if (get_kprobe((void *)rec->ip)) + if (get_kprobe((void *)rec->ip)) { + freeze_record(rec); continue; + } else { + unfreeze_record(rec); + } failed = __ftrace_replace_code(rec, old, new, enable); if (failed && (rec->flags & FTRACE_FL_CONVERTED)) { @@ -740,7 +744,10 @@ static int __ftrace_update_code(void *ignore) ftrace_del_hash(p); INIT_HLIST_NODE(&p->node); hlist_add_head(&p->node, &temp_list); + freeze_record(p); continue; + } else { + unfreeze_record(p); } /* convert record (i.e, patch mcount-call with NOP) */ -- cgit v0.10.2 From 007c05d4d2ce42fabd58cb54ed98e0a1714d9d86 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 10 Jul 2008 20:58:09 -0400 Subject: ftrace: move sched_switch enable after markers We have two markers now that are enabled on sched_switch. One that records the context switching and the other that records task wake ups. Currently we enable the tracing first and then set the markers. This causes some confusing traces: # tracer: sched_switch # # TASK-PID CPU# TIMESTAMP FUNCTION # | | | | | trace-cmd-3973 [00] 115.834817: 3973:120:R + 3: 0:S trace-cmd-3973 [01] 115.834910: 3973:120:R + 6: 0:S trace-cmd-3973 [02] 115.834910: 3973:120:R + 9: 0:S trace-cmd-3973 [03] 115.834910: 3973:120:R + 12: 0:S trace-cmd-3973 [02] 115.834910: 3973:120:R + 9: 0:S -0 [02] 115.834910: 0:140:R ==> 3973:120:R Here we see that trace-cmd with PID 3973 wakes up task 9 but the next line shows the idle task doing a context switch to task 3973. Enabling the tracing to _after_ the markers are set creates a much saner output: # tracer: sched_switch # # TASK-PID CPU# TIMESTAMP FUNCTION # | | | | | -0 [02] 7922.634225: 0:140:R ==> 4790:120:R trace-cmd-4789 [03] 7922.634225: 0:140:R + 4790:120:R Signed-off-by: Steven Rostedt Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Andrew Morton Signed-off-by: Ingo Molnar diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 93a6620..cb817a2 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -227,14 +227,14 @@ void tracing_stop_cmdline_record(void) static void start_sched_trace(struct trace_array *tr) { sched_switch_reset(tr); - tracer_enabled = 1; tracing_start_cmdline_record(); + tracer_enabled = 1; } static void stop_sched_trace(struct trace_array *tr) { - tracing_stop_cmdline_record(); tracer_enabled = 0; + tracing_stop_cmdline_record(); } static void sched_switch_trace_init(struct trace_array *tr) -- cgit v0.10.2 From 001b6767b1d0c89e458e5ddb039245b268f569fb Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 10 Jul 2008 20:58:10 -0400 Subject: ftrace: define function trace nop When CONFIG_FTRACE is not enabled, the tracing_start_functon_trace and tracing_stop_function_trace should be nops. Signed-off-by: Steven Rostedt Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Andrew Morton Signed-off-by: Ingo Molnar diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 6b8bd88..df840f4 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -216,8 +216,6 @@ void trace_function(struct trace_array *tr, unsigned long parent_ip, unsigned long flags); -void tracing_start_function_trace(void); -void tracing_stop_function_trace(void); void tracing_start_cmdline_record(void); void tracing_stop_cmdline_record(void); int register_tracer(struct tracer *type); @@ -234,6 +232,14 @@ void update_max_tr_single(struct trace_array *tr, extern cycle_t ftrace_now(int cpu); +#ifdef CONFIG_FTRACE +void tracing_start_function_trace(void); +void tracing_stop_function_trace(void); +#else +# define tracing_start_function_trace() do { } while (0) +# define tracing_stop_function_trace() do { } while (0) +#endif + #ifdef CONFIG_CONTEXT_SWITCH_TRACER typedef void (*tracer_switch_func_t)(void *private, -- cgit v0.10.2 From 1e16c0a081f6c93f04c6af784d6a160955269f91 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 10 Jul 2008 20:58:11 -0400 Subject: ftrace: trace schedule After the sched_clock code has been removed from sched.c we can now trace the scheduler. The scheduler has a lot of functions that would be worth tracing. Signed-off-by: Steven Rostedt Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Andrew Morton Signed-off-by: Ingo Molnar diff --git a/kernel/Makefile b/kernel/Makefile index ca2433e..4809762 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -11,7 +11,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ notifier.o ksysfs.o pm_qos_params.o sched_clock.o -CFLAGS_REMOVE_sched.o = -pg -mno-spe +CFLAGS_REMOVE_sched.o = -mno-spe ifdef CONFIG_FTRACE # Do not trace debug files and internal ftrace files -- cgit v0.10.2 From b5c21b4514b38f450848feb432f7120376d01ffe Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 10 Jul 2008 20:58:12 -0400 Subject: ftrace: check proper config for preempt type There is no CONFIG_PREEMPT_DESKTOP. Use the proper entry CONFIG_PREEMPT. Signed-off-by: Steven Rostedt Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Andrew Morton Signed-off-by: Ingo Molnar diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9ade793..f8fdb9c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1341,7 +1341,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) "server", #elif defined(CONFIG_PREEMPT_VOLUNTARY) "desktop", -#elif defined(CONFIG_PREEMPT_DESKTOP) +#elif defined(CONFIG_PREEMPT) "preempt", #else "unknown", -- cgit v0.10.2 From ad591240ceadcaf41b2a88855ca5f1c77c5a0298 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 10 Jul 2008 20:58:13 -0400 Subject: ftrace: start wakeup tracing after setting function tracer Enabling the wakeup tracer before enabling the function tracing causes some strange results due to the dynamic enabling of the functions. Signed-off-by: Steven Rostedt Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Andrew Morton Signed-off-by: Ingo Molnar diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index bf7e91c..3c8d61d 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -352,9 +352,10 @@ static void start_wakeup_tracer(struct trace_array *tr) */ smp_wmb(); - tracer_enabled = 1; register_ftrace_function(&trace_ops); + tracer_enabled = 1; + return; fail_deprobe_wake_new: marker_probe_unregister("kernel_sched_wakeup_new", -- cgit v0.10.2 From 26bc83f4cb911a0b4dabfe23b700aaf3235f2955 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 10 Jul 2008 20:58:14 -0400 Subject: ftrace: use current CPU for function startup This is more of a clean up. Currently the function tracer initializes the tracer with which ever CPU was last used for tracing. This value isn't realy useful for function tracing, but at least it should be something other than a random number. Signed-off-by: Steven Rostedt Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Andrew Morton Signed-off-by: Ingo Molnar diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 7ee7dcd..3121448 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -28,7 +28,10 @@ static void function_reset(struct trace_array *tr) static void start_function_trace(struct trace_array *tr) { + tr->cpu = get_cpu(); function_reset(tr); + put_cpu(); + tracing_start_cmdline_record(); tracing_start_function_trace(); } -- cgit v0.10.2 From a2bb6a3d85ef3124cd336403a95abc0540d3fbe2 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 10 Jul 2008 20:58:15 -0400 Subject: ftrace: add ftrace_kill_atomic It has been suggested that I add a way to disable the function tracer on an oops. This code adds a ftrace_kill_atomic. It is not meant to be used in normal situations. It will disable the ftrace tracer, but will not perform the nice shutdown that requires scheduling. Signed-off-by: Steven Rostedt Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Andrew Morton Signed-off-by: Ingo Molnar diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 3121b95..f368d04 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -89,6 +89,7 @@ void ftrace_enable_daemon(void); /* totally disable ftrace - can not re-enable after this */ void ftrace_kill(void); +void ftrace_kill_atomic(void); static inline void tracer_disable(void) { diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 0f271c4..1359632 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1602,6 +1602,21 @@ core_initcall(ftrace_dynamic_init); #endif /* CONFIG_DYNAMIC_FTRACE */ /** + * ftrace_kill_atomic - kill ftrace from critical sections + * + * This function should be used by panic code. It stops ftrace + * but in a not so nice way. If you need to simply kill ftrace + * from a non-atomic section, use ftrace_kill. + */ +void ftrace_kill_atomic(void) +{ + ftrace_disabled = 1; + ftrace_enabled = 0; + ftraced_suspend = -1; + clear_ftrace_function(); +} + +/** * ftrace_kill - totally shutdown ftrace * * This is a safety measure. If something was detected that seems -- cgit v0.10.2 From 60bc080090e3bf6afa29c62cb25f913706551010 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 10 Jul 2008 20:58:16 -0400 Subject: ftrace: separate out the function enabled variable Currently the function tracer uses the global tracer_enabled variable that is used to keep track if the tracer is enabled or not. The function tracing startup needs to be separated out, otherwise the internal happenings of the tracer startup is also recorded. This patch creates a ftrace_function_enabled variable to all the starting of the function traces to happen after everything has been started. Signed-off-by: Steven Rostedt Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Andrew Morton Signed-off-by: Ingo Molnar diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f8fdb9c..2e37857 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -96,6 +96,9 @@ static DEFINE_PER_CPU(struct trace_array_cpu, max_data); /* tracer_enabled is used to toggle activation of a tracer */ static int tracer_enabled = 1; +/* function tracing enabled */ +int ftrace_function_enabled; + /* * trace_nr_entries is the number of entries that is allocated * for a buffer. Note, the number of entries is always rounded @@ -134,6 +137,7 @@ static notrace void no_trace_init(struct trace_array *tr) { int cpu; + ftrace_function_enabled = 0; if(tr->ctrl) for_each_online_cpu(cpu) tracing_reset(tr->data[cpu]); @@ -985,7 +989,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip) long disabled; int cpu; - if (unlikely(!tracer_enabled)) + if (unlikely(!ftrace_function_enabled)) return; if (skip_trace(ip)) @@ -1010,11 +1014,15 @@ static struct ftrace_ops trace_ops __read_mostly = void tracing_start_function_trace(void) { + ftrace_function_enabled = 0; register_ftrace_function(&trace_ops); + if (tracer_enabled) + ftrace_function_enabled = 1; } void tracing_stop_function_trace(void) { + ftrace_function_enabled = 0; unregister_ftrace_function(&trace_ops); } #endif @@ -1850,8 +1858,10 @@ __tracing_open(struct inode *inode, struct file *file, int *ret) m->private = iter; /* stop the trace while dumping */ - if (iter->tr->ctrl) + if (iter->tr->ctrl) { tracer_enabled = 0; + ftrace_function_enabled = 0; + } if (iter->trace && iter->trace->open) iter->trace->open(iter); @@ -1884,8 +1894,14 @@ int tracing_release(struct inode *inode, struct file *file) iter->trace->close(iter); /* reenable tracing if it was previously enabled */ - if (iter->tr->ctrl) + if (iter->tr->ctrl) { tracer_enabled = 1; + /* + * It is safe to enable function tracing even if it + * isn't used + */ + ftrace_function_enabled = 1; + } mutex_unlock(&trace_types_lock); seq_release(inode, file); -- cgit v0.10.2 From b2613e370dbeb69edbff989382fa54f2395aa471 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 11 Jul 2008 16:44:27 +0200 Subject: ftrace: build fix for ftraced_suspend fix: kernel/trace/ftrace.c:1615: error: 'ftraced_suspend' undeclared (first use in this function) kernel/trace/ftrace.c:1615: error: (Each undeclared identifier is reported only once kernel/trace/ftrace.c:1615: error: for each function it appears in.) Signed-off-by: Ingo Molnar diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 1359632..4231a3d 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1612,7 +1612,9 @@ void ftrace_kill_atomic(void) { ftrace_disabled = 1; ftrace_enabled = 0; +#ifdef CONFIG_DYNAMIC_FTRACE ftraced_suspend = -1; +#endif clear_ftrace_function(); } -- cgit v0.10.2