summaryrefslogtreecommitdiff
path: root/Documentation/trace
diff options
context:
space:
mode:
authorMichal Marek <mmarek@suse.cz>2010-10-27 22:15:57 (GMT)
committerMichal Marek <mmarek@suse.cz>2010-10-27 22:15:57 (GMT)
commitb74b953b998bcc2db91b694446f3a2619ec32de6 (patch)
tree6ce24caabd730f6ae9287ed0676ec32e6ff31e9d /Documentation/trace
parentabb438526201c6a79949ad45375c051b6681c253 (diff)
parentf6f94e2ab1b33f0082ac22d71f66385a60d8157f (diff)
downloadlinux-fsl-qoriq-b74b953b998bcc2db91b694446f3a2619ec32de6.tar.xz
Merge commit 'v2.6.36' into kbuild/misc
Update to be able to fix a recent change to scripts/basic/docproc.c (commit eda603f).
Diffstat (limited to 'Documentation/trace')
-rw-r--r--Documentation/trace/events.txt11
-rw-r--r--Documentation/trace/ftrace-design.txt182
-rw-r--r--Documentation/trace/ftrace.txt54
-rw-r--r--Documentation/trace/kmemtrace.txt126
-rw-r--r--Documentation/trace/kprobetrace.txt61
-rw-r--r--Documentation/trace/postprocess/trace-vmscan-postprocess.pl686
6 files changed, 949 insertions, 171 deletions
diff --git a/Documentation/trace/events.txt b/Documentation/trace/events.txt
index 02ac6ed..09bd8e9 100644
--- a/Documentation/trace/events.txt
+++ b/Documentation/trace/events.txt
@@ -90,7 +90,8 @@ In order to facilitate early boot debugging, use boot option:
trace_event=[event-list]
-The format of this boot option is the same as described in section 2.1.
+event-list is a comma separated list of events. See section 2.1 for event
+format.
3. Defining an event-enabled tracepoint
=======================================
@@ -238,7 +239,7 @@ subsystem's filter file.
For convenience, filters for every event in a subsystem can be set or
cleared as a group by writing a filter expression into the filter file
-at the root of the subsytem. Note however, that if a filter for any
+at the root of the subsystem. Note however, that if a filter for any
event within the subsystem lacks a field specified in the subsystem
filter, or if the filter can't be applied for any other reason, the
filter for that event will retain its previous setting. This can
@@ -250,7 +251,7 @@ fields can be guaranteed to propagate successfully to all events.
Here are a few subsystem filter examples that also illustrate the
above points:
-Clear the filters on all events in the sched subsytem:
+Clear the filters on all events in the sched subsystem:
# cd /sys/kernel/debug/tracing/events/sched
# echo 0 > filter
@@ -260,7 +261,7 @@ none
none
Set a filter using only common fields for all events in the sched
-subsytem (all events end up with the same filter):
+subsystem (all events end up with the same filter):
# cd /sys/kernel/debug/tracing/events/sched
# echo common_pid == 0 > filter
@@ -270,7 +271,7 @@ common_pid == 0
common_pid == 0
Attempt to set a filter using a non-common field for all events in the
-sched subsytem (all events but those that have a prev_pid field retain
+sched subsystem (all events but those that have a prev_pid field retain
their old filters):
# cd /sys/kernel/debug/tracing/events/sched
diff --git a/Documentation/trace/ftrace-design.txt b/Documentation/trace/ftrace-design.txt
index 239f14b..dc52bd4 100644
--- a/Documentation/trace/ftrace-design.txt
+++ b/Documentation/trace/ftrace-design.txt
@@ -1,5 +1,6 @@
function tracer guts
====================
+ By Mike Frysinger
Introduction
------------
@@ -12,6 +13,9 @@ Note that this focuses on architecture implementation details only. If you
want more explanation of a feature in terms of common code, review the common
ftrace.txt file.
+Ideally, everyone who wishes to retain performance while supporting tracing in
+their kernel should make it all the way to dynamic ftrace support.
+
Prerequisites
-------------
@@ -173,14 +177,16 @@ void ftrace_graph_caller(void)
unsigned long *frompc = &...;
unsigned long selfpc = <return address> - MCOUNT_INSN_SIZE;
- prepare_ftrace_return(frompc, selfpc);
+ /* passing frame pointer up is optional -- see below */
+ prepare_ftrace_return(frompc, selfpc, frame_pointer);
/* restore all state needed by the ABI */
}
#endif
-For information on how to implement prepare_ftrace_return(), simply look at
-the x86 version. The only architecture-specific piece in it is the setup of
+For information on how to implement prepare_ftrace_return(), simply look at the
+x86 version (the frame pointer passing is optional; see the next section for
+more information). The only architecture-specific piece in it is the setup of
the fault recovery table (the asm(...) code). The rest should be the same
across architectures.
@@ -205,6 +211,23 @@ void return_to_handler(void)
#endif
+HAVE_FUNCTION_GRAPH_FP_TEST
+---------------------------
+
+An arch may pass in a unique value (frame pointer) to both the entering and
+exiting of a function. On exit, the value is compared and if it does not
+match, then it will panic the kernel. This is largely a sanity check for bad
+code generation with gcc. If gcc for your port sanely updates the frame
+pointer under different optimization levels, then ignore this option.
+
+However, adding support for it isn't terribly difficult. In your assembly code
+that calls prepare_ftrace_return(), pass the frame pointer as the 3rd argument.
+Then in the C version of that function, do what the x86 port does and pass it
+along to ftrace_push_return_trace() instead of a stub value of 0.
+
+Similarly, when you call ftrace_return_to_handler(), pass it the frame pointer.
+
+
HAVE_FTRACE_NMI_ENTER
---------------------
@@ -214,15 +237,14 @@ If you can't trace NMI functions, then skip this option.
HAVE_SYSCALL_TRACEPOINTS
----------------------
+------------------------
You need very few things to get the syscalls tracing in an arch.
+- Support HAVE_ARCH_TRACEHOOK (see arch/Kconfig).
- Have a NR_syscalls variable in <asm/unistd.h> that provides the number
of syscalls supported by the arch.
-- Implement arch_syscall_addr() that resolves a syscall address from a
- syscall number.
-- Support the TIF_SYSCALL_TRACEPOINT thread flags
+- Support the TIF_SYSCALL_TRACEPOINT thread flags.
- Put the trace_sys_enter() and trace_sys_exit() tracepoints calls from ptrace
in the ptrace syscalls tracing path.
- Tag this arch as HAVE_SYSCALL_TRACEPOINTS.
@@ -231,12 +253,152 @@ You need very few things to get the syscalls tracing in an arch.
HAVE_FTRACE_MCOUNT_RECORD
-------------------------
-See scripts/recordmcount.pl for more info.
+See scripts/recordmcount.pl for more info. Just fill in the arch-specific
+details for how to locate the addresses of mcount call sites via objdump.
+This option doesn't make much sense without also implementing dynamic ftrace.
+
+HAVE_DYNAMIC_FTRACE
+-------------------
+
+You will first need HAVE_FTRACE_MCOUNT_RECORD and HAVE_FUNCTION_TRACER, so
+scroll your reader back up if you got over eager.
+
+Once those are out of the way, you will need to implement:
+ - asm/ftrace.h:
+ - MCOUNT_ADDR
+ - ftrace_call_adjust()
+ - struct dyn_arch_ftrace{}
+ - asm code:
+ - mcount() (new stub)
+ - ftrace_caller()
+ - ftrace_call()
+ - ftrace_stub()
+ - C code:
+ - ftrace_dyn_arch_init()
+ - ftrace_make_nop()
+ - ftrace_make_call()
+ - ftrace_update_ftrace_func()
+
+First you will need to fill out some arch details in your asm/ftrace.h.
+
+Define MCOUNT_ADDR as the address of your mcount symbol similar to:
+ #define MCOUNT_ADDR ((unsigned long)mcount)
+Since no one else will have a decl for that function, you will need to:
+ extern void mcount(void);
+
+You will also need the helper function ftrace_call_adjust(). Most people
+will be able to stub it out like so:
+ static inline unsigned long ftrace_call_adjust(unsigned long addr)
+ {
+ return addr;
+ }
<details to be filled>
+Lastly you will need the custom dyn_arch_ftrace structure. If you need
+some extra state when runtime patching arbitrary call sites, this is the
+place. For now though, create an empty struct:
+ struct dyn_arch_ftrace {
+ /* No extra data needed */
+ };
+
+With the header out of the way, we can fill out the assembly code. While we
+did already create a mcount() function earlier, dynamic ftrace only wants a
+stub function. This is because the mcount() will only be used during boot
+and then all references to it will be patched out never to return. Instead,
+the guts of the old mcount() will be used to create a new ftrace_caller()
+function. Because the two are hard to merge, it will most likely be a lot
+easier to have two separate definitions split up by #ifdefs. Same goes for
+the ftrace_stub() as that will now be inlined in ftrace_caller().
+
+Before we get confused anymore, let's check out some pseudo code so you can
+implement your own stuff in assembly:
-HAVE_DYNAMIC_FTRACE
----------------------
+void mcount(void)
+{
+ return;
+}
+
+void ftrace_caller(void)
+{
+ /* implement HAVE_FUNCTION_TRACE_MCOUNT_TEST if you desire */
+
+ /* save all state needed by the ABI (see paragraph above) */
+
+ unsigned long frompc = ...;
+ unsigned long selfpc = <return address> - MCOUNT_INSN_SIZE;
+
+ftrace_call:
+ ftrace_stub(frompc, selfpc);
+
+ /* restore all state needed by the ABI */
+
+ftrace_stub:
+ return;
+}
+
+This might look a little odd at first, but keep in mind that we will be runtime
+patching multiple things. First, only functions that we actually want to trace
+will be patched to call ftrace_caller(). Second, since we only have one tracer
+active at a time, we will patch the ftrace_caller() function itself to call the
+specific tracer in question. That is the point of the ftrace_call label.
+
+With that in mind, let's move on to the C code that will actually be doing the
+runtime patching. You'll need a little knowledge of your arch's opcodes in
+order to make it through the next section.
+
+Every arch has an init callback function. If you need to do something early on
+to initialize some state, this is the time to do that. Otherwise, this simple
+function below should be sufficient for most people:
+
+int __init ftrace_dyn_arch_init(void *data)
+{
+ /* return value is done indirectly via data */
+ *(unsigned long *)data = 0;
+
+ return 0;
+}
+
+There are two functions that are used to do runtime patching of arbitrary
+functions. The first is used to turn the mcount call site into a nop (which
+is what helps us retain runtime performance when not tracing). The second is
+used to turn the mcount call site into a call to an arbitrary location (but
+typically that is ftracer_caller()). See the general function definition in
+linux/ftrace.h for the functions:
+ ftrace_make_nop()
+ ftrace_make_call()
+The rec->ip value is the address of the mcount call site that was collected
+by the scripts/recordmcount.pl during build time.
+
+The last function is used to do runtime patching of the active tracer. This
+will be modifying the assembly code at the location of the ftrace_call symbol
+inside of the ftrace_caller() function. So you should have sufficient padding
+at that location to support the new function calls you'll be inserting. Some
+people will be using a "call" type instruction while others will be using a
+"branch" type instruction. Specifically, the function is:
+ ftrace_update_ftrace_func()
+
+
+HAVE_DYNAMIC_FTRACE + HAVE_FUNCTION_GRAPH_TRACER
+------------------------------------------------
+
+The function grapher needs a few tweaks in order to work with dynamic ftrace.
+Basically, you will need to:
+ - update:
+ - ftrace_caller()
+ - ftrace_graph_call()
+ - ftrace_graph_caller()
+ - implement:
+ - ftrace_enable_ftrace_graph_caller()
+ - ftrace_disable_ftrace_graph_caller()
<details to be filled>
+Quick notes:
+ - add a nop stub after the ftrace_call location named ftrace_graph_call;
+ stub needs to be large enough to support a call to ftrace_graph_caller()
+ - update ftrace_graph_caller() to work with being called by the new
+ ftrace_caller() since some semantics may have changed
+ - ftrace_enable_ftrace_graph_caller() will runtime patch the
+ ftrace_graph_call location with a call to ftrace_graph_caller()
+ - ftrace_disable_ftrace_graph_caller() will runtime patch the
+ ftrace_graph_call location with nops
diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt
index 8179692..557c1ed 100644
--- a/Documentation/trace/ftrace.txt
+++ b/Documentation/trace/ftrace.txt
@@ -155,6 +155,9 @@ of ftrace. Here is a list of some of the key files:
to be traced. Echoing names of functions into this file
will limit the trace to only those functions.
+ This interface also allows for commands to be used. See the
+ "Filter commands" section for more details.
+
set_ftrace_notrace:
This has an effect opposite to that of
@@ -1337,12 +1340,14 @@ ftrace_dump_on_oops must be set. To set ftrace_dump_on_oops, one
can either use the sysctl function or set it via the proc system
interface.
- sysctl kernel.ftrace_dump_on_oops=1
+ sysctl kernel.ftrace_dump_on_oops=n
or
- echo 1 > /proc/sys/kernel/ftrace_dump_on_oops
+ echo n > /proc/sys/kernel/ftrace_dump_on_oops
+If n = 1, ftrace will dump buffers of all CPUs, if n = 2 ftrace will
+only dump the buffer of the CPU that triggered the oops.
Here's an example of such a dump after a null pointer
dereference in a kernel module:
@@ -1588,7 +1593,7 @@ module author does not need to worry about it.
When tracing is enabled, kstop_machine is called to prevent
races with the CPUS executing code being modified (which can
-cause the CPU to do undesireable things), and the nops are
+cause the CPU to do undesirable things), and the nops are
patched back to calls. But this time, they do not call mcount
(which is just a function stub). They now call into the ftrace
infrastructure.
@@ -1625,7 +1630,7 @@ If I am only interested in sys_nanosleep and hrtimer_interrupt:
# echo sys_nanosleep hrtimer_interrupt \
> set_ftrace_filter
- # echo ftrace > current_tracer
+ # echo function > current_tracer
# echo 1 > tracing_enabled
# usleep 1
# echo 0 > tracing_enabled
@@ -1822,6 +1827,47 @@ this special filter via:
echo > set_graph_function
+Filter commands
+---------------
+
+A few commands are supported by the set_ftrace_filter interface.
+Trace commands have the following format:
+
+<function>:<command>:<parameter>
+
+The following commands are supported:
+
+- mod
+ This command enables function filtering per module. The
+ parameter defines the module. For example, if only the write*
+ functions in the ext3 module are desired, run:
+
+ echo 'write*:mod:ext3' > set_ftrace_filter
+
+ This command interacts with the filter in the same way as
+ filtering based on function names. Thus, adding more functions
+ in a different module is accomplished by appending (>>) to the
+ filter file. Remove specific module functions by prepending
+ '!':
+
+ echo '!writeback*:mod:ext3' >> set_ftrace_filter
+
+- traceon/traceoff
+ These commands turn tracing on and off when the specified
+ functions are hit. The parameter determines how many times the
+ tracing system is turned on and off. If unspecified, there is
+ no limit. For example, to disable tracing when a schedule bug
+ is hit the first 5 times, run:
+
+ echo '__schedule_bug:traceoff:5' > set_ftrace_filter
+
+ These commands are cumulative whether or not they are appended
+ to set_ftrace_filter. To remove a command, prepend it by '!'
+ and drop the parameter:
+
+ echo '!__schedule_bug:traceoff' > set_ftrace_filter
+
+
trace_pipe
----------
diff --git a/Documentation/trace/kmemtrace.txt b/Documentation/trace/kmemtrace.txt
deleted file mode 100644
index 6308735..0000000
--- a/Documentation/trace/kmemtrace.txt
+++ /dev/null
@@ -1,126 +0,0 @@
- kmemtrace - Kernel Memory Tracer
-
- by Eduard - Gabriel Munteanu
- <eduard.munteanu@linux360.ro>
-
-I. Introduction
-===============
-
-kmemtrace helps kernel developers figure out two things:
-1) how different allocators (SLAB, SLUB etc.) perform
-2) how kernel code allocates memory and how much
-
-To do this, we trace every allocation and export information to the userspace
-through the relay interface. We export things such as the number of requested
-bytes, the number of bytes actually allocated (i.e. including internal
-fragmentation), whether this is a slab allocation or a plain kmalloc() and so
-on.
-
-The actual analysis is performed by a userspace tool (see section III for
-details on where to get it from). It logs the data exported by the kernel,
-processes it and (as of writing this) can provide the following information:
-- the total amount of memory allocated and fragmentation per call-site
-- the amount of memory allocated and fragmentation per allocation
-- total memory allocated and fragmentation in the collected dataset
-- number of cross-CPU allocation and frees (makes sense in NUMA environments)
-
-Moreover, it can potentially find inconsistent and erroneous behavior in
-kernel code, such as using slab free functions on kmalloc'ed memory or
-allocating less memory than requested (but not truly failed allocations).
-
-kmemtrace also makes provisions for tracing on some arch and analysing the
-data on another.
-
-II. Design and goals
-====================
-
-kmemtrace was designed to handle rather large amounts of data. Thus, it uses
-the relay interface to export whatever is logged to userspace, which then
-stores it. Analysis and reporting is done asynchronously, that is, after the
-data is collected and stored. By design, it allows one to log and analyse
-on different machines and different arches.
-
-As of writing this, the ABI is not considered stable, though it might not
-change much. However, no guarantees are made about compatibility yet. When
-deemed stable, the ABI should still allow easy extension while maintaining
-backward compatibility. This is described further in Documentation/ABI.
-
-Summary of design goals:
- - allow logging and analysis to be done across different machines
- - be fast and anticipate usage in high-load environments (*)
- - be reasonably extensible
- - make it possible for GNU/Linux distributions to have kmemtrace
- included in their repositories
-
-(*) - one of the reasons Pekka Enberg's original userspace data analysis
- tool's code was rewritten from Perl to C (although this is more than a
- simple conversion)
-
-
-III. Quick usage guide
-======================
-
-1) Get a kernel that supports kmemtrace and build it accordingly (i.e. enable
-CONFIG_KMEMTRACE).
-
-2) Get the userspace tool and build it:
-$ git clone git://repo.or.cz/kmemtrace-user.git # current repository
-$ cd kmemtrace-user/
-$ ./autogen.sh
-$ ./configure
-$ make
-
-3) Boot the kmemtrace-enabled kernel if you haven't, preferably in the
-'single' runlevel (so that relay buffers don't fill up easily), and run
-kmemtrace:
-# '$' does not mean user, but root here.
-$ mount -t debugfs none /sys/kernel/debug
-$ mount -t proc none /proc
-$ cd path/to/kmemtrace-user/
-$ ./kmemtraced
-Wait a bit, then stop it with CTRL+C.
-$ cat /sys/kernel/debug/kmemtrace/total_overruns # Check if we didn't
- # overrun, should
- # be zero.
-$ (Optionally) [Run kmemtrace_check separately on each cpu[0-9]*.out file to
- check its correctness]
-$ ./kmemtrace-report
-
-Now you should have a nice and short summary of how the allocator performs.
-
-IV. FAQ and known issues
-========================
-
-Q: 'cat /sys/kernel/debug/kmemtrace/total_overruns' is non-zero, how do I fix
-this? Should I worry?
-A: If it's non-zero, this affects kmemtrace's accuracy, depending on how
-large the number is. You can fix it by supplying a higher
-'kmemtrace.subbufs=N' kernel parameter.
----
-
-Q: kmemtrace_check reports errors, how do I fix this? Should I worry?
-A: This is a bug and should be reported. It can occur for a variety of
-reasons:
- - possible bugs in relay code
- - possible misuse of relay by kmemtrace
- - timestamps being collected unorderly
-Or you may fix it yourself and send us a patch.
----
-
-Q: kmemtrace_report shows many errors, how do I fix this? Should I worry?
-A: This is a known issue and I'm working on it. These might be true errors
-in kernel code, which may have inconsistent behavior (e.g. allocating memory
-with kmem_cache_alloc() and freeing it with kfree()). Pekka Enberg pointed
-out this behavior may work with SLAB, but may fail with other allocators.
-
-It may also be due to lack of tracing in some unusual allocator functions.
-
-We don't want bug reports regarding this issue yet.
----
-
-V. See also
-===========
-
-Documentation/kernel-parameters.txt
-Documentation/ABI/testing/debugfs-kmemtrace
-
diff --git a/Documentation/trace/kprobetrace.txt b/Documentation/trace/kprobetrace.txt
index 47aabee..5f77d94 100644
--- a/Documentation/trace/kprobetrace.txt
+++ b/Documentation/trace/kprobetrace.txt
@@ -24,6 +24,7 @@ Synopsis of kprobe_events
-------------------------
p[:[GRP/]EVENT] SYMBOL[+offs]|MEMADDR [FETCHARGS] : Set a probe
r[:[GRP/]EVENT] SYMBOL[+0] [FETCHARGS] : Set a return probe
+ -:[GRP/]EVENT : Clear a probe
GRP : Group name. If omitted, use "kprobes" for it.
EVENT : Event name. If omitted, the event name is generated
@@ -37,15 +38,14 @@ Synopsis of kprobe_events
@SYM[+|-offs] : Fetch memory at SYM +|- offs (SYM should be a data symbol)
$stackN : Fetch Nth entry of stack (N >= 0)
$stack : Fetch stack address.
- $argN : Fetch function argument. (N >= 0)(*)
- $retval : Fetch return value.(**)
- +|-offs(FETCHARG) : Fetch memory at FETCHARG +|- offs address.(***)
- NAME=FETCHARG: Set NAME as the argument name of FETCHARG.
+ $retval : Fetch return value.(*)
+ +|-offs(FETCHARG) : Fetch memory at FETCHARG +|- offs address.(**)
+ NAME=FETCHARG : Set NAME as the argument name of FETCHARG.
+ FETCHARG:TYPE : Set TYPE as the type of FETCHARG. Currently, basic types
+ (u8/u16/u32/u64/s8/s16/s32/s64) and string are supported.
- (*) aN may not correct on asmlinkaged functions and at the middle of
- function body.
- (**) only for return probe.
- (***) this is useful for fetching a field of data structures.
+ (*) only for return probe.
+ (**) this is useful for fetching a field of data structures.
Per-Probe Event Filtering
@@ -82,13 +82,16 @@ Usage examples
To add a probe as a new event, write a new definition to kprobe_events
as below.
- echo p:myprobe do_sys_open dfd=$arg0 filename=$arg1 flags=$arg2 mode=$arg3 > /sys/kernel/debug/tracing/kprobe_events
+ echo 'p:myprobe do_sys_open dfd=%ax filename=%dx flags=%cx mode=+4($stack)' > /sys/kernel/debug/tracing/kprobe_events
This sets a kprobe on the top of do_sys_open() function with recording
-1st to 4th arguments as "myprobe" event. As this example shows, users can
-choose more familiar names for each arguments.
+1st to 4th arguments as "myprobe" event. Note, which register/stack entry is
+assigned to each function argument depends on arch-specific ABI. If you unsure
+the ABI, please try to use probe subcommand of perf-tools (you can find it
+under tools/perf/).
+As this example shows, users can choose more familiar names for each arguments.
- echo r:myretprobe do_sys_open $retval >> /sys/kernel/debug/tracing/kprobe_events
+ echo 'r:myretprobe do_sys_open $retval' >> /sys/kernel/debug/tracing/kprobe_events
This sets a kretprobe on the return point of do_sys_open() function with
recording return value as "myretprobe" event.
@@ -97,23 +100,24 @@ recording return value as "myretprobe" event.
cat /sys/kernel/debug/tracing/events/kprobes/myprobe/format
name: myprobe
-ID: 75
+ID: 780
format:
- field:unsigned short common_type; offset:0; size:2;
- field:unsigned char common_flags; offset:2; size:1;
- field:unsigned char common_preempt_count; offset:3; size:1;
- field:int common_pid; offset:4; size:4;
- field:int common_tgid; offset:8; size:4;
+ field:unsigned short common_type; offset:0; size:2; signed:0;
+ field:unsigned char common_flags; offset:2; size:1; signed:0;
+ field:unsigned char common_preempt_count; offset:3; size:1;signed:0;
+ field:int common_pid; offset:4; size:4; signed:1;
+ field:int common_lock_depth; offset:8; size:4; signed:1;
- field: unsigned long ip; offset:16;tsize:8;
- field: int nargs; offset:24;tsize:4;
- field: unsigned long dfd; offset:32;tsize:8;
- field: unsigned long filename; offset:40;tsize:8;
- field: unsigned long flags; offset:48;tsize:8;
- field: unsigned long mode; offset:56;tsize:8;
+ field:unsigned long __probe_ip; offset:12; size:4; signed:0;
+ field:int __probe_nargs; offset:16; size:4; signed:1;
+ field:unsigned long dfd; offset:20; size:4; signed:0;
+ field:unsigned long filename; offset:24; size:4; signed:0;
+ field:unsigned long flags; offset:28; size:4; signed:0;
+ field:unsigned long mode; offset:32; size:4; signed:0;
-print fmt: "(%lx) dfd=%lx filename=%lx flags=%lx mode=%lx", REC->ip, REC->dfd, REC->filename, REC->flags, REC->mode
+print fmt: "(%lx) dfd=%lx filename=%lx flags=%lx mode=%lx", REC->__probe_ip,
+REC->dfd, REC->filename, REC->flags, REC->mode
You can see that the event has 4 arguments as in the expressions you specified.
@@ -121,6 +125,12 @@ print fmt: "(%lx) dfd=%lx filename=%lx flags=%lx mode=%lx", REC->ip, REC->dfd, R
This clears all probe points.
+ Or,
+
+ echo -:myprobe >> kprobe_events
+
+ This clears probe points selectively.
+
Right after definition, each event is disabled by default. For tracing these
events, you need to enable it.
@@ -146,4 +156,3 @@ events, you need to enable it.
returns from SYMBOL(e.g. "sys_open+0x1b/0x1d <- do_sys_open" means kernel
returns from do_sys_open to sys_open+0x1b).
-
diff --git a/Documentation/trace/postprocess/trace-vmscan-postprocess.pl b/Documentation/trace/postprocess/trace-vmscan-postprocess.pl
new file mode 100644
index 0000000..1b55146
--- /dev/null
+++ b/Documentation/trace/postprocess/trace-vmscan-postprocess.pl
@@ -0,0 +1,686 @@
+#!/usr/bin/perl
+# This is a POC for reading the text representation of trace output related to
+# page reclaim. It makes an attempt to extract some high-level information on
+# what is going on. The accuracy of the parser may vary
+#
+# Example usage: trace-vmscan-postprocess.pl < /sys/kernel/debug/tracing/trace_pipe
+# other options
+# --read-procstat If the trace lacks process info, get it from /proc
+# --ignore-pid Aggregate processes of the same name together
+#
+# Copyright (c) IBM Corporation 2009
+# Author: Mel Gorman <mel@csn.ul.ie>
+use strict;
+use Getopt::Long;
+
+# Tracepoint events
+use constant MM_VMSCAN_DIRECT_RECLAIM_BEGIN => 1;
+use constant MM_VMSCAN_DIRECT_RECLAIM_END => 2;
+use constant MM_VMSCAN_KSWAPD_WAKE => 3;
+use constant MM_VMSCAN_KSWAPD_SLEEP => 4;
+use constant MM_VMSCAN_LRU_SHRINK_ACTIVE => 5;
+use constant MM_VMSCAN_LRU_SHRINK_INACTIVE => 6;
+use constant MM_VMSCAN_LRU_ISOLATE => 7;
+use constant MM_VMSCAN_WRITEPAGE_FILE_SYNC => 8;
+use constant MM_VMSCAN_WRITEPAGE_ANON_SYNC => 9;
+use constant MM_VMSCAN_WRITEPAGE_FILE_ASYNC => 10;
+use constant MM_VMSCAN_WRITEPAGE_ANON_ASYNC => 11;
+use constant MM_VMSCAN_WRITEPAGE_ASYNC => 12;
+use constant EVENT_UNKNOWN => 13;
+
+# Per-order events
+use constant MM_VMSCAN_DIRECT_RECLAIM_BEGIN_PERORDER => 11;
+use constant MM_VMSCAN_WAKEUP_KSWAPD_PERORDER => 12;
+use constant MM_VMSCAN_KSWAPD_WAKE_PERORDER => 13;
+use constant HIGH_KSWAPD_REWAKEUP_PERORDER => 14;
+
+# Constants used to track state
+use constant STATE_DIRECT_BEGIN => 15;
+use constant STATE_DIRECT_ORDER => 16;
+use constant STATE_KSWAPD_BEGIN => 17;
+use constant STATE_KSWAPD_ORDER => 18;
+
+# High-level events extrapolated from tracepoints
+use constant HIGH_DIRECT_RECLAIM_LATENCY => 19;
+use constant HIGH_KSWAPD_LATENCY => 20;
+use constant HIGH_KSWAPD_REWAKEUP => 21;
+use constant HIGH_NR_SCANNED => 22;
+use constant HIGH_NR_TAKEN => 23;
+use constant HIGH_NR_RECLAIM => 24;
+use constant HIGH_NR_CONTIG_DIRTY => 25;
+
+my %perprocesspid;
+my %perprocess;
+my %last_procmap;
+my $opt_ignorepid;
+my $opt_read_procstat;
+
+my $total_wakeup_kswapd;
+my ($total_direct_reclaim, $total_direct_nr_scanned);
+my ($total_direct_latency, $total_kswapd_latency);
+my ($total_direct_writepage_file_sync, $total_direct_writepage_file_async);
+my ($total_direct_writepage_anon_sync, $total_direct_writepage_anon_async);
+my ($total_kswapd_nr_scanned, $total_kswapd_wake);
+my ($total_kswapd_writepage_file_sync, $total_kswapd_writepage_file_async);
+my ($total_kswapd_writepage_anon_sync, $total_kswapd_writepage_anon_async);
+
+# Catch sigint and exit on request
+my $sigint_report = 0;
+my $sigint_exit = 0;
+my $sigint_pending = 0;
+my $sigint_received = 0;
+sub sigint_handler {
+ my $current_time = time;
+ if ($current_time - 2 > $sigint_received) {
+ print "SIGINT received, report pending. Hit ctrl-c again to exit\n";
+ $sigint_report = 1;
+ } else {
+ if (!$sigint_exit) {
+ print "Second SIGINT received quickly, exiting\n";
+ }
+ $sigint_exit++;
+ }
+
+ if ($sigint_exit > 3) {
+ print "Many SIGINTs received, exiting now without report\n";
+ exit;
+ }
+
+ $sigint_received = $current_time;
+ $sigint_pending = 1;
+}
+$SIG{INT} = "sigint_handler";
+
+# Parse command line options
+GetOptions(
+ 'ignore-pid' => \$opt_ignorepid,
+ 'read-procstat' => \$opt_read_procstat,
+);
+
+# Defaults for dynamically discovered regex's
+my $regex_direct_begin_default = 'order=([0-9]*) may_writepage=([0-9]*) gfp_flags=([A-Z_|]*)';
+my $regex_direct_end_default = 'nr_reclaimed=([0-9]*)';
+my $regex_kswapd_wake_default = 'nid=([0-9]*) order=([0-9]*)';
+my $regex_kswapd_sleep_default = 'nid=([0-9]*)';
+my $regex_wakeup_kswapd_default = 'nid=([0-9]*) zid=([0-9]*) order=([0-9]*)';
+my $regex_lru_isolate_default = 'isolate_mode=([0-9]*) order=([0-9]*) nr_requested=([0-9]*) nr_scanned=([0-9]*) nr_taken=([0-9]*) contig_taken=([0-9]*) contig_dirty=([0-9]*) contig_failed=([0-9]*)';
+my $regex_lru_shrink_inactive_default = 'lru=([A-Z_]*) nr_scanned=([0-9]*) nr_reclaimed=([0-9]*) priority=([0-9]*)';
+my $regex_lru_shrink_active_default = 'lru=([A-Z_]*) nr_scanned=([0-9]*) nr_rotated=([0-9]*) priority=([0-9]*)';
+my $regex_writepage_default = 'page=([0-9a-f]*) pfn=([0-9]*) flags=([A-Z_|]*)';
+
+# Dyanically discovered regex
+my $regex_direct_begin;
+my $regex_direct_end;
+my $regex_kswapd_wake;
+my $regex_kswapd_sleep;
+my $regex_wakeup_kswapd;
+my $regex_lru_isolate;
+my $regex_lru_shrink_inactive;
+my $regex_lru_shrink_active;
+my $regex_writepage;
+
+# Static regex used. Specified like this for readability and for use with /o
+# (process_pid) (cpus ) ( time ) (tpoint ) (details)
+my $regex_traceevent = '\s*([a-zA-Z0-9-]*)\s*(\[[0-9]*\])\s*([0-9.]*):\s*([a-zA-Z_]*):\s*(.*)';
+my $regex_statname = '[-0-9]*\s\((.*)\).*';
+my $regex_statppid = '[-0-9]*\s\(.*\)\s[A-Za-z]\s([0-9]*).*';
+
+sub generate_traceevent_regex {
+ my $event = shift;
+ my $default = shift;
+ my $regex;
+
+ # Read the event format or use the default
+ if (!open (FORMAT, "/sys/kernel/debug/tracing/events/$event/format")) {
+ print("WARNING: Event $event format string not found\n");
+ return $default;
+ } else {
+ my $line;
+ while (!eof(FORMAT)) {
+ $line = <FORMAT>;
+ $line =~ s/, REC->.*//;
+ if ($line =~ /^print fmt:\s"(.*)".*/) {
+ $regex = $1;
+ $regex =~ s/%s/\([0-9a-zA-Z|_]*\)/g;
+ $regex =~ s/%p/\([0-9a-f]*\)/g;
+ $regex =~ s/%d/\([-0-9]*\)/g;
+ $regex =~ s/%ld/\([-0-9]*\)/g;
+ $regex =~ s/%lu/\([0-9]*\)/g;
+ }
+ }
+ }
+
+ # Can't handle the print_flags stuff but in the context of this
+ # script, it really doesn't matter
+ $regex =~ s/\(REC.*\) \? __print_flags.*//;
+
+ # Verify fields are in the right order
+ my $tuple;
+ foreach $tuple (split /\s/, $regex) {
+ my ($key, $value) = split(/=/, $tuple);
+ my $expected = shift;
+ if ($key ne $expected) {
+ print("WARNING: Format not as expected for event $event '$key' != '$expected'\n");
+ $regex =~ s/$key=\((.*)\)/$key=$1/;
+ }
+ }
+
+ if (defined shift) {
+ die("Fewer fields than expected in format");
+ }
+
+ return $regex;
+}
+
+$regex_direct_begin = generate_traceevent_regex(
+ "vmscan/mm_vmscan_direct_reclaim_begin",
+ $regex_direct_begin_default,
+ "order", "may_writepage",
+ "gfp_flags");
+$regex_direct_end = generate_traceevent_regex(
+ "vmscan/mm_vmscan_direct_reclaim_end",
+ $regex_direct_end_default,
+ "nr_reclaimed");
+$regex_kswapd_wake = generate_traceevent_regex(
+ "vmscan/mm_vmscan_kswapd_wake",
+ $regex_kswapd_wake_default,
+ "nid", "order");
+$regex_kswapd_sleep = generate_traceevent_regex(
+ "vmscan/mm_vmscan_kswapd_sleep",
+ $regex_kswapd_sleep_default,
+ "nid");
+$regex_wakeup_kswapd = generate_traceevent_regex(
+ "vmscan/mm_vmscan_wakeup_kswapd",
+ $regex_wakeup_kswapd_default,
+ "nid", "zid", "order");
+$regex_lru_isolate = generate_traceevent_regex(
+ "vmscan/mm_vmscan_lru_isolate",
+ $regex_lru_isolate_default,
+ "isolate_mode", "order",
+ "nr_requested", "nr_scanned", "nr_taken",
+ "contig_taken", "contig_dirty", "contig_failed");
+$regex_lru_shrink_inactive = generate_traceevent_regex(
+ "vmscan/mm_vmscan_lru_shrink_inactive",
+ $regex_lru_shrink_inactive_default,
+ "nid", "zid",
+ "lru",
+ "nr_scanned", "nr_reclaimed", "priority");
+$regex_lru_shrink_active = generate_traceevent_regex(
+ "vmscan/mm_vmscan_lru_shrink_active",
+ $regex_lru_shrink_active_default,
+ "nid", "zid",
+ "lru",
+ "nr_scanned", "nr_rotated", "priority");
+$regex_writepage = generate_traceevent_regex(
+ "vmscan/mm_vmscan_writepage",
+ $regex_writepage_default,
+ "page", "pfn", "flags");
+
+sub read_statline($) {
+ my $pid = $_[0];
+ my $statline;
+
+ if (open(STAT, "/proc/$pid/stat")) {
+ $statline = <STAT>;
+ close(STAT);
+ }
+
+ if ($statline eq '') {
+ $statline = "-1 (UNKNOWN_PROCESS_NAME) R 0";
+ }
+
+ return $statline;
+}
+
+sub guess_process_pid($$) {
+ my $pid = $_[0];
+ my $statline = $_[1];
+
+ if ($pid == 0) {
+ return "swapper-0";
+ }
+
+ if ($statline !~ /$regex_statname/o) {
+ die("Failed to math stat line for process name :: $statline");
+ }
+ return "$1-$pid";
+}
+
+# Convert sec.usec timestamp format
+sub timestamp_to_ms($) {
+ my $timestamp = $_[0];
+
+ my ($sec, $usec) = split (/\./, $timestamp);
+ return ($sec * 1000) + ($usec / 1000);
+}
+
+sub process_events {
+ my $traceevent;
+ my $process_pid;
+ my $cpus;
+ my $timestamp;
+ my $tracepoint;
+ my $details;
+ my $statline;
+
+ # Read each line of the event log
+EVENT_PROCESS:
+ while ($traceevent = <STDIN>) {
+ if ($traceevent =~ /$regex_traceevent/o) {
+ $process_pid = $1;
+ $timestamp = $3;
+ $tracepoint = $4;
+
+ $process_pid =~ /(.*)-([0-9]*)$/;
+ my $process = $1;
+ my $pid = $2;
+
+ if ($process eq "") {
+ $process = $last_procmap{$pid};
+ $process_pid = "$process-$pid";
+ }
+ $last_procmap{$pid} = $process;
+
+ if ($opt_read_procstat) {
+ $statline = read_statline($pid);
+ if ($opt_read_procstat && $process eq '') {
+ $process_pid = guess_process_pid($pid, $statline);
+ }
+ }
+ } else {
+ next;
+ }
+
+ # Perl Switch() sucks majorly
+ if ($tracepoint eq "mm_vmscan_direct_reclaim_begin") {
+ $timestamp = timestamp_to_ms($timestamp);
+ $perprocesspid{$process_pid}->{MM_VMSCAN_DIRECT_RECLAIM_BEGIN}++;
+ $perprocesspid{$process_pid}->{STATE_DIRECT_BEGIN} = $timestamp;
+
+ $details = $5;
+ if ($details !~ /$regex_direct_begin/o) {
+ print "WARNING: Failed to parse mm_vmscan_direct_reclaim_begin as expected\n";
+ print " $details\n";
+ print " $regex_direct_begin\n";
+ next;
+ }
+ my $order = $1;
+ $perprocesspid{$process_pid}->{MM_VMSCAN_DIRECT_RECLAIM_BEGIN_PERORDER}[$order]++;
+ $perprocesspid{$process_pid}->{STATE_DIRECT_ORDER} = $order;
+ } elsif ($tracepoint eq "mm_vmscan_direct_reclaim_end") {
+ # Count the event itself
+ my $index = $perprocesspid{$process_pid}->{MM_VMSCAN_DIRECT_RECLAIM_END};
+ $perprocesspid{$process_pid}->{MM_VMSCAN_DIRECT_RECLAIM_END}++;
+
+ # Record how long direct reclaim took this time
+ if (defined $perprocesspid{$process_pid}->{STATE_DIRECT_BEGIN}) {
+ $timestamp = timestamp_to_ms($timestamp);
+ my $order = $perprocesspid{$process_pid}->{STATE_DIRECT_ORDER};
+ my $latency = ($timestamp - $perprocesspid{$process_pid}->{STATE_DIRECT_BEGIN});
+ $perprocesspid{$process_pid}->{HIGH_DIRECT_RECLAIM_LATENCY}[$index] = "$order-$latency";
+ }
+ } elsif ($tracepoint eq "mm_vmscan_kswapd_wake") {
+ $details = $5;
+ if ($details !~ /$regex_kswapd_wake/o) {
+ print "WARNING: Failed to parse mm_vmscan_kswapd_wake as expected\n";
+ print " $details\n";
+ print " $regex_kswapd_wake\n";
+ next;
+ }
+
+ my $order = $2;
+ $perprocesspid{$process_pid}->{STATE_KSWAPD_ORDER} = $order;
+ if (!$perprocesspid{$process_pid}->{STATE_KSWAPD_BEGIN}) {
+ $timestamp = timestamp_to_ms($timestamp);
+ $perprocesspid{$process_pid}->{MM_VMSCAN_KSWAPD_WAKE}++;
+ $perprocesspid{$process_pid}->{STATE_KSWAPD_BEGIN} = $timestamp;
+ $perprocesspid{$process_pid}->{MM_VMSCAN_KSWAPD_WAKE_PERORDER}[$order]++;
+ } else {
+ $perprocesspid{$process_pid}->{HIGH_KSWAPD_REWAKEUP}++;
+ $perprocesspid{$process_pid}->{HIGH_KSWAPD_REWAKEUP_PERORDER}[$order]++;
+ }
+ } elsif ($tracepoint eq "mm_vmscan_kswapd_sleep") {
+
+ # Count the event itself
+ my $index = $perprocesspid{$process_pid}->{MM_VMSCAN_KSWAPD_SLEEP};
+ $perprocesspid{$process_pid}->{MM_VMSCAN_KSWAPD_SLEEP}++;
+
+ # Record how long kswapd was awake
+ $timestamp = timestamp_to_ms($timestamp);
+ my $order = $perprocesspid{$process_pid}->{STATE_KSWAPD_ORDER};
+ my $latency = ($timestamp - $perprocesspid{$process_pid}->{STATE_KSWAPD_BEGIN});
+ $perprocesspid{$process_pid}->{HIGH_KSWAPD_LATENCY}[$index] = "$order-$latency";
+ $perprocesspid{$process_pid}->{STATE_KSWAPD_BEGIN} = 0;
+ } elsif ($tracepoint eq "mm_vmscan_wakeup_kswapd") {
+ $perprocesspid{$process_pid}->{MM_VMSCAN_WAKEUP_KSWAPD}++;
+
+ $details = $5;
+ if ($details !~ /$regex_wakeup_kswapd/o) {
+ print "WARNING: Failed to parse mm_vmscan_wakeup_kswapd as expected\n";
+ print " $details\n";
+ print " $regex_wakeup_kswapd\n";
+ next;
+ }
+ my $order = $3;
+ $perprocesspid{$process_pid}->{MM_VMSCAN_WAKEUP_KSWAPD_PERORDER}[$order]++;
+ } elsif ($tracepoint eq "mm_vmscan_lru_isolate") {
+ $details = $5;
+ if ($details !~ /$regex_lru_isolate/o) {
+ print "WARNING: Failed to parse mm_vmscan_lru_isolate as expected\n";
+ print " $details\n";
+ print " $regex_lru_isolate/o\n";
+ next;
+ }
+ my $nr_scanned = $4;
+ my $nr_contig_dirty = $7;
+ $perprocesspid{$process_pid}->{HIGH_NR_SCANNED} += $nr_scanned;
+ $perprocesspid{$process_pid}->{HIGH_NR_CONTIG_DIRTY} += $nr_contig_dirty;
+ } elsif ($tracepoint eq "mm_vmscan_writepage") {
+ $details = $5;
+ if ($details !~ /$regex_writepage/o) {
+ print "WARNING: Failed to parse mm_vmscan_writepage as expected\n";
+ print " $details\n";
+ print " $regex_writepage\n";
+ next;
+ }
+
+ my $flags = $3;
+ my $file = 0;
+ my $sync_io = 0;
+ if ($flags =~ /RECLAIM_WB_FILE/) {
+ $file = 1;
+ }
+ if ($flags =~ /RECLAIM_WB_SYNC/) {
+ $sync_io = 1;
+ }
+ if ($sync_io) {
+ if ($file) {
+ $perprocesspid{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_SYNC}++;
+ } else {
+ $perprocesspid{$process_pid}->{MM_VMSCAN_WRITEPAGE_ANON_SYNC}++;
+ }
+ } else {
+ if ($file) {
+ $perprocesspid{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_ASYNC}++;
+ } else {
+ $perprocesspid{$process_pid}->{MM_VMSCAN_WRITEPAGE_ANON_ASYNC}++;
+ }
+ }
+ } else {
+ $perprocesspid{$process_pid}->{EVENT_UNKNOWN}++;
+ }
+
+ if ($sigint_pending) {
+ last EVENT_PROCESS;
+ }
+ }
+}
+
+sub dump_stats {
+ my $hashref = shift;
+ my %stats = %$hashref;
+
+ # Dump per-process stats
+ my $process_pid;
+ my $max_strlen = 0;
+
+ # Get the maximum process name
+ foreach $process_pid (keys %perprocesspid) {
+ my $len = length($process_pid);
+ if ($len > $max_strlen) {
+ $max_strlen = $len;
+ }
+ }
+ $max_strlen += 2;
+
+ # Work out latencies
+ printf("\n") if !$opt_ignorepid;
+ printf("Reclaim latencies expressed as order-latency_in_ms\n") if !$opt_ignorepid;
+ foreach $process_pid (keys %stats) {
+
+ if (!$stats{$process_pid}->{HIGH_DIRECT_RECLAIM_LATENCY}[0] &&
+ !$stats{$process_pid}->{HIGH_KSWAPD_LATENCY}[0]) {
+ next;
+ }
+
+ printf "%-" . $max_strlen . "s ", $process_pid if !$opt_ignorepid;
+ my $index = 0;
+ while (defined $stats{$process_pid}->{HIGH_DIRECT_RECLAIM_LATENCY}[$index] ||
+ defined $stats{$process_pid}->{HIGH_KSWAPD_LATENCY}[$index]) {
+
+ if ($stats{$process_pid}->{HIGH_DIRECT_RECLAIM_LATENCY}[$index]) {
+ printf("%s ", $stats{$process_pid}->{HIGH_DIRECT_RECLAIM_LATENCY}[$index]) if !$opt_ignorepid;
+ my ($dummy, $latency) = split(/-/, $stats{$process_pid}->{HIGH_DIRECT_RECLAIM_LATENCY}[$index]);
+ $total_direct_latency += $latency;
+ } else {
+ printf("%s ", $stats{$process_pid}->{HIGH_KSWAPD_LATENCY}[$index]) if !$opt_ignorepid;
+ my ($dummy, $latency) = split(/-/, $stats{$process_pid}->{HIGH_KSWAPD_LATENCY}[$index]);
+ $total_kswapd_latency += $latency;
+ }
+ $index++;
+ }
+ print "\n" if !$opt_ignorepid;
+ }
+
+ # Print out process activity
+ printf("\n");
+ printf("%-" . $max_strlen . "s %8s %10s %8s %8s %8s %8s %8s\n", "Process", "Direct", "Wokeup", "Pages", "Pages", "Pages", "Time");
+ printf("%-" . $max_strlen . "s %8s %10s %8s %8s %8s %8s %8s\n", "details", "Rclms", "Kswapd", "Scanned", "Sync-IO", "ASync-IO", "Stalled");
+ foreach $process_pid (keys %stats) {
+
+ if (!$stats{$process_pid}->{MM_VMSCAN_DIRECT_RECLAIM_BEGIN}) {
+ next;
+ }
+
+ $total_direct_reclaim += $stats{$process_pid}->{MM_VMSCAN_DIRECT_RECLAIM_BEGIN};
+ $total_wakeup_kswapd += $stats{$process_pid}->{MM_VMSCAN_WAKEUP_KSWAPD};
+ $total_direct_nr_scanned += $stats{$process_pid}->{HIGH_NR_SCANNED};
+ $total_direct_writepage_file_sync += $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_SYNC};
+ $total_direct_writepage_anon_sync += $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_ANON_SYNC};
+ $total_direct_writepage_file_async += $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_ASYNC};
+
+ $total_direct_writepage_anon_async += $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_ANON_ASYNC};
+
+ my $index = 0;
+ my $this_reclaim_delay = 0;
+ while (defined $stats{$process_pid}->{HIGH_DIRECT_RECLAIM_LATENCY}[$index]) {
+ my ($dummy, $latency) = split(/-/, $stats{$process_pid}->{HIGH_DIRECT_RECLAIM_LATENCY}[$index]);
+ $this_reclaim_delay += $latency;
+ $index++;
+ }
+
+ printf("%-" . $max_strlen . "s %8d %10d %8u %8u %8u %8.3f",
+ $process_pid,
+ $stats{$process_pid}->{MM_VMSCAN_DIRECT_RECLAIM_BEGIN},
+ $stats{$process_pid}->{MM_VMSCAN_WAKEUP_KSWAPD},
+ $stats{$process_pid}->{HIGH_NR_SCANNED},
+ $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_SYNC} + $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_ANON_SYNC},
+ $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_ASYNC} + $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_ANON_ASYNC},
+ $this_reclaim_delay / 1000);
+
+ if ($stats{$process_pid}->{MM_VMSCAN_DIRECT_RECLAIM_BEGIN}) {
+ print " ";
+ for (my $order = 0; $order < 20; $order++) {
+ my $count = $stats{$process_pid}->{MM_VMSCAN_DIRECT_RECLAIM_BEGIN_PERORDER}[$order];
+ if ($count != 0) {
+ print "direct-$order=$count ";
+ }
+ }
+ }
+ if ($stats{$process_pid}->{MM_VMSCAN_WAKEUP_KSWAPD}) {
+ print " ";
+ for (my $order = 0; $order < 20; $order++) {
+ my $count = $stats{$process_pid}->{MM_VMSCAN_WAKEUP_KSWAPD_PERORDER}[$order];
+ if ($count != 0) {
+ print "wakeup-$order=$count ";
+ }
+ }
+ }
+ if ($stats{$process_pid}->{HIGH_NR_CONTIG_DIRTY}) {
+ print " ";
+ my $count = $stats{$process_pid}->{HIGH_NR_CONTIG_DIRTY};
+ if ($count != 0) {
+ print "contig-dirty=$count ";
+ }
+ }
+
+ print "\n";
+ }
+
+ # Print out kswapd activity
+ printf("\n");
+ printf("%-" . $max_strlen . "s %8s %10s %8s %8s %8s %8s\n", "Kswapd", "Kswapd", "Order", "Pages", "Pages", "Pages");
+ printf("%-" . $max_strlen . "s %8s %10s %8s %8s %8s %8s\n", "Instance", "Wakeups", "Re-wakeup", "Scanned", "Sync-IO", "ASync-IO");
+ foreach $process_pid (keys %stats) {
+
+ if (!$stats{$process_pid}->{MM_VMSCAN_KSWAPD_WAKE}) {
+ next;
+ }
+
+ $total_kswapd_wake += $stats{$process_pid}->{MM_VMSCAN_KSWAPD_WAKE};
+ $total_kswapd_nr_scanned += $stats{$process_pid}->{HIGH_NR_SCANNED};
+ $total_kswapd_writepage_file_sync += $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_SYNC};
+ $total_kswapd_writepage_anon_sync += $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_ANON_SYNC};
+ $total_kswapd_writepage_file_async += $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_ASYNC};
+ $total_kswapd_writepage_anon_async += $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_ANON_ASYNC};
+
+ printf("%-" . $max_strlen . "s %8d %10d %8u %8i %8u",
+ $process_pid,
+ $stats{$process_pid}->{MM_VMSCAN_KSWAPD_WAKE},
+ $stats{$process_pid}->{HIGH_KSWAPD_REWAKEUP},
+ $stats{$process_pid}->{HIGH_NR_SCANNED},
+ $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_SYNC} + $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_ANON_SYNC},
+ $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_ASYNC} + $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_ANON_ASYNC});
+
+ if ($stats{$process_pid}->{MM_VMSCAN_KSWAPD_WAKE}) {
+ print " ";
+ for (my $order = 0; $order < 20; $order++) {
+ my $count = $stats{$process_pid}->{MM_VMSCAN_KSWAPD_WAKE_PERORDER}[$order];
+ if ($count != 0) {
+ print "wake-$order=$count ";
+ }
+ }
+ }
+ if ($stats{$process_pid}->{HIGH_KSWAPD_REWAKEUP}) {
+ print " ";
+ for (my $order = 0; $order < 20; $order++) {
+ my $count = $stats{$process_pid}->{HIGH_KSWAPD_REWAKEUP_PERORDER}[$order];
+ if ($count != 0) {
+ print "rewake-$order=$count ";
+ }
+ }
+ }
+ printf("\n");
+ }
+
+ # Print out summaries
+ $total_direct_latency /= 1000;
+ $total_kswapd_latency /= 1000;
+ print "\nSummary\n";
+ print "Direct reclaims: $total_direct_reclaim\n";
+ print "Direct reclaim pages scanned: $total_direct_nr_scanned\n";
+ print "Direct reclaim write file sync I/O: $total_direct_writepage_file_sync\n";
+ print "Direct reclaim write anon sync I/O: $total_direct_writepage_anon_sync\n";
+ print "Direct reclaim write file async I/O: $total_direct_writepage_file_async\n";
+ print "Direct reclaim write anon async I/O: $total_direct_writepage_anon_async\n";
+ print "Wake kswapd requests: $total_wakeup_kswapd\n";
+ printf "Time stalled direct reclaim: %-1.2f seconds\n", $total_direct_latency;
+ print "\n";
+ print "Kswapd wakeups: $total_kswapd_wake\n";
+ print "Kswapd pages scanned: $total_kswapd_nr_scanned\n";
+ print "Kswapd reclaim write file sync I/O: $total_kswapd_writepage_file_sync\n";
+ print "Kswapd reclaim write anon sync I/O: $total_kswapd_writepage_anon_sync\n";
+ print "Kswapd reclaim write file async I/O: $total_kswapd_writepage_file_async\n";
+ print "Kswapd reclaim write anon async I/O: $total_kswapd_writepage_anon_async\n";
+ printf "Time kswapd awake: %-1.2f seconds\n", $total_kswapd_latency;
+}
+
+sub aggregate_perprocesspid() {
+ my $process_pid;
+ my $process;
+ undef %perprocess;
+
+ foreach $process_pid (keys %perprocesspid) {
+ $process = $process_pid;
+ $process =~ s/-([0-9])*$//;
+ if ($process eq '') {
+ $process = "NO_PROCESS_NAME";
+ }
+
+ $perprocess{$process}->{MM_VMSCAN_DIRECT_RECLAIM_BEGIN} += $perprocesspid{$process_pid}->{MM_VMSCAN_DIRECT_RECLAIM_BEGIN};
+ $perprocess{$process}->{MM_VMSCAN_KSWAPD_WAKE} += $perprocesspid{$process_pid}->{MM_VMSCAN_KSWAPD_WAKE};
+ $perprocess{$process}->{MM_VMSCAN_WAKEUP_KSWAPD} += $perprocesspid{$process_pid}->{MM_VMSCAN_WAKEUP_KSWAPD};
+ $perprocess{$process}->{HIGH_KSWAPD_REWAKEUP} += $perprocesspid{$process_pid}->{HIGH_KSWAPD_REWAKEUP};
+ $perprocess{$process}->{HIGH_NR_SCANNED} += $perprocesspid{$process_pid}->{HIGH_NR_SCANNED};
+ $perprocess{$process}->{MM_VMSCAN_WRITEPAGE_FILE_SYNC} += $perprocesspid{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_SYNC};
+ $perprocess{$process}->{MM_VMSCAN_WRITEPAGE_ANON_SYNC} += $perprocesspid{$process_pid}->{MM_VMSCAN_WRITEPAGE_ANON_SYNC};
+ $perprocess{$process}->{MM_VMSCAN_WRITEPAGE_FILE_ASYNC} += $perprocesspid{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_ASYNC};
+ $perprocess{$process}->{MM_VMSCAN_WRITEPAGE_ANON_ASYNC} += $perprocesspid{$process_pid}->{MM_VMSCAN_WRITEPAGE_ANON_ASYNC};
+
+ for (my $order = 0; $order < 20; $order++) {
+ $perprocess{$process}->{MM_VMSCAN_DIRECT_RECLAIM_BEGIN_PERORDER}[$order] += $perprocesspid{$process_pid}->{MM_VMSCAN_DIRECT_RECLAIM_BEGIN_PERORDER}[$order];
+ $perprocess{$process}->{MM_VMSCAN_WAKEUP_KSWAPD_PERORDER}[$order] += $perprocesspid{$process_pid}->{MM_VMSCAN_WAKEUP_KSWAPD_PERORDER}[$order];
+ $perprocess{$process}->{MM_VMSCAN_KSWAPD_WAKE_PERORDER}[$order] += $perprocesspid{$process_pid}->{MM_VMSCAN_KSWAPD_WAKE_PERORDER}[$order];
+
+ }
+
+ # Aggregate direct reclaim latencies
+ my $wr_index = $perprocess{$process}->{MM_VMSCAN_DIRECT_RECLAIM_END};
+ my $rd_index = 0;
+ while (defined $perprocesspid{$process_pid}->{HIGH_DIRECT_RECLAIM_LATENCY}[$rd_index]) {
+ $perprocess{$process}->{HIGH_DIRECT_RECLAIM_LATENCY}[$wr_index] = $perprocesspid{$process_pid}->{HIGH_DIRECT_RECLAIM_LATENCY}[$rd_index];
+ $rd_index++;
+ $wr_index++;
+ }
+ $perprocess{$process}->{MM_VMSCAN_DIRECT_RECLAIM_END} = $wr_index;
+
+ # Aggregate kswapd latencies
+ my $wr_index = $perprocess{$process}->{MM_VMSCAN_KSWAPD_SLEEP};
+ my $rd_index = 0;
+ while (defined $perprocesspid{$process_pid}->{HIGH_KSWAPD_LATENCY}[$rd_index]) {
+ $perprocess{$process}->{HIGH_KSWAPD_LATENCY}[$wr_index] = $perprocesspid{$process_pid}->{HIGH_KSWAPD_LATENCY}[$rd_index];
+ $rd_index++;
+ $wr_index++;
+ }
+ $perprocess{$process}->{MM_VMSCAN_DIRECT_RECLAIM_END} = $wr_index;
+ }
+}
+
+sub report() {
+ if (!$opt_ignorepid) {
+ dump_stats(\%perprocesspid);
+ } else {
+ aggregate_perprocesspid();
+ dump_stats(\%perprocess);
+ }
+}
+
+# Process events or signals until neither is available
+sub signal_loop() {
+ my $sigint_processed;
+ do {
+ $sigint_processed = 0;
+ process_events();
+
+ # Handle pending signals if any
+ if ($sigint_pending) {
+ my $current_time = time;
+
+ if ($sigint_exit) {
+ print "Received exit signal\n";
+ $sigint_pending = 0;
+ }
+ if ($sigint_report) {
+ if ($current_time >= $sigint_received + 2) {
+ report();
+ $sigint_report = 0;
+ $sigint_pending = 0;
+ $sigint_processed = 1;
+ }
+ }
+ }
+ } while ($sigint_pending || $sigint_processed);
+}
+
+signal_loop();
+report();