Merge commit 'v2.6.36' into kbuild/misc

Update to be able to fix a recent change to scripts/basic/docproc.c (commit eda603f).
author: Michal Marek <mmarek@suse.cz> 2010-10-27 22:15:57 (GMT)
committer: Michal Marek <mmarek@suse.cz> 2010-10-27 22:15:57 (GMT)
commit: b74b953b998bcc2db91b694446f3a2619ec32de6 (patch)
tree: 6ce24caabd730f6ae9287ed0676ec32e6ff31e9d /Documentation/trace
parent: abb438526201c6a79949ad45375c051b6681c253 (diff)
parent: f6f94e2ab1b33f0082ac22d71f66385a60d8157f (diff)
download: linux-fsl-qoriq-b74b953b998bcc2db91b694446f3a2619ec32de6.tar.xz
6 files changed, 949 insertions, 171 deletions
diff --git a/Documentation/trace/events.txt b/Documentation/trace/events.txt
index 02ac6ed..09bd8e9 100644
--- a/Documentation/trace/events.txt
+++ b/Documentation/trace/events.txt
@@ -90,7 +90,8 @@ In order to facilitate early boot debugging, use boot option:
 
 	trace_event=[event-list]
 
-The format of this boot option is the same as described in section 2.1.
+event-list is a comma separated list of events. See section 2.1 for event
+format.
 
 3. Defining an event-enabled tracepoint
 =======================================
@@ -238,7 +239,7 @@ subsystem's filter file.
 
 For convenience, filters for every event in a subsystem can be set or
 cleared as a group by writing a filter expression into the filter file
-at the root of the subsytem.  Note however, that if a filter for any
+at the root of the subsystem.  Note however, that if a filter for any
 event within the subsystem lacks a field specified in the subsystem
 filter, or if the filter can't be applied for any other reason, the
 filter for that event will retain its previous setting.  This can
@@ -250,7 +251,7 @@ fields can be guaranteed to propagate successfully to all events.
 Here are a few subsystem filter examples that also illustrate the
 above points:
 
-Clear the filters on all events in the sched subsytem:
+Clear the filters on all events in the sched subsystem:
 
 # cd /sys/kernel/debug/tracing/events/sched
 # echo 0 > filter
@@ -260,7 +261,7 @@ none
 none
 
 Set a filter using only common fields for all events in the sched
-subsytem (all events end up with the same filter):
+subsystem (all events end up with the same filter):
 
 # cd /sys/kernel/debug/tracing/events/sched
 # echo common_pid == 0 > filter
@@ -270,7 +271,7 @@ common_pid == 0
 common_pid == 0
 
 Attempt to set a filter using a non-common field for all events in the
-sched subsytem (all events but those that have a prev_pid field retain
+sched subsystem (all events but those that have a prev_pid field retain
 their old filters):
 
 # cd /sys/kernel/debug/tracing/events/sched
diff --git a/Documentation/trace/ftrace-design.txt b/Documentation/trace/ftrace-design.txt
index 239f14b..dc52bd4 100644
--- a/Documentation/trace/ftrace-design.txt
+++ b/Documentation/trace/ftrace-design.txt
@@ -1,5 +1,6 @@
 		function tracer guts
 		====================
+		By Mike Frysinger
 
 Introduction
 ------------
@@ -12,6 +13,9 @@ Note that this focuses on architecture implementation details only.  If you
 want more explanation of a feature in terms of common code, review the common
 ftrace.txt file.
 
+Ideally, everyone who wishes to retain performance while supporting tracing in
+their kernel should make it all the way to dynamic ftrace support.
+
 
 Prerequisites
 -------------
@@ -173,14 +177,16 @@ void ftrace_graph_caller(void)
 
 	unsigned long *frompc = &...;
 	unsigned long selfpc = <return address> - MCOUNT_INSN_SIZE;
-	prepare_ftrace_return(frompc, selfpc);
+	/* passing frame pointer up is optional -- see below */
+	prepare_ftrace_return(frompc, selfpc, frame_pointer);
 
 	/* restore all state needed by the ABI */
 }
 #endif
 
-For information on how to implement prepare_ftrace_return(), simply look at
-the x86 version.  The only architecture-specific piece in it is the setup of
+For information on how to implement prepare_ftrace_return(), simply look at the
+x86 version (the frame pointer passing is optional; see the next section for
+more information).  The only architecture-specific piece in it is the setup of
 the fault recovery table (the asm(...) code).  The rest should be the same
 across architectures.
 
@@ -205,6 +211,23 @@ void return_to_handler(void)
 #endif
 
 
+HAVE_FUNCTION_GRAPH_FP_TEST
+---------------------------
+
+An arch may pass in a unique value (frame pointer) to both the entering and
+exiting of a function.  On exit, the value is compared and if it does not
+match, then it will panic the kernel.  This is largely a sanity check for bad
+code generation with gcc.  If gcc for your port sanely updates the frame
+pointer under different optimization levels, then ignore this option.
+
+However, adding support for it isn't terribly difficult.  In your assembly code
+that calls prepare_ftrace_return(), pass the frame pointer as the 3rd argument.
+Then in the C version of that function, do what the x86 port does and pass it
+along to ftrace_push_return_trace() instead of a stub value of 0.
+
+Similarly, when you call ftrace_return_to_handler(), pass it the frame pointer.
+
+
 HAVE_FTRACE_NMI_ENTER
 ---------------------
 
@@ -214,15 +237,14 @@ If you can't trace NMI functions, then skip this option.
 
 
 HAVE_SYSCALL_TRACEPOINTS
----------------------
+------------------------
 
 You need very few things to get the syscalls tracing in an arch.
 
+- Support HAVE_ARCH_TRACEHOOK (see arch/Kconfig).
 - Have a NR_syscalls variable in <asm/unistd.h> that provides the number
   of syscalls supported by the arch.
-- Implement arch_syscall_addr() that resolves a syscall address from a
-  syscall number.
-- Support the TIF_SYSCALL_TRACEPOINT thread flags
+- Support the TIF_SYSCALL_TRACEPOINT thread flags.
 - Put the trace_sys_enter() and trace_sys_exit() tracepoints calls from ptrace
   in the ptrace syscalls tracing path.
 - Tag this arch as HAVE_SYSCALL_TRACEPOINTS.
@@ -231,12 +253,152 @@ You need very few things to get the syscalls tracing in an arch.
 HAVE_FTRACE_MCOUNT_RECORD
 -------------------------
 
-See scripts/recordmcount.pl for more info.
+See scripts/recordmcount.pl for more info.  Just fill in the arch-specific
+details for how to locate the addresses of mcount call sites via objdump.
+This option doesn't make much sense without also implementing dynamic ftrace.
 
+
+HAVE_DYNAMIC_FTRACE
+-------------------
+
+You will first need HAVE_FTRACE_MCOUNT_RECORD and HAVE_FUNCTION_TRACER, so
+scroll your reader back up if you got over eager.
+
+Once those are out of the way, you will need to implement:
+	- asm/ftrace.h:
+		- MCOUNT_ADDR
+		- ftrace_call_adjust()
+		- struct dyn_arch_ftrace{}
+	- asm code:
+		- mcount() (new stub)
+		- ftrace_caller()
+		- ftrace_call()
+		- ftrace_stub()
+	- C code:
+		- ftrace_dyn_arch_init()
+		- ftrace_make_nop()
+		- ftrace_make_call()
+		- ftrace_update_ftrace_func()
+
+First you will need to fill out some arch details in your asm/ftrace.h.
+
+Define MCOUNT_ADDR as the address of your mcount symbol similar to:
+	#define MCOUNT_ADDR ((unsigned long)mcount)
+Since no one else will have a decl for that function, you will need to:
+	extern void mcount(void);
+
+You will also need the helper function ftrace_call_adjust().  Most people
+will be able to stub it out like so:
+	static inline unsigned long ftrace_call_adjust(unsigned long addr)
+	{
+		return addr;
+	}
 <details to be filled>
 
+Lastly you will need the custom dyn_arch_ftrace structure.  If you need
+some extra state when runtime patching arbitrary call sites, this is the
+place.  For now though, create an empty struct:
+	struct dyn_arch_ftrace {
+		/* No extra data needed */
+	};
+
+With the header out of the way, we can fill out the assembly code.  While we
+did already create a mcount() function earlier, dynamic ftrace only wants a
+stub function.  This is because the mcount() will only be used during boot
+and then all references to it will be patched out never to return.  Instead,
+the guts of the old mcount() will be used to create a new ftrace_caller()
+function.  Because the two are hard to merge, it will most likely be a lot
+easier to have two separate definitions split up by #ifdefs.  Same goes for
+the ftrace_stub() as that will now be inlined in ftrace_caller().
+
+Before we get confused anymore, let's check out some pseudo code so you can
+implement your own stuff in assembly:
 
-HAVE_DYNAMIC_FTRACE
----------------------
+void mcount(void)
+{
+	return;
+}
+
+void ftrace_caller(void)
+{
+	/* implement HAVE_FUNCTION_TRACE_MCOUNT_TEST if you desire */
+
+	/* save all state needed by the ABI (see paragraph above) */
+
+	unsigned long frompc = ...;
+	unsigned long selfpc = <return address> - MCOUNT_INSN_SIZE;
+
+ftrace_call:
+	ftrace_stub(frompc, selfpc);
+
+	/* restore all state needed by the ABI */
+
+ftrace_stub:
+	return;
+}
+
+This might look a little odd at first, but keep in mind that we will be runtime
+patching multiple things.  First, only functions that we actually want to trace
+will be patched to call ftrace_caller().  Second, since we only have one tracer
+active at a time, we will patch the ftrace_caller() function itself to call the
+specific tracer in question.  That is the point of the ftrace_call label.
+
+With that in mind, let's move on to the C code that will actually be doing the
+runtime patching.  You'll need a little knowledge of your arch's opcodes in
+order to make it through the next section.
+
+Every arch has an init callback function.  If you need to do something early on
+to initialize some state, this is the time to do that.  Otherwise, this simple
+function below should be sufficient for most people:
+
+int __init ftrace_dyn_arch_init(void *data)
+{
+	/* return value is done indirectly via data */
+	*(unsigned long *)data = 0;
+
+	return 0;
+}
+
+There are two functions that are used to do runtime patching of arbitrary
+functions.  The first is used to turn the mcount call site into a nop (which
+is what helps us retain runtime performance when not tracing).  The second is
+used to turn the mcount call site into a call to an arbitrary location (but
+typically that is ftracer_caller()).  See the general function definition in
+linux/ftrace.h for the functions:
+	ftrace_make_nop()
+	ftrace_make_call()
+The rec->ip value is the address of the mcount call site that was collected
+by the scripts/recordmcount.pl during build time.
+
+The last function is used to do runtime patching of the active tracer.  This
+will be modifying the assembly code at the location of the ftrace_call symbol
+inside of the ftrace_caller() function.  So you should have sufficient padding
+at that location to support the new function calls you'll be inserting.  Some
+people will be using a "call" type instruction while others will be using a
+"branch" type instruction.  Specifically, the function is:
+	ftrace_update_ftrace_func()
+
+
+HAVE_DYNAMIC_FTRACE + HAVE_FUNCTION_GRAPH_TRACER
+------------------------------------------------
+
+The function grapher needs a few tweaks in order to work with dynamic ftrace.
+Basically, you will need to:
+	- update:
+		- ftrace_caller()
+		- ftrace_graph_call()
+		- ftrace_graph_caller()
+	- implement:
+		- ftrace_enable_ftrace_graph_caller()
+		- ftrace_disable_ftrace_graph_caller()
 
 <details to be filled>
+Quick notes:
+	- add a nop stub after the ftrace_call location named ftrace_graph_call;
+	  stub needs to be large enough to support a call to ftrace_graph_caller()
+	- update ftrace_graph_caller() to work with being called by the new
+	  ftrace_caller() since some semantics may have changed
+	- ftrace_enable_ftrace_graph_caller() will runtime patch the
+	  ftrace_graph_call location with a call to ftrace_graph_caller()
+	- ftrace_disable_ftrace_graph_caller() will runtime patch the
+	  ftrace_graph_call location with nops
diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt
index 8179692..557c1ed 100644
--- a/Documentation/trace/ftrace.txt
+++ b/Documentation/trace/ftrace.txt
@@ -155,6 +155,9 @@ of ftrace. Here is a list of some of the key files:
 	to be traced. Echoing names of functions into this file
 	will limit the trace to only those functions.
 
+	This interface also allows for commands to be used. See the
+	"Filter commands" section for more details.
+
   set_ftrace_notrace:
 
 	This has an effect opposite to that of
@@ -1337,12 +1340,14 @@ ftrace_dump_on_oops must be set. To set ftrace_dump_on_oops, one
 can either use the sysctl function or set it via the proc system
 interface.
 
-  sysctl kernel.ftrace_dump_on_oops=1
+  sysctl kernel.ftrace_dump_on_oops=n
 
 or
 
-  echo 1 > /proc/sys/kernel/ftrace_dump_on_oops
+  echo n > /proc/sys/kernel/ftrace_dump_on_oops
 
+If n = 1, ftrace will dump buffers of all CPUs, if n = 2 ftrace will
+only dump the buffer of the CPU that triggered the oops.
 
 Here's an example of such a dump after a null pointer
 dereference in a kernel module:
@@ -1588,7 +1593,7 @@ module author does not need to worry about it.
 
 When tracing is enabled, kstop_machine is called to prevent
 races with the CPUS executing code being modified (which can
-cause the CPU to do undesireable things), and the nops are
+cause the CPU to do undesirable things), and the nops are
 patched back to calls. But this time, they do not call mcount
 (which is just a function stub). They now call into the ftrace
 infrastructure.
@@ -1625,7 +1630,7 @@ If I am only interested in sys_nanosleep and hrtimer_interrupt:
 
  # echo sys_nanosleep hrtimer_interrupt \
 		> set_ftrace_filter
- # echo ftrace > current_tracer
+ # echo function > current_tracer
  # echo 1 > tracing_enabled
  # usleep 1
  # echo 0 > tracing_enabled
@@ -1822,6 +1827,47 @@ this special filter via:
  echo > set_graph_function
 
 
+Filter commands
+---------------
+
+A few commands are supported by the set_ftrace_filter interface.
+Trace commands have the following format:
+
+<function>:<command>:<parameter>
+
+The following commands are supported:
+
+- mod
+  This command enables function filtering per module. The
+  parameter defines the module. For example, if only the write*
+  functions in the ext3 module are desired, run:
+
+   echo 'write*:mod:ext3' > set_ftrace_filter
+
+  This command interacts with the filter in the same way as
+  filtering based on function names. Thus, adding more functions
+  in a different module is accomplished by appending (>>) to the
+  filter file. Remove specific module functions by prepending
+  '!':
+
+   echo '!writeback*:mod:ext3' >> set_ftrace_filter
+
+- traceon/traceoff
+  These commands turn tracing on and off when the specified
+  functions are hit. The parameter determines how many times the
+  tracing system is turned on and off. If unspecified, there is
+  no limit. For example, to disable tracing when a schedule bug
+  is hit the first 5 times, run:
+
+   echo '__schedule_bug:traceoff:5' > set_ftrace_filter
+
+  These commands are cumulative whether or not they are appended
+  to set_ftrace_filter. To remove a command, prepend it by '!'
+  and drop the parameter:
+
+   echo '!__schedule_bug:traceoff' > set_ftrace_filter
+
+
 trace_pipe
 ----------
 
diff --git a/Documentation/trace/kmemtrace.txt b/Documentation/trace/kmemtrace.txt
deleted file mode 100644
index 6308735..0000000
--- a/Documentation/trace/kmemtrace.txt
+++ /dev/null
@@ -1,126 +0,0 @@
-			kmemtrace - Kernel Memory Tracer
-
-			  by Eduard - Gabriel Munteanu
-			     <eduard.munteanu@linux360.ro>
-
-I. Introduction
-===============
-
-kmemtrace helps kernel developers figure out two things:
-1) how different allocators (SLAB, SLUB etc.) perform
-2) how kernel code allocates memory and how much
-
-To do this, we trace every allocation and export information to the userspace
-through the relay interface. We export things such as the number of requested
-bytes, the number of bytes actually allocated (i.e. including internal
-fragmentation), whether this is a slab allocation or a plain kmalloc() and so
-on.
-
-The actual analysis is performed by a userspace tool (see section III for
-details on where to get it from). It logs the data exported by the kernel,
-processes it and (as of writing this) can provide the following information:
-- the total amount of memory allocated and fragmentation per call-site
-- the amount of memory allocated and fragmentation per allocation
-- total memory allocated and fragmentation in the collected dataset
-- number of cross-CPU allocation and frees (makes sense in NUMA environments)
-
-Moreover, it can potentially find inconsistent and erroneous behavior in
-kernel code, such as using slab free functions on kmalloc'ed memory or
-allocating less memory than requested (but not truly failed allocations).
-
-kmemtrace also makes provisions for tracing on some arch and analysing the
-data on another.
-
-II. Design and goals
-====================
-
-kmemtrace was designed to handle rather large amounts of data. Thus, it uses
-the relay interface to export whatever is logged to userspace, which then
-stores it. Analysis and reporting is done asynchronously, that is, after the
-data is collected and stored. By design, it allows one to log and analyse
-on different machines and different arches.
-
-As of writing this, the ABI is not considered stable, though it might not
-change much. However, no guarantees are made about compatibility yet. When
-deemed stable, the ABI should still allow easy extension while maintaining
-backward compatibility. This is described further in Documentation/ABI.
-
-Summary of design goals:
-	- allow logging and analysis to be done across different machines
-	- be fast and anticipate usage in high-load environments (*)
-	- be reasonably extensible
-	- make it possible for GNU/Linux distributions to have kmemtrace
-	included in their repositories
-
-(*) - one of the reasons Pekka Enberg's original userspace data analysis
-    tool's code was rewritten from Perl to C (although this is more than a
-    simple conversion)
-
-
-III. Quick usage guide
-======================
-
-1) Get a kernel that supports kmemtrace and build it accordingly (i.e. enable
-CONFIG_KMEMTRACE).
-
-2) Get the userspace tool and build it:
-$ git clone git://repo.or.cz/kmemtrace-user.git		# current repository
-$ cd kmemtrace-user/
-$ ./autogen.sh
-$ ./configure
-$ make
-
-3) Boot the kmemtrace-enabled kernel if you haven't, preferably in the
-'single' runlevel (so that relay buffers don't fill up easily), and run
-kmemtrace:
-# '$' does not mean user, but root here.
-$ mount -t debugfs none /sys/kernel/debug
-$ mount -t proc none /proc
-$ cd path/to/kmemtrace-user/
-$ ./kmemtraced
-Wait a bit, then stop it with CTRL+C.
-$ cat /sys/kernel/debug/kmemtrace/total_overruns	# Check if we didn't
-							# overrun, should
-							# be zero.
-$ (Optionally) [Run kmemtrace_check separately on each cpu[0-9]*.out file to
-		check its correctness]
-$ ./kmemtrace-report
-
-Now you should have a nice and short summary of how the allocator performs.
-
-IV. FAQ and known issues
-========================
-
-Q: 'cat /sys/kernel/debug/kmemtrace/total_overruns' is non-zero, how do I fix
-this? Should I worry?
-A: If it's non-zero, this affects kmemtrace's accuracy, depending on how
-large the number is. You can fix it by supplying a higher
-'kmemtrace.subbufs=N' kernel parameter.
----
-
-Q: kmemtrace_check reports errors, how do I fix this? Should I worry?
-A: This is a bug and should be reported. It can occur for a variety of
-reasons:
-	- possible bugs in relay code
-	- possible misuse of relay by kmemtrace
-	- timestamps being collected unorderly
-Or you may fix it yourself and send us a patch.
----
-
-Q: kmemtrace_report shows many errors, how do I fix this? Should I worry?
-A: This is a known issue and I'm working on it. These might be true errors
-in kernel code, which may have inconsistent behavior (e.g. allocating memory
-with kmem_cache_alloc() and freeing it with kfree()). Pekka Enberg pointed
-out this behavior may work with SLAB, but may fail with other allocators.
-
-It may also be due to lack of tracing in some unusual allocator functions.
-
-We don't want bug reports regarding this issue yet.
----
-
-V. See also
-===========
-
-Documentation/kernel-parameters.txt
-Documentation/ABI/testing/debugfs-kmemtrace
-
diff --git a/Documentation/trace/kprobetrace.txt b/Documentation/trace/kprobetrace.txt
index 47aabee..5f77d94 100644
--- a/Documentation/trace/kprobetrace.txt
+++ b/Documentation/trace/kprobetrace.txt
@@ -24,6 +24,7 @@ Synopsis of kprobe_events
 -------------------------
   p[:[GRP/]EVENT] SYMBOL[+offs]|MEMADDR [FETCHARGS]	: Set a probe
   r[:[GRP/]EVENT] SYMBOL[+0] [FETCHARGS]		: Set a return probe
+  -:[GRP/]EVENT						: Clear a probe
 
  GRP		: Group name. If omitted, use "kprobes" for it.
  EVENT		: Event name. If omitted, the event name is generated
@@ -37,15 +38,14 @@ Synopsis of kprobe_events
   @SYM[+|-offs]	: Fetch memory at SYM +|- offs (SYM should be a data symbol)
   $stackN	: Fetch Nth entry of stack (N >= 0)
   $stack	: Fetch stack address.
-  $argN		: Fetch function argument. (N >= 0)(*)
-  $retval	: Fetch return value.(**)
-  +|-offs(FETCHARG) : Fetch memory at FETCHARG +|- offs address.(***)
-  NAME=FETCHARG: Set NAME as the argument name of FETCHARG.
+  $retval	: Fetch return value.(*)
+  +|-offs(FETCHARG) : Fetch memory at FETCHARG +|- offs address.(**)
+  NAME=FETCHARG : Set NAME as the argument name of FETCHARG.
+  FETCHARG:TYPE : Set TYPE as the type of FETCHARG. Currently, basic types
+		  (u8/u16/u32/u64/s8/s16/s32/s64) and string are supported.
 
-  (*) aN may not correct on asmlinkaged functions and at the middle of
-      function body.
-  (**) only for return probe.
-  (***) this is useful for fetching a field of data structures.
+  (*) only for return probe.
+  (**) this is useful for fetching a field of data structures.
 
 
 Per-Probe Event Filtering
@@ -82,13 +82,16 @@ Usage examples
 To add a probe as a new event, write a new definition to kprobe_events
 as below.
 
-  echo p:myprobe do_sys_open dfd=$arg0 filename=$arg1 flags=$arg2 mode=$arg3 > /sys/kernel/debug/tracing/kprobe_events
+  echo 'p:myprobe do_sys_open dfd=%ax filename=%dx flags=%cx mode=+4($stack)' > /sys/kernel/debug/tracing/kprobe_events
 
  This sets a kprobe on the top of do_sys_open() function with recording
-1st to 4th arguments as "myprobe" event. As this example shows, users can
-choose more familiar names for each arguments.
+1st to 4th arguments as "myprobe" event. Note, which register/stack entry is
+assigned to each function argument depends on arch-specific ABI. If you unsure
+the ABI, please try to use probe subcommand of perf-tools (you can find it
+under tools/perf/).
+As this example shows, users can choose more familiar names for each arguments.
 
-  echo r:myretprobe do_sys_open $retval >> /sys/kernel/debug/tracing/kprobe_events
+  echo 'r:myretprobe do_sys_open $retval' >> /sys/kernel/debug/tracing/kprobe_events
 
  This sets a kretprobe on the return point of do_sys_open() function with
 recording return value as "myretprobe" event.
@@ -97,23 +100,24 @@ recording return value as "myretprobe" event.
 
   cat /sys/kernel/debug/tracing/events/kprobes/myprobe/format
 name: myprobe
-ID: 75
+ID: 780
 format:
-	field:unsigned short common_type;	offset:0;	size:2;
-	field:unsigned char common_flags;	offset:2;	size:1;
-	field:unsigned char common_preempt_count;	offset:3;	size:1;
-	field:int common_pid;	offset:4;	size:4;
-	field:int common_tgid;	offset:8;	size:4;
+        field:unsigned short common_type;       offset:0;       size:2; signed:0;
+        field:unsigned char common_flags;       offset:2;       size:1; signed:0;
+        field:unsigned char common_preempt_count;       offset:3; size:1;signed:0;
+        field:int common_pid;   offset:4;       size:4; signed:1;
+        field:int common_lock_depth;    offset:8;       size:4; signed:1;
 
-	field: unsigned long ip;	offset:16;tsize:8;
-	field: int nargs;	offset:24;tsize:4;
-	field: unsigned long dfd;	offset:32;tsize:8;
-	field: unsigned long filename;	offset:40;tsize:8;
-	field: unsigned long flags;	offset:48;tsize:8;
-	field: unsigned long mode;	offset:56;tsize:8;
+        field:unsigned long __probe_ip; offset:12;      size:4; signed:0;
+        field:int __probe_nargs;        offset:16;      size:4; signed:1;
+        field:unsigned long dfd;        offset:20;      size:4; signed:0;
+        field:unsigned long filename;   offset:24;      size:4; signed:0;
+        field:unsigned long flags;      offset:28;      size:4; signed:0;
+        field:unsigned long mode;       offset:32;      size:4; signed:0;
 
-print fmt: "(%lx) dfd=%lx filename=%lx flags=%lx mode=%lx", REC->ip, REC->dfd, REC->filename, REC->flags, REC->mode
 
+print fmt: "(%lx) dfd=%lx filename=%lx flags=%lx mode=%lx", REC->__probe_ip,
+REC->dfd, REC->filename, REC->flags, REC->mode
 
  You can see that the event has 4 arguments as in the expressions you specified.
 
@@ -121,6 +125,12 @@ print fmt: "(%lx) dfd=%lx filename=%lx flags=%lx mode=%lx", REC->ip, REC->dfd, R
 
  This clears all probe points.
 
+ Or,
+
+  echo -:myprobe >> kprobe_events
+
+ This clears probe points selectively.
+
  Right after definition, each event is disabled by default. For tracing these
 events, you need to enable it.
 
@@ -146,4 +156,3 @@ events, you need to enable it.
 returns from SYMBOL(e.g. "sys_open+0x1b/0x1d <- do_sys_open" means kernel
 returns from do_sys_open to sys_open+0x1b).
 
-
diff --git a/Documentation/trace/postprocess/trace-vmscan-postprocess.pl b/Documentation/trace/postprocess/trace-vmscan-postprocess.pl
new file mode 100644
index 0000000..1b55146
--- /dev/null
+++ b/Documentation/trace/postprocess/trace-vmscan-postprocess.pl
@@ -0,0 +1,686 @@
+#!/usr/bin/perl
+# This is a POC for reading the text representation of trace output related to
+# page reclaim. It makes an attempt to extract some high-level information on
+# what is going on. The accuracy of the parser may vary
+#
+# Example usage: trace-vmscan-postprocess.pl < /sys/kernel/debug/tracing/trace_pipe
+# other options
+#   --read-procstat	If the trace lacks process info, get it from /proc
+#   --ignore-pid	Aggregate processes of the same name together
+#
+# Copyright (c) IBM Corporation 2009
+# Author: Mel Gorman <mel@csn.ul.ie>
+use strict;
+use Getopt::Long;
+
+# Tracepoint events
+use constant MM_VMSCAN_DIRECT_RECLAIM_BEGIN	=> 1;
+use constant MM_VMSCAN_DIRECT_RECLAIM_END	=> 2;
+use constant MM_VMSCAN_KSWAPD_WAKE		=> 3;
+use constant MM_VMSCAN_KSWAPD_SLEEP		=> 4;
+use constant MM_VMSCAN_LRU_SHRINK_ACTIVE	=> 5;
+use constant MM_VMSCAN_LRU_SHRINK_INACTIVE	=> 6;
+use constant MM_VMSCAN_LRU_ISOLATE		=> 7;
+use constant MM_VMSCAN_WRITEPAGE_FILE_SYNC	=> 8;
+use constant MM_VMSCAN_WRITEPAGE_ANON_SYNC	=> 9;
+use constant MM_VMSCAN_WRITEPAGE_FILE_ASYNC	=> 10;
+use constant MM_VMSCAN_WRITEPAGE_ANON_ASYNC	=> 11;
+use constant MM_VMSCAN_WRITEPAGE_ASYNC		=> 12;
+use constant EVENT_UNKNOWN			=> 13;
+
+# Per-order events
+use constant MM_VMSCAN_DIRECT_RECLAIM_BEGIN_PERORDER => 11;
+use constant MM_VMSCAN_WAKEUP_KSWAPD_PERORDER 	=> 12;
+use constant MM_VMSCAN_KSWAPD_WAKE_PERORDER	=> 13;
+use constant HIGH_KSWAPD_REWAKEUP_PERORDER	=> 14;
+
+# Constants used to track state
+use constant STATE_DIRECT_BEGIN 		=> 15;
+use constant STATE_DIRECT_ORDER 		=> 16;
+use constant STATE_KSWAPD_BEGIN			=> 17;
+use constant STATE_KSWAPD_ORDER			=> 18;
+
+# High-level events extrapolated from tracepoints
+use constant HIGH_DIRECT_RECLAIM_LATENCY	=> 19;
+use constant HIGH_KSWAPD_LATENCY		=> 20;
+use constant HIGH_KSWAPD_REWAKEUP		=> 21;
+use constant HIGH_NR_SCANNED			=> 22;
+use constant HIGH_NR_TAKEN			=> 23;
+use constant HIGH_NR_RECLAIM			=> 24;
+use constant HIGH_NR_CONTIG_DIRTY		=> 25;
+
+my %perprocesspid;
+my %perprocess;
+my %last_procmap;
+my $opt_ignorepid;
+my $opt_read_procstat;
+
+my $total_wakeup_kswapd;
+my ($total_direct_reclaim, $total_direct_nr_scanned);
+my ($total_direct_latency, $total_kswapd_latency);
+my ($total_direct_writepage_file_sync, $total_direct_writepage_file_async);
+my ($total_direct_writepage_anon_sync, $total_direct_writepage_anon_async);
+my ($total_kswapd_nr_scanned, $total_kswapd_wake);
+my ($total_kswapd_writepage_file_sync, $total_kswapd_writepage_file_async);
+my ($total_kswapd_writepage_anon_sync, $total_kswapd_writepage_anon_async);
+
+# Catch sigint and exit on request
+my $sigint_report = 0;
+my $sigint_exit = 0;
+my $sigint_pending = 0;
+my $sigint_received = 0;
+sub sigint_handler {
+	my $current_time = time;
+	if ($current_time - 2 > $sigint_received) {
+		print "SIGINT received, report pending. Hit ctrl-c again to exit\n";
+		$sigint_report = 1;
+	} else {
+		if (!$sigint_exit) {
+			print "Second SIGINT received quickly, exiting\n";
+		}
+		$sigint_exit++;
+	}
+
+	if ($sigint_exit > 3) {
+		print "Many SIGINTs received, exiting now without report\n";
+		exit;
+	}
+
+	$sigint_received = $current_time;
+	$sigint_pending = 1;
+}
+$SIG{INT} = "sigint_handler";
+
+# Parse command line options
+GetOptions(
+	'ignore-pid'	 =>	\$opt_ignorepid,
+	'read-procstat'	 =>	\$opt_read_procstat,
+);
+
+# Defaults for dynamically discovered regex's
+my $regex_direct_begin_default = 'order=([0-9]*) may_writepage=([0-9]*) gfp_flags=([A-Z_|]*)';
+my $regex_direct_end_default = 'nr_reclaimed=([0-9]*)';
+my $regex_kswapd_wake_default = 'nid=([0-9]*) order=([0-9]*)';
+my $regex_kswapd_sleep_default = 'nid=([0-9]*)';
+my $regex_wakeup_kswapd_default = 'nid=([0-9]*) zid=([0-9]*) order=([0-9]*)';
+my $regex_lru_isolate_default = 'isolate_mode=([0-9]*) order=([0-9]*) nr_requested=([0-9]*) nr_scanned=([0-9]*) nr_taken=([0-9]*) contig_taken=([0-9]*) contig_dirty=([0-9]*) contig_failed=([0-9]*)';
+my $regex_lru_shrink_inactive_default = 'lru=([A-Z_]*) nr_scanned=([0-9]*) nr_reclaimed=([0-9]*) priority=([0-9]*)';
+my $regex_lru_shrink_active_default = 'lru=([A-Z_]*) nr_scanned=([0-9]*) nr_rotated=([0-9]*) priority=([0-9]*)';
+my $regex_writepage_default = 'page=([0-9a-f]*) pfn=([0-9]*) flags=([A-Z_|]*)';
+
+# Dyanically discovered regex
+my $regex_direct_begin;
+my $regex_direct_end;
+my $regex_kswapd_wake;
+my $regex_kswapd_sleep;
+my $regex_wakeup_kswapd;
+my $regex_lru_isolate;
+my $regex_lru_shrink_inactive;
+my $regex_lru_shrink_active;
+my $regex_writepage;
+
+# Static regex used. Specified like this for readability and for use with /o
+#                      (process_pid)     (cpus      )   ( time  )   (tpoint    ) (details)
+my $regex_traceevent = '\s*([a-zA-Z0-9-]*)\s*(\[[0-9]*\])\s*([0-9.]*):\s*([a-zA-Z_]*):\s*(.*)';
+my $regex_statname = '[-0-9]*\s\((.*)\).*';
+my $regex_statppid = '[-0-9]*\s\(.*\)\s[A-Za-z]\s([0-9]*).*';
+
+sub generate_traceevent_regex {
+	my $event = shift;
+	my $default = shift;
+	my $regex;
+
+	# Read the event format or use the default
+	if (!open (FORMAT, "/sys/kernel/debug/tracing/events/$event/format")) {
+		print("WARNING: Event $event format string not found\n");
+		return $default;
+	} else {
+		my $line;
+		while (!eof(FORMAT)) {
+			$line = <FORMAT>;
+			$line =~ s/, REC->.*//;
+			if ($line =~ /^print fmt:\s"(.*)".*/) {
+				$regex = $1;
+				$regex =~ s/%s/\([0-9a-zA-Z|_]*\)/g;
+				$regex =~ s/%p/\([0-9a-f]*\)/g;
+				$regex =~ s/%d/\([-0-9]*\)/g;
+				$regex =~ s/%ld/\([-0-9]*\)/g;
+				$regex =~ s/%lu/\([0-9]*\)/g;
+			}
+		}
+	}
+
+	# Can't handle the print_flags stuff but in the context of this
+	# script, it really doesn't matter
+	$regex =~ s/\(REC.*\) \? __print_flags.*//;
+
+	# Verify fields are in the right order
+	my $tuple;
+	foreach $tuple (split /\s/, $regex) {
+		my ($key, $value) = split(/=/, $tuple);
+		my $expected = shift;
+		if ($key ne $expected) {
+			print("WARNING: Format not as expected for event $event '$key' != '$expected'\n");
+			$regex =~ s/$key=\((.*)\)/$key=$1/;
+		}
+	}
+
+	if (defined shift) {
+		die("Fewer fields than expected in format");
+	}
+
+	return $regex;
+}
+
+$regex_direct_begin = generate_traceevent_regex(
+			"vmscan/mm_vmscan_direct_reclaim_begin",
+			$regex_direct_begin_default,
+			"order", "may_writepage",
+			"gfp_flags");
+$regex_direct_end = generate_traceevent_regex(
+			"vmscan/mm_vmscan_direct_reclaim_end",
+			$regex_direct_end_default,
+			"nr_reclaimed");
+$regex_kswapd_wake = generate_traceevent_regex(
+			"vmscan/mm_vmscan_kswapd_wake",
+			$regex_kswapd_wake_default,
+			"nid", "order");
+$regex_kswapd_sleep = generate_traceevent_regex(
+			"vmscan/mm_vmscan_kswapd_sleep",
+			$regex_kswapd_sleep_default,
+			"nid");
+$regex_wakeup_kswapd = generate_traceevent_regex(
+			"vmscan/mm_vmscan_wakeup_kswapd",
+			$regex_wakeup_kswapd_default,
+			"nid", "zid", "order");
+$regex_lru_isolate = generate_traceevent_regex(
+			"vmscan/mm_vmscan_lru_isolate",
+			$regex_lru_isolate_default,
+			"isolate_mode", "order",
+			"nr_requested", "nr_scanned", "nr_taken",
+			"contig_taken", "contig_dirty", "contig_failed");
+$regex_lru_shrink_inactive = generate_traceevent_regex(
+			"vmscan/mm_vmscan_lru_shrink_inactive",
+			$regex_lru_shrink_inactive_default,
+			"nid", "zid",
+			"lru",
+			"nr_scanned", "nr_reclaimed", "priority");
+$regex_lru_shrink_active = generate_traceevent_regex(
+			"vmscan/mm_vmscan_lru_shrink_active",
+			$regex_lru_shrink_active_default,
+			"nid", "zid",
+			"lru",
+			"nr_scanned", "nr_rotated", "priority");
+$regex_writepage = generate_traceevent_regex(
+			"vmscan/mm_vmscan_writepage",
+			$regex_writepage_default,
+			"page", "pfn", "flags");
+
+sub read_statline($) {
+	my $pid = $_[0];
+	my $statline;
+
+	if (open(STAT, "/proc/$pid/stat")) {
+		$statline = <STAT>;
+		close(STAT);
+	}
+
+	if ($statline eq '') {
+		$statline = "-1 (UNKNOWN_PROCESS_NAME) R 0";
+	}
+
+	return $statline;
+}
+
+sub guess_process_pid($$) {
+	my $pid = $_[0];
+	my $statline = $_[1];
+
+	if ($pid == 0) {
+		return "swapper-0";
+	}
+
+	if ($statline !~ /$regex_statname/o) {
+		die("Failed to math stat line for process name :: $statline");
+	}
+	return "$1-$pid";
+}
+
+# Convert sec.usec timestamp format
+sub timestamp_to_ms($) {
+	my $timestamp = $_[0];
+
+	my ($sec, $usec) = split (/\./, $timestamp);
+	return ($sec * 1000) + ($usec / 1000);
+}
+
+sub process_events {
+	my $traceevent;
+	my $process_pid;
+	my $cpus;
+	my $timestamp;
+	my $tracepoint;
+	my $details;
+	my $statline;
+
+	# Read each line of the event log
+EVENT_PROCESS:
+	while ($traceevent = <STDIN>) {
+		if ($traceevent =~ /$regex_traceevent/o) {
+			$process_pid = $1;
+			$timestamp = $3;
+			$tracepoint = $4;
+
+			$process_pid =~ /(.*)-([0-9]*)$/;
+			my $process = $1;
+			my $pid = $2;
+
+			if ($process eq "") {
+				$process = $last_procmap{$pid};
+				$process_pid = "$process-$pid";
+			}
+			$last_procmap{$pid} = $process;
+
+			if ($opt_read_procstat) {
+				$statline = read_statline($pid);
+				if ($opt_read_procstat && $process eq '') {
+					$process_pid = guess_process_pid($pid, $statline);
+				}
+			}
+		} else {
+			next;
+		}
+
+		# Perl Switch() sucks majorly
+		if ($tracepoint eq "mm_vmscan_direct_reclaim_begin") {
+			$timestamp = timestamp_to_ms($timestamp);
+			$perprocesspid{$process_pid}->{MM_VMSCAN_DIRECT_RECLAIM_BEGIN}++;
+			$perprocesspid{$process_pid}->{STATE_DIRECT_BEGIN} = $timestamp;
+
+			$details = $5;
+			if ($details !~ /$regex_direct_begin/o) {
+				print "WARNING: Failed to parse mm_vmscan_direct_reclaim_begin as expected\n";
+				print "         $details\n";
+				print "         $regex_direct_begin\n";
+				next;
+			}
+			my $order = $1;
+			$perprocesspid{$process_pid}->{MM_VMSCAN_DIRECT_RECLAIM_BEGIN_PERORDER}[$order]++;
+			$perprocesspid{$process_pid}->{STATE_DIRECT_ORDER} = $order;
+		} elsif ($tracepoint eq "mm_vmscan_direct_reclaim_end") {
+			# Count the event itself
+			my $index = $perprocesspid{$process_pid}->{MM_VMSCAN_DIRECT_RECLAIM_END};
+			$perprocesspid{$process_pid}->{MM_VMSCAN_DIRECT_RECLAIM_END}++;
+
+			# Record how long direct reclaim took this time
+			if (defined $perprocesspid{$process_pid}->{STATE_DIRECT_BEGIN}) {
+				$timestamp = timestamp_to_ms($timestamp);
+				my $order = $perprocesspid{$process_pid}->{STATE_DIRECT_ORDER};
+				my $latency = ($timestamp - $perprocesspid{$process_pid}->{STATE_DIRECT_BEGIN});
+				$perprocesspid{$process_pid}->{HIGH_DIRECT_RECLAIM_LATENCY}[$index] = "$order-$latency";
+			}
+		} elsif ($tracepoint eq "mm_vmscan_kswapd_wake") {
+			$details = $5;
+			if ($details !~ /$regex_kswapd_wake/o) {
+				print "WARNING: Failed to parse mm_vmscan_kswapd_wake as expected\n";
+				print "         $details\n";
+				print "         $regex_kswapd_wake\n";
+				next;
+			}
+
+			my $order = $2;
+			$perprocesspid{$process_pid}->{STATE_KSWAPD_ORDER} = $order;
+			if (!$perprocesspid{$process_pid}->{STATE_KSWAPD_BEGIN}) {
+				$timestamp = timestamp_to_ms($timestamp);
+				$perprocesspid{$process_pid}->{MM_VMSCAN_KSWAPD_WAKE}++;
+				$perprocesspid{$process_pid}->{STATE_KSWAPD_BEGIN} = $timestamp;
+				$perprocesspid{$process_pid}->{MM_VMSCAN_KSWAPD_WAKE_PERORDER}[$order]++;
+			} else {
+				$perprocesspid{$process_pid}->{HIGH_KSWAPD_REWAKEUP}++;
+				$perprocesspid{$process_pid}->{HIGH_KSWAPD_REWAKEUP_PERORDER}[$order]++;
+			}
+		} elsif ($tracepoint eq "mm_vmscan_kswapd_sleep") {
+
+			# Count the event itself
+			my $index = $perprocesspid{$process_pid}->{MM_VMSCAN_KSWAPD_SLEEP};
+			$perprocesspid{$process_pid}->{MM_VMSCAN_KSWAPD_SLEEP}++;
+
+			# Record how long kswapd was awake
+			$timestamp = timestamp_to_ms($timestamp);
+			my $order = $perprocesspid{$process_pid}->{STATE_KSWAPD_ORDER};
+			my $latency = ($timestamp - $perprocesspid{$process_pid}->{STATE_KSWAPD_BEGIN});
+			$perprocesspid{$process_pid}->{HIGH_KSWAPD_LATENCY}[$index] = "$order-$latency";
+			$perprocesspid{$process_pid}->{STATE_KSWAPD_BEGIN} = 0;
+		} elsif ($tracepoint eq "mm_vmscan_wakeup_kswapd") {
+			$perprocesspid{$process_pid}->{MM_VMSCAN_WAKEUP_KSWAPD}++;
+
+			$details = $5;
+			if ($details !~ /$regex_wakeup_kswapd/o) {
+				print "WARNING: Failed to parse mm_vmscan_wakeup_kswapd as expected\n";
+				print "         $details\n";
+				print "         $regex_wakeup_kswapd\n";
+				next;
+			}
+			my $order = $3;
+			$perprocesspid{$process_pid}->{MM_VMSCAN_WAKEUP_KSWAPD_PERORDER}[$order]++;
+		} elsif ($tracepoint eq "mm_vmscan_lru_isolate") {
+			$details = $5;
+			if ($details !~ /$regex_lru_isolate/o) {
+				print "WARNING: Failed to parse mm_vmscan_lru_isolate as expected\n";
+				print "         $details\n";
+				print "         $regex_lru_isolate/o\n";
+				next;
+			}
+			my $nr_scanned = $4;
+			my $nr_contig_dirty = $7;
+			$perprocesspid{$process_pid}->{HIGH_NR_SCANNED} += $nr_scanned;
+			$perprocesspid{$process_pid}->{HIGH_NR_CONTIG_DIRTY} += $nr_contig_dirty;
+		} elsif ($tracepoint eq "mm_vmscan_writepage") {
+			$details = $5;
+			if ($details !~ /$regex_writepage/o) {
+				print "WARNING: Failed to parse mm_vmscan_writepage as expected\n";
+				print "         $details\n";
+				print "         $regex_writepage\n";
+				next;
+			}
+
+			my $flags = $3;
+			my $file = 0;
+			my $sync_io = 0;
+			if ($flags =~ /RECLAIM_WB_FILE/) {
+				$file = 1;
+			}
+			if ($flags =~ /RECLAIM_WB_SYNC/) {
+				$sync_io = 1;
+			}
+			if ($sync_io) {
+				if ($file) {
+					$perprocesspid{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_SYNC}++;
+				} else {
+					$perprocesspid{$process_pid}->{MM_VMSCAN_WRITEPAGE_ANON_SYNC}++;
+				}
+			} else {
+				if ($file) {
+					$perprocesspid{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_ASYNC}++;
+				} else {
+					$perprocesspid{$process_pid}->{MM_VMSCAN_WRITEPAGE_ANON_ASYNC}++;
+				}
+			}
+		} else {
+			$perprocesspid{$process_pid}->{EVENT_UNKNOWN}++;
+		}
+
+		if ($sigint_pending) {
+			last EVENT_PROCESS;
+		}
+	}
+}
+
+sub dump_stats {
+	my $hashref = shift;
+	my %stats = %$hashref;
+
+	# Dump per-process stats
+	my $process_pid;
+	my $max_strlen = 0;
+
+	# Get the maximum process name
+	foreach $process_pid (keys %perprocesspid) {
+		my $len = length($process_pid);
+		if ($len > $max_strlen) {
+			$max_strlen = $len;
+		}
+	}
+	$max_strlen += 2;
+
+	# Work out latencies
+	printf("\n") if !$opt_ignorepid;
+	printf("Reclaim latencies expressed as order-latency_in_ms\n") if !$opt_ignorepid;
+	foreach $process_pid (keys %stats) {
+
+		if (!$stats{$process_pid}->{HIGH_DIRECT_RECLAIM_LATENCY}[0] &&
+				!$stats{$process_pid}->{HIGH_KSWAPD_LATENCY}[0]) {
+			next;
+		}
+
+		printf "%-" . $max_strlen . "s ", $process_pid if !$opt_ignorepid;
+		my $index = 0;
+		while (defined $stats{$process_pid}->{HIGH_DIRECT_RECLAIM_LATENCY}[$index] ||
+			defined $stats{$process_pid}->{HIGH_KSWAPD_LATENCY}[$index]) {
+
+			if ($stats{$process_pid}->{HIGH_DIRECT_RECLAIM_LATENCY}[$index]) {
+				printf("%s ", $stats{$process_pid}->{HIGH_DIRECT_RECLAIM_LATENCY}[$index]) if !$opt_ignorepid;
+				my ($dummy, $latency) = split(/-/, $stats{$process_pid}->{HIGH_DIRECT_RECLAIM_LATENCY}[$index]);
+				$total_direct_latency += $latency;
+			} else {
+				printf("%s ", $stats{$process_pid}->{HIGH_KSWAPD_LATENCY}[$index]) if !$opt_ignorepid;
+				my ($dummy, $latency) = split(/-/, $stats{$process_pid}->{HIGH_KSWAPD_LATENCY}[$index]);
+				$total_kswapd_latency += $latency;
+			}
+			$index++;
+		}
+		print "\n" if !$opt_ignorepid;
+	}
+
+	# Print out process activity
+	printf("\n");
+	printf("%-" . $max_strlen . "s %8s %10s   %8s   %8s %8s %8s %8s\n", "Process", "Direct",  "Wokeup", "Pages",   "Pages",   "Pages",     "Time");
+	printf("%-" . $max_strlen . "s %8s %10s   %8s   %8s %8s %8s %8s\n", "details", "Rclms",   "Kswapd", "Scanned", "Sync-IO", "ASync-IO",  "Stalled");
+	foreach $process_pid (keys %stats) {
+
+		if (!$stats{$process_pid}->{MM_VMSCAN_DIRECT_RECLAIM_BEGIN}) {
+			next;
+		}
+
+		$total_direct_reclaim += $stats{$process_pid}->{MM_VMSCAN_DIRECT_RECLAIM_BEGIN};
+		$total_wakeup_kswapd += $stats{$process_pid}->{MM_VMSCAN_WAKEUP_KSWAPD};
+		$total_direct_nr_scanned += $stats{$process_pid}->{HIGH_NR_SCANNED};
+		$total_direct_writepage_file_sync += $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_SYNC};
+		$total_direct_writepage_anon_sync += $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_ANON_SYNC};
+		$total_direct_writepage_file_async += $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_ASYNC};
+
+		$total_direct_writepage_anon_async += $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_ANON_ASYNC};
+
+		my $index = 0;
+		my $this_reclaim_delay = 0;
+		while (defined $stats{$process_pid}->{HIGH_DIRECT_RECLAIM_LATENCY}[$index]) {
+			 my ($dummy, $latency) = split(/-/, $stats{$process_pid}->{HIGH_DIRECT_RECLAIM_LATENCY}[$index]);
+			$this_reclaim_delay += $latency;
+			$index++;
+		}
+
+		printf("%-" . $max_strlen . "s %8d %10d   %8u   %8u %8u %8.3f",
+			$process_pid,
+			$stats{$process_pid}->{MM_VMSCAN_DIRECT_RECLAIM_BEGIN},
+			$stats{$process_pid}->{MM_VMSCAN_WAKEUP_KSWAPD},
+			$stats{$process_pid}->{HIGH_NR_SCANNED},
+			$stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_SYNC} + $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_ANON_SYNC},
+			$stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_ASYNC} + $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_ANON_ASYNC},
+			$this_reclaim_delay / 1000);
+
+		if ($stats{$process_pid}->{MM_VMSCAN_DIRECT_RECLAIM_BEGIN}) {
+			print "      ";
+			for (my $order = 0; $order < 20; $order++) {
+				my $count = $stats{$process_pid}->{MM_VMSCAN_DIRECT_RECLAIM_BEGIN_PERORDER}[$order];
+				if ($count != 0) {
+					print "direct-$order=$count ";
+				}
+			}
+		}
+		if ($stats{$process_pid}->{MM_VMSCAN_WAKEUP_KSWAPD}) {
+			print "      ";
+			for (my $order = 0; $order < 20; $order++) {
+				my $count = $stats{$process_pid}->{MM_VMSCAN_WAKEUP_KSWAPD_PERORDER}[$order];
+				if ($count != 0) {
+					print "wakeup-$order=$count ";
+				}
+			}
+		}
+		if ($stats{$process_pid}->{HIGH_NR_CONTIG_DIRTY}) {
+			print "      ";
+			my $count = $stats{$process_pid}->{HIGH_NR_CONTIG_DIRTY};
+			if ($count != 0) {
+				print "contig-dirty=$count ";
+			}
+		}
+
+		print "\n";
+	}
+
+	# Print out kswapd activity
+	printf("\n");
+	printf("%-" . $max_strlen . "s %8s %10s   %8s   %8s %8s %8s\n", "Kswapd",   "Kswapd",  "Order",     "Pages",   "Pages",  "Pages");
+	printf("%-" . $max_strlen . "s %8s %10s   %8s   %8s %8s %8s\n", "Instance", "Wakeups", "Re-wakeup", "Scanned", "Sync-IO", "ASync-IO");
+	foreach $process_pid (keys %stats) {
+
+		if (!$stats{$process_pid}->{MM_VMSCAN_KSWAPD_WAKE}) {
+			next;
+		}
+
+		$total_kswapd_wake += $stats{$process_pid}->{MM_VMSCAN_KSWAPD_WAKE};
+		$total_kswapd_nr_scanned += $stats{$process_pid}->{HIGH_NR_SCANNED};
+		$total_kswapd_writepage_file_sync += $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_SYNC};
+		$total_kswapd_writepage_anon_sync += $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_ANON_SYNC};
+		$total_kswapd_writepage_file_async += $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_ASYNC};
+		$total_kswapd_writepage_anon_async += $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_ANON_ASYNC};
+
+		printf("%-" . $max_strlen . "s %8d %10d   %8u   %8i %8u",
+			$process_pid,
+			$stats{$process_pid}->{MM_VMSCAN_KSWAPD_WAKE},
+			$stats{$process_pid}->{HIGH_KSWAPD_REWAKEUP},
+			$stats{$process_pid}->{HIGH_NR_SCANNED},
+			$stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_SYNC} + $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_ANON_SYNC},
+			$stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_ASYNC} + $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_ANON_ASYNC});
+
+		if ($stats{$process_pid}->{MM_VMSCAN_KSWAPD_WAKE}) {
+			print "      ";
+			for (my $order = 0; $order < 20; $order++) {
+				my $count = $stats{$process_pid}->{MM_VMSCAN_KSWAPD_WAKE_PERORDER}[$order];
+				if ($count != 0) {
+					print "wake-$order=$count ";
+				}
+			}
+		}
+		if ($stats{$process_pid}->{HIGH_KSWAPD_REWAKEUP}) {
+			print "      ";
+			for (my $order = 0; $order < 20; $order++) {
+				my $count = $stats{$process_pid}->{HIGH_KSWAPD_REWAKEUP_PERORDER}[$order];
+				if ($count != 0) {
+					print "rewake-$order=$count ";
+				}
+			}
+		}
+		printf("\n");
+	}
+
+	# Print out summaries
+	$total_direct_latency /= 1000;
+	$total_kswapd_latency /= 1000;
+	print "\nSummary\n";
+	print "Direct reclaims:     			$total_direct_reclaim\n";
+	print "Direct reclaim pages scanned:		$total_direct_nr_scanned\n";
+	print "Direct reclaim write file sync I/O:	$total_direct_writepage_file_sync\n";
+	print "Direct reclaim write anon sync I/O:	$total_direct_writepage_anon_sync\n";
+	print "Direct reclaim write file async I/O:	$total_direct_writepage_file_async\n";
+	print "Direct reclaim write anon async I/O:	$total_direct_writepage_anon_async\n";
+	print "Wake kswapd requests:			$total_wakeup_kswapd\n";
+	printf "Time stalled direct reclaim: 		%-1.2f seconds\n", $total_direct_latency;
+	print "\n";
+	print "Kswapd wakeups:				$total_kswapd_wake\n";
+	print "Kswapd pages scanned:			$total_kswapd_nr_scanned\n";
+	print "Kswapd reclaim write file sync I/O:	$total_kswapd_writepage_file_sync\n";
+	print "Kswapd reclaim write anon sync I/O:	$total_kswapd_writepage_anon_sync\n";
+	print "Kswapd reclaim write file async I/O:	$total_kswapd_writepage_file_async\n";
+	print "Kswapd reclaim write anon async I/O:	$total_kswapd_writepage_anon_async\n";
+	printf "Time kswapd awake:			%-1.2f seconds\n", $total_kswapd_latency;
+}
+
+sub aggregate_perprocesspid() {
+	my $process_pid;
+	my $process;
+	undef %perprocess;
+
+	foreach $process_pid (keys %perprocesspid) {
+		$process = $process_pid;
+		$process =~ s/-([0-9])*$//;
+		if ($process eq '') {
+			$process = "NO_PROCESS_NAME";
+		}
+
+		$perprocess{$process}->{MM_VMSCAN_DIRECT_RECLAIM_BEGIN} += $perprocesspid{$process_pid}->{MM_VMSCAN_DIRECT_RECLAIM_BEGIN};
+		$perprocess{$process}->{MM_VMSCAN_KSWAPD_WAKE} += $perprocesspid{$process_pid}->{MM_VMSCAN_KSWAPD_WAKE};
+		$perprocess{$process}->{MM_VMSCAN_WAKEUP_KSWAPD} += $perprocesspid{$process_pid}->{MM_VMSCAN_WAKEUP_KSWAPD};
+		$perprocess{$process}->{HIGH_KSWAPD_REWAKEUP} += $perprocesspid{$process_pid}->{HIGH_KSWAPD_REWAKEUP};
+		$perprocess{$process}->{HIGH_NR_SCANNED} += $perprocesspid{$process_pid}->{HIGH_NR_SCANNED};
+		$perprocess{$process}->{MM_VMSCAN_WRITEPAGE_FILE_SYNC} += $perprocesspid{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_SYNC};
+		$perprocess{$process}->{MM_VMSCAN_WRITEPAGE_ANON_SYNC} += $perprocesspid{$process_pid}->{MM_VMSCAN_WRITEPAGE_ANON_SYNC};
+		$perprocess{$process}->{MM_VMSCAN_WRITEPAGE_FILE_ASYNC} += $perprocesspid{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_ASYNC};
+		$perprocess{$process}->{MM_VMSCAN_WRITEPAGE_ANON_ASYNC} += $perprocesspid{$process_pid}->{MM_VMSCAN_WRITEPAGE_ANON_ASYNC};
+
+		for (my $order = 0; $order < 20; $order++) {
+			$perprocess{$process}->{MM_VMSCAN_DIRECT_RECLAIM_BEGIN_PERORDER}[$order] += $perprocesspid{$process_pid}->{MM_VMSCAN_DIRECT_RECLAIM_BEGIN_PERORDER}[$order];
+			$perprocess{$process}->{MM_VMSCAN_WAKEUP_KSWAPD_PERORDER}[$order] += $perprocesspid{$process_pid}->{MM_VMSCAN_WAKEUP_KSWAPD_PERORDER}[$order];
+			$perprocess{$process}->{MM_VMSCAN_KSWAPD_WAKE_PERORDER}[$order] += $perprocesspid{$process_pid}->{MM_VMSCAN_KSWAPD_WAKE_PERORDER}[$order];
+
+		}
+
+		# Aggregate direct reclaim latencies
+		my $wr_index = $perprocess{$process}->{MM_VMSCAN_DIRECT_RECLAIM_END};
+		my $rd_index = 0;
+		while (defined $perprocesspid{$process_pid}->{HIGH_DIRECT_RECLAIM_LATENCY}[$rd_index]) {
+			$perprocess{$process}->{HIGH_DIRECT_RECLAIM_LATENCY}[$wr_index] = $perprocesspid{$process_pid}->{HIGH_DIRECT_RECLAIM_LATENCY}[$rd_index];
+			$rd_index++;
+			$wr_index++;
+		}
+		$perprocess{$process}->{MM_VMSCAN_DIRECT_RECLAIM_END} = $wr_index;
+
+		# Aggregate kswapd latencies
+		my $wr_index = $perprocess{$process}->{MM_VMSCAN_KSWAPD_SLEEP};
+		my $rd_index = 0;
+		while (defined $perprocesspid{$process_pid}->{HIGH_KSWAPD_LATENCY}[$rd_index]) {
+			$perprocess{$process}->{HIGH_KSWAPD_LATENCY}[$wr_index] = $perprocesspid{$process_pid}->{HIGH_KSWAPD_LATENCY}[$rd_index];
+			$rd_index++;
+			$wr_index++;
+		}
+		$perprocess{$process}->{MM_VMSCAN_DIRECT_RECLAIM_END} = $wr_index;
+	}
+}
+
+sub report() {
+	if (!$opt_ignorepid) {
+		dump_stats(\%perprocesspid);
+	} else {
+		aggregate_perprocesspid();
+		dump_stats(\%perprocess);
+	}
+}
+
+# Process events or signals until neither is available
+sub signal_loop() {
+	my $sigint_processed;
+	do {
+		$sigint_processed = 0;
+		process_events();
+
+		# Handle pending signals if any
+		if ($sigint_pending) {
+			my $current_time = time;
+
+			if ($sigint_exit) {
+				print "Received exit signal\n";
+				$sigint_pending = 0;
+			}
+			if ($sigint_report) {
+				if ($current_time >= $sigint_received + 2) {
+					report();
+					$sigint_report = 0;
+					$sigint_pending = 0;
+					$sigint_processed = 1;
+				}
+			}
+		}
+	} while ($sigint_pending || $sigint_processed);
+}
+
+signal_loop();
+report();
author	Michal Marek <mmarek@suse.cz>	2010-10-27 22:15:57 (GMT)
committer	Michal Marek <mmarek@suse.cz>	2010-10-27 22:15:57 (GMT)
commit	b74b953b998bcc2db91b694446f3a2619ec32de6 (patch)
tree	6ce24caabd730f6ae9287ed0676ec32e6ff31e9d /Documentation/trace
parent	abb438526201c6a79949ad45375c051b6681c253 (diff)
parent	f6f94e2ab1b33f0082ac22d71f66385a60d8157f (diff)
download	linux-fsl-qoriq-b74b953b998bcc2db91b694446f3a2619ec32de6.tar.xz